使用Java 解析word文檔,包括獲取標題和內容

部分摘自https://blog.csdn.net/jane_feng/article/details/81221550

import com.sinitek.sirm.web.plm.funddate.MatchingObject;
import org.apache.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xwpf.usermodel.*;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;
 
import java.io.*;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
 
public class ParseWordUtil {
    private static final Logger LOGGER = Logger.getLogger(ParseWordUtil.class);
    // word整體樣式
    private static CTStyles wordStyles = null;
 
    public static void getWordStyle(String filepath) {
        XWPFDocument template;
        try {
            // 讀取模板文檔
            template = new XWPFDocument(new FileInputStream(filepath));
            // 獲得模板文檔的整體樣式
            wordStyles = template.getStyle();
        } catch (FileNotFoundException e) {
            LOGGER.error("未找到文件",e);
        } catch (IOException e) {
            LOGGER.error("",e);
        } catch (XmlException e) {
            LOGGER.error("XML轉換異常",e);
        }
    }
 
    // 獲取word文檔標題
    public static List<String> getWordTitles(String filepath) throws IOException {
        String filename = getWordVersion(filepath);
        if (".docx".equals(filename)) {
            return getWordTitles2007(filepath);
        } else {
            return getWordTitlesAndContext2003(filepath, 1); // 1:只獲取標題;2:只獲取內容;3:標題和內容
        }
    }
 
    // 獲取word文檔內容
    public static List<String> getWordText(String filepath) throws Exception {
        String filename = getWordVersion(filepath);
        if (".docx".equals(filename)) {
            return getParagraphText2007(filepath);
        } else {
            return getWordTitlesAndContext2003(filepath, 3);
        }
    }
 
    // 獲取文件版本,97基本已經淘汰不考慮,只針對03和07版本word
    public static String getWordVersion(String filepath) {
        File file = new File(filepath);
        String filename = file.getName();
        // filename = filename.substring(0, filename.lastIndexOf("."));
        filename = filename.substring(filename.lastIndexOf("."), filename.length());
        return filename;
    }
 
    /**
     * 獲取03版word文檔標題和內容
     * @param path 文件路徑
     * @param type 1:只獲取標題;2:只獲取內容;3:標題和內容都獲取
     * @return list
     * @throws IOException
     */
    public static List<String> getWordTitlesAndContext2003(String path, Integer type) throws IOException {
        InputStream is = new FileInputStream(path);
        HWPFDocument doc = new HWPFDocument(is);
        Range r = doc.getRange();
        List<String> list = new ArrayList<String>();
        List<String> titles = new ArrayList<String>();
        List<String> context = new ArrayList<String>();
        for (int i = 0; i < r.numParagraphs(); i++) {
            Paragraph p = r.getParagraph(i);
            // check if style index is greater than total number of styles
            int numStyles = doc.getStyleSheet().numStyles();
            int styleIndex = p.getStyleIndex();
            String contexts = p.text();
            list.add(contexts); // 標題+內容
 
            if (numStyles > styleIndex) {
                StyleSheet style_sheet = doc.getStyleSheet();
                StyleDescription style = style_sheet.getStyleDescription(styleIndex);
                String styleName = style.getName();
                if (styleName != null && styleName.contains("標題")) {
                    String text = p.text();
                    titles.add(text);
                } else if (styleName != null && styleName.contains("正文")) {
                    String text = p.text();
                    context.add(text);
                }
            }
        }
 
        //得到word數據流
        byte [] dataStream = doc.getDataStream();
        //用於在一段範圍內獲得段落數
        int numCharacterRuns = r.numCharacterRuns();
        // System.out.println("CharacterRuns 數:"+numCharacterRuns);
        //負責圖像提取 和 確定一些文件某塊是否包含嵌入的圖像。
        PicturesTable table = new PicturesTable(doc, dataStream, null, null, null);
 
        //文章圖片編號
        /*int i = 1;
        for(int j=0 ; j<numCharacterRuns ; j++){
            //這個類表示一個文本運行,有着共同的屬性。
            CharacterRun run = r.getCharacterRun(j);
            //是否存在圖片
            boolean bool = table.hasPicture(run);
            if(bool) {
                //返回圖片對象綁定到指定的CharacterRun
                Picture pic = table.extractPicture(run, true);
                //圖片的內容字節寫入到指定的輸出流。
                pic.writeImageContent(new FileOutputStream("D:temp"+filename+"_"+i+".jpg"));
                i++;
            }
        }*/
        if (type == 1) {
            return titles;
        } else if (type == 2) {
            return context;
        }
        return list;
    }
 
    // 獲取2007版word標題 (這個方法有一點問題)
    public static List<String> getWordTitles2007(String path) throws IOException {
        InputStream is = new FileInputStream(path);
        XWPFDocument doc = new XWPFDocument(is);
        //HWPFDocument doc = new HWPFDocument(is);
        //Range r = doc.getRange();
        List<XWPFRun> listRun;
        List<XWPFParagraph> listParagraphs = doc.getParagraphs();//得到段落信息
        List<String> list = new ArrayList<String>();
 
        /*for (int i = 0; i<listParagraphs.size(); i++) {
            System.out.println(listParagraphs.get(i).getRuns().get(0).getText(0));
            String str = listParagraphs.get(i).getRuns().get(0).getText(0);
            list.add(str);
        }*/
 
        List<XWPFParagraph> paras = doc.getParagraphs();
        for (XWPFParagraph para : paras) {
            // 當前段落的屬性
            // CTPPr pr = para.getCTP().getPPr();
            if (para.getText() != null && !"".equals(para.getText()) && !"r".equals(para.getText())) {
                System.out.println(para.getText().trim());
                String str = para.getText();
                String str1 = "  " + para.getText().replaceAll("\\n", "").replaceAll("\\t", "") + "\n";
                list.add(str);
            }
        }
 
        /*XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
        String text = extractor.getText();
        // System.out.println(text);
        POIXMLProperties.CoreProperties coreProps = extractor.getCoreProperties();
        String title = coreProps.getTitle();
        System.out.println(title);*/
 
        //獲取文檔中所有的表格
        /*List<XWPFTable> tables = doc.getTables();
        List<XWPFTableRow> rows;
        List<XWPFTableCell> cells;
        for (XWPFTable table : tables) {
            // 表格屬性
            // CTTblPr pr = table.getCTTbl().getTblPr();
            // 獲取表格對應的行
            rows = table.getRows();
            for (XWPFTableRow row : rows) {
                //獲取行對應的單元格
                cells = row.getTableCells();
                for (XWPFTableCell cell : cells) {
                    System.out.println(cell.getText());;
                }
            }
        }*/
        close(is);
 
        return list;
    }
 
    // 獲取2007版word文檔內容
    public static List<String> getParagraphText2007(String filePath) throws Exception {
        InputStream is = new FileInputStream(filePath);
        XWPFDocument doc = new XWPFDocument(is);
 
        List<String> context = new ArrayList<String>();
        List<XWPFParagraph> paras = doc.getParagraphs();
        for (XWPFParagraph para : paras) {
            String str = "  " + para.getText().replaceAll("\\n", "").replaceAll("\\t", "") + "\n";
            context.add(str);
        }
 
        //獲取文檔中所有的表格
        /*List<XWPFTable> tables = doc.getTables();
        List<XWPFTableRow> rows;
        List<XWPFTableCell> cells;
        for (XWPFTable table : tables) {
            // 表格屬性
            // CTTblPr pr = table.getCTTbl().getTblPr();
            // 獲取表格對應的行
            rows = table.getRows();
            for (XWPFTableRow row : rows) {
                //獲取行對應的單元格
                cells = row.getTableCells();
                for (XWPFTableCell cell : cells) {
                    context.add(cell.getText());
                }
            }
        }*/
        close(is);
        return context;
    }
 
    /**
     * 將對比結果寫入表格
     * @param size 對比list size
     * @param object 短句對比結果
     * @throws Exception
     */
    public static void writeTable(int size, List<MatchingObject> object, String returnPath) throws Exception {
        XWPFDocument doc = new XWPFDocument();
        // 獲取新建文檔對象的樣式
        XWPFStyles newStyles = doc.createStyles();
        // 關鍵行 // 修改設置文檔樣式爲靜態塊中讀取到的樣式
        // newStyles.setStyles(wordStyles);
        // 創建一個表格
        XWPFTable table = doc.createTable(size, 2);
        // 這裏增加的列原本初始化創建的行在通過getTableCells()方法獲取時獲取不到,但通過row新增的就可以。
        // table.addNewCol(); //給表格增加一列
        // table.createRow(); //給表格新增一行
        List<XWPFTableRow> rows = table.getRows();
        // 表格屬性
        CTTblPr tablePr = table.getCTTbl().addNewTblPr();
        // 表格寬度
        CTTblWidth width = tablePr.addNewTblW();
        width.setW(BigInteger.valueOf(9000));
        XWPFTableRow row;
        List<XWPFTableCell> cells;
        XWPFTableCell cell;
        int rowSize = rows.size();
        int cellSize;
        for (int i=0; i<rowSize; i++) {
            row = rows.get(i);
            // 新增單元格
            // row.addNewTableCell();
            // 設置行的高度
            row.setHeight(400);
            // 行屬性
            // CTTrPr rowPr = row.getCtRow().addNewTrPr();
            // 這種方式是可以獲取到新增的cell的。
            // List<CTTc> list = row.getCtRow().getTcList();
            cells = row.getTableCells();
            cellSize = cells.size();
            for (int j=0; j<cellSize; j++) {
                cell = cells.get(j);
                if (object.get(i).getMark() != 0) {
                    // 設置單元格的顏色
                    cell.setColor("ff0000"); //紅色
                }
                // 單元格屬性
                CTTcPr cellPr = cell.getCTTc().addNewTcPr();
                cellPr.addNewVAlign().setVal(STVerticalJc.CENTER);
                if (j == 0) {
                    cellPr.addNewTcW().setW(BigInteger.valueOf(4500));
                    if (object.get(i).getMark() == 2) { // 新增
                        cell.setText("");
                    } else { // 不變、新增、修改
                        cell.setText(object.get(i).getData());
                    }
                } else if (j == 1) {
                    cellPr.addNewTcW().setW(BigInteger.valueOf(4500));
                    if (object.get(i).getMark() == 3) { // 修改
                        cell.setText(object.get(i).getDataAds());
                    } else if (object.get(i).getMark() == 1) { // 刪除
                        cell.setText("");
                    } else  {
                        cell.setText(object.get(i).getData());
                    }
 
                }
            }
        }
        // 文件不存在時會自動創建
        OutputStream os = new FileOutputStream(returnPath);
        // 寫入文件
        doc.write(os);
        close(os);
    }
 
    // 模板方式實現寫word
    public static void formatDoc() throws IOException {
        // 新建的word文檔對象
        XWPFDocument doc = new XWPFDocument();
        // 獲取新建文檔對象的樣式
        XWPFStyles newStyles = doc.createStyles();
        // 關鍵行// 修改設置文檔樣式爲靜態塊中讀取到的樣式
        newStyles.setStyles(wordStyles);
 
        // 開始內容輸入
        // 標題1,1級大綱
        XWPFParagraph para1 = doc.createParagraph();
        // 關鍵行// 1級大綱
        para1.setStyle("1");
        XWPFRun run1 = para1.createRun();
        // 標題內容
        run1.setText("標題 1");
 
        // 標題2
        XWPFParagraph para2 = doc.createParagraph();
        // 關鍵行// 2級大綱
        para2.setStyle("2");
        XWPFRun run2 = para2.createRun();
        // 標題內容
        run2.setText("標題 2");
 
        // 正文
        XWPFParagraph paraX = doc.createParagraph();
        XWPFRun runX = paraX.createRun();
        // 正文內容
        runX.setText("正文");
 
        // word寫入到文件
        FileOutputStream fos = new FileOutputStream("D://myDoc1.docx");
        doc.write(fos);
        fos.close();
    }
 
    // 自定義樣式寫word
    public static void writeSimpleDocxFile() throws IOException {
        XWPFDocument docxDocument = new XWPFDocument();
 
        // 老外自定義了一個名字,中文版的最好還是按照word給的標題名來,否則級別上可能會亂
        addCustomHeadingStyle(docxDocument, "標題 1", 1);
        addCustomHeadingStyle(docxDocument, "標題 2", 2);
 
        // 標題1
        XWPFParagraph paragraph = docxDocument.createParagraph();
        XWPFRun run = paragraph.createRun();
        run.setText("標題 1");
        paragraph.setStyle("標題 1");
 
        // 標題2
        XWPFParagraph paragraph2 = docxDocument.createParagraph();
        XWPFRun run2 = paragraph2.createRun();
        run2.setText("標題 2");
        paragraph2.setStyle("標題 2");
 
        // 正文
        XWPFParagraph paragraphX = docxDocument.createParagraph();
        XWPFRun runX = paragraphX.createRun();
        runX.setText("正文");
 
        // word寫入到文件
        FileOutputStream fos = new FileOutputStream("D:/myDoc2.docx");
        docxDocument.write(fos);
        fos.close();
    }
 
    // 增加自定義標題
    private static void addCustomHeadingStyle(XWPFDocument docxDocument, String strStyleId, int headingLevel) {
 
        CTStyle ctStyle = CTStyle.Factory.newInstance();
        ctStyle.setStyleId(strStyleId);
 
        CTString styleName = CTString.Factory.newInstance();
        styleName.setVal(strStyleId);
        ctStyle.setName(styleName);
 
        CTDecimalNumber indentNumber = CTDecimalNumber.Factory.newInstance();
        indentNumber.setVal(BigInteger.valueOf(headingLevel));
 
        // lower number > style is more prominent in the formats bar
        ctStyle.setUiPriority(indentNumber);
 
        CTOnOff onoffnull = CTOnOff.Factory.newInstance();
        ctStyle.setUnhideWhenUsed(onoffnull);
 
        // style shows up in the formats bar
        ctStyle.setQFormat(onoffnull);
 
        // style defines a heading of the given level
        CTPPr ppr = CTPPr.Factory.newInstance();
        ppr.setOutlineLvl(indentNumber);
        ctStyle.setPPr(ppr);
 
        XWPFStyle style = new XWPFStyle(ctStyle);
 
        // is a null op if already defined
        XWPFStyles styles = docxDocument.createStyles();
 
        style.setType(STStyleType.PARAGRAPH);
        styles.addStyle(style);
 
    }
 
 
    /**
     * 關閉輸入流
     * @param is 輸入流
     */
    private static void close(InputStream is) {
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
                LOGGER.error("流關閉異常",e);
            }
        }
    }
 
    /**
     * 關閉輸出流
     * @param os 輸出流
     */
    private static void close(OutputStream os) throws Exception{
        if (os != null) {
            try {
                os.close();
            } catch (IOException e) {
                LOGGER.error("流關閉異常",e);
            }
        }
    }
}

不過可能會報錯

Exception in thread "main" java.lang.NoSuchMethodError: org.apache.poi.util.POILogger.log(ILjava/lang/Object;)V
	at org.apache.poi.openxml4j.opc.PackageRelationshipCollection.parseRelationshipsPart(PackageRelationshipCollection.java:313)
	at org.apache.poi.openxml4j.opc.PackageRelationshipCollection.<init>(PackageRelationshipCollection.java:162)
	at org.apache.poi.openxml4j.opc.PackageRelationshipCollection.<init>(PackageRelationshipCollection.java:130)
	at org.apache.poi.openxml4j.opc.PackagePart.loadRelationships(PackagePart.java:559)
	at org.apache.poi.openxml4j.opc.PackagePart.<init>(PackagePart.java:112)
	at org.apache.poi.openxml4j.opc.PackagePart.<init>(PackagePart.java:83)
	at org.apache.poi.openxml4j.opc.PackagePart.<init>(PackagePart.java:128)
	at org.apache.poi.openxml4j.opc.ZipPackagePart.<init>(ZipPackagePart.java:78)
	at org.apache.poi.openxml4j.opc.ZipPackage.getPartsImpl(ZipPackage.java:218)
	at org.apache.poi.openxml4j.opc.OPCPackage.getParts(OPCPackage.java:662)
	at org.apache.poi.openxml4j.opc.OPCPackage.open(OPCPackage.java:269)
	at org.apache.poi.util.PackageHelper.open(PackageHelper.java:39)
	at org.apache.poi.xwpf.usermodel.XWPFDocument.<init>(XWPFDocument.java:121)
	at com.word.WordTest3.getWordTitles2007(WordTest3.java:47)
	at com.word.WordTest3.getWordTitles(WordTest3.java:29)
	at com.word.WordTest3.main(WordTest3.java:22)

解決方案:保證jar版本的一致

    <dependency>  
        <groupId>org.apache.poi</groupId>  
        <artifactId>poi-ooxml</artifactId>  
        <version>3.17</version>  
    </dependency>  
    <dependency>  
        <groupId>org.apache.poi</groupId>  
        <artifactId>poi-ooxml-schemas</artifactId>  
        <version>3.17</version>  
    </dependency>  
    <dependency>  
        <groupId>org.apache.poi</groupId>  
        <artifactId>poi</artifactId>  
        <version>3.17</version>  
    </dependency>  

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章