部分摘自https://blog.csdn.net/jane_feng/article/details/81221550
import com.sinitek.sirm.web.plm.funddate.MatchingObject;
import org.apache.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xwpf.usermodel.*;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;
import java.io.*;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
public class ParseWordUtil {
private static final Logger LOGGER = Logger.getLogger(ParseWordUtil.class);
// word整體樣式
private static CTStyles wordStyles = null;
public static void getWordStyle(String filepath) {
XWPFDocument template;
try {
// 讀取模板文檔
template = new XWPFDocument(new FileInputStream(filepath));
// 獲得模板文檔的整體樣式
wordStyles = template.getStyle();
} catch (FileNotFoundException e) {
LOGGER.error("未找到文件",e);
} catch (IOException e) {
LOGGER.error("",e);
} catch (XmlException e) {
LOGGER.error("XML轉換異常",e);
}
}
// 獲取word文檔標題
public static List<String> getWordTitles(String filepath) throws IOException {
String filename = getWordVersion(filepath);
if (".docx".equals(filename)) {
return getWordTitles2007(filepath);
} else {
return getWordTitlesAndContext2003(filepath, 1); // 1:只獲取標題;2:只獲取內容;3:標題和內容
}
}
// 獲取word文檔內容
public static List<String> getWordText(String filepath) throws Exception {
String filename = getWordVersion(filepath);
if (".docx".equals(filename)) {
return getParagraphText2007(filepath);
} else {
return getWordTitlesAndContext2003(filepath, 3);
}
}
// 獲取文件版本,97基本已經淘汰不考慮,只針對03和07版本word
public static String getWordVersion(String filepath) {
File file = new File(filepath);
String filename = file.getName();
// filename = filename.substring(0, filename.lastIndexOf("."));
filename = filename.substring(filename.lastIndexOf("."), filename.length());
return filename;
}
/**
* 獲取03版word文檔標題和內容
* @param path 文件路徑
* @param type 1:只獲取標題;2:只獲取內容;3:標題和內容都獲取
* @return list
* @throws IOException
*/
public static List<String> getWordTitlesAndContext2003(String path, Integer type) throws IOException {
InputStream is = new FileInputStream(path);
HWPFDocument doc = new HWPFDocument(is);
Range r = doc.getRange();
List<String> list = new ArrayList<String>();
List<String> titles = new ArrayList<String>();
List<String> context = new ArrayList<String>();
for (int i = 0; i < r.numParagraphs(); i++) {
Paragraph p = r.getParagraph(i);
// check if style index is greater than total number of styles
int numStyles = doc.getStyleSheet().numStyles();
int styleIndex = p.getStyleIndex();
String contexts = p.text();
list.add(contexts); // 標題+內容
if (numStyles > styleIndex) {
StyleSheet style_sheet = doc.getStyleSheet();
StyleDescription style = style_sheet.getStyleDescription(styleIndex);
String styleName = style.getName();
if (styleName != null && styleName.contains("標題")) {
String text = p.text();
titles.add(text);
} else if (styleName != null && styleName.contains("正文")) {
String text = p.text();
context.add(text);
}
}
}
//得到word數據流
byte [] dataStream = doc.getDataStream();
//用於在一段範圍內獲得段落數
int numCharacterRuns = r.numCharacterRuns();
// System.out.println("CharacterRuns 數:"+numCharacterRuns);
//負責圖像提取 和 確定一些文件某塊是否包含嵌入的圖像。
PicturesTable table = new PicturesTable(doc, dataStream, null, null, null);
//文章圖片編號
/*int i = 1;
for(int j=0 ; j<numCharacterRuns ; j++){
//這個類表示一個文本運行,有着共同的屬性。
CharacterRun run = r.getCharacterRun(j);
//是否存在圖片
boolean bool = table.hasPicture(run);
if(bool) {
//返回圖片對象綁定到指定的CharacterRun
Picture pic = table.extractPicture(run, true);
//圖片的內容字節寫入到指定的輸出流。
pic.writeImageContent(new FileOutputStream("D:temp"+filename+"_"+i+".jpg"));
i++;
}
}*/
if (type == 1) {
return titles;
} else if (type == 2) {
return context;
}
return list;
}
// 獲取2007版word標題 (這個方法有一點問題)
public static List<String> getWordTitles2007(String path) throws IOException {
InputStream is = new FileInputStream(path);
XWPFDocument doc = new XWPFDocument(is);
//HWPFDocument doc = new HWPFDocument(is);
//Range r = doc.getRange();
List<XWPFRun> listRun;
List<XWPFParagraph> listParagraphs = doc.getParagraphs();//得到段落信息
List<String> list = new ArrayList<String>();
/*for (int i = 0; i<listParagraphs.size(); i++) {
System.out.println(listParagraphs.get(i).getRuns().get(0).getText(0));
String str = listParagraphs.get(i).getRuns().get(0).getText(0);
list.add(str);
}*/
List<XWPFParagraph> paras = doc.getParagraphs();
for (XWPFParagraph para : paras) {
// 當前段落的屬性
// CTPPr pr = para.getCTP().getPPr();
if (para.getText() != null && !"".equals(para.getText()) && !"r".equals(para.getText())) {
System.out.println(para.getText().trim());
String str = para.getText();
String str1 = " " + para.getText().replaceAll("\\n", "").replaceAll("\\t", "") + "\n";
list.add(str);
}
}
/*XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
String text = extractor.getText();
// System.out.println(text);
POIXMLProperties.CoreProperties coreProps = extractor.getCoreProperties();
String title = coreProps.getTitle();
System.out.println(title);*/
//獲取文檔中所有的表格
/*List<XWPFTable> tables = doc.getTables();
List<XWPFTableRow> rows;
List<XWPFTableCell> cells;
for (XWPFTable table : tables) {
// 表格屬性
// CTTblPr pr = table.getCTTbl().getTblPr();
// 獲取表格對應的行
rows = table.getRows();
for (XWPFTableRow row : rows) {
//獲取行對應的單元格
cells = row.getTableCells();
for (XWPFTableCell cell : cells) {
System.out.println(cell.getText());;
}
}
}*/
close(is);
return list;
}
// 獲取2007版word文檔內容
public static List<String> getParagraphText2007(String filePath) throws Exception {
InputStream is = new FileInputStream(filePath);
XWPFDocument doc = new XWPFDocument(is);
List<String> context = new ArrayList<String>();
List<XWPFParagraph> paras = doc.getParagraphs();
for (XWPFParagraph para : paras) {
String str = " " + para.getText().replaceAll("\\n", "").replaceAll("\\t", "") + "\n";
context.add(str);
}
//獲取文檔中所有的表格
/*List<XWPFTable> tables = doc.getTables();
List<XWPFTableRow> rows;
List<XWPFTableCell> cells;
for (XWPFTable table : tables) {
// 表格屬性
// CTTblPr pr = table.getCTTbl().getTblPr();
// 獲取表格對應的行
rows = table.getRows();
for (XWPFTableRow row : rows) {
//獲取行對應的單元格
cells = row.getTableCells();
for (XWPFTableCell cell : cells) {
context.add(cell.getText());
}
}
}*/
close(is);
return context;
}
/**
* 將對比結果寫入表格
* @param size 對比list size
* @param object 短句對比結果
* @throws Exception
*/
public static void writeTable(int size, List<MatchingObject> object, String returnPath) throws Exception {
XWPFDocument doc = new XWPFDocument();
// 獲取新建文檔對象的樣式
XWPFStyles newStyles = doc.createStyles();
// 關鍵行 // 修改設置文檔樣式爲靜態塊中讀取到的樣式
// newStyles.setStyles(wordStyles);
// 創建一個表格
XWPFTable table = doc.createTable(size, 2);
// 這裏增加的列原本初始化創建的行在通過getTableCells()方法獲取時獲取不到,但通過row新增的就可以。
// table.addNewCol(); //給表格增加一列
// table.createRow(); //給表格新增一行
List<XWPFTableRow> rows = table.getRows();
// 表格屬性
CTTblPr tablePr = table.getCTTbl().addNewTblPr();
// 表格寬度
CTTblWidth width = tablePr.addNewTblW();
width.setW(BigInteger.valueOf(9000));
XWPFTableRow row;
List<XWPFTableCell> cells;
XWPFTableCell cell;
int rowSize = rows.size();
int cellSize;
for (int i=0; i<rowSize; i++) {
row = rows.get(i);
// 新增單元格
// row.addNewTableCell();
// 設置行的高度
row.setHeight(400);
// 行屬性
// CTTrPr rowPr = row.getCtRow().addNewTrPr();
// 這種方式是可以獲取到新增的cell的。
// List<CTTc> list = row.getCtRow().getTcList();
cells = row.getTableCells();
cellSize = cells.size();
for (int j=0; j<cellSize; j++) {
cell = cells.get(j);
if (object.get(i).getMark() != 0) {
// 設置單元格的顏色
cell.setColor("ff0000"); //紅色
}
// 單元格屬性
CTTcPr cellPr = cell.getCTTc().addNewTcPr();
cellPr.addNewVAlign().setVal(STVerticalJc.CENTER);
if (j == 0) {
cellPr.addNewTcW().setW(BigInteger.valueOf(4500));
if (object.get(i).getMark() == 2) { // 新增
cell.setText("");
} else { // 不變、新增、修改
cell.setText(object.get(i).getData());
}
} else if (j == 1) {
cellPr.addNewTcW().setW(BigInteger.valueOf(4500));
if (object.get(i).getMark() == 3) { // 修改
cell.setText(object.get(i).getDataAds());
} else if (object.get(i).getMark() == 1) { // 刪除
cell.setText("");
} else {
cell.setText(object.get(i).getData());
}
}
}
}
// 文件不存在時會自動創建
OutputStream os = new FileOutputStream(returnPath);
// 寫入文件
doc.write(os);
close(os);
}
// 模板方式實現寫word
public static void formatDoc() throws IOException {
// 新建的word文檔對象
XWPFDocument doc = new XWPFDocument();
// 獲取新建文檔對象的樣式
XWPFStyles newStyles = doc.createStyles();
// 關鍵行// 修改設置文檔樣式爲靜態塊中讀取到的樣式
newStyles.setStyles(wordStyles);
// 開始內容輸入
// 標題1,1級大綱
XWPFParagraph para1 = doc.createParagraph();
// 關鍵行// 1級大綱
para1.setStyle("1");
XWPFRun run1 = para1.createRun();
// 標題內容
run1.setText("標題 1");
// 標題2
XWPFParagraph para2 = doc.createParagraph();
// 關鍵行// 2級大綱
para2.setStyle("2");
XWPFRun run2 = para2.createRun();
// 標題內容
run2.setText("標題 2");
// 正文
XWPFParagraph paraX = doc.createParagraph();
XWPFRun runX = paraX.createRun();
// 正文內容
runX.setText("正文");
// word寫入到文件
FileOutputStream fos = new FileOutputStream("D://myDoc1.docx");
doc.write(fos);
fos.close();
}
// 自定義樣式寫word
public static void writeSimpleDocxFile() throws IOException {
XWPFDocument docxDocument = new XWPFDocument();
// 老外自定義了一個名字,中文版的最好還是按照word給的標題名來,否則級別上可能會亂
addCustomHeadingStyle(docxDocument, "標題 1", 1);
addCustomHeadingStyle(docxDocument, "標題 2", 2);
// 標題1
XWPFParagraph paragraph = docxDocument.createParagraph();
XWPFRun run = paragraph.createRun();
run.setText("標題 1");
paragraph.setStyle("標題 1");
// 標題2
XWPFParagraph paragraph2 = docxDocument.createParagraph();
XWPFRun run2 = paragraph2.createRun();
run2.setText("標題 2");
paragraph2.setStyle("標題 2");
// 正文
XWPFParagraph paragraphX = docxDocument.createParagraph();
XWPFRun runX = paragraphX.createRun();
runX.setText("正文");
// word寫入到文件
FileOutputStream fos = new FileOutputStream("D:/myDoc2.docx");
docxDocument.write(fos);
fos.close();
}
// 增加自定義標題
private static void addCustomHeadingStyle(XWPFDocument docxDocument, String strStyleId, int headingLevel) {
CTStyle ctStyle = CTStyle.Factory.newInstance();
ctStyle.setStyleId(strStyleId);
CTString styleName = CTString.Factory.newInstance();
styleName.setVal(strStyleId);
ctStyle.setName(styleName);
CTDecimalNumber indentNumber = CTDecimalNumber.Factory.newInstance();
indentNumber.setVal(BigInteger.valueOf(headingLevel));
// lower number > style is more prominent in the formats bar
ctStyle.setUiPriority(indentNumber);
CTOnOff onoffnull = CTOnOff.Factory.newInstance();
ctStyle.setUnhideWhenUsed(onoffnull);
// style shows up in the formats bar
ctStyle.setQFormat(onoffnull);
// style defines a heading of the given level
CTPPr ppr = CTPPr.Factory.newInstance();
ppr.setOutlineLvl(indentNumber);
ctStyle.setPPr(ppr);
XWPFStyle style = new XWPFStyle(ctStyle);
// is a null op if already defined
XWPFStyles styles = docxDocument.createStyles();
style.setType(STStyleType.PARAGRAPH);
styles.addStyle(style);
}
/**
* 關閉輸入流
* @param is 輸入流
*/
private static void close(InputStream is) {
if (is != null) {
try {
is.close();
} catch (IOException e) {
LOGGER.error("流關閉異常",e);
}
}
}
/**
* 關閉輸出流
* @param os 輸出流
*/
private static void close(OutputStream os) throws Exception{
if (os != null) {
try {
os.close();
} catch (IOException e) {
LOGGER.error("流關閉異常",e);
}
}
}
}
不過可能會報錯
Exception in thread "main" java.lang.NoSuchMethodError: org.apache.poi.util.POILogger.log(ILjava/lang/Object;)V
at org.apache.poi.openxml4j.opc.PackageRelationshipCollection.parseRelationshipsPart(PackageRelationshipCollection.java:313)
at org.apache.poi.openxml4j.opc.PackageRelationshipCollection.<init>(PackageRelationshipCollection.java:162)
at org.apache.poi.openxml4j.opc.PackageRelationshipCollection.<init>(PackageRelationshipCollection.java:130)
at org.apache.poi.openxml4j.opc.PackagePart.loadRelationships(PackagePart.java:559)
at org.apache.poi.openxml4j.opc.PackagePart.<init>(PackagePart.java:112)
at org.apache.poi.openxml4j.opc.PackagePart.<init>(PackagePart.java:83)
at org.apache.poi.openxml4j.opc.PackagePart.<init>(PackagePart.java:128)
at org.apache.poi.openxml4j.opc.ZipPackagePart.<init>(ZipPackagePart.java:78)
at org.apache.poi.openxml4j.opc.ZipPackage.getPartsImpl(ZipPackage.java:218)
at org.apache.poi.openxml4j.opc.OPCPackage.getParts(OPCPackage.java:662)
at org.apache.poi.openxml4j.opc.OPCPackage.open(OPCPackage.java:269)
at org.apache.poi.util.PackageHelper.open(PackageHelper.java:39)
at org.apache.poi.xwpf.usermodel.XWPFDocument.<init>(XWPFDocument.java:121)
at com.word.WordTest3.getWordTitles2007(WordTest3.java:47)
at com.word.WordTest3.getWordTitles(WordTest3.java:29)
at com.word.WordTest3.main(WordTest3.java:22)
解決方案:保證jar版本的一致
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.17</version>
</dependency>