POI實現Word轉HTML文件

package cn.wgd.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.converter.AbstractWordUtils;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.util.XMLHelper;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;

import fr.opensagres.poi.xwpf.converter.core.IXWPFConverter;
import fr.opensagres.poi.xwpf.converter.core.ImageManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;

/**
 * @author Kevin 2018-3-14
 * 
 * 將word,pdf等文件轉爲html,用於附件預覽!
 * 
 * 圖片處理https://www.cnblogs.com/feiruo/p/5924514.html
 * 
 * 本例程需要jar包:poi(poi3.17)相關jar包外,
 * fr.opensagres.poi.xwpf.converter.core-2.0.1.jar
 * fr.opensagres.poi.xwpf.converter.xhtml-2.0.1.jar
 * fr.opensagres.xdocreport.core-2.0.1.jar
 * ooxml-schemas-1.3.jar等
 * 
 * 注:此方法爲簡單實現,如word需要更多樣式處理,還需要自行實現!
 *
 */
public class ConvertWord2HtmlUtil {

    public static void main(String[] args) throws IOException, ParserConfigurationException, TransformerException, SAXException {
        String path = "D:\\testfile2html\\test.docx";
        String descPath = "D:\\testfile2html\\test.html";
        String imagePath = "D:\\testfile2html";
        word2007ToHtml(path, descPath, imagePath);
    }

    /**
     * 處理doc文件轉HTML,此方法參考:org.apache.poi.hwpf.converter.WordToHtmlConverter.main()
     * @param path
     * @param descPath
     * @throws IOException
     * @throws ParserConfigurationException
     * @throws TransformerException
     */
    public static void word95T2007ToHtml(String path, String descPath) 
            throws IOException, ParserConfigurationException, TransformerException{
        if(path == null)
            throw new NullPointerException("路徑不能爲空!");

        System.out.println( "Converting " + path );
        System.out.println( "Saving output to " + descPath );

        Document doc = ConvertWord2HtmlUtil.process(new File(path));

        DOMSource domSource = new DOMSource( doc );
        StreamResult streamResult = new StreamResult(new File(descPath));

        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();
        // TODO set encoding from a command argument
        serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
        serializer.setOutputProperty( OutputKeys.INDENT, "yes" );
        serializer.setOutputProperty( OutputKeys.METHOD, "html" );
        serializer.transform( domSource, streamResult );
    }

    /**
     * 
     * 此方法來源於:org.apache.poi.hwpf.converter.WordToHtmlConverter
     * @param docFile
     * @return
     * @throws IOException
     * @throws ParserConfigurationException
     */
    static Document process( File docFile ) throws IOException, ParserConfigurationException
    {
        final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( docFile );
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                XMLHelper.getDocumentBuilderFactory().newDocumentBuilder()
                        .newDocument() );
        wordToHtmlConverter.processDocument( wordDocument );
        return wordToHtmlConverter.getDocument();
    }

    /**
     * @param path 源文件路徑(doc or docx)
     * @param descPath 轉化後的文件路徑(html)
     * @param imagePath 圖片存放地址(本地址默認爲html文件同路徑)
     * @throws IOException 
     * @throws ParserConfigurationException
     * @throws TransformerException
     * @throws SAXException
     */
    public static void word2007ToHtml(String path, String descPath, String imagePath) 
            throws IOException, ParserConfigurationException, TransformerException, SAXException{
        if(path == null){
            throw new NullPointerException("路徑不能爲空!");
        }
        File sourceFile = new File(path);
        if(!sourceFile.exists()){
            System.out.println("用戶文件不存在!");
            return;
        }else{
            if(path.endsWith(".docx") || path.endsWith(".DOCX")){
                XWPFDocument document = new XWPFDocument(new FileInputStream(path));
                //html轉化器
                IXWPFConverter<XHTMLOptions> converter = XHTMLConverter.getInstance();
                //html屬性器
                XHTMLOptions options = XHTMLOptions.create();
                //圖片處理,第二個參數爲html文件同級目錄下,否則圖片找不到。
                ImageManager imageManager = new ImageManager(new File(imagePath), "image");
                options.setImageManager(imageManager);

                converter.convert(document, new FileOutputStream(descPath), options);
            }else{
                word95T2007ToHtml(path, descPath);
            }
        }
    }
}
發佈了98 篇原創文章 · 獲贊 115 · 訪問量 3萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章