java word转换成html源码分享

2021 年 10 月 30 日

2716 次浏览

5682字数

代码如下

package com.cong.quartz.util;

import org.apache.commons.io.FileUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.List;

/**
 * word 转换成html
 */
public class WordToHtml {

    /**
     * docx转换成html
     */
    public static void Word2007ToHtml(String datestr,String id,String docpath,String htmlpath, String docname, String htmlname) throws IOException {
        ///mnt/ChinaApp/Tomcat/apache-tomcat-8-8060/webapps/PWebService/doc/202010/0e5b5829881647248a58c4fd94cb41e3.docx
        final String file = docpath + "/" + datestr + "/" + docname;
        File f = new File(file);
        if (!f.exists()) {
            System.out.println("Sorry File does not Exists!");
        } else {
            if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {

                // 1) 加载word文档生成 XWPFDocument对象
                InputStream in = new FileInputStream(f);
                XWPFDocument document = new XWPFDocument(in);

                // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
                ///mnt/ChinaApp/Tomcat/apache-tomcat-8-8060/webapps/PWebService/html/202010/80c0ce15d6e940408b157f00f10d523d/image5.png
                File htmlFolderFile = new File(htmlpath + "/" + datestr);
                if(!htmlFolderFile.exists()){
                    htmlFolderFile.mkdirs();
                }
                File imageFolderFile = new File(htmlpath + "/" + datestr+ "/" + id);
                if(!imageFolderFile.exists()){
                    imageFolderFile.mkdirs();
                }
                //XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));
                XHTMLOptions options = XHTMLOptions.create();;
                options.setExtractor(new FileImageExtractor(imageFolderFile));
                //图片位置---这里需要改变
                options.URIResolver(new BasicURIResolver(id));
                options.setIgnoreStylesIfUnused(false);
                options.setFragment(true);

                // 2) Prepare XHTML options (here we set the IURIResolver to load images from a "word/media" folder)
                // 3) 将 XWPFDocument转换成XHTML
                ///mnt/ChinaApp/Tomcat/apache-tomcat-8-8060/webapps/PWebService/html/202010/00b4fe3d59ac486187f2f5173e359075.html
                String targetFileName = htmlpath + "/" + datestr + "/" + htmlname;
                OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(targetFileName), "utf-8");
                XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();
                xhtmlConverter.convert(document, outputStreamWriter, options);
                //OutputStream out = new FileOutputStream(new File(htmlpath + "/" + datestr + "/" + htmlname));
                //XHTMLConverter.getInstance().convert(document, out, options);
            } else {
                System.out.println("Enter only MS Office 2007+ files");
            }
        }
    }

    /**
     * doc转换成html
     */
    public static void convert2Html(String datestr,String id,String docpath,String htmlpath,String docname, String htmlname)
            throws TransformerException, IOException,
            ParserConfigurationException {
        ///mnt/ChinaApp/Tomcat/apache-tomcat-8-8060/webapps/PWebService/doc/202010/0e5b5829881647248a58c4fd94cb41e3.docx
        File htmlFolderFile = new File(htmlpath + "/" + datestr);
        if(!htmlFolderFile.exists()){
            htmlFolderFile.mkdirs();
        }
        File imageFolderFile = new File(htmlpath + "/" + datestr+ "/" + id);
        if(!imageFolderFile.exists()){
            imageFolderFile.mkdirs();
        }
        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(docpath+"/"+datestr+"/"+docname));//WordToHtmlUtils.loadDoc(new FileInputStream(inputFile));
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder()
                        .newDocument());
        wordToHtmlConverter.setPicturesManager( new PicturesManager()
        {
            public String savePicture(byte[] content,
                                      PictureType pictureType, String suggestedName,
                                      float widthInches, float heightInches )
            {
                return htmlpath + "/" + datestr + "/" + id + "/" +suggestedName;
            }
        } );
        wordToHtmlConverter.processDocument(wordDocument);
        //save pictures
        List pics=wordDocument.getPicturesTable().getAllPictures();
        if(pics!=null){
            for(int i=0;i<pics.size();i++){
                Picture pic = (Picture)pics.get(i);
                System.out.println();
                try {
                    ///mnt/ChinaApp/Tomcat/apache-tomcat-8-8060/webapps/PWebService/html/202010/80c0ce15d6e940408b157f00f10d523d/image5.png
                    pic.writeImageContent(new FileOutputStream(htmlpath + "/" + datestr + "/" + id + "/" + pic.suggestFullFileName()));
                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                }
            }
        }
        Document htmlDocument = wordToHtmlConverter.getDocument();
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(out);

        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();
        //serializer.setOutputProperty(OutputKeys.ENCODING, "GB2312");
        serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        out.close();

        String content = new String(out.toByteArray());
        String imgWebPath = htmlpath + "/" + datestr + "/" + id;
        content = content.replace(imgWebPath, id);
        String targetFilePath = htmlpath + "/" + datestr + "/" +htmlname;
        FileUtils.writeStringToFile(new File(targetFilePath), content, "utf-8");
        //writeFile(new String(out.toByteArray()), htmlpath + "/" + datestr + "/" +htmlname);
    }

    public static void writeFile(String content, String path) {
        FileOutputStream fos = null;
        BufferedWriter bw = null;
        try {
            File file = new File(path);
            fos = new FileOutputStream(file);

            //    bw = new BufferedWriter(new OutputStreamWriter(fos,"GB2312"));
            bw = new BufferedWriter(new OutputStreamWriter(fos,"UTF-8"));
            bw.write(content);
        } catch (FileNotFoundException fnfe) {
            fnfe.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        } finally {
            try {
                if (bw != null)
                    bw.close();
                if (fos != null)
                    fos.close();
            } catch (IOException ie) {
            }
        }
    }

}

java word转换成html源码分享