一、简介
一般word文件后缀有doc、docx两种。docx是office word 2007以及以后版本文档的扩展名;doc是office word 2003文档保存的扩展名。对于这两种格式的word转换成html需要使用不同的方法。
对于docx格式的文档使用xdocreport进行转换。依赖如下:
<dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>fr.opensagres.xdocreport.document</artifactId> <version>1.0.5</version> </dependency> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId> <version>1.0.5</version> </dependency>
对于docx格式的文档使用poi进行转换。依赖如下:
<dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.12</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.12</version> </dependency>
二:示例
代码示例如下:
1 package com.test.word; 2 3 import java.io.File; 4 import java.io.FileInputStream; 5 import java.io.FileNotFoundException; 6 import java.io.FileOutputStream; 7 import java.io.IOException; 8 import java.io.InputStream; 9 import java.io.OutputStream; 10 11 import javax.xml.parsers.DocumentBuilderFactory; 12 import javax.xml.parsers.ParserConfigurationException; 13 import javax.xml.transform.OutputKeys; 14 import javax.xml.transform.Transformer; 15 import javax.xml.transform.TransformerException; 16 import javax.xml.transform.TransformerFactory; 17 import javax.xml.transform.dom.DOMSource; 18 import javax.xml.transform.stream.StreamResult; 19 20 import org.apache.poi.hwpf.HWPFDocument; 21 import org.apache.poi.hwpf.converter.PicturesManager; 22 import org.apache.poi.hwpf.converter.WordToHtmlConverter; 23 import org.apache.poi.hwpf.usermodel.PictureType; 24 import org.apache.poi.xwpf.converter.core.FileImageExtractor; 25 import org.apache.poi.xwpf.converter.core.FileURIResolver; 26 import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter; 27 import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions; 28 import org.apache.poi.xwpf.usermodel.XWPFDocument; 29 import org.junit.Test; 30 import org.w3c.dom.Document; 31 32 /** 33 * word 转换成html 34 */ 35 public class WordToHtml { 36 37 /** 38 * 2007版本word转换成html 39 * @throws IOException 40 */ 41 @Test 42 public void Word2007ToHtml() throws IOException { 43 String filepath = "C:/test/"; 44 String fileName = "滕王阁序2007.docx"; 45 String htmlName = "滕王阁序2007.html"; 46 final String file = filepath + fileName; 47 File f = new File(file); 48 if (!f.exists()) { 49 System.out.println("Sorry File does not Exists!"); 50 } else { 51 if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) { 52 53 // 1) 加载word文档生成 XWPFDocument对象 54 InputStream in = new FileInputStream(f); 55 XWPFDocument document = new XWPFDocument(in); 56 57 // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录) 58 File imageFolderFile = new File(filepath); 59 XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile)); 60 options.setExtractor(new FileImageExtractor(imageFolderFile)); 61 options.setIgnoreStylesIfUnused(false); 62 options.setFragment(true); 63 64 // 3) 将 XWPFDocument转换成XHTML 65 OutputStream out = new FileOutputStream(new File(filepath + htmlName)); 66 XHTMLConverter.getInstance().convert(document, out, options); 67 68 //也可以使用字符数组流获取解析的内容 69 // ByteArrayOutputStream baos = new ByteArrayOutputStream(); 70 // XHTMLConverter.getInstance().convert(document, baos, options); 71 // String content = baos.toString(); 72 // System.out.println(content); 73 // baos.close(); 74 } else { 75 System.out.println("Enter only MS Office 2007+ files"); 76 } 77 } 78 } 79 80 /** 81 * /** 82 * 2003版本word转换成html 83 * @throws IOException 84 * @throws TransformerException 85 * @throws ParserConfigurationException 86 */ 87 @Test 88 public void Word2003ToHtml() throws IOException, TransformerException, ParserConfigurationException { 89 String filepath = "C:/test/"; 90 final String imagepath = "C:/test/image/"; 91 String fileName = "滕王阁序2003.doc"; 92 String htmlName = "滕王阁序2003.html"; 93 final String file = filepath + fileName; 94 InputStream input = new FileInputStream(new File(file)); 95 HWPFDocument wordDocument = new HWPFDocument(input); 96 WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); 97 //设置图片存放的位置 98 wordToHtmlConverter.setPicturesManager(new PicturesManager() { 99 public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) { 100 File imgPath = new File(imagepath); 101 if(!imgPath.exists()){//图片目录不存在则创建 102 imgPath.mkdirs(); 103 } 104 File file = new File(imagepath + suggestedName); 105 try { 106 OutputStream os = new FileOutputStream(file); 107 os.write(content); 108 os.close(); 109 } catch (FileNotFoundException e) { 110 e.printStackTrace(); 111 } catch (IOException e) { 112 e.printStackTrace(); 113 } 114 return imagepath + suggestedName; 115 } 116 }); 117 118 //解析word文档 119 wordToHtmlConverter.processDocument(wordDocument); 120 Document htmlDocument = wordToHtmlConverter.getDocument(); 121 122 File htmlFile = new File(filepath + htmlName); 123 OutputStream outStream = new FileOutputStream(htmlFile); 124 125 //也可以使用字符数组流获取解析的内容 126 // ByteArrayOutputStream baos = new ByteArrayOutputStream(); 127 // OutputStream outStream = new BufferedOutputStream(baos); 128 129 DOMSource domSource = new DOMSource(htmlDocument); 130 StreamResult streamResult = new StreamResult(outStream); 131 132 TransformerFactory factory = TransformerFactory.newInstance(); 133 Transformer serializer = factory.newTransformer(); 134 serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); 135 serializer.setOutputProperty(OutputKeys.INDENT, "yes"); 136 serializer.setOutputProperty(OutputKeys.METHOD, "html"); 137 138 serializer.transform(domSource, streamResult); 139 140 //也可以使用字符数组流获取解析的内容 141 // String content = baos.toString(); 142 // System.out.println(content); 143 // baos.close(); 144 outStream.close(); 145 } 146 }
运行生存文件结果如下: