poi读取docx中的文字和图片(自己应用)
1 package com.fry.poiDemo.dao;
2
3 import java.io.File;
4 import java.io.FileInputStream;
5 import java.io.FileOutputStream;
6 import java.io.IOException;
7 import java.io.PrintStream;
8 import java.util.List;
9
10 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
11 import org.apache.poi.xwpf.usermodel.XWPFDocument;
12 import org.apache.poi.xwpf.usermodel.XWPFPictureData;
13
14 public class Word {
15 // maven太好用了
16 // 读取srcFile源word文件docx文字
17 // 读取srcFile源word文件docx中的image图片并且存放在文件夹imageFile中
18 public String readDocxImage(String srcFile, String imageFile) {
19 String path = srcFile;
20 File file = new File(path);
21 try {
22 // 用XWPFWordExtractor来获取文字
23 FileInputStream fis = new FileInputStream(file);
24 XWPFDocument document = new XWPFDocument(fis);
25 XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(document);
26 String text = xwpfWordExtractor.getText();
27 // System.out.println(text);
28 //将获取到的文字存放到对应文件名中的txt文件中
29 String temp[]=srcFile.split("\/");
30 String temp1=temp[temp.length-1];
31 String temp3[]=temp1.split("\.");
32 String txtFileName="myRes//txt//"+temp3[0]+".txt";
33 PrintStream ps = new PrintStream(txtFileName);
34 ps.println(text);
35
36
37 // 用XWPFDocument的getAllPictures来获取所有的图片
38 List<XWPFPictureData> picList = document.getAllPictures();
39 for (XWPFPictureData pic : picList) {
40 // System.out.println(pic.getPictureType() + file.separator + pic.suggestFileExtension() + file.separator
41 // + pic.getFileName());
42 byte[] bytev = pic.getData();
43 // System.out.println(bytev.length);
44 // 大于1000bites的图片我们才弄下来,消除word中莫名的小图片的影响
45 if (bytev.length > 300) {
46 FileOutputStream fos = new FileOutputStream(imageFile + pic.getFileName());
47 fos.write(bytev);
48 }
49 }
50 fis.close();
51 return text;
52 } catch (IOException e) {
53 e.printStackTrace();
54 }
55 return null;
56 }
57 }