• HWPFDocument读取doc,wps文档(含图片读取)


    使用HWPFDocument对象读取03版doc文件或wps文件

    导包

     代码:

    1、图片工具类

     1 package com.poi.test;
     2 
     3 import java.util.ArrayList;
     4 import java.util.HashMap;
     5 import java.util.HashSet;
     6 import java.util.List;
     7 import java.util.Map;
     8 import java.util.Set;
     9 
    10 import org.apache.poi.hwpf.HWPFDocument;
    11 import org.apache.poi.hwpf.model.PicturesTable;
    12 import org.apache.poi.hwpf.usermodel.CharacterRun;
    13 import org.apache.poi.hwpf.usermodel.Picture;
    14 import org.apache.poi.hwpf.usermodel.Range;
    15 
    16 /**
    17  * Provides access to the pictures both by offset, iteration over the
    18  * un-claimed, and peeking forward
    19  */
    20 public class PicturesSource {//这个类是poi官网找的
    21     private PicturesTable picturesTable;
    22     private Set<Picture> output = new HashSet<Picture>();
    23     private Map<Integer, Picture> lookup;
    24     private List<Picture> nonU1based;
    25     private List<Picture> all;
    26     private int pn = 0;
    27 
    28     public PicturesSource(HWPFDocument doc) {
    29         picturesTable = doc.getPicturesTable();
    30         all = picturesTable.getAllPictures();
    31 
    32         // Build the Offset-Picture lookup map
    33         lookup = new HashMap<Integer, Picture>();
    34         for (Picture p : all) {
    35             lookup.put(p.getStartOffset(), p);
    36         }
    37 
    38         // Work out which Pictures aren't referenced by
    39         //  a u0001 in the main text
    40         // These are u0008 escher floating ones, ones
    41         //  found outside the normal text, and who
    42         //  knows what else...
    43         nonU1based = new ArrayList<Picture>();
    44         nonU1based.addAll(all);
    45         Range r = doc.getRange();
    46         for (int i = 0; i < r.numCharacterRuns(); i++) {
    47             CharacterRun cr = r.getCharacterRun(i);
    48             if (picturesTable.hasPicture(cr)) {
    49                 Picture p = getFor(cr);
    50                 int at = nonU1based.indexOf(p);
    51                 nonU1based.set(at, null);
    52             }
    53         }
    54     }
    55 
    56     private boolean hasPicture(CharacterRun cr) {
    57         return picturesTable.hasPicture(cr);
    58     }
    59 
    60     private void recordOutput(Picture picture) {
    61         output.add(picture);
    62     }
    63 
    64     private boolean hasOutput(Picture picture) {
    65         return output.contains(picture);
    66     }
    67 
    68     private int pictureNumber(Picture picture) {
    69         return all.indexOf(picture) + 1;
    70     }
    71 
    72     public Picture getFor(CharacterRun cr) {
    73         return lookup.get(cr.getPicOffset());
    74     }
    75 
    76     /**
    77      * Return the next unclaimed one, used towards the end
    78      */
    79     private Picture nextUnclaimed() {
    80         Picture p = null;
    81         while (pn < nonU1based.size()) {
    82             p = nonU1based.get(pn);
    83             pn++;
    84             if (p != null)
    85                 return p;
    86         }
    87         return null;
    88     }
    89 }

    2、处理图片和段落文字

     1 package com.poi.test;
     2 
     3 import java.io.ByteArrayOutputStream;
     4 import java.io.File;
     5 import java.io.FileInputStream;
     6 
     7 import org.apache.poi.hwpf.HWPFDocument;
     8 import org.apache.poi.hwpf.model.PicturesTable;
     9 import org.apache.poi.hwpf.usermodel.CharacterRun;
    10 import org.apache.poi.hwpf.usermodel.Paragraph;
    11 import org.apache.poi.hwpf.usermodel.Picture;
    12 import org.apache.poi.hwpf.usermodel.Range;
    13 
    14 public class PoiForWord {
    15     /**
    16      * 使用HWPFDocument解析word文档
    17      * wps按doc处理即可
    18      */
    19     public void parseDocByHWPFDocument(){
    20         try(FileInputStream is = new FileInputStream(new File("c:\a.wps"));HWPFDocument document = new HWPFDocument(is);){
    21             ByteArrayOutputStream baos = new ByteArrayOutputStream();//字节流,用来存储图片
    22             PicturesSource pictures = new PicturesSource(document);
    23             PicturesTable pictureTable = document.getPicturesTable();
    24             
    25             Range r = document.getRange();//区间
    26             for(int i=0;i<r.numParagraphs();i++){
    27                 Paragraph p = r.getParagraph(i);//段落
    28                 int fontSize = p.getCharacterRun(0).getFontSize();//字号,字号和是否加粗可用来当做标题或者某一关键标识的判断
    boolean isBold = p.getCharacterRun(0).isBold();//是否加粗
    29 String paragraphText = p.text();//段落文本 30 31 //以下代码解析图片,这样获取的图片是在文档流中的,是和文本按顺序解析的,可以很好的解决图片定位问题 32 for(int j=0;j<p.numCharacterRuns();j++){ 33 CharacterRun cr = p.getCharacterRun(j);//字符 34 if(pictureTable.hasPicture(cr)){ 35 Picture picture = pictures.getFor(cr); 36 //如果是在页面显示图片,可转换为base64编码的图片 37 picture.writeImageContent(baos);//将图片写入字节流 38 // String base64Image = "<img src='data:image/png;base64,"+new BASE64Encoder().encode(baos.toByteArray())+"'/>"; 39 } 40 } 41 } 42 }catch(Exception e){ 43 e.printStackTrace(); 44 } 45 } 46 47 }

    3、处理表格

     1 /**
     2      * 使用HWPFDocument解析word文档
     3      * wps按doc处理即可
     4      */
     5     @Test
     6     public void parseDocTableByHWPFDocument(){
     7         try(FileInputStream is = new FileInputStream(new File("d:\b.doc"));HWPFDocument document = new HWPFDocument(is);){
     8             Range r = document.getRange();//区间
     9             for(int i=0;i<r.numParagraphs();i++){
    10                 Paragraph p = r.getParagraph(i);//段落
    11                 String text = p.text();
    12                 
    13                 if(text.indexOf("序号")!=-1){//解析表格需要从表格第一个单元格获取表格,另一种表格的方式是直接获取所有表格,但是无法判断表格在文档中的位置
    14                     Table table = r.getTable(p);
    15                     
    16                     int numRows = table.numRows();//获取行数
    17                     
    18                     for(int j=0;j<numRows;j++){
    19                         TableRow row = table.getRow(j);
    20                         int numCells = row.numCells();//当前行列数
    21                         for(int k=0;k<numCells;k++){
    22                             TableCell cell = row.getCell(k);
    23                             System.out.print(cell.text()+" @ ");
    24                         }
    25                         System.out.println();
    26                     }
    27                 }
    28             }
    29         }catch(Exception e){
    30             e.printStackTrace();
    31         }
    32     }

     字符"?"可通过字符串替换或截取来解决

    另一种解析的方式,只支持解析文本内容,且无法获取字号和加粗等字体格式

    1 WordExtractor extor = new WordExtractor(is);
    2             String[] paragraphText = extor.getParagraphText();
  • 相关阅读:
    OLED的相关信息
    FilterDispatcher is depredated!plesae use the new filters
    lua c函数注册器
    一个简易版本的lua debugger实现
    【Unity Shaders】Using Textures for Effects——让sprite sheets动起来
    GDAL库中WFS服务中含有中文不能获取数据的问题
    golang:使用timingwheel进行大量ticker的优化
    扩展GDAL,支持CNSDTF格式(一)
    理解WebKit和Chromium: Android 4.4 上的Chromium WebView
    学习tornado:异步
  • 原文地址:https://www.cnblogs.com/-mystery/p/12874051.html
Copyright © 2020-2023  润新知