最近由于工作需要,调研了一下关于poi获取word字体信息方面的方法,在这里mark一下。
首先word格式分为doc和docx,分别利用HWPFDocument和XWPFDocument对文档进行解析,话不多说,直接贴代码:
- 解析doc格式
1 import java.io.FileInputStream; 2 import java.io.IOException; 3 4 import org.apache.poi.hwpf.HWPFDocument; 5 import org.apache.poi.hwpf.usermodel.CharacterRun; 6 import org.apache.poi.hwpf.usermodel.Paragraph; 7 import org.apache.poi.hwpf.usermodel.Range; 8 import org.apache.poi.poifs.filesystem.POIFSFileSystem; 9 10 public class fontTest { 11 public static void main(String[] args) throws IOException { 12 String filePath = "***.doc"; 13 POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(filePath)); 14 HWPFDocument document = new HWPFDocument(fs); 15 Range head = document.getHeaderStoryRange(); 16 for(int i = 0;i<head.numCharacterRuns();i++){ 17 System.out.println(head.getCharacterRun(i).text()); 18 } 19 System.out.println("==============================================="); 20 Range range = document.getRange(); 21 for (int i = 0; i < range.numParagraphs(); i++) { 22 Paragraph para = range.getParagraph(i);// 获取第i段 23 int count = 0; 24 while (true) { 25 CharacterRun run = para.getCharacterRun(count);// 此characterrun并非一个字符,而是一类字符,例如“数据挖掘”,前两个字为加粗,后两个字不加粗,那么“数据” 和“挖掘”会存在两个characterrun中 26 System.out.println("color ------" + run.getColor());// 字体颜色 27 System.out.println("font name---" + run.getFontName());// 字体类型 28 System.out.println("font size---" + run.getFontSize());// 字体大小 29 System.out.println("text -------" + run.text());// 文本信息 30 System.out.println("bold -------" + run.isBold());// 是否加粗 31 System.out.println("italic -----" + run.isItalic());// 是否斜体字 32 System.out.println("algnment ---" + para.getJustification());//对齐方式,0为左对齐,1为居中,2为右对齐,3为左右对齐 33 count++; 34 System.out.println("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"); 35 if (run.getEndOffset() == para.getEndOffset()) {//到达段落末尾 36 break; 37 } 38 } 39 System.out.println("--------------分段--------------"); 40 } 41 } 42 }
- 解析docx格式
1 package baidu; 2 3 import java.io.FileInputStream; 4 import java.io.InputStream; 5 import java.util.List; 6 7 import org.apache.poi.xwpf.usermodel.XWPFDocument; 8 import org.apache.poi.xwpf.usermodel.XWPFHeader; 9 import org.apache.poi.xwpf.usermodel.XWPFParagraph; 10 import org.apache.poi.xwpf.usermodel.XWPFRun; 11 12 public class fontdocx { 13 public static void main(String[] args) throws Exception { 14 String filePath = "***.docx"; 15 InputStream istream = new FileInputStream(filePath); 16 XWPFDocument docx = new XWPFDocument(istream); 17 // 获取页眉内容 18 List<XWPFHeader> list = docx.getHeaderList(); 19 for (XWPFHeader head : list) { 20 List<XWPFParagraph> paralist = head.getParagraphs(); 21 for (XWPFParagraph para : paralist) { 22 System.out.println(para.getText()); 23 } 24 // System.out.println(head.getText()); 25 } 26 List<XWPFParagraph> paraGraph = docx.getParagraphs(); 27 for (XWPFParagraph para : paraGraph) { 28 List<XWPFRun> run_array = para.getRuns(); 29 for (int i = 0; i < run_array.size(); i++) { 30 System.out.println("alignment --" 31 + para.getAlignment().toString());// 对齐方式 32 System.out 33 .println("text -------" + run_array.get(i).getText(0));// 文本内容 34 System.out.println("font name --" 35 + run_array.get(i).getFontFamily());// 字体类型 36 System.out 37 .println("color ------" + run_array.get(i).getColor());// 字体颜色 38 System.out.println("font size --" 39 + run_array.get(i).getFontSize());// 字体大小 40 } 41 } 42 } 43 }
对于抽取内容的准确度,未经过格式转换文本的准确度还是可以接受的。
在这里介绍的抽取内容比较少,欢迎大家补充!