/** * com.jiaoyiping.pdstest.TestTika.java * Copyright (c) 2009 Hewlett-Packard Development Company, L.P. * All rights reserved. */ package com.jiaoyiping.pdstest; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.io.OutputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.mail.RFC822Parser; import org.apache.tika.parser.microsoft.OfficeParser; import org.apache.tika.parser.pdf.PDFParser; import org.apache.tika.sax.BodyContentHandler; import org.junit.Test; import org.xml.sax.ContentHandler; /** * <pre> * Desc: * @author 焦一平 * @refactor 焦一平 * @date 2014年12月4日 下午1:31:09 * @version 1.0 * @see * REVISIONS: * Version Date Author Description * ------------------------------------------------------------------- * 1.0 2014年12月4日 焦一平 1. Created this class. * </pre> */ public class TestTika { //解析PDF @Test public void testPdf() throws Exception{ Long start = System.currentTimeMillis(); Parser parser = new PDFParser(); InputStream is = new BufferedInputStream(new FileInputStream(new File("D:\我的微盘\文档\参考文档\Linux Shell脚本攻略.pdf"))); OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\Users\Administrator\Desktop\result.txt"))); Metadata meta = new Metadata(); meta.add(Metadata.CONTENT_ENCODING, "utf-8"); ContentHandler iHandler = new BodyContentHandler(os); parser.parse(is, iHandler, meta, new ParseContext()); Long end = System.currentTimeMillis(); Long used = (end-start)/1000; System.out.println("耗时: "+used+"秒"); } //解析Word @Test public void testWrod() throws Exception{ Long start = System.currentTimeMillis(); Parser parser = new OfficeParser(); InputStream is = new BufferedInputStream(new FileInputStream(new File("D:\我的微盘\文档\参考文档\jBPM5_用户指南中文版.doc"))); OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\Users\Administrator\Desktop\result.txt"))); Metadata meta = new Metadata(); meta.add(Metadata.CONTENT_ENCODING, "utf-8"); ContentHandler iHandler = new BodyContentHandler(os); parser.parse(is, iHandler, meta, new ParseContext()); Long end = System.currentTimeMillis(); Long used = (end-start)/1000; System.out.println("耗时:"+used+"秒"); } //解析EMAIL(只能解析标准的eml格式的,不能解析微软的msg格式) //使用commons-email来进行解析的可以得到收件人、发件人、主题、内容等元数据,TIkA是否支持未尝试 @Test public void testEmail() throws Exception{ Long start = System.currentTimeMillis(); Parser parser = new RFC822Parser(); InputStream is = new BufferedInputStream(new FileInputStream(new File("C:\Users\Administrator\Downloads\回复_ RE_ 数据导入工作 - 外部系统枚举与U-Cloud枚举映射.eml"))); OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\Users\Administrator\Desktop\result.txt"))); Metadata meta = new Metadata(); meta.add(Metadata.CONTENT_ENCODING, "utf-8"); ContentHandler iHandler = new BodyContentHandler(os); parser.parse(is, iHandler, meta, new ParseContext()); Long end = System.currentTimeMillis(); Long used = (end-start)/1000; System.out.println("耗时:"+used+"秒"); } }