1. 说明
将pdf中的文字读取处理还有一些限制:1. 文档的安全属性不能过于严格 2. 不能存在图片。
2. 直接贴相关的源码
有两种读取方式,maven对应的pom文件
<dependencies>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>1.8.8</version>
</dependency>
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.0.6</version>
</dependency>
</dependencies>
2.1 pdfbox
/**
* PdfboxUtil.java
*/
package com.hsm.pdfTest;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.InputStream;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
/**
* @author hsm
*/
public class PdfboxUtil {
private static String PDFPATH = "D:/Maven权威指南中文版.pdf";
private static String FILEPATH = "D:/Maven权威指南中文版.doc";
public static void main(String[] args) throws Exception {
String content=getPdfContent(PDFPATH);
toFile(content,FILEPATH);
}
/**
* 获取pdf的内容<br/>
* @param pdfPath
* @return
* @throws Exception
*/
private static String getPdfContent(String pdfPath) throws Exception {
boolean sort = false;// 是否排序
int startPage = 1;// 开始提取页数
int endPage = Integer.MAX_VALUE; // 结束提取页数
String content = null;//暂时存放pdf内容
InputStream input = null;
File pdfFile = new File(pdfPath);
PDDocument document = null;
try {
input = new FileInputStream(pdfFile);
// 加载 pdf 文档
PDFParser parser = new PDFParser(input);
parser.parse();
document = parser.getPDDocument();
// 获取内容信息
PDFTextStripper pts = new PDFTextStripper();
pts.setSortByPosition(sort);
endPage = document.getNumberOfPages();
System.out.println("Total Page: " + endPage);
pts.setStartPage(startPage);
pts.setEndPage(endPage);
try {
content = pts.getText(document);
}catch(Exception e) {
throw e;
}
System.out.println("Get PDF Content ...");
}catch(Exception e){
throw e;
} finally {
if (null != input)
input.close();
if (null != document)
document.close();
}
return content;
}
private static void toFile(String content,String filePath) {
try {
File f = new File(filePath);
if (!f.exists()) {
f.createNewFile();
}
System.out.println("Write PDF Content to txt file ...");
BufferedWriter output = new BufferedWriter(new FileWriter(f));
output.write(content);
output.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
2.2 itext
package com.hsm.pdfTest;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
/**
* @author hsm
*/
public class ItextpdfUtil {
private static String PDFPATH = "D:/Maven权威指南中文版.pdf";
private static String FILEPATH = "D:/Maven权威指南中文版.doc";
public static void main(String[] args) {
String content=getPdfContent(PDFPATH);
System.out.println(content);
toFile(PDFPATH,FILEPATH);
}
/**
* 获取pdf的内容
* @param pdfPath
* @return
*/
private static String getPdfContent(String pdfPath) {
PdfReader reader = null;
StringBuffer buff = new StringBuffer();
try {
reader = new PdfReader(pdfPath);
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
int num = reader.getNumberOfPages();// 获得页数
TextExtractionStrategy strategy;
for (int i = 1; i <= num; i++) {
strategy = parser.processContent(i,
new SimpleTextExtractionStrategy());
buff.append(strategy.getResultantText());
}
} catch (IOException e) {
e.printStackTrace();
}
return buff.toString();
}
/**
* 将对应的pdf文件读到指定的文件中
* @param pdfPath
* @param filePath
*/
private static void toFile(String pdfPath, String filePath) {
PrintWriter writer = null;
PdfReader reader = null;
try {
writer = new PrintWriter(new FileOutputStream(filePath));
reader = new PdfReader(pdfPath);
int num = reader.getNumberOfPages();// 获得页数
System.out.println("Total Page: " + num);
StringBuffer content = new StringBuffer(""); // 存放读取出的文档内容
for (int i = 1; i <= num; i++) {
// 读取第i页的文档内容
content.append(PdfTextExtractor.getTextFromPage(reader, i));
}
writer.write(content.toString());// 写入文件内容
writer.flush();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}