• 使用itextpdf提取pdf内容


      1 package test;
      2 
      3 import java.io.FileInputStream;
      4 import java.io.IOException;
      5 import java.util.ArrayList;
      6 import java.util.List;
      7 import java.util.regex.Pattern;
      8 
      9 import org.apache.pdfbox.cos.COSDocument;
     10 import org.apache.pdfbox.pdfparser.PDFParser;
     11 import org.apache.pdfbox.util.PDFTextStripper;
     12 
     13 import com.itextpdf.text.pdf.PdfReader;
     14 import com.itextpdf.text.pdf.parser.PdfTextExtractor;
     15 
     16 public class UploadUtils {
     17     
     18     private final static Pattern pattern = Pattern.compile("\d+");
     19     private final static int stateParaOverFlag = 800;
     20     private final static int thankParaOverFlag = 800;
     21     
     22     /**
     23      * 读取pdf参考文献内容
     24      * 
     25      * @param s
     26      * @return
     27      */
     28     public String readPdf(String filePath) {
     29         StringBuilder buffer = new StringBuilder();
     30         FileInputStream fis = null;
     31         PdfReader pdfReader = null;
     32         COSDocument cosDocument = null;
     33         String[] paragraphs = null;
     34         PDFParser p;
     35         boolean addBool = true;
     36         boolean judgeState = false;
     37         boolean judgeThank = false;
     38         StringBuilder tempSb = new StringBuilder();
     39         try {
     40             fis = new FileInputStream(filePath);
     41             p = new PDFParser(fis);
     42             p.parse();
     43             cosDocument = p.getDocument();
     44             // 加密文档判断
     45             if (cosDocument.isEncrypted()) {
     46                 StringBuilder tempContent = new StringBuilder();
     47                 pdfReader = new PdfReader(filePath);
     48                 int i = pdfReader.getNumberOfPages();
     49                 for (int j = 1; j <= i; j++) {
     50                     tempContent.append(PdfTextExtractor.getTextFromPage(pdfReader, j));
     51                 }
     52                 paragraphs = tempContent.toString().split("
    ");
     53             } else {
     54                 PDFTextStripper ts = new PDFTextStripper();
     55                 paragraphs = ts.getText(p.getPDDocument()).split("
    ");
     56             }
     57             boolean mark = false;
     58             List<Integer> errornum = new ArrayList<Integer>();
     59             int flag = 0;
     60             int endRange = paragraphs.length * 70 / 100;
     61             int rangeFlag = 0;
     62             for (String lineContent : paragraphs) {
     63                 if (judgeState) {
     64                     tempSb.append(lineContent);
     65                     if (tempSb.length() >= stateParaOverFlag) {
     66                         judgeState = false;
     67                         addBool = true;
     68                         tempSb.delete(0, tempSb.length() - 1);
     69                     }
     70                 }
     71                 if (judgeThank) {
     72                     tempSb.append(lineContent);
     73                     if (tempSb.length() >= thankParaOverFlag) {
     74                         judgeThank = false;
     75                         addBool = true;
     76                         tempSb.delete(0, tempSb.length() - 1);
     77                     }
     78                 }
     79                 if (addBool) {
     80                     buffer.append(lineContent);
     81                 }
     82                 if (mark && rangeFlag >= endRange) {
     83                     if (lineContent.length() < 5) {
     84                         errornum.add(++flag);
     85                         rangeFlag++;
     86                         continue;
     87                     }
     88                     if (pattern.matcher(lineContent.substring(0, 5)).find()) {
     89                         if (flag != 0) {
     90                             flag = 0;
     91                             errornum.clear();
     92                         }
     93                     } else {
     94                         errornum.add(++flag);
     95                     }
     96                     if (errornum.size() > 2) {
     97                         mark = false;
     98                     }
     99                 }
    100                 rangeFlag++;
    101             }
    102         } catch (Exception e) {
    103             e.printStackTrace();
    104         } finally {
    105             if (fis != null) {
    106                 try {
    107                     fis.close();
    108                 } catch (IOException e) {
    109                     e.printStackTrace();
    110                 } finally {
    111                     fis = null;
    112                 }
    113             }
    114             if (pdfReader != null) {
    115                 pdfReader.close();
    116             }
    117             if (cosDocument != null) {
    118                 try {
    119                     cosDocument.close();
    120                 } catch (IOException e) {
    121                     e.printStackTrace();
    122                 } finally {
    123                     cosDocument = null;
    124                 }
    125             }
    126         }
    127         return buffer.toString();
    128     }
    129     
    130     public static boolean isBlank(CharSequence cs) {
    131         int strLen;
    132         if (cs == null || (strLen = cs.length()) == 0) {
    133             return true;
    134         }
    135         for (int i = 0; i < strLen; i++) {
    136             if (Character.isWhitespace(cs.charAt(i)) == false) {
    137                 return false;
    138             }
    139         }
    140         return true;
    141     }
    142     
    143     public static void main(String[] args) {
    144         // System.err.println(new UploadUtils()
    145         // .readPdf("/opt/fileCache/2014/125/13/shuangping_D7037870CF4FC5C421A3E5359DCF8BBE.pdf"));
    146         System.err.println(new UploadUtils().readPdf("E:\MyWork\guyezhai\pdf提取\路径依赖视角下高校新专业建设的策略创新(1).pdf"));
    147         
    148     }
    149     
    150 }

    其中用到的jar包:

    bcpkix-jdk15on-1.47.jar
    bcprov-jdk15on-1.49.jar
    commons-logging-1.1.2.jar
    fontbox-1.8.2.jar
    icu4j-4.0.1.jar
    itextpdf-5.4.3.jar
    jempbox-1.8.2.jar
    pdfbox-1.8.2.jar
  • 相关阅读:
    C语言I博客作业03
    C语言I博客作业06
    C语言I博客作业01
    C语言I博客作业04
    C语言I博客作业05
    How kNN algorithm works(kNN算法原理讲解)
    《机器学习笔记》环境配置(Windows64位)
    git 命令使用
    关于推荐系统中的冷启动问题探讨(Approaching the Cold Start Problem in Recommender Systems)
    WebForms UnobtrusiveValidationMode 需要“jquery”ScriptResourceMapping。请添加一个名为 jquery (区分大小写)的 ScriptResourceMapping。
  • 原文地址:https://www.cnblogs.com/guyezhai/p/5091204.html
Copyright © 2020-2023  润新知