• pdfbox 识别 pdf为excel


    1、继承 PageDrawer 和 PDFRenderer获得文本框坐标

    2、通过坐标获取文字

    3、通过easyExcel生成表格

    public class MyPageDrawer extends PageDrawer {

    static final List<Coordinate> COORDINATE_LIST = new ArrayList<>();
    double pageHeight;
    MyPageDrawer(PageDrawerParameters parameters) throws IOException
    {
    super(parameters);
    this.pageHeight=parameters.getPage().getBBox().getHeight();

    }
    PDPage pdPage;
    @Override
    public void processPage(PDPage aPage) throws IOException {
    this.pdPage=aPage;
    super.processPage(aPage);
    }

    @Override
    public void fillPath(int windingRule) {
    Shape bbox = getLinePath().getBounds2D();
    Coordinate startCoordinate = new Coordinate(bbox.getBounds().getLocation().x,(int)pageHeight-bbox.getBounds().getLocation().y);
    COORDINATE_LIST.add(startCoordinate);
    getLinePath().reset();
    }

    }

    public   class MyPDFRenderer extends PDFRenderer
    {
    MyPDFRenderer(PDDocument document)
    {
    super(document);
    }

    @Override
    protected PageDrawer createPageDrawer(PageDrawerParameters parameters) throws IOException
    {
    return new MyPageDrawer(parameters);
    // return new TestPageDrawer(parameters);
    }
    }



    public class App {
    public static void main(String[] args) throws Exception {
    String fileName = "E:\download\test\2020年12月北京工程造价信息.pdf"; //这里先手动把绝对路径的文件夹给补上。
    readPDF(fileName);
    }
    /**
    * 读PDF文件,使用了pdfbox开源项目
    * @param fileName
    */
    public static void readPDF(String fileName) {
    File file = new File(fileName);
    FileInputStream in = null;
    try {
    in = new FileInputStream(fileName);
    // 新建一个PDF解析器对象
    PDFParser parser = new PDFParser(new RandomAccessFile(file,"rw"));
    // 对PDF文件进行解析
    parser.parse();
    // 获取解析后得到的PDF文档对象
    PDDocument pdfdocument = parser.getPDDocument();
    System.out.println("NumberOfPages:"+ pdfdocument.getNumberOfPages());

    PDFRenderer renderer = new MyPDFRenderer(pdfdocument);

    int pageNum=12;
    BufferedImage image = renderer.renderImage(pageNum);
    ImageIO.write(image, "PNG", new File("test.png"));

    // System.out.println("SEG_LINETO_LIST...");
    // MyPageDrawer.SEG_LINETO_LIST.stream().forEach(System.out::println);

    String resultFileName = "simpleWrite" + System.currentTimeMillis() + ".xlsx";
    EasyExcel.write(resultFileName).sheet().doWrite(judgeCoordinate(MyPageDrawer.COORDINATE_LIST, pdfdocument, pageNum));
    } catch (Exception e) {
    System.out.println("读取PDF文件" + file.getAbsolutePath() + "生失败!" + e);
    e.printStackTrace();
    } finally {
    if (in != null) {
    try {
    in.close();
    } catch (IOException e1) {
    }
    }
    }
    }
    /**
    * 去重排序
    *
    * @param coordinateList
    * @param document
    * @return
    */
    private static List<List<String>> judgeCoordinate(List<Coordinate> coordinateList, PDDocument document,int pageNum) {
    //去除pdf边界
    coordinateList=coordinateList.stream().filter(coordinate -> !(coordinate.getX()<38||coordinate.getY()<70||coordinate.getY()>780||coordinate.getX()>558)).collect(Collectors.toList());
    // 去重 按y,x排序 从左上角开始计算
    coordinateList = coordinateList.stream().sorted(Comparator.comparing(Coordinate::getY).thenComparing(Coordinate::getX)).collect(Collectors.toList());
    System.out.println("去重,排序后,分组前...");
    coordinateList.stream().forEach(System.out::println);
    // 去除相近元素
    for(int a=0;a<coordinateList.size();a++){
    Coordinate coordinateStart = coordinateList.get(a);
    for (int j = a+1; j < coordinateList.size(); j++) {
    Coordinate coordinateC = coordinateList.get(j);
    if (Math.abs(coordinateStart.getY()-coordinateC.getY()) <=2) {
    if(Math.abs(coordinateC.getX()-coordinateStart.getX())<=2){
    coordinateList.remove(j);
    j--;
    }else {
    int y=coordinateStart.getY()>coordinateC.getY()?coordinateC.getY():coordinateStart.getY();
    coordinateC.setY(y);
    }
    }else {
    break;
    }
    }
    }
    //需要重新排序
    coordinateList=coordinateList.stream().sorted(Comparator.comparing(Coordinate::getY).thenComparing(Coordinate::getX))
    .collect(Collectors.toList());

    Map<Integer, List<Coordinate>> groupList = coordinateList.stream()
    .collect(Collectors.groupingBy(Coordinate::getY));
    Map<Integer, List<Coordinate>> result =new LinkedHashMap<>();
    groupList.entrySet().stream().sorted(Map.Entry.<Integer, List<Coordinate>>comparingByKey())
    .forEachOrdered(e -> result.put(e.getKey(), e.getValue()));
    System.out.println("总行数:"+result.size());

    List<List<Coordinate>> resultRow = result.values().stream()
    .collect(Collectors.toList());
    resultRow=resultRow.stream().filter(item-> (item.size()>1)).collect(Collectors.toList());
    System.out.println("去重,排序,分组后...");

    resultRow.stream().forEach(System.out::println);
    List<List<String>> mapList = new ArrayList<>();
    for (int k = 0; k < resultRow.size()-1; k++) {
    Map<String,String> map = new HashMap<>();
    List<String> listRow=new ArrayList<>();
    boolean nullData=false;
    for (int i = 0; i < resultRow.get(k).size()-1; i++) {
    Coordinate coordinateStart=resultRow.get(k).get(i);
    List<Coordinate> nextRow=resultRow.get(k+1);
    if(nextRow.size()>i+1){
    Coordinate coordinateEnd=nextRow.get(i+1);
    int width=coordinateEnd.getX() - coordinateStart.getX();
    int height=coordinateEnd.getY() - coordinateStart.getY();
    //左上角 为原始点 向右 加宽向下加高
    try {
    String info = readRectangleInfo(coordinateStart.getX(), coordinateStart.getY(),
    width,height, document,pageNum);
    info = info.replaceAll(" | ", "");
    map.put("column"+i,info);
    if(info==null||info.length()==0){
    nullData=true;
    }else {
    nullData=false;
    listRow.add(info);
    }
    }catch (Exception e){
    e.printStackTrace();
    }
    }
    }
    if(!nullData){
    mapList.add(listRow);
    }
    }
    Gson gson = new Gson();
    String mapListString = gson.toJson(mapList);
    System.out.println(mapListString);

    return mapList;
    }
    private static String readRectangleInfo(int x, int y, int width, int height, PDDocument document
    , int pageNum) throws Exception {
    PDFTextStripperByArea stripper = new PDFTextStripperByArea();
    stripper.setSortByPosition(true);
    Rectangle rect = new Rectangle(x, y, width, height);
    stripper.addRegion("rect", rect);
    PDPage firstPage = document.getPage(pageNum);
    stripper.extractRegions(firstPage);
    return stripper.getTextForRegion("rect");
    }
    }


    <dependencies>

    <dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.22</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/fontbox -->
    <dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>fontbox</artifactId>
    <version>2.0.22</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/jempbox -->
    <dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>jempbox</artifactId>
    <version>1.8.16</version>
    </dependency>
    <dependency>
    <groupId>com.google.code.gson</groupId>
    <artifactId>gson</artifactId>
    <version>2.8.0</version>
    </dependency>

    <dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>easyexcel</artifactId>
    <version>2.2.7</version>
    </dependency>

    </dependencies>
  • 相关阅读:
    Leetcode No.108 Convert Sorted Array to Binary Search Tree(c++实现)
    Leetcode No.88 Merge Sorted Array(c++实现)
    Leetcode No.66 Plus One(c++实现)
    pandas数据排序(series排序 & DataFrame排序)
    pandas的settingwithWaring报警
    pandas对缺失值的处理
    pandas的数据统计函数
    pandas 新增数据列(直接赋值、apply,assign、分条件赋值)
    pandas数据查询(数值、列表、区间、条件、函数)
    pandas数据读取(DataFrame & Series)
  • 原文地址:https://www.cnblogs.com/CaptainLin/p/14298026.html
Copyright © 2020-2023  润新知