• Java使用PDFBox操作PDF文件获取页码、文章内容、缩略图


    
    一、依赖
    
    <!--使用的是pdfbox计数总页数与缩略图-->
    <!-- https://mvnrepository.com/artifact/com.sleepycat/je -->
    <dependency>
        <groupId>com.sleepycat</groupId>
        <artifactId>je</artifactId>
        <version>5.0.73</version>
    </dependency>
    
    <!--pdf-->
    <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
    <dependency>
        <groupId>org.apache.pdfbox</groupId>
        <artifactId>pdfbox</artifactId>
        <version>2.0.8</version>
    </dependency>

    二、实现代码

    
    
    import lombok.extern.slf4j.Slf4j;
    import org.apache.pdfbox.pdfparser.PDFParser;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.rendering.ImageType;
    import org.apache.pdfbox.rendering.PDFRenderer;
    import org.apache.pdfbox.text.PDFTextStripper;
    
    import javax.imageio.IIOImage;
    import javax.imageio.ImageIO;
    import javax.imageio.ImageWriter;
    import javax.imageio.stream.ImageOutputStream;
    import java.awt.image.BufferedImage;
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.util.Iterator;
    @Slf4j
    public class PdfUtil {
    
    
        /**
         * 通过PDFbox获取文章总页数
         *
         * @param filePath:文件路径
         * @return
         * @throws IOException
         */
        public static int getNumberOfPages(String filePath) throws IOException, InterruptedException {
                    File file = new File(filePath);
                    PDDocument pdDocument = PDDocument.load(new File(filePath));
                    int pages = pdDocument.getNumberOfPages();
                    pdDocument.close();
                    return pages;
            }
        }
    
        /**
         * 通过PDFbox获取文章内容
         *
         * @param filePath
         * @return
         */
        public static String getContent(String filePath) throws IOException {
            PDFParser pdfParser = new PDFParser(new org.apache.pdfbox.io.RandomAccessFile(new File(filePath), "rw"));
            pdfParser.parse();
            PDDocument pdDocument = pdfParser.getPDDocument();
            String text = new PDFTextStripper().getText(pdDocument);
            pdDocument.close();
    
            return text;
        }
    
        /**
         * 通过PDFbox生成文件的缩略图
         *
         * @param filePath:文件路径
         * @param outPath:输出图片路径
         * @throws IOException
         */
        public static void getThumbnails(String filePath, String outPath) throws IOException {
            // 利用PdfBox生成图像
            PDDocument pdDocument = PDDocument.load(new File(filePath));
            PDFRenderer renderer = new PDFRenderer(pdDocument);
    
            // 构造图片
            BufferedImage img_temp = renderer.renderImageWithDPI(0, 30, ImageType.RGB);
            // 设置图片格式
            Iterator<ImageWriter> it = ImageIO.getImageWritersBySuffix("png");
            // 将文件写出
            ImageWriter writer = (ImageWriter) it.next();
            ImageOutputStream imageout = ImageIO.createImageOutputStream(new FileOutputStream(outPath));
            writer.setOutput(imageout);
            writer.write(new IIOImage(img_temp, null, null));
            img_temp.flush();
            imageout.flush();
            imageout.close();
            //Warning: You did not close a PDF Document
            pdDocument.close();
        }
    }

    三、测试类--Main

    
    
    import java.io.IOException;
    /**
     * @author Mr.lu
     * @Title: Main
     * @ProjectName DocCloud
     * @Description: TODO
     * @date 2018/11/6:22:17
     */
    public class Main {
        public static void main(String[] args) throws IOException, InterruptedException {
            int numberOfPages = getNumberOfPages("D:\Desktop\DocCloud\testDir\hadoopClientCode.pdf");
            System.out.println(numberOfPages);
            String content = getContent("");
            System.out.println(content);
           getThumbnails("D:\Desktop\DocCloud\testDir\hadoopClientCoed.pdf",
                    "D:\Desktop\DocCloud\testDir\hadoopClientCoed.pdf.png");
        }
    }
    

    1>首先测试生成PDF文件的页码,在控制台可以看到

    2>测试获取PDF文件的内容,在控制台可以看到--你自己PDF文件中的内容

    3>测试生成PDF缩略图

    缩略图的大小,可以在代码中修改

  • 相关阅读:
    《UNIX环境高级编程》笔记--UNIX标准化及实现
    SPOJ1811最长公共子串问题(后缀自动机)
    一个leetcode解题报告类目,代码很简洁
    字符压缩题目
    求最佳会议地点
    实现树的横向指针
    lower_bound与upper_bound
    求到所有房子距离和最小的新房子
    增加限制条件的矩阵求和
    切分数组来得到一定的和
  • 原文地址:https://www.cnblogs.com/pigdata/p/10305576.html
Copyright © 2020-2023  润新知