• pdf转图片,提取文字,提取图片


    1、使用pdfbox

            <dependency>
                <groupId>org.apache.pdfbox</groupId>
                <artifactId>pdfbox</artifactId>
                <version>2.0.21</version>
            </dependency>

    2、code

    import org.apache.pdfbox.cos.COSName;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.pdmodel.PDPage;
    import org.apache.pdfbox.pdmodel.PDResources;
    import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
    import org.apache.pdfbox.rendering.PDFRenderer;
    import org.apache.pdfbox.text.PDFTextStripper;
    
    import javax.imageio.ImageIO;
    import java.awt.image.BufferedImage;
    import java.io.File;
    import java.io.IOException;
    
    /**
     * @Author: xu.dm
     * @Date: 2020/10/27 17:06
     * @Version: 1.0
     * @Description: pdf转图片,提取文字,提取图片
     **/
    public class PdfUtil {
        /**
         * 转换全部的pdf
         *
         * @param filename PDF文件全路径
         * @param type     图片类型
         */
        public static void pdf2png(String filename, String type) {
            // 将pdf装图片 并且自定义图片得格式大小
            File file = new File(filename);
            String parentPath = file.getParent();
            String name = file.getName();
            try {
                PDDocument doc = PDDocument.load(file);
                PDFRenderer renderer = new PDFRenderer(doc);
                int pageCount = doc.getNumberOfPages();
                for (int i = 0; i < pageCount; i++) {
                    BufferedImage image = renderer.renderImageWithDPI(i, 144); // Windows native DPI
                    ImageIO.write(image, type, new File(parentPath + "/" + name + "_" + (i + 1) + "." + type));
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    
    
        /**
         * 自由确定起始页和终止页
         *
         * @param filename     PDF文件全路径
         * @param indexOfStart 开始页  开始转换的页码,从0开始
         * @param indexOfEnd   结束页  停止转换的页码,-1为全部
         * @param type         图片类型
         */
        public static void pdf2png(String filename, int indexOfStart, int indexOfEnd, String type) {
            // 将pdf装图片 并且自定义图片得格式大小
            File file = new File(filename);
            String parentPath = file.getParent();
            String name = file.getName();
    
            try {
                PDDocument doc = PDDocument.load(file);
                PDFRenderer renderer = new PDFRenderer(doc);
                int pageCount = doc.getNumberOfPages();
                for (int i = indexOfStart; i < indexOfEnd; i++) {
                    BufferedImage image = renderer.renderImageWithDPI(i, 144); // Windows native DPI
                    ImageIO.write(image, type, new File(parentPath + "\" + name + "_" + (i + 1) + "." + type));
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    
        /**
         * 提取pdf中的文字,例如:用word转存pdf,那么里面文字就可以提取,如果是图片转存pdf不能提取
         * @param filename pdf全路径
         */
        public static String extractText(String filename) {
            File file = new File(filename);
            try {
                PDDocument doc = PDDocument.load(file);
                int pages = doc.getNumberOfPages();
                // 读文本内容
                PDFTextStripper stripper = new PDFTextStripper();
                // 设置按顺序输出
                stripper.setSortByPosition(true);
                stripper.setStartPage(1);
                stripper.setEndPage(pages);
                return stripper.getText(doc);
            } catch (IOException e) {
                e.printStackTrace();
            }
            return null;
        }
    
        /**
         * 从PDF中提取图片
         * @param filename pdf全路径
         * @param type 图片类型,后缀
         */
        public static void extractImage(String filename, String type) {
            File file = new File(filename);
            String parentPath = file.getParent();
            String name = file.getName();
            try {
                PDDocument doc = PDDocument.load(file);
                int pages = doc.getNumberOfPages();
                int j = 0;
                for (int i = 0; i < pages; i++) {
                    PDPage page = doc.getPage(i);
                    PDResources resources = page.getResources();
                    Iterable<COSName> xObjectNames = resources.getXObjectNames();
                    if (xObjectNames == null) continue;
                    for (COSName cosName : xObjectNames) {
                        if (resources.isImageXObject(cosName)) {
                            PDImageXObject image = (PDImageXObject) resources.getXObject(cosName);
                            BufferedImage bufferedImage = image.getImage();
                            ImageIO.write(bufferedImage, type, new File(parentPath + "\" + name + "_" + (++j) + "." + type));
                        }
                    }
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
  • 相关阅读:
    Hadoop基础(五十四):基于centos搭建Hadoop3.x完全分布式运行模式
    FLINK基础(111): DS算子与窗口(22)窗口 (8) 自定义窗口(3)清理器(EVICTORS)
    FLINK基础(110): DS算子与窗口(21)窗口 (6) 自定义窗口(2)触发器(Triggers)
    FLINK基础(109): DS算子与窗口(20)窗口 (5) 自定义窗口(1) 窗口分配器(window assigners)
    neutron-dhcp-agent
    Firecracker 线程
    kata-containers Compile And Installed
    katacontainer debug
    katka-container搭建
    git切换分支
  • 原文地址:https://www.cnblogs.com/asker009/p/13890171.html
Copyright © 2020-2023  润新知