• pdfminer模块批量处理PDF文件


    from pdfminer.pdfparser import PDFParser, PDFDocument
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTTextLineHorizontal, LTFigure, LTRect, LTLine, LTCurve
    import os
     
     
    class PdfForString(object):
        def __init__(self):
            self.pdf_list = os.listdir(r'E:StockExchangePDF')  # 获取PDF文件夹中所有pdf名称
            #  存储文档资源
            self.src = PDFResourceManager()
            #  设备对象
            self.device = PDFPageAggregator(self.src, laparams=LAParams())
            # 解释器对象
            self.inter = PDFPageInterpreter(self.src, self.device)
     
        # 生成pdf路径
        def for_string(self):
            for pdf in self.pdf_list:
                pdf_path = os.path.join(os.path.dirname(os.path.dirname(__file__)) + '/PDF', pdf)
                yield pdf_path
     
        # 解析pdf
        def pdf_analysis(self):
            for path in self.for_string():
                pd_file = open(path, 'rb')
                parser = PDFParser(pd_file)  # pdf文件解析对象
     
                #  pdf文档对象
                document = PDFDocument()
                parser.set_document(document)
                document.set_parser(parser)
                pages = document.get_pages()
                yield pages
     
        # 获取PDF信息
        def get_string(self):
            for pages in self.pdf_analysis():
                for page in pages:
                    self.inter.process_page(page)
                    layout = self.device.get_result()
                    for x in layout:
                        if isinstance(x, LTTextBoxHorizontal):
                            print(str(x.get_text()))
                # break
     
     
    PdfForString().get_string()

    --------转自屁桃

  • 相关阅读:
    还是模块
    模块
    Django之中间件和Auth模块
    Django之form表单组件、cookie与session
    ORM表查询之F查询和Q查询以及事务
    django之单表和多表查询
    django之模板层
    Django之路由
    Django之前戏
    前端之Bootstrap框架
  • 原文地址:https://www.cnblogs.com/lilei1996/p/11947702.html
Copyright © 2020-2023  润新知