from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTTextLineHorizontal, LTFigure, LTRect, LTLine, LTCurve import os class PdfForString(object): def __init__(self): self.pdf_list = os.listdir(r'E:StockExchangePDF') # 获取PDF文件夹中所有pdf名称 # 存储文档资源 self.src = PDFResourceManager() # 设备对象 self.device = PDFPageAggregator(self.src, laparams=LAParams()) # 解释器对象 self.inter = PDFPageInterpreter(self.src, self.device) # 生成pdf路径 def for_string(self): for pdf in self.pdf_list: pdf_path = os.path.join(os.path.dirname(os.path.dirname(__file__)) + '/PDF', pdf) yield pdf_path # 解析pdf def pdf_analysis(self): for path in self.for_string(): pd_file = open(path, 'rb') parser = PDFParser(pd_file) # pdf文件解析对象 # pdf文档对象 document = PDFDocument() parser.set_document(document) document.set_parser(parser) pages = document.get_pages() yield pages # 获取PDF信息 def get_string(self): for pages in self.pdf_analysis(): for page in pages: self.inter.process_page(page) layout = self.device.get_result() for x in layout: if isinstance(x, LTTextBoxHorizontal): print(str(x.get_text())) # break PdfForString().get_string()
--------转自屁桃