一:下载所需要的库
1 :pdfminer 安装库命令 pip install pdfminer3k
pdfminer3k是pdfminer的Python 3端口。PDFMiner是从PDF文档中提取信息的工具。与其他PDF相关的工具不同,它完全专注于获取和分析文本数据。PDFMiner允许获取页面中文本的确切位置,以及其他信息,如字体或线条。它包含一个PDF转换器,可以将PDF文件转换为其他文本格式(如HTML)。它有一个可扩展的PDF解析器,可用于其他目的而不是文本分析。
2: docx 安装库命令 pip install python_docx
Python DocX目前是Python OpenXML的一部分,你可以用它打开Word 2007及以后的文档,而用它保存的文档可以在Microsoft Office 2007/2010, Microsoft Mac Office 2008, Google Docs, OpenOffice.org 3, and Apple iWork 08中打开。
from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter,process_pdf from pdfminer.layout import LAParams from pdfminer.converter import PDFPageAggregator from pdfminer.pdfinterp import PDFTextExtractionNotAllowed from docx import Document document = Document() import warnings warnings.filterwarnings("ignore") from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from io import StringIO from urllib.request import urlopen import pandas as pd def readPDF(pdfFile): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdfFile) device.close() content = retstr.getvalue() retstr.close() return content def save_to_file(file_name, contents): fh = open(file_name, 'w') fh.write(contents) fh.close() save_to_file('mobiles.txt', 'your contents str') def main(): pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf") outputString = readPDF(pdfFile)
#c.word save_to_file('c.csv',outputString) if __name__ == '__main__': main()
使用docx 保存为word
from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.layout import LAParams from pdfminer.converter import PDFPageAggregator from pdfminer.pdfinterp import PDFTextExtractionNotAllowed from docx import Document document = Document() import warnings warnings.filterwarnings("ignore") import os file_name=os.open('/Users/dudu/Desktop/test1/a.pdf',os.O_RDWR ) def main(): fn = open(file_name,'rb') parser = PDFParser(fn) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) resource = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(resource,laparams=laparams) interpreter = PDFPageInterpreter(resource,device) for i in doc.get_pages(): interpreter.process_page(i) layout = device.get_result() for out in layout: if hasattr(out,"get_text"): content = out.get_text().replace(u'xa0', u' ') document.add_paragraph( content, style='ListBullet' ) document.save('a'+'.docx') print ('处理完成') if __name__ == '__main__': main()
加下面的公众号,我会定期发一些资料。