• python读取PDF文件内容


     1 import os
     2 from pdfminer.pdfparser import PDFParser
     3 from pdfminer.pdfdocument import PDFDocument
     4 from pdfminer.pdfpage import PDFPage
     5 from pdfminer.pdfpage import PDFTextExtractionNotAllowed
     6 from pdfminer.pdfinterp import PDFResourceManager
     7 from pdfminer.pdfinterp import PDFPageInterpreter
     8 from pdfminer.pdfdevice import PDFDevice
     9 from pdfminer.layout import *
    10 from pdfminer.converter import PDFPageAggregator
    11 
    12 
    13 import os
    14 import pdb
    15 
    16 #inputFile = r'D:用户目录桌面340xxxxxxxxxxxxxxxxxx0.pdf'
    17 
    18 
    19 def decode_text(s):
    20     """
    21     Decodes a PDFDocEncoding string to Unicode.
    22     Adds py3 compatability to pdfminer's version.
    23     """
    24     if type(s) == bytes and s.startswith(b'xfexff'):
    25         return six.text_type(s[2:], 'utf-16be', 'ignore')
    26     else:
    27         ords = (ord(c) if type(c) == str else c for c in s)
    28         return ''.join(PDFDocEncoding[o] for o in ords)
    29 
    30 
    31 
    32 def get_msgs(inputFile):
    33     msgs = []
    34     fp = open(inputFile, 'rb')
    35     #来创建一个pdf文档分析器
    36     parser = PDFParser(fp)  
    37     #创建一个PDF文档对象存储文档结构
    38     document = PDFDocument(parser)
    39     # 检查文件是否允许文本提取
    40     if not document.is_extractable:
    41         raise PDFTextExtractionNotAllowed
    42     else:
    43         # 创建一个PDF资源管理器对象来存储共赏资源
    44         rsrcmgr=PDFResourceManager()
    45         # 设定参数进行分析
    46         laparams=LAParams()
    47         # 创建一个PDF设备对象
    48         # device=PDFDevice(rsrcmgr)
    49         device=PDFPageAggregator(rsrcmgr,laparams=laparams)
    50         # 创建一个PDF解释器对象
    51         interpreter=PDFPageInterpreter(rsrcmgr,device)
    52      
    53         # 处理每一页
    54         for page in PDFPage.create_pages(document):
    55      
    56             interpreter.process_page(page)
    57      
    58             # 接受该页面的LTPage对象
    59             layout=device.get_result()
    60      
    61             for x in layout:
    62                 
    63                 if(isinstance(x,LTTextBoxHorizontal)):
    64                     #print(x.get_text().strip())
    65                     
    66                     msgs.append(x.get_text().strip())
    67                     
    68         return msgs
    69                    
    70 
    71 
    72     #print(msgs[5][5:]+ '	' + msgs[4][4:])
    73     
    74 
    75 
    76 if __name__ == "__main__":
    77     names = os.listdir('.')
    78     for i in names:
    79         if os.path.splitext(i)[-1] == '.pdf':
    80             #print(i)
    81             msg = get_msgs(i)
    82             #print(msg)
    83             ms = msg[5][5:]+ '	' + msg[4][4:]
    84             with open('学生信息表.txt','a') as f:
    85                 f.write(ms+'
    ')
    86     
  • 相关阅读:
    技术检验
    Linux 系统命令总结
    ftp服务器的搭建
    Win10优秀软件推荐
    Mac软件推荐
    博客主题美化
    无人机开发之四:Pixhawk开发环境搭建
    无人机开发之三:飞行器入门理论知识
    无人机开发之二:Pixhawk硬件架构
    无人机开发之一:Pixhawk与Arduino简述
  • 原文地址:https://www.cnblogs.com/chillytao-suiyuan/p/11858433.html
Copyright © 2020-2023  润新知