• python 分析PDF文件 (基于使用pdf2htmlEX.exe python3.6)


    from html.parser import HTMLParser
    import json
    import re
    from openpyxl import Workbook
    from openpyxl.utils import get_column_letter
    from itertools import islice
    import subprocess
    import os
    import shutil




    def runApp(command, message=''):
    stdoutput = None
    erroutput = None
    for retryFlag in range(3):
    try:
    p = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
    stderr=subprocess.PIPE)
    (stdoutput, erroutput) = p.communicate(message.encode(), timeout=30)
    break
    except Exception as e:
    if retryFlag != 2:
    continue
    else:
    raise Exception("Error %s"%command)
    return stdoutput, erroutput

    class MyHTMLParser(HTMLParser):
    def __init__(self):
    HTMLParser.__init__(self)
    self.links = []
    self.handledtags = ['div']
    self.processState = 0
    self.fdata={"meta":{},"item":{},"card":{}}
    self.stpdf = False
    self.x3 = False
    self.lastmeta = ""
    self.itemNum = 0
    self.cardNum = 0

    def _attr(self,attrlist, attrname):
    for each in attrlist:
    if attrname == each[0]:
    return each[1]
    return ""

    def handle_starttag(self, tag, attrs):
    if "page-container" == self._attr(attrs, 'id'):
    self.stpdf = True
    if "x3" in self._attr(attrs, 'class'):
    self.x3 = True



    def handle_data(self,data):
    if not self.stpdf:
    return
    if self.processState == 0:
    if len(data.split())>0 and data.split()[0].endswith((":",":")):
    data = data.split()
    lastCon = 0
    for i in range(0,len(data)):
    if data[i].endswith((":",":")): #判断data结尾(":",":")
    lastCon = i
    self.lastmeta = data[i][:-1]
    else:
    self.fdata["meta"][data[lastCon][:-1]] = data[i]
    elif "经费项目" in data:
    self.fdata["meta"]["项目"] = data[4:]
    self.processState = 1
    elif self.x3 :
    self.x3 = False
    if len(self.lastmeta) >0 and self.lastmeta in self.fdata["meta"] and len(self.fdata["meta"][self.lastmeta]) > 0:
    self.fdata["meta"][self.lastmeta] += data
    elif self.processState == 1:
    data = data.split() #字符串切割
    if "合计金额(小写):" in data[0]:
    self.fdata["item"]["total"] = data[-1]
    self.processState = 2
    if self.processState == 2:
    if "结算信息" in data:
    self.processState = 3
    if self.processState == 3:
    data = data.split()
    if data[0] in map(lambda x: str(x), [i for i in range(100)]):
    self.fdata["card"][data[0]] = data[1:]
    self.cardNum += 1
    elif "预约报销日期" in data[0]:
    self.fdata["card"]["date"] = " ".join(data[1:])
    self.processState = 4

    return


    # def pdf2csv(pdf):
    if __name__ == '__main__':
    n=0
    # pdfFile = sys.argv[0]
    addressPDF = "E:/totally/FinancePDF_travel/"

    f_list = os.listdir(addressPDF)

    for fileNAME in f_list:
    try:
    if os.path.splitext(fileNAME)[1] == '.pdf':

    pdfFile=addressPDF +fileNAME
    pdfFile=pdfFile
    #print(pdfFile)
    htmlFile = pdfFile[:-4] + ".html"
    xlsxFile= pdfFile +".xls"

    s,e = runApp('pdf2htmlEX "%s"'%(pdfFile))
    try:
    html_code = re.sub("<span.+?</span>"," ",open(htmlFile,encoding = "UTF-8").read())
    except Exception as e2:
    print(e)
    print(e2)
    hp = MyHTMLParser()
    hp.feed(html_code)
    hp.close()

    # print(json.dumps(hp.fdata,indent=4))


    wb = Workbook()

    ws = wb.active

    title = ["编号", "项目负责人", "项目", "报销事由", "费用合计", "预约报销日期", "结算信息2*", "", ""]
    c = iter(range(len(title)))
    for i in c:
    t = title[i][:-2] if title[i].endswith('*') else title[i]
    ws["%s1"% (chr(ord('A')+i))] = t
    if title[i][-1] == "*":
    crs = int(title[i][-2])
    ws.merge_cells("%s1:%s1"%(chr(ord('A')+i),chr(ord('A')+i+crs)))
    next(islice(c, crs, crs), None)
    # i += crs


    MergeBoxNum = max(hp.cardNum,hp.itemNum)
    if MergeBoxNum == 0:
    MergeBoxNum ==1
    else:
    c = iter(range(len(title)))
    for i in c:
    if title[i].endswith("*"):
    crs = int(title[i][-2])
    next(islice(c, crs, crs), None)
    continue
    ws.merge_cells("%s2:%s%d"% (chr(ord('A')+i),
    chr(ord('A')+i),
    1 + MergeBoxNum))

    vfunc = [lambda x: x["meta"]["报销单号"],
    lambda x: x["meta"]["项目负责人"],
    lambda x: x["meta"]["项目"],
    lambda x: x["meta"]["报销事由"],
    lambda x: x["item"]["total"],
    lambda x: x["card"]["date"],
    lambda x: [n[1] for n in filter(lambda k:
    k[0] in map(lambda x: str(x), [i for i in range(100)])
    , x["card"].items())],
    lambda x: [n[1] for n in filter(lambda k:
    k[0] in map(lambda x: str(x), [i for i in range(100)])
    , x["item"].items())]


    vfuncID = 0
    for i in range(len(title)):
    if len(title[i]) == 0:
    continue
    dat = vfunc[vfuncID](hp.fdata)
    if type(dat) == list:
    if "结算信息" in title[i]:
    for j in range(len(dat)):
    if len(dat[j]) == 0:
    ws["%s%d" % (chr(ord('A') + i), 2 + j + 1)] = "null"
    elif len(dat[j]) <= 2 and len(dat[j]) > 0 :
    ws["%s%d" % (chr(ord('A') + i), 2 + j + 1)] = dat[j][0]
    ws["%s%d" % (chr(ord('A') + i + 1), 2 + j + 1)] = dat[j][1]
    else:
    ws["%s%d" % (chr(ord('A') + i), 2 + j + 1)] = dat[j][0]
    ws["%s%d" % (chr(ord('A') + i + 1), 2 + j + 1)] = dat[j][1]
    ws["%s%d" % (chr(ord('A') + i + 2), 2 + j + 1)] = dat[j][2]
    ws["%s%d" % (chr(ord('A') + i + 3), 2 + j + 1)] = dat[j][3]
    else:
    ws["%s2" % (chr(ord('A') + i))] = dat

    vfuncID += 1
    column_widths = []
    for row in ws:
    for i, cell in enumerate(row):
    if not cell.value:
    continue
    if len(column_widths) > i:
    if len(cell.value) > column_widths[i]:
    column_widths[i] = len(cell.value)
    else:
    column_widths += [len(cell.value)] if cell.value else [0]

    for i, column_width in enumerate(column_widths):
    ws.column_dimensions[get_column_letter(i + 1)].width = min(42,column_width * 1.7)
    wb.save(xlsxFile)
    except:
    name_OVER=os.path.splitext(fileNAME)
    if name_OVER[1] == '.pdf':
    n += 1
    print(str(n) + '.' + '无法解析' + fileNAME + '文件')
    oldname = u"E:\totally\FinancePDF_travel\" + fileNAME
    newname = u"E:\totally\bad_file\" + fileNAME
    shutil.copyfile(oldname, newname)
    print('已复制' + fileNAME + '文件')
    continue
    else:
    continue
     
  • 相关阅读:
    利用if else判断几点是什么时间段
    【UML】活动图介绍
    【UML】类图介绍
    jQuery Ajax跨域问题简易解决方案
    ASP.NET MVC @Html.Label的问题
    Mysql Show ProcessList命令
    【ASP.NET MVC 学习笔记】- 20 ASP.NET Web API
    【ASP.NET MVC 学习笔记】- 19 REST和RESTful Web API
    【ASP.NET MVC 学习笔记】- 18 Bundle(捆绑)
    【ASP.NET MVC 学习笔记】- 17 Model验证
  • 原文地址:https://www.cnblogs.com/setname/p/8417808.html
Copyright © 2020-2023  润新知