分享一个电子发票信息提取工具(Python)

电子发票太多，想统计下总额异常困难，网上工具不好用，花了2个小时实现一份，测试过中石油、京东开具的电子发票还行，部分发票名称失败有问题不影响统计，有需要的小伙伴自己拿去改吧。

import cmd
import sys
import json
import pdfplumber
import os
from pprint import pprint


class FapiaoShell(cmd.Cmd):
    """ 发票 """

    intro = '欢迎使用发票提取工具，输入?(help)获取帮助消息和命令列表，CTRL+C退出程序。\n'
    prompt = '\n输入命令: '
    doc_header = "详细文档 (输入 help <命令>):"
    misc_header = "友情提示:"
    undoc_header = "没有帮助文档:"
    nohelp = "*** 没有命令(%s)的帮助信息 "

    def __init__(self):
        super().__init__()

    def do_load(self, arg):
        """ 加载发票 例如：load D:\ """
        if not os.path.isdir(arg):
            print('参数必须是目录!')
            return

        os.chdir(os.path.dirname(arg))
        pdfs = []
        for root, _, files in os.walk(arg):
            for fn in files:
                ext = os.path.splitext(fn)[1].lower()
                if ext != '.pdf':
                    continue
                fpth = os.path.join(root, fn)
                fpth = os.path.relpath(fpth)
                print(f'发现pdf文件: {fpth}')
                pdfs.append(fpth)

        pdf_ctxs = self._parse_pdfs(pdfs)
        total = {
            '内容': pdf_ctxs,
            '发票数': len(pdf_ctxs),
            '总计': 0,
        }
        for fpth, info in pdf_ctxs:
            total['总计'] += float(info['总计'])

        print('\n保存到 结果.json...')

        with open("结果.json", 'w', encoding='utf-8') as json_file:
            json.dump(total,
                      json_file,
                      ensure_ascii=False,
                      sort_keys=True,
                      indent=4,
                      separators=(', ', ': '))

        print('完成!')

    def _parse_pdfs(self, pdfs):
        """ 分析 """
        result = []
        for fpth in pdfs:
            info = {}
            with pdfplumber.open(fpth) as pdf:
                page = pdf.pages[0]

                if '增值税电子普通发票' not in ''.join(page.extract_text()):
                    result.append((fpth, {}))

                inf = self._extrace_from_words(page.extract_words())
                info.update(inf)

                inf = self._extrace_from_table(page.extract_tables()[0])
                info.update(inf)

            result.append((fpth, info))
        return result

    def _extrace_from_words(self, words):
        """ 从单词中提取 """
        info = {}

        lines = {}
        for word in words:
            top = int(word['top'])
            bottom = int(word['bottom'])
            pos = (top + bottom) // 2
            text = word['text']
            if pos not in lines:
                lines[pos] = [text]
            else:
                lines[pos].append(text)

        lines_pack = []
        last_pos = None
        for pos in sorted(lines):
            arr = lines[pos]

            if len(lines_pack) > 0 and pos - last_pos <= 10:
                lines_pack[-1] += arr
                continue

            lines_pack.append(arr)
            last_pos = pos
            continue

        for pack in lines_pack:
            for idx, line in enumerate(pack):
                if '电子普通发票' in line:
                    info['标题'] = line
                    continue

                if '发票代码:' in line:
                    info['发票代码'] = line.split(':')[1]
                    continue

                if '发票号码:' in line:
                    info['发票号码'] = line.split(':')[1]
                    continue

                if '开票日期:' in line:
                    year = line.split(':')[1]
                    month = [ln for ln in pack if ln.isdigit()][0]
                    day = [ln[:2] for ln in pack if '日' in ln][0]
                    info['开票日期'] = f'{year}-{month}-{day}'
                    continue

                if '机器编号:' in line:
                    info['机器编号'] = [ln for ln in pack if ln.isdigit()
                                    and len(ln) > 10][0]
                    continue

                if '码:' in line:
                    c1 = pack[idx].split(':')[1]
                    c2 = pack[idx+1]
                    c3 = pack[idx+2]
                    c4 = pack[idx+3]
                    info['校验码'] = f'{c1} {c2} {c3} {c4}'
                    continue

                if '收款人:' in line:
                    info['收款人'] = line.split(':')[1]
                    continue

                if '开票人:' in line:
                    info['开票人'] = line.split(':')[1]
                    continue

        return info

    def _extrace_from_table(self, table):
        """ 从表中提取 """
        info = {}
        if len(table) != 4:
            return None

        # 购买方
        for cell in table[0]:
            if not cell:
                continue

            lines = cell.splitlines()
            for line in lines:
                if '名        称:' in line:
                    info['购买方名称'] = line.split(':')[1]
                    continue

                if len(line) == 18 and line.isalnum():
                    info['购买方税号'] = line
                    continue

                if len(line) == 27:
                    if '密码' not in info:
                        info['密码'] = []
                    info['密码'].append(line)
                    continue

        # 详细
        for cell in table[1]:
            if not cell:
                continue

            lines = cell.splitlines()
            for line in lines:
                if '货物或应税劳务、服务名称' in line:
                    info['商品'] = lines[1:-1]
                    break

                if '金  额' in line:
                    info['总金额'] = lines[-1][1:]
                    break

                if '税  额' in line:
                    info['总税额'] = lines[-1][1:]
                    break

        # 合计
        for cell in table[2]:
            if not cell:
                continue

            lines = cell.splitlines()
            for line in lines:
                if '¥' in line:
                    info['总计'] = line[1:]

        # 销售方
        for cell in table[3]:
            if not cell:
                continue

            lines = cell.splitlines()
            for line in lines:
                if '名        称:' in line:
                    info['销售方名称'] = line.split(':')[1]
                    continue

                if len(line) == 18 and line.isalnum():
                    info['销售方税号'] = line
                    continue

        return info


if __name__ == '__main__':
    try:
        FapiaoShell().cmdloop()
    except KeyboardInterrupt:
        print('\n\n再见！')
相关阅读:
whatweb tree
testUrl
ParseUrl
whatweb wordpress.rb
LeetCode: Binary Tree Level Order Traversal 解题报告
 LeetCode: Minimum Path Sum 解题报告
 Lintcode: Sort Colors II 解题报告
 LeetCode: Validate Binary Search Tree 解题报告
 LeetCode: Longest Common Prefix 解题报告
 LeetCode: Maximum Subarray 解题报告
原文地址：https://www.cnblogs.com/wuyaSama/p/10768002.html