• camelot工具进行pdf表格解析重建


    camelot内置生成html文件的方法,但表格数据转化成pandas.dataframe的过程中,丢失了跨行跨列的结构信息,故生成html的表格无跨行跨列结构。

    于是我在输出部分选择直接手写html表格..

    import camelot
    import numpy as np
    import matplotlib.pyplot as plt
    import os
    import pandas as pd
    
    # def listdir(path, list_name):  # 传入存储的list
    #     for file in os.listdir(path):
    #         file_path = os.path.join(path, file)
    #         if os.path.isdir(file_path):
    #             listdir(file_path, list_name)
    #         else:
    #             list_name.append(file_path)
    #批量文件
    # filenames=[r'E:pdf_download']
    # listdir('E:pdf_download',filenames)
    # for onefile in filenames:
    #     filename=onefile.split(".", )[0]
    
    #单个文件
    onefile=r'1202007288.pdf'
    print("loading...", onefile)
    tables = camelot.read_pdf(onefile,pages='28',strip_text=' .
    ',line_scale=80,split_text=True)
    
    for onetable in tables:
        mask = np.zeros((len(onetable.rows)+1, len(onetable.cols)+1))
        colspan = np.ones((len(onetable.rows)+1, len(onetable.cols)+1))
        rowspan = np.ones((len(onetable.rows)+1, len(onetable.cols)+1))
        for onerow in onetable.cells:
            for onecell in onerow:
                thisrow = onetable.cells.index(onerow)
                thiscol = onerow.index(onecell)
                if mask[thisrow][thiscol] == 0:
                    if not onecell.right:
                        for i in range(thiscol,len(onerow)-1):
                            if not onerow[i].right:
                                mask[thisrow][i + 1] = 1
                                colspan[thisrow][thiscol] += 1
                            else:
                                break
                    if not onecell.bottom:
                        for i in range(thisrow,len(onetable.cells)-1):
                            if not onetable.cells[i][thiscol].bottom:
                                mask[i + 1][thiscol] = 1
                                rowspan[thisrow][thiscol] += 1
                            else:
                                break
        head='''<table border="1" class="dataframe">
      <tbody>'''
        f = open(onefile + '-page'+str(onetable.page) + '-table-'+str(onetable.order)+'.html', 'w')
        f.write(head)
        for onerow in onetable.cells:
            writerow = '''
        <tr>'''
            f.write(writerow)
            for onecell in onerow:
                thisrow = onetable.cells.index(onerow)
                thiscol = onerow.index(onecell)
                if mask[thisrow][thiscol] == 0:
                    if int(colspan[thisrow][thiscol]) > 1:
                        Colspan = 'colspan=' + str(int(colspan[thisrow][thiscol]))
                    else:
                        Colspan=''
                    if int(rowspan[thisrow][thiscol]) > 1:
                        Rowspan = 'rowspan=' + str(int(rowspan[thisrow][thiscol]))
                    else:
                        Rowspan = ''
                    writecell = '''
                    <td %s %s>%s</td>'''%(Colspan,Rowspan,onecell.text)
                    f.write(writecell)
            writerow = '''
        </tr>'''
            f.write(writerow)
        f.close()
  • 相关阅读:
    学习笔记—二进制和精度问题
    学习笔记—Buffer的常用方法与实现
    学习笔记—Node中第三方模块
    学习笔记—npm的介绍与使用
    .NET中序列化(一)
    .NET中序列化(二)
    JavaScript在多浏览器下杂谈1for循环
    .NET中序列化(三)
    DLCB额度切分
    DLCB解决问题的思路
  • 原文地址:https://www.cnblogs.com/wind-chaser/p/10690083.html
Copyright © 2020-2023  润新知