#!/usr/bin/env python3 # -*- coding: utf-8 -*- import pdfplumber import time from tqdm import tqdm import pandas as pd def get_balance_table(file): # 获取合并资产负债表内容 start = 0 ret=[] with pdfplumber.open(file) as pdf: for page in pdf.pages: try: text = page.extract_text() if '合并资产负债表' in text and '编制单位' in text: start = 1 if start: table = page.extract_table({ "vertical_strategy": "lines", "horizontal_strategy": "lines", "explicit_vertical_lines": [], "explicit_horizontal_lines": [], "snap_tolerance": 3, "join_tolerance": 3, "edge_min_length": 3, "min_words_vertical": 3, "min_words_horizontal": 1, "keep_blank_chars": False, "text_tolerance": 3, "text_x_tolerance": None, "text_y_tolerance": None, "intersection_tolerance": 1, "intersection_x_tolerance": None, "intersection_y_tolerance": None, }) ret.extend(table) if '负债和所有者权益总计' in text and '所有者权益合计' in text: break except Exception as e: print(e) return ret if __name__ == "__main__": start_time = time.time() print("time start:%s"%(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(start_time)))) data = get_balance_table("1.pdf") df = pd.DataFrame(data) df=df[df.iloc[:,0].notnull()] df.to_excel("1.xlsx") end_time = time.time() print("time end:%s"%(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(end_time)))) print("take: %s S"%(int(end_time-start_time)))