• 关键信息读写脚本


    记录一个小的脚本

    """
        Function: extract the key info
        Author: dyx
        DateTime: 20200805
    """
    import pandas as pd
    
    current_file = r'./data/original.xlsx'
    to_save = r'./data/abc.xlsx'
    
    
    class AnalysisTool():
    
    
        def get_info(self, analy_file, save_file):
            df = pd.read_excel(analy_file)
            message = df['sms_message']
            messageid = df['messageid']
            data = message.tolist()
    
            messages = []
            groups = []
            groups_ids = []
            groups_name = []
            save_types = []
            money_nums = []
            money_types = []
            gusts = []
            save_moneys = []
            put_datas = []
            remarks_names = []
            remarks_groups = []
            remarks_money = []
            remarks = []
    
            for id, each in zip(messageid, data):
                messages.append(id)
                items = each.split('
    
    ')
                if len(items) == 4:
                    items = [items[0], items[1], items[2]+'
    '+items[3]]
    
                first = items[0].split('
    ')
                name_info = first[2].split(' ')
                if not name_info:
                    name_info = 'None'
                # print(name_info)
                save_get_info = first[3]
                gust_info = first[4].split(': ')[-1]
                remark_info = first[5]
                second = items[1].split('
    ')
                save_money = []
                for s in second:
                    if '0 万' not in s and '存结' not in s and '' in s:
                        save_money.append(s)
    
                    if '0 million(s)' not in s and 'currency balance' not in s and 'million' in s:
                        save_money.append(s)
    
                third = items[2].split('
    ')
                for t in third:
                    if '入数日期' in t or 'Input Date' in t :
                        put_date_info = t.split(': ')[1]
    
                groups.append(name_info[0])
                groups_ids.append(name_info[1])
                groups_name.append(name_info[2])
    
                sgi = save_get_info.split(': ')[-1]
    
                save_type = sgi.split(' ')[0]
                save_money_info = sgi.split(save_type)[-1]
                if '' in save_get_info:
                    smi = save_money_info.split('')
                    money_num = smi[0].strip()+''
                    money_type = smi[-1].strip()
    
                elif 'million' in save_get_info:
                    smi = save_money_info.split('million(s)')
                    money_num = smi[0].strip()+'million(s)'
                    money_type = smi[-1].strip()
    
                save_types.append(save_type)
                money_nums.append(money_num)
                money_types.append(money_type)
                gusts.append(gust_info)
                remarks.append(remark_info)
                save_moneys.append(save_money)
                put_datas.append(put_date_info)
    
            # remark
            remarks = self.ner_deal_data(remarks)
            for each in remarks:
                if '备注' in each[0] or 'Notes' in each[0]:
                    remarks_names.append(None)
                    remarks_groups.append(None)
                    remarks_money.append(None)
                else:
                    remarks_names.append(each[0])
                    remarks_groups.append(each[1])
                    remarks_money.append(each[2])
    
            # save
            DateSet = list(zip(messages, groups,groups_ids,groups_name, save_types, money_nums,money_types,
                               gusts, save_moneys, put_datas, remarks_names, remarks_groups, remarks_money, remarks))
            df = pd.DataFrame(data=DateSet, columns=['item1','item2','编号','','','金额','单位',
                                                     '信息','', '日期','姓名','组名','金额','备注'])
            try:
                df.to_excel(save_file)
                print('OK. analysis result has generate!')
            except:
                print('exe is wrong')
    
    
    
        def ner_deal_data(self, data):
            all_list= []
    
            for line in data:
                invertname = ""
                name = ""
                split_line =line.split("]")
                if len(split_line) < 2:
                    all_list.append([line])
                    continue
                else:
                    for i in split_line[0][::-1]:
                        if not i.isdigit():
                            invertname += i
                        else:
                            for j in invertname[::-1]:
                                name += j
                            name.strip()
                            break
                    group1 = split_line[0].split(" ")
                    group = ""
                    for one in group1:
                        if "" in one or "" in one:
                            group = one
                    money = ""
                    for i in split_line[0]:
                        if i.isdigit() or i == ".":
                            money += i
                        else:
                            if i == "" or i == "":
                                if len(money) != 0:
                                    money += i
                                    break
                                else:
                                    money = ""
                            else:
                                money = ""
                all_list.append([name.strip(), group, money])
    
            return all_list
    
    
    if __name__ == '__main__':
        at = AnalysisTool()
        # execute
        at.get_info(current_file, to_save)
  • 相关阅读:
    MVC知识总结(前序)
    MySql 安装
    django【ORM】model字段类型
    gmail注册时“此电话号码无法用于进行验证”
    Python3 re模块正则表达式中的re.S
    django【ORM】 通过外键字段找对应类
    Django【进阶】modelform
    python3-字符编码
    python3-可变和不可变数据类型
    Django【设计】同功能不同实现模式的兼容性
  • 原文地址:https://www.cnblogs.com/demo-deng/p/13439998.html
Copyright © 2020-2023  润新知