• python 利用已有Ner模型进行数据清洗合并


    # -*- coding: utf-8 -*-
    from kashgari.corpus import DataReader
    import re
    from tqdm import tqdm
    
    
    def cut_text(text, lenth):
        textArr = re.findall('.{' + str(lenth) + '}', text)
        textArr.append(text[(len(textArr) * lenth):])
        return textArr
    
    
    def clean_data(source_file, target_file, ner_model):
        
        data_x, data_y = DataReader().read_conll_format_file(source_file)
    
        with tqdm(total=len(data_x)) as pbar:
            for idx, text_array in enumerate(data_x):
                if len(text_array) <= 100:
                    ners = ner_model.predict([text_array])
                    ner = ners[0]
                else:
                    texts = cut_text(''.join(text_array), 100)
                    ners = []
                    for text in texts:
                        ner = ner_model.predict([[char for char in text]])
                        ners = ners + ner[0]
                    ner = ners         
                # print('[-----------------------', idx, len(data_x))
                # print(data_y[idx])
                # print(ner)
            
                for jdx, t in enumerate(text_array):
                    if ner[jdx].startswith('B') or ner[jdx].startswith('I') :
                        if data_y[idx][jdx] == 'O':
                            data_y[idx][jdx] = ner[jdx]
               
                # print(data_y[idx])
                # print('-----------------------]')  
                pbar.update(1)
                
        f = open(target_file, 'a', encoding="utf-8")    
        for idx, text_array in enumerate(data_x):
            if idx != 0:
                f.writelines(['
    '])   
            for jdx, t in enumerate(text_array):
                text = t + ' ' + data_y[idx][jdx] 
                if idx == 0 and jdx == 0:
                    text = text
                else:
                    text = '
    ' + text
                f.writelines([text])   
        
        f.close()   
        
        data_x2, data_y2 = DataReader().read_conll_format_file(source_file)
        print(data_x == data_x2, len(data_y) == len(data_y2), '数据清洗完成')              
    
    # -*- coding: utf-8 -*-
    import kashgari
    from data_tools import clean_data
    time_ner = kashgari.utils.load_model('time_ner.h5')
    clean_data('./data/example.dev', 'example.dev', time_ner)
    
  • 相关阅读:
    windows消息定义
    17种正则表达式
    DirectX程序例子
    C#调用WINDOWS API 要点
    提取网页中的超级链接
    基于消息驱动的C#Windows程序
    C#使用事件
    C#启动进程的方法
    C#注册表操作方法
    HighLight.net 2.0 版本源码
  • 原文地址:https://www.cnblogs.com/gmhappy/p/11863935.html
Copyright © 2020-2023  润新知