• python文本处理(结巴分词并去除符号)


    import re
    import jieba.analyse
    import codecs
    import pandas as pd
    
    def simplification_text(xianbingshi):
        """提取文本"""
        xianbingshi_simplification = []
        with codecs.open(xianbingshi,'r','utf8') as f:
            for line in f :
                line = line.strip()
                line_write = re.findall('(?<=<b>).*?(?=<e>)',line)
                for line in line_write:
                    xianbingshi_simplification.append(line)
        with codecs.open(r'C:UsersAdministrator.SC-201812211013PycharmProjectsuntitled29yiwoqucodexianbingshi_write.txt','w','utf8') as f:
            for line in xianbingshi_simplification:
                f.write(line + '
    ')
    def jieba_text():
        """"""
        word_list = []
        data = open(r"C:UsersAdministrator.SC-201812211013PycharmProjectsuntitled29xianbingshi_write.txt", encoding='utf-8').read()
        seg_list = jieba.cut(data, cut_all=False)  # 精确模式
        for i in seg_list:
            word_list.append(i.strip())
        data_quchong = pd.DataFrame({'a':word_list})
        data_quchong.drop_duplicates(subset=['a'],keep='first',inplace=True)
        word_list = data_quchong['a'].tolist()
        with codecs.open('word.txt','w','utf8')as w:
            for line in word_list:
                w.write(line + '
    ')
    def word_messy(word):
        """词语提炼"""
        word_sub_list = []
        with codecs.open(word,'r','utf8') as f:
            for line in f:
                line_sub = re.sub("^[1-9]d*.d*|^[A-Za-z0-9]+$|^[0-9]*$|^(-?d+)(.d+)?$|^[A-Za-z0-9]{4,40}.*?",'',line)
                word_sub_list.append(line_sub)
        word_sub_list.sort()
        with codecs.open('word.txt','w','utf8')as w:
            for line in word_sub_list:
                w.write(line.strip("
    ") + '
    ')
    
    if __name__ == '__main__':
        xianbingshi = r'C:UsersAdministrator.SC-201812211013PycharmProjectsuntitled29yiwoquxianbingshi_sub_sen_all(1).txt'
        # simplification_text(xianbingshi)
        # word = r'C:UsersAdministrator.SC-201812211013PycharmProjectsuntitled29word.txt'
        simplification_text(xianbingshi)
  • 相关阅读:
    单词 统计
    第十周学习记录
    梦断代码阅读笔记03
    梦断代码阅读笔记02
    梦断代码阅读笔记01
    用户模板和用户场景
    第九周学习记录
    分享好友-分享朋友圈
    生命周期函数-页面刷新
    底部导航的设置
  • 原文地址:https://www.cnblogs.com/yiwoqu/p/11542002.html
Copyright © 2020-2023  润新知