import re import jieba.analyse import codecs import pandas as pd def simplification_text(xianbingshi): """提取文本""" xianbingshi_simplification = [] with codecs.open(xianbingshi,'r','utf8') as f: for line in f : line = line.strip() line_write = re.findall('(?<=<b>).*?(?=<e>)',line) for line in line_write: xianbingshi_simplification.append(line) with codecs.open(r'C:UsersAdministrator.SC-201812211013PycharmProjectsuntitled29yiwoqucodexianbingshi_write.txt','w','utf8') as f: for line in xianbingshi_simplification: f.write(line + ' ') def jieba_text(): """""" word_list = [] data = open(r"C:UsersAdministrator.SC-201812211013PycharmProjectsuntitled29xianbingshi_write.txt", encoding='utf-8').read() seg_list = jieba.cut(data, cut_all=False) # 精确模式 for i in seg_list: word_list.append(i.strip()) data_quchong = pd.DataFrame({'a':word_list}) data_quchong.drop_duplicates(subset=['a'],keep='first',inplace=True) word_list = data_quchong['a'].tolist() with codecs.open('word.txt','w','utf8')as w: for line in word_list: w.write(line + ' ') def word_messy(word): """词语提炼""" word_sub_list = [] with codecs.open(word,'r','utf8') as f: for line in f: line_sub = re.sub("^[1-9]d*.d*|^[A-Za-z0-9]+$|^[0-9]*$|^(-?d+)(.d+)?$|^[A-Za-z0-9]{4,40}.*?",'',line) word_sub_list.append(line_sub) word_sub_list.sort() with codecs.open('word.txt','w','utf8')as w: for line in word_sub_list: w.write(line.strip(" ") + ' ') if __name__ == '__main__': xianbingshi = r'C:UsersAdministrator.SC-201812211013PycharmProjectsuntitled29yiwoquxianbingshi_sub_sen_all(1).txt' # simplification_text(xianbingshi) # word = r'C:UsersAdministrator.SC-201812211013PycharmProjectsuntitled29word.txt' simplification_text(xianbingshi)