(1) 读取源文本文件内容
content = "" try:
fo = open(filename)
print("读取文件名:", filename)
for line in fo.readlines():
content += line.strip() print("字数:", len(content))
(2) 使用结巴分词组件做中文分词
rawContent = readFile(rawFileName)
r = '[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、
~@#¥%……&*()]+'
rawContent = re.sub(r, " ", rawContent) seg_list = jieba.cut(rawContent, cut_all=False) writeFile(dataFileName, " ".join(seg_list))
(3) 分词结果进行词频统计
with open(dataFileName) as wf, open(sortFileName,'w') as wf2, open(tmpFileName, 'w') as wf3:
for word in wf:
word_lst.append(word.split(' ')) for item in word_lst:
for item2 in item:
if item2 not in word_dict:
word_dict[item2] = 1
else:
word_dict[item2] += 1
(4) 词频统计结果写入新的 txt 文件word_items.sort(reverse = True) for item in word_items:
wf2.write(item.label+' '+str(item.times) + '\n')