# coding:utf-8 # 将文本分词处理 import jieba stoplist = {}.fromkeys([ line.strip() for line in open('/Test/orgindata/stopwords.txt') ]) input = open('/Test/orgindata/corpus.txt','r') output = open('/Test/process2/corpus-seg.txt','w+') line = input.readline() index = 0 text = '' while line!=None and len(line) > 4: #去除头部和尾部的<content> </content> line = line[9:-11] # segments = thu1.cut(line, text=True) segments = jieba.cut(line) # segments = segments.split(' ') segments = [word for word in list(segments) if word not in stoplist] result = '' for segment in segments: if len(segment)>1: result += segment + ' ' line = input.readline() if len(result) > 4: text += result index += 1 if index%100 == 0: output.write(text.encode('utf-8') + ' ') text = '' print('line '+str(index)) print '处理完成'