Python 文本分析 笔记
中文停用词处理
自行下载 shotwords.txt,代码如下:
def stopwordslist(filepath): stopwords = [line.strip() for line in open (filepath, 'r' , encoding = 'utf-8' ).readlines()] return stopwords # 对句子进行分词 def seg_sentence(sentence): sentence_seged = jieba.cut(sentence.strip()) stopwords = stopwordslist( '/root/stopwords.txt' ) # 这里加载停用词的路径 outstr = '' for word in sentence_seged: if word not in stopwords: if word ! = ' ' : outstr + = word outstr + = " " return outstr |