1、读入文本内容
corpos = pandas.DataFrame(columns=['filePath','content']) for root ,dirs,files in os.walk(r'H:19113117 - 副本'): for name in files: filePath=root+'\'+name f = codecs.open(filePath,'r','utf-8') content=f.read() f.close() corpos.loc[len(corpos)+1]=[filePath,content.strip()]
2、将手动分完词的文本进行词频统计
filePaths=[] segments=[] for filePath,content in corpos.itertuples(index=False): for item in content.split('/'): segments.append(item) filePaths.append(filePath) segmentDF=pandas.DataFrame({'filePath':filePaths,'segments':segments}) segStat = segmentDF.groupby( by=["filePath","segments"] )["segments"].agg({ "计数":numpy.size }).reset_index();
3、计算tf值
textVector=segStat.pivot_table( index='segments', values='计数', columns='filePath', fill_value=0) tF=(1+numpy.log(textVector)).as_matrix()
4、计算IDF
def handle(x): idf=1+numpy.log(len(corpos)/(numpy.sum(x>0)+1)) return idf zhuan=textVector.T iDF=zhuan.apply(handle).as_matrix() iDF=iDF.reshape(8889,1)
5、计算tfidf
TFIDF=tF*iDF
tFIDF_DF=pandas.DataFrame(TFIDF)
6、将每个文本中tfidf值排名前100的词和相应的tfidf值输出
file=[] for root ,dirs,files in os.walk(r'H:19113117 - 副本'): for name in files: name=name[0:-4] file.append(name) for i in range(len(corpos)): sort=pandas.DataFrame(tFIDF_DF.loc[:,i].order(ascending=False)[:100]).reset_index() names = sort.columns.tolist() names[names.index(i)] = 'value' sort.columns = names tagis = textVector.index[sort.index] print(file[i]) for t in range(len(tagis)): print(tagis[t],sort.loc[t].value)