import codecs #主题模型 from gensim import corpora from gensim.models import LdaModel from gensim import models from gensim.corpora import Dictionary te = [] fp = codecs.open('input.txt','r') for line in fp: line = line.split(',') te.append([ w for w in line ]) print ('输入文本数量:',len(te)) dictionary = corpora.Dictionary(te) corpus = [ dictionary.doc2bow(text) for text in te ] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] print(list(corpus_tfidf))#输出词的tfidf print(list(corpus))#输出文本向量空间 lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20,passes=100) doc_topic = [a for a in lda[corpus]] topics_r = lda.print_topics(num_topics = 20, num_words =20) topic_name = codecs.open('topics_result3.txt','w') for v in topics_r: topic_name.write(str(v)+' ') fp2 = codecs.open('documents_result.txt','w') for t in doc_topic: c = [] c.append([a[1] for a in t]) print(t) m = max(c[0]) for i in range(0, len(t)): if m in t[i]: #print(t[i]) fp2.write(str(t[i][0]) + ' ' + str(t[i][1]) + ' ')#输出模型类和概览 break