• LDA模型数据的可视化


     1 """
     2     执行lda2vec.ipnb中的代码
     3     模型LDA
     4     功能:训练好后模型数据的可视化
     5 """
     6 
     7 from lda2vec import preprocess, Corpus
     8 import matplotlib.pyplot as plt
     9 import numpy as np
    10 # %matplotlib inline
    11 import pyLDAvis
    12 try:
    13     import seaborn
    14 except:
    15     pass
    16 # 加载训练好的主题-文档模型,这里是查看数据使用。这里需要搞清楚数据的形式,还要去回看这个文件是怎么构成的
    17 npz = np.load(open('D:/my_AI/lda2vec-master/examples/twenty_newsgroups/lda2vec/topics.pyldavis.npz', 'rb'))
    18 # 数据
    19 dat = {k: v for (k, v) in npz.iteritems()}
    20 # 词汇表变成list
    21 dat['vocab'] = dat['vocab'].tolist()
    22 
    23 #####################################
    24 ##  主题-词汇
    25 #####################################
    26 # 主题个数为10
    27 top_n = 10
    28 # 主题对应10个最相关的词
    29 topic_to_topwords = {}
    30 for j, topic_to_word in enumerate(dat['topic_term_dists']):
    31     top = np.argsort(topic_to_word)[::-1][:top_n]               # 概率从大到小的下标索引值
    32     msg = 'Topic %i '  % j
    33     # 通过list的下标获取关键词
    34     top_words = [dat['vocab'][i].strip()[:35] for i in top]
    35     # 数据拼接
    36     msg += ' '.join(top_words)
    37     print(msg)
    38     # 将数据保存到字典里面
    39     topic_to_topwords[j] = top_words
    40 
    41 import warnings
    42 warnings.filterwarnings('ignore')
    43 prepared_data = pyLDAvis.prepare(dat['topic_term_dists'], dat['doc_topic_dists'],
    44                                  dat['doc_lengths'] * 1.0, dat['vocab'], dat['term_frequency'] * 1.0, mds='tsne')
    45 
    46 from sklearn.datasets import fetch_20newsgroups
    47 remove=('headers', 'footers', 'quotes')
    48 texts = fetch_20newsgroups(subset='train', remove=remove).data
    49 
    50 
    51 ##############################################
    52 ##  选取一篇文章,确定该文章有哪些主题
    53 ##############################################
    54 
    55 print(texts[1])
    56 tt = dat['doc_topic_dists'][1]
    57 msg = "{weight:02d}% in topic {topic_id:02d} which has top words {text:s}"
    58 # 遍历这20个主题,观察一下它的权重,权重符合的跳出来
    59 for topic_id, weight in enumerate(dat['doc_topic_dists'][1]):
    60     if weight > 0.01:
    61         # 权重符合要求,那么输出该主题下的关联词汇
    62         text = ', '.join(topic_to_topwords[topic_id])
    63         print (msg.format(topic_id=topic_id, weight=int(weight * 100.0), text=text))
    64 
    65 # plt.bar(np.arange(20), dat['doc_topic_dists'][1])
    66 
    67 print(texts[51])
    68 tt = texts[51]
    69 msg = "{weight:02d}% in topic {topic_id:02d} which has top words {text:s}"
    70 for topic_id, weight in enumerate(dat['doc_topic_dists'][51]):
    71     if weight > 0.01:
    72         text = ', '.join(topic_to_topwords[topic_id])
    73         print(msg.format(topic_id=topic_id, weight=int(weight * 100.0), text=text))
    74 
    75 
    76 # plt.bar(np.arange(20), dat['doc_topic_dists'][51])
  • 相关阅读:
    Git
    Shell-sed之替换字符
    Linux IO/NFS tunning 性能优化及检测
    利用Java Flight Recorder(JFR)诊断timing及内存问题
    Get/Post
    SQL-1
    HTTP协议简要
    nmap简单使用
    (C语言)买东西找零钱
    今日错误(C语言)(定义二维数组储存)
  • 原文地址:https://www.cnblogs.com/demo-deng/p/9707006.html
Copyright © 2020-2023  润新知