1.核心包
#jieba、pandas用来处理数据,数据源以xls格式存储的,这里用pandas进行处理
import jieba from jieba import analyse import pandas as pd
#scipy、wordcloud创建词云 from scipy.misc import imread from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
#matpoltlib展示、保存生成的词云图
import matplotlib.pyplot as plt
2.过程
import jieba
from jieba import analyse import pandas as pd import sys reload(sys) sys.setdefaultencoding('utf-8') # 1.stopwords def stop_words(): stop_dict = set() with open(u'./百度停用词列表.txt', 'r')as f: words = f.readlines() for word in words: stop_dict.add(word.strip().decode('utf-8')) return stop_dict # 2.分词并去停用词 # save chinese only,remove english words,emoji def remove_stopwords(stop_words): source_data = pd.read_excel('./11.xls') all_content = [] content = source_data[u'内容'] f = open('./weibo.txt', 'w') for line in content: cut_list = [c for c in jieba.cut(line)] ret_set = set(cut_list) - stop_words ret_list = list(ret_set) f.writelines([str(line) for line in ret_list]) f.writelines(' ') all_content.extend(ret_list) f.close() #3.统计词频 def get_frequency_words(file): with open(file, 'r') as f: texts = f.read()
# 统计词频 top_words = analyse.textrank(texts, topK=400, withWeight=True) ret_words = {} for word in top_words: ret_words[word[0]] = word[1] return ret_words from scipy.misc import imread from wordcloud import WordCloud from wordcloud import ImageColorGenerator import matplotlib.pyplot as plt
# 4.生成词云图并保存 def generate_word_cloud(dict): color_mask = imread('./background.jpg') cloud = WordCloud( # 设置字体,不指定就会出现乱码,文件名不支持中文 font_path="./static/chinese.msyh.ttf", # font_path=path.join(d,'simsun.ttc'), # 设置背景色,默认为黑,可根据需要自定义为颜色 background_color='white', # 词云形状, mask=color_mask, # 允许最大词汇 max_words=400, # 最大号字体,如果不指定则为图像高度 max_font_size=150, # 画布宽度和高度,如果设置了mask则不会生效 # 词语水平摆放的频率,默认为0.9.即竖直摆放的频率为0.1 prefer_horizontal=0.8 ) cloud.generate_from_frequencies(frequencies=dict) cloud.to_file('word_cloud.jpg') # plt.imshow(cloud) # 不现实坐标轴 plt.axis('off') # 绘制词云 # plt.figure(dpi = 600) image_colors = ImageColorGenerator(color_mask)
# 重新上色 plt.imshow(cloud.recolor(color_func=image_colors))
# 保存图片 plt.savefig('./result2.png') # plt.show() if __name__ == '__main__': stop_words = stop_words() remove_stopwords(stop_words=stop_words) words_frequency = get_frequency_words('./weibo.txt') generate_word_cloud(words_frequency)
[注]:(1).wordcloud.generate_from_text(text=text)可以直接由文本生成词云,但必须是英文文本。
(2).wordcloud.generate_from_frequencies(frequencies=dict)由词频字典生成词云,词频越大则显示该词size越大
[结果]: