简单的爬取页面数据,并生成词云和网络图
需要安卓 networkx wordcloud 包
代码如下
1 # @Author :whyCai 2 # @Time :2020/10/17 10:35 3 from time import sleep 4 5 import requests,json,jieba,wordcloud,networkx as nx,jieba.posseg as jp 6 from matplotlib import pyplot as plt 7 8 """ 9 #爬取页面的数据 10 def getCommText(): 11 ''' 12 爬取页面的数据 13 :return: 14 ''' 15 text = '' 16 url = 'https://xxxxxxx' 17 headers = {'content-type': 'application/json'} 18 19 for i in range(0,300): 20 data = {"pageIndex": i+1,"xxxx":1} 21 r = requests.post(url, data=json.dumps(data), headers=headers) 22 res = json.loads(r.text) 23 #获取接口的字段值 24 resContent = res['result']['items'] 25 lenComm = len(resContent) 26 # 获取接口的字段值 27 for j in range(0,lenComm): 28 # text = text + resContent[j]['content']+' ' 29 print(resContent[j]['content']) 30 sleep(0.2) 31 # print(text) 32 # return text 33 getCommText() 34 """ 35 36 """ 37 #生成词云 38 39 #读取数据 40 f = open('xxx.txt',encoding='utf-8') 41 text = f.read() 42 txtlist = jieba.lcut(text) 43 txtlist = " ".join(txtlist) 44 w = wordcloud.WordCloud(width=1000,height=700,background_color='white',font_path='msyh.ttc') 45 w.generate(txtlist) 46 #生成词云 47 w.to_file('output2-poem.png') 48 """ 49 50 51 """ 52 #生成网络图 53 54 #text 为 上面词云中的 text = f.read() 55 words = jp.lcut(text) 56 G = nx.MultiDiGraph() 57 # 添加节点 58 for word in words: 59 G.add_node(word.flag) 60 # 添加边 61 for i in range(len(words) - 1): 62 G.add_edge(words[i].flag, words[i+1].flag) 63 # 绘图 64 nx.draw(G, alpha=0.8, with_labels=True, node_color='lightgreen', font_size=36, node_size=999, width=2) 65 # 展示 66 plt.show() 67 """
参考博客:
词云:https://www.cnblogs.com/wkfvawl/p/11585986.html
网络图:https://blog.csdn.net/your_answer/article/details/79189660