和上一篇相比,差别不是很大
1 import xlrd#读取excel 2 import xlwt#写入excel 3 import requests 4 import linecache 5 import wordcloud 6 import jieba 7 import matplotlib.pyplot as plt 8 from bs4 import BeautifulSoup 9 10 if __name__=="__main__": 11 f = xlwt.Workbook(encoding='utf-8') #创建工作簿 12 sheet1 = f.add_sheet(u'sheet1') #创建sheet 13 row0 = [u'ID',u'name',u'av',u'play_num',u'comment_num'] 14 #生成第一行 15 for i in range(0,len(row0)): 16 sheet1.write(0,i,row0[i]) 17 yun="" 18 19 n=0#ID编号 20 target='https://www.bilibili.com/ranking/all/160/0/3'#b站 21 user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' 22 headers = {'User-Agent':user_agent} 23 24 req=requests.get(url=target) 25 html=req.text 26 html=html.replace('<br>',' ').replace('<br/>',' ').replace('/>','>') 27 bf=BeautifulSoup(html,"html.parser") 28 29 texts=bf.find('ul',class_='rank-list') 30 texts_div=texts.find_all('div',class_='info') 31 #print(texts_div) 32 for item in texts_div: 33 n=n+1 34 item_name=item.find('a').text#标题 35 yun+=str(item_name) 36 item_href=item.find('a')['href']#链接 37 h=item_href.rfind('/') 38 item_href=item_href[h+1:] 39 item_refer=item.find_all('span',class_='data-box') 40 item_refer1=item_refer[0].text 41 item_refer2=item_refer[1].text 42 #print('{} {} {} {} '.format(item_name,item_href,item_refer1,item_refer2)) 43 mid=[n,item_name,item_href,item_refer1,item_refer2] 44 #print(mid) 45 for i in range(len(row0)):#写入excel 46 sheet1.write(n,i,mid[i]) 47 f.save('demo1.xls') #保存文件 48 49 # 结巴分词,生成字符串,wordcloud无法直接生成正确的中文词云 50 cut_text = " ".join(jieba.cut(yun)) 51 wc = wordcloud.WordCloud( 52 #设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的 53 font_path="C:/Windows/Fonts/simfang.ttf", 54 #设置了背景,宽高 55 background_color="white",width=1000,height=880).generate(cut_text) 56 57 plt.imshow(wc, interpolation="bilinear") 58 plt.axis("off") 59 plt.show() 60 print("Done!")