爬虫B站【原神】相关视频播放数前1000的数据并生成词云图

老规矩，先上代码

# -*- coding: utf-8 -*
import pandas as pd
import matplotlib.pyplot as plt
import requests
import sys
import time
from bs4 import BeautifulSoup
import jieba
import wordcloud

def mihoyo():#爬虫b站视频信息
    target='https://search.bilibili.com/all?keyword=%E5%8E%9F%E7%A5%9E&order=click&duration=0&tids_1=0'#网址
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
    headers = {'User-Agent':user_agent}

    result=[]
    n=0#视频总数量
    for i in range(50):
        mid_target=target+"&page={}".format(i+1)
        req=requests.get(url=mid_target)
        html=req.text
        html=html.replace('<br>',' ').replace('<br/>',' ').replace('/>','>')
        bf=BeautifulSoup(html,"html.parser")#网页解析
        texts=bf.find('ul',class_='video-list clearfix')#大列表
        texts_div=texts.find_all('li',class_='video-item matrix')#每一项
        for item in texts_div:
            n=n+1
            item_name=item.find('a')['title']#标题
            #item_href=item.find('a')['href']#链接
            item_refer_watch_num=item.find('span',class_='so-icon watch-num').text
            item_refer_watch_num=item_refer_watch_num.replace(" ","").replace("
","")
            #print(item_refer_watch_num)
            item_refer_uptime=item.find('span',class_='so-icon time').text
            item_refer_uptime=item_refer_uptime.replace(" ","").replace("
","")
            #print(item_refer_uptime)
            result.append([item_name,item_refer_uptime,item_refer_watch_num])
    pd.DataFrame(result).to_excel("output.xls")

def info_of_b():
    excel=pd.read_excel(r'output.xls',sheet_name='Sheet1')#读取数据
    province_data=excel
    dates=province_data.index.tolist()#第一列
    countries=province_data.columns.tolist()#第一行
    values=province_data.values.tolist()
    print(countries)
    print(values[0])
    yun=""
    for item in values:
        if item[2][:7]=="2020-08":
            yun+=item[1]
            print(item[1])
    #print(result)
    yun=yun.replace("你","").replace("我","").replace("的","").replace("了","").replace("吗","").replace("个","")
    yun=yun.replace("是","").replace("吧","").replace("这","").replace("原神","").replace("","").replace("","")
    cut_text = " ".join(jieba.cut(yun))
    wc = wordcloud.WordCloud(
    font_path="C:/Windows/Fonts/simfang.ttf",
    background_color="white",width=1000,height=880).generate(cut_text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    #print(cut_text)

info_of_b()

先运行mihoyo()函数进行爬虫，将数据写入output.xls文件中

这里的网址是在B站搜索关键词（本例中为“原神”）复制网址栏的url而来，可以看到数据被加密为

%E5%8E%9F%E7%A5%9E&

搜索结果有50页，每页20个视频

这里爬取标题、发布时间和播放量，以发布日期排序

将其以上述格式写入xls文件中

爬虫时要注意，最好注销账号进行，因为频繁的爬虫会被认为为服务器攻击，会被封ip一段时间。。。

然后读取xls数据进行词云生成

这里按照月度生成方便分析

2020年以前

2020年1月

2020年3月

2020年4月

2020年6月

2020年7月

2020年8月

2020年9月

2020年10月

2020年11月

2020年12月

2021年1月

相关阅读:
hdu 2492 树状数组 Ping pong
HDU 1532 基础EK Drainage Ditches
EK算法模板
 Codeforces Round #538 (Div. 2) (A-E题解)
Codeforces Global Round 1 (A-E题解)
Educational Codeforces Round 59 (Rated for Div. 2) DE题解
 Codeforces Round #535 (Div. 3) 题解
 Codeforces Round #534 (Div. 2) D. Game with modulo(取余性质+二分)
POJ2253：Frogger（改造Dijkstra）
POJ1797：Heavy Transportation（改造Dijkstra）
原文地址：https://www.cnblogs.com/ljy1227476113/p/14363739.html