• 爬取“我们是冠军”的评论


    微博热搜第一名;B站人气超过3.5亿,满屏弹幕;腾讯视频超过600万人观看;央视新闻也发微博祝贺EDG;今天用python来爬下B站“我们是冠军”这个视频的评论并做些可视化。获取呐喊的正确姿势。

    评论爬取代码:

     1 import csv
     2 import pprint
     3 import random
     4 import time
     5 import requests
     6 import openpyxl as opx
     7 import json
     8 
     9 f = open('我们是冠军.csv', mode='a', encoding='utf-8-sig', newline='')
    10 csvWriter = csv.DictWriter(f, fieldnames=[
    11     '评论人',
    12     '性别',
    13     '点赞数',
    14     '评论时间',
    15     '评论内容',
    16 ])
    17 csvWriter.writeheader() # 写入头
    18 startStampTime = int(time.time() * 1000)
    19 # # 新建excel文档
    20 # wb = opx.Workbook()
    21 # ws = wb.create_sheet(index=0)
    22 #
    23 # # 先写入表头
    24 # ws.cell(row=1, column=1, value='评论人')
    25 # ws.cell(row=1, column=2, value='性别')
    26 # ws.cell(row=1, column=3, value='点赞数')
    27 # ws.cell(row=1, column=4, value='评论时间')
    28 # ws.cell(row=1, column=5, value='评论内容')
    29 
    30 headers = {
    31     "cookie": "_uuid=BE35640F-EB4E-F87D-53F2-7A8FD5D50E3330964infoc; buvid3=D0213B95-F001-4A46-BE4F-E921AE18EB67167647infoc; CURRENT_BLACKGAP=1; CURRENT_QUALITY=0; rpdid=|(u))ku~m)kJ0J'uYJuRRRYmk; CURRENT_FNVAL=976; video_page_version=v_old_home_17; blackside_state=1; LIVE_BUVID=AUTO1516364619569495; sid=bqyo86kv; innersign=1; PVID=2",
    32     "referer": "https://www.xxx.com/video/BV12R4y1E7kn?spm_id_from=333.999.0.0",
    33     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36",
    34 }
    35 # # 初始计数
    36 # count = 2
    37 for page in range(1, 100 + 1):
    38     print(f'====================正在爬取第{page}页的数据====================')
    39     time.sleep(random.randint(2,5)) # 随机休眠
    40     nextStampTime = int(time.time() * 1000)
    41     # 请求的网址
    42     url = f'https://api.xxx.com/x/v2/reply/main?callback=jQuery172046940903221511165_{startStampTime}&jsonp=jsonp&next={page}&type=1&oid=336587753&mode=3&plat=1&_={nextStampTime}'
    43     # 开始请求数据
    44     response = requests.get(url=url, headers=headers)
    45     # print(response.text)
    46     json_data = json.loads(response.text[42:-1])
    47     # 提取我们要的数据
    48     data = json_data['data']['replies']
    49     print(f'第{page}页包含:' + str(len(data)))
    50     for item in data:
    51         # pprint.pprint(item)
    52         name = item['member']['uname']
    53         sex = item['member']['sex']
    54         like = item['like']
    55         ctime = item.get('ctime')
    56         # print(ctime)
    57         commenttime = time.strftime('%Y-%m-%d %H:%M', time.localtime(ctime))
    58         content = item['content']['message']
    59         # print(name, sex, like, commenttime, content, sep=' | ')
    60         dit = {
    61             '评论人':name,
    62             '性别':sex,
    63             '点赞数':like,
    64             '评论时间':commenttime,
    65             '评论内容':content,
    66         }
    67         print(dit)
    68         csvWriter.writerow(dit)
    69     break
    70 print('数据采集完毕!')

    下面来做个词云:

     1 import re
     2 import jieba
     3 import matplotlib.pyplot as plt
     4 import pandas as pd
     5 from wordcloud import WordCloud
     6 # 读取
     7 df = pd.read_csv('我们是冠军.csv')
     8 
     9 # 删除重复记录和NA值
    10 df_new = df.drop_duplicates() # 去重
    11 df_new = df_new.dropna() # 删除缺失值
    12 # print(df_new)
    13 STOPWORDS = {"回复", "@", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
    14              "", "", "", "", "", "", "赵薇", "", "不是", "现在", "什么", "", "", "知道", "", "我们", "他们", "", "", "", "",
    15             "", "就是", "但是", "", "", "自己", "", "问题", "一个", "没有", "", "这个", "", "", "点赞", "热词", "系列", "热词系列"}
    16 
    17 # 取出评论区的词进行分词
    18 textList = df_new['评论内容'].value_counts().sort_values().index.tolist()
    19 # print(textList)
    20 # 将列表转换成字符串
    21 strText = ' '.join(textList)
    22 # 正则替换一下指定次
    23 newTxt = re.sub("A-Z0-9-a-z!\%[]\,。", "", strText)
    24 # print(newTxt)
    25 words = jieba.lcut(newTxt)
    26 
    27 # 制作词云
    28 wordcloudword = WordCloud(
    29     background_color='white',
    30     width = 1080,
    31     height = 960,
    32     # font_path = "../文悦新青年.otf",
    33     font_path = 'C:/Windows/Fonts/simhei.ttf',
    34     max_words = 150,
    35     scale = 10, #清晰度
    36     max_font_size = 100,
    37     stopwords=STOPWORDS,
    38     # mask = img_array, # 可以设置背景图像
    39     collocations=False).generate(newTxt)
    40 
    41 plt.imshow(wordcloudword)
    42 plt.axis('off')
    43 plt.show()
    44 wordcloudword.to_file('wc.png')
  • 相关阅读:
    洛谷3004 [USACO10DEC]宝箱Treasure Chest
    洛谷3778 [APIO2017]商旅
    洛谷4141消失之物——每个体积的角度
    洛谷2943 [USACO09MAR]清理Cleaning Up——转变枚举内容的dp
    bzoj1858[Scoi2010]序列操作
    poj1325机器工作——二分图最小点覆盖
    洛谷P1144——最短路计数
    poj3254二进制放牛——状态压缩DP
    poj1191棋盘分割——区间DP
    洛谷P1474货币系统——背包方案计数
  • 原文地址:https://www.cnblogs.com/mafu/p/15546347.html
Copyright © 2020-2023  润新知