• 抓取b站视频的带有数字的评论


    import requests
    import re
    import os
    import sys
    import json
    
    aid_list=[]
    info_list =[]
    title_list = []
    
    def getAllAVList(mid, size, page):
    	for n in range(1, page+1):
    		url='http://space.bilibili.com/ajax/member/getSubmitVideos?mid='+str(mid)+'&pagesize='+str(size)+'&page='+str(n)
    		r=requests.get(url)
    		text=r.text
    		#print(text.encode('utf-8').decode('unicode_escape'))
    		#{"status":true,"data":{"tlist":{"4":{"tid":4,"count":861,"name":"游戏"}},"vlist":[{"comment":200,"typeid":17,"play":24884,"pic":"//i2.hdslb.com/bfs/archive/da1faeb8f3b08693cd440e1c5dfe75b2f612d407.jpg","subtitle":"","description":"啦啦啦","copyright":"","title":"【风笑试玩】在太空捡垃圾丨Space Scavenger 直播试玩","review":0,"author":"逆风笑","mid":2019740,"is_union_video":0,"created":1592293193,"length":"17:16","video_review":316,"is_pay":0,"favorites":355,"aid":626106116,"is_steins_gate":0,"hide_click":false}],"count":861,"pages":861}}
    		json_text=json.loads(text)
    		for item in json_text['data']['vlist']:
    			aid_list.append(item['aid'])
    			title_list.append(item['title'])
    	print(aid_list)
    
    def getAllCommentList(item):
    	info_list.append('begin %s'%title_list[aid_list.index(item)])
    	print('begin %s'%title_list[aid_list.index(item)])
    	url='http://api.bilibili.com/x/reply?type=1&oid='+str(item)+'&pn=1&nohot=1&sort=0'
    	r=requests.get(url)
    	numtext=r.text
    	json_text=json.loads(numtext)
    	commentsNum=json_text['data']['page']['count']
    	page=commentsNum//20+2
    	for n in range(1, page):
    		url='https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn='+str(n)+'&type=1&oid='+str(item)+'&sort=1&nohot=1'
    		req=requests.get(url)
    		text=req.text
    		json_text_list=json.loads(text)
    		#print(json_text_list)
    		for i in json_text_list['data']['replies']:
    			# info_list.append([i['member']['uname'], i['content']['message']])
    			if re.findall(r'^[u4E00-u9FA5A-Za-z0-9]*[0-9][u4E00-u9FA5A-Za-z0-9]*$', i['content']['message']):
    				info_list.append(str(i['member']['uname']+': '+i['content']['message']))
    		# if input('continue, yes or no?')=='y':
    		# 	continue
    		# else:
    		# 	break
    
    def saveTxt(filename, filecontent):
    	filename=str(filename)+'.txt'
    	with open(filename, 'w', encoding='utf-8') as txt:
    		for content in filecontent:
    			# txt.write(content[0]+' '+content[1].replace('
    ', '')+'
    
    ')
    			txt.write(content+'
    ')
    			#print('文件写入中')
    
    if __name__ == '__main__':
    	getAllAVList(2019740, 1, 50)
    	for item in aid_list:
    		# info_list.clear()
    		getAllCommentList(item)
    	saveTxt('abc', info_list)
    

    程序的主要思路是借助b站的api进行数据的提取。首先,流程是视频信息接口→视频id→评论接口的评论数量→页数→访问评论字符串→通过正则表达式筛选出含有数字的评论,写在文件中。
    json.loads函数将字符串转成字典格式。
    遇见字符串里有/uxxxx的字符(utf-8字符编码),想转成其原本的意思,使用string.encode('utf-8').decode('unicode_escape')
    中文、字母和数字的正则表示法是[u4E00-u9FA5A-Za-z0-9]

    参考链接:

    1

    爬虫如何抓取b站评论,弹幕等内容? - 肥肥杨的回答 - 知乎

    2

    python: 关于解决'u'开头的字符串转中文的方法

  • 相关阅读:
    cantor 数表
    利用form的“acceptcharset”在不同编码的页面间提交表单
    <li>标签,在文字超出宽度时引起混乱的解决办法
    java中 Integer.getInteger(String str)的疑惑
    SQL语句集锦
    禁用鼠标右键
    ROW_NUMBER() OVER函数的基本用法
    listview
    decodeResource
    LinkedList
  • 原文地址:https://www.cnblogs.com/tellw/p/13158653.html
Copyright © 2020-2023  润新知