• b站评论区爬取


    import requests
    
    import time
    
    from bs4 import BeautifulSoup
    
    import json
    
    # 必要的库
    
    
    
    def get_html(url):
    
        headers = {
    
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    
         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) appleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    
        }
    
        # 模拟访问信息
        r = requests.get(url, timeout=30, headers=headers)
    
        r.raise_for_status()
    
        r.endcodding = 'utf-8'
    
        return r.text
    
    
    
    def get_content(url):
    
            comments = []
    
            html = get_html(url)
    
            try:
    
                s = json.loads(html)
    
            except:
    
                 print("jsonload error")
    
    
    
            num = len(s['data']['replies']) # 获取每页评论栏的数量
    
    
    
            i = 0
    
            while i < num:
    
                comment = s['data']['replies'][i] # 获取每栏信息
    
                InfoDict = {}  # 存储每组信息字典
                InfoDict['用户名'] = comment['member']['uname']
    
                InfoDict['uid号'] = comment['member']['mid']
    
                InfoDict['评论内容'] = comment['content']['message']
    
                InfoDict['性别'] = comment['member']['sex']
    
                comments.append(InfoDict)
    
                i+=1
            return comments
    
    def Out2File(dict):
        with open('评论区爬取.txt', 'a+', encoding='utf-8') as f:
            for user in dict:
                try:
    
                    f.write('姓名:{}\t uid:{}\t 性别:{}\t \n 评论内容:{}\t \n'.format(user['用户名'], user['uid号'], user['性别'], user['评论内容']))
    
    
    
                except:
    
                 print("out2File error")
    
            print('当前页面保存完成')
    
    
    
    
    
    e = 0
    
    page = 1
    
    while e == 0:
    
        url = "https://api.bilibili.com/x/v2/reply/main?&jsonp=jsonp&next=" + str(page) + "&type=1&oid=677870443&mode=3&plat=1&_=1641278727643"
    
    
        try:
    
            print()
    
            content = get_content(url)
    
            print("page:", page)
            Out2File(content)
    
            page = page + 1
    
    
    
    # 为了降低被封ip的风险,每爬10页便歇5秒。
            if page % 10 == 0: # 求余数
    
                time.sleep(5)
        except:
                e = 1

    参考视频:https://www.bilibili.com/video/BV1fu411d7Hy?from=search&seid=3483579157564497530&spm_id_from=333.337.0.0

  • 相关阅读:
    [NOI2012]美食节——费用流(带权二分图匹配)+动态加边
    mysqlsla slow-query常用用法
    [POI2000]病毒
    mysqlsla安装
    mysqldumpslow
    [学习笔记]约数
    查看MySQL数据的连接
    [学习笔记]质数
    关于ulimit -a中需要修改的两个值
    Miller-Rabin与二次探测
  • 原文地址:https://www.cnblogs.com/520520520zl/p/15762472.html
Copyright © 2020-2023  润新知