• b站排行榜-爬虫


    import requests
    from lxml import etree
    import re
    import time
    import json
    import threading
    import urllib3
    urllib3.disable_warnings()
    
    
    url = "https://www.bilibili.com/ranking/"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
    }
    
    
    #所有频道数据
    c_data = {}
    #获取频道信息
    def channels_work():
        res = requests.get(url=url,headers=headers).text
        #匹配有效数据
        data = re.findall('"channels":(.*?),"showTypes":',res)
        #解析所有的频道和tid信息
        channels = json.loads(data[0])
        print("所有分类信息爬取ok")
        print(channels)
        return channels
    
    
    #获取频道UP主个人首页
    def channels_detail(c_url,c_name,all_up):
        res = requests.get(url=c_url,headers=headers).text
        tree = etree.HTML(res)
        all_video = tree.xpath("//ul[@class='rank-list']/li")
        for i in all_video:
            #获取视频播放量和评论数和up主
            up_home = "https:"+i.xpath(".//div[@class='detail']/a/@href")[0]
            all_up.append(up_home)
        print(c_name+"排行榜数据爬取ok")
    
    
    def up_detail(url,proxies):
        up = {}
        up['video'] = []
    
        #解析up的mid
        lis = url.split("/")
        up['up_mid'] = lis[-1]
        print("开始爬取mid为"+str(up['up_mid'])+"的UP主的信息")
    
        #up主个人信息接口
        url1 = 'https://api.bilibili.com/x/space/acc/info?mid='+str(up['up_mid'])
        res = requests.get(url=url1, headers=headers,proxies=proxies,verify=False).text
        up_d = json.loads(res)
        up['up_name'] = up_d["data"]["name"]
        up['up_face'] = up_d["data"]["face"]
        up['up_sex'] = up_d["data"]["sex"]
        up['up_sign'] = up_d["data"]["sign"]
        up['up_level'] = up_d["data"]["level"]
        up['up_fans_badge'] = up_d["data"]["fans_badge"]
    
        #关注,被关注接口
        # {"code": 0, "message": "0", "ttl": 1,"data": {"mid": 18775476, "following": 128, "whisper": 0, "black": 0, "follower": 519397}}
        url2 = 'https://api.bilibili.com/x/relation/stat?vmid=' + str(up['up_mid'])
        res = requests.get(url=url2, headers=headers,proxies=proxies,verify=False).text
        up_d = json.loads(res)
        up['up_following'] = up_d['data']['following']
        up['up_follower'] = up_d['data']['follower']
    
        #播放量,阅读数
        # {"code": 0, "message": "0", "ttl": 1,"data": {"archive": {"view": 37989388}, "article": {"view": 560}, "likes": 1688691}}
        url3 = 'https://api.bilibili.com/x/space/upstat?mid=' + str(up['up_mid'])
        res = requests.get(url=url3, headers=headers,proxies=proxies,verify=False).text
        up_d = json.loads(res)
        up['up_archive'] = up_d['data']['archive']['view']
        up['up_likes'] = up_d['data']['likes']
        up['up_article'] = up_d['data']['article']['view']
    
        #充电数接口
        # {"code":0,"data":{"display_num":0,"count":13,"total_count":994,"list":...
        url4 = 'https://elec.bilibili.com/api/query.rank.do?mid=' + str(up['up_mid'])
        res = requests.get(url=url4, headers=headers,proxies=proxies,verify=False).text
        up_d = json.loads(res)
        try:
            up['up_total_count'] = up_d['data']['total_count']
        except:
            up['up_total_count'] = 0
    
        #视频接口
        url5 = 'https://api.bilibili.com/x/space/arc/search?mid='+str(up['up_mid'])+'&ps=1&pn=1'
        res = requests.get(url=url5, headers=headers,proxies=proxies,verify=False).text
        up_d = json.loads(res)
        count = up_d['data']['page']['count']
        up = get_video(count,up)
    
        print(up)
    
    
    def get_video(count,up):
        pn = 1
        while count > 0:
            url5 = 'https://api.bilibili.com/x/space/arc/search?mid='+str(up['up_mid'])+'&ps=100&pn='+str(pn)
            res = requests.get(url=url5, headers=headers).text
            up_d = json.loads(res)
            v={}
            for video in up_d['data']['list']['vlist']:
                v['title'] = video['title']
                v['pic_url'] = video['pic']
                v['comment'] = video['comment']
                v['video_review'] = video['video_review']
                v['created'] = video['created']
                up['video'].append(v)
            pn += 1
            count -= 100
        return up
    
    #爬取可用代理并爬取信息
    def ip_run(i,ts2):
        url = 'https://www.xicidaili.com/nn/'
        ip_response = requests.get(url=url,headers=headers).text
    
        ips = re.findall("<td>(d+.d+.d+.d+)</td>", ip_response, re.S)
        ports = re.findall("<td>(d+)</td>", ip_response, re.S)
    
        for ip in(zip(ips,ports)):
            proxies = {
                "http":"http://"+ip[0]+":"+ip[1],
                "https":"http://"+ip[0]+":"+ip[1],
            }
            try:
                res = requests.get('https://space.bilibili.com/337312411',proxies=proxies, timeout=3)
                print("ip能使用")
                #如果能使用,使用此ip代理
                print("开始爬取url为" + i + "的up主详细信息")
                t = threading.Thread(target=up_detail, args=(i,proxies))
                t.start()
                time.sleep(3)
                ts2.append(t)
                break
            except Exception as e:
                print("ip不能使用")
    
    if __name__ == '__main__':
        #获取频道信息
        channels = channels_work()
    
        #获取UP主主页
        all_up = []
        ts = []
        ts2 = []
        for c in channels:
            #拼接频道url
            c_url = url+"all/"+str(c['tid'])+"/1/3"
            c_data[c["name"]] = []
            t = threading.Thread(target=channels_detail,args=(c_url,c['name'],all_up))
            t.start()
            ts.append(t)
    
        for t in ts:
            t.join()
    
        for i in all_up:
            ip_run(i,ts2)
    
        for t in ts2:
            t.join()
    
        print("爬取所有数据完成")
    
  • 相关阅读:
    swift--使用URLSession异步加载图片
    swift--浮点数转换成整数(四舍五入/直接截断)
    swift--环形进度条(UIActivityIndicatorView)的用法
    swift--Timer实现定时器功能,每个一段时间执行具体函数,可以重复,也可以只执行一次
    HTML节点树
    网页的结构
    网页的组成
    HTTP 请求过程
    HTTP 基础术语
    《投资最重要的事》
  • 原文地址:https://www.cnblogs.com/zx125/p/12848479.html
Copyright © 2020-2023  润新知