• python3爬虫-爬取B站排行榜信息


    import requests, re, time, os
    
    category_dic = {
        "all": "全站榜",
        "origin": "原创榜",
        "rookie": "新人榜",
    }
    
    day_dic = {1: "日排行榜", 3: "三日排行榜", 7: "周排行榜", 30: "月排行榜"}
    all_or_origin_dic = {
        0: "全站",
        1: "动画",
        168: "国创相关",
        3: "音乐",
        129: "舞蹈",
        4: "游戏",
        36: "科技",
        188: "数码",
        160: "生活",
        119: "鬼畜",
        155: "时尚",
        5: "娱乐",
        181: "影视",
    }
    
    bangumi_dic = {
        "番剧": 1,
        "国产动画": 4,
    }
    
    cinema_dic = {
        "记录篇": 177,
        "电影": 23,
        "电视剧": 11,
    }
    
    rookie_dic = {
        0: "全站",
        1: "动画",
        3: "音乐",
        129: "舞蹈",
        4: "游戏",
        36: "科技",
        188: "数码",
        160: "生活",
        119: "鬼畜",
        155: "时尚",
        5: "娱乐",
        181: "影视",
    }
    
    BaseDict = {
        "all": all_or_origin_dic,
        "origin": all_or_origin_dic,
        # "bangumi": bangumi_dic,
        # "cinema": cinema_dic,
        "rookie": rookie_dic,
    }
    
    dic = {
        "all": 1,
        "origin": 2,
        "rookie": 3,
    }
    
    base_path = "D:图片\bilibili_ranking"       # 文件保存的位置
    
    
    def get_url():
        for first in category_dic.keys():
            if first in ["all", "origin", "rookie"]:
                for second in BaseDict.get(first).keys():
                    for third in day_dic.keys():
                        url = "https://api.bilibili.com/x/web-interface/ranking?jsonp=jsonp&rid={}&day={}&type={}&arc_type=0&callback=__jp1".format(
                            second, third, dic.get(first))
                        yield url, [first, second, third]
    
    
    s = requests.Session()
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
        "Referer": "https://www.bilibili.com/ranking/all/0/0/3"
    }
    url_list = get_url()
    for url in url_list:
        print("向{}发请求".format(url[0]))
        response = s.get(url=url[0], headers=headers)
        data = response.text.replace('"', "")
        pattern = r'.*?author:(?P<author>.*?),.*?play:(?P<play>.*?),.*?pts:(?P<pts>.*?),.*?title:(?P<title>.*?),'
        result_list = re.findall(pattern, data)
        path = os.path.join(base_path, "{}-{}-{}".format(category_dic.get(url[1][0]),
                                                         rookie_dic.get(url[1][1]) or all_or_origin_dic.get(url[1][1]),
                                                         day_dic.get(url[1][2])))
        f = open(path + ".txt", "a", encoding="utf-8")
        print('正在写入....{}'.format(path + ".txt"))
        for index, res in enumerate(result_list):
            # print("排名:{}".format(index + 1))
            # print("作者:{}".format(res[0]))
            # print("播放量:{}".format(res[1]))
            # print("综合分数:{}".format(res[2]))
            # print("标题:{}".format(res[3]))
            # print("-" * 90)
            f.write("排名:{}
    ".format(index + 1))
            f.write("标题:{}
    ".format(res[3]))
            f.write("作者:{}
    ".format(res[0]))
            f.write("播放量:{}
    ".format(res[1]))
            f.write("综合分数:{}
    ".format(res[2]))
            f.write("-" * 90 + "
    ")
        f.close()
        time.sleep(2)
  • 相关阅读:
    [Beta阶段]第四次Scrum Meeting
    [Beta阶段]第三次Scrum Meeting
    [Beta阶段]第二次Scrum Meeting
    [Beta阶段]第一次Scrum Meeting
    [Alpha阶段]事后分析博客
    [Alpha阶段]无人转会申请
    Server MyEclipse Tomcat v7.0 was unable to start within 45 seconds. If the server requires more time
    关于单选框、下拉框、复选框的数据回显问题以及全选和全不选
    学习spring和spring mvc过程中遇到的一些问题
    springmvc常用注解之@Controller和@RequestMapping
  • 原文地址:https://www.cnblogs.com/zhuchunyu/p/10765863.html
Copyright © 2020-2023  润新知