• python 爬取bilibili 视频信息


    抓包时发现子菜单请求数据时一般需要rid,但的确存在一些如游戏->游戏赛事不使用rid,对于这种未进行处理,此外rid一般在主菜单的响应中,但有的如番剧这种,rid在子菜单的url中,此外返回的data中含有页数相关信息,可以据此定义爬取的页面数量

      1 # -*- coding: utf-8 -*-
      2 # @author: Tele
      3 # @Time  : 2019/04/08 下午 1:01
      4 import requests
      5 import json
      6 import os
      7 import re
      8 import shutil
      9 from lxml import etree
     10 
     11 
     12 # 爬取每个菜单的前5页内容
     13 class BiliSplider:
     14     def __init__(self, save_dir, menu_list):
     15         self.target = menu_list
     16         self.url_temp = "https://www.bilibili.com/"
     17         self.headers = {
     18             "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
     19          #  "Cookie": "LIVE_BUVID=AUTO6715546997211617; buvid3=07192BD6-2288-4BA5-9259-8E0BF6381C9347193infoc; stardustvideo=1; CURRENT_FNVAL=16; sid=l0fnfa5e; rpdid=bfAHHkDF:cq6flbmZ:Ohzhw:1Hdog8",
     20         }
     21         self.proxies = {
     22             "http": "http://61.190.102.50:15845"
     23         }
     24         self.father_dir = save_dir
     25 
     26     def get_menu_list(self):
     27         regex = re.compile("//")
     28         response = requests.get(self.url_temp, headers=self.headers)
     29         html_element = etree.HTML(response.content)
     30         nav_menu_list = html_element.xpath("//div[@id='primary_menu']/ul[@class='nav-menu']/li/a")
     31 
     32         menu_list = list()
     33         for item in nav_menu_list:
     34             menu = dict()
     35             title = item.xpath("./*/text()")
     36             menu["title"] = title[0] if len(title) > 0 else None
     37             href = item.xpath("./@href")
     38             menu["href"] = "https://" + regex.sub("", href[0]) if len(href) > 0 else None
     39 
     40             # 子菜单
     41             submenu_list = list()
     42             sub_nav_list = item.xpath("./../ul[@class='sub-nav']/li")
     43             if len(sub_nav_list) > 0:
     44                 for sub in sub_nav_list:
     45                     submenu = dict()
     46                     sub_title = sub.xpath("./a/span/text()")
     47                     submenu["title"] = sub_title[0] if len(sub_title) > 0 else None
     48                     sub_href = sub.xpath("./a/@href")
     49                     submenu["href"] = "https://" + regex.sub("", sub_href[0]) if len(sub_href) > 0 else None
     50                     submenu_list.append(submenu)
     51             menu["submenu_list"] = submenu_list if len(submenu_list) > 0 else None
     52             menu_list.append(menu)
     53         return menu_list
     54 
     55     # rid=tid
     56     def parse_index_url(self, url):
     57         result_list = list()
     58         # 正则匹配
     59         regex = re.compile("<script>window.__INITIAL_STATE__=(.*)</script>")
     60         response = requests.get(url, headers=self.headers)
     61         result = regex.findall(response.content.decode())
     62         temp = re.compile("(.*);(function").findall(result[0]) if len(result) > 0 else None
     63         sub_list = json.loads(temp[0])["config"]["sub"] if temp else list()
     64         if len(sub_list) > 0:
     65             for sub in sub_list:
     66                 # 一些子菜单没有rid,需要请求不同的url,暂不处理
     67                 if "tid" in sub:
     68                     if sub["tid"]:
     69                         sub_menu = dict()
     70                         sub_menu["rid"] = sub["tid"] if sub["tid"] else None
     71                         sub_menu["title"] = sub["name"] if sub["name"] else None
     72                         result_list.append(sub_menu)
     73                 else:
     74                     pass
     75 
     76         return result_list
     77 
     78     # 最新动态  region?callback
     79     # 数据 newlist?callback
     80     def parse_sub_url(self, item):
     81         self.headers["Referer"] = item["referer"]
     82         url_pattern = "https://api.bilibili.com/x/web-interface/newlist?rid={}&type=0&pn={}&ps=20"
     83 
     84         # 每个菜单爬取前5页
     85         for i in range(1, 6):
     86             data = dict()
     87             url = url_pattern.format(item["rid"], i)
     88             print(url)
     89             try:
     90                 response = requests.get(url, headers=self.headers, proxies=self.proxies, timeout=10)
     91             except:
     92                 return
     93             if response.status_code == 200:
     94                 data["content"] = json.loads(response.content.decode())["data"]
     95                 data["title"] = item["title"]
     96                 data["index"] = i
     97                 data["menu"] = item["menu"]
     98                 # 保存数据
     99                 self.save_data(data)
    100             else:
    101                 print("请求超时")  # 一般是403,被封IP了
    102 
    103     def save_data(self, data):
    104         if len(data["content"]) == 0:
    105             return
    106         parent_path = self.father_dir + "/" + data["menu"] + "/" + data["title"]
    107         if not os.path.exists(parent_path):
    108             os.makedirs(parent_path)
    109         file_dir = parent_path + "/" + "" + str(data["index"]) + "页.txt"
    110 
    111         # 保存
    112         with open(file_dir, "w", encoding="utf-8") as file:
    113             file.write(json.dumps(data["content"], ensure_ascii=False, indent=2))
    114 
    115     def run(self):
    116         # 清除之前保存的数据
    117         if os.path.exists(self.father_dir):
    118             shutil.rmtree(self.father_dir)
    119 
    120         menu_list = self.get_menu_list()
    121         menu_info = list()
    122         # 获得目标菜单信息
    123         # 特殊列表,一些菜单的rid必须从子菜单的url中获得
    124         special_list = list()
    125         for menu in menu_list:
    126             for t in self.target:
    127                 if menu["title"] == t:
    128                     if menu["title"] == "番剧" or menu["title"] == "国创" or menu["title"] == "影视":
    129                         special_list.append(menu)
    130                     menu_info.append(menu)
    131                     break
    132 
    133         # 目标菜单的主页
    134         if len(menu_info) > 0:
    135             for info in menu_info:
    136                 menu_index_url = info["href"]
    137                 # 处理特殊列表
    138                 if info in special_list:
    139                     menu_index_url = info["submenu_list"][0]["href"]
    140                 # 获得rid
    141                 result_list = self.parse_index_url(menu_index_url)
    142                 print(result_list)
    143                 if len(result_list) > 0:
    144                     for item in result_list:
    145                         # 大菜单
    146                         item["menu"] = info["title"]
    147                         item["referer"] = menu_index_url
    148                         # 爬取子菜单
    149                         self.parse_sub_url(item)
    150 
    151 
    152 def main():
    153     target = ["动画", "番剧", "国创", "音乐", "舞蹈", "游戏", "科技", "数码", "生活", "鬼畜", "时尚", "广告", "娱乐", "影视"]
    154     splider = BiliSplider("f:/bili_splider", target)
    155     splider.run()
    156 
    157 
    158 if __name__ == '__main__':
    159     main()

     可以看到番剧少了新番时间表与番剧索引,因为这两个请求不遵循https://api.bilibili.com/x/web-interface/newlist?rid={}&type=0&pn={}&ps=20的格式,类似的不再赘述

     

  • 相关阅读:
    华为帐号服务助力应用运营和用户转化
    【接入指南】华为帐号服务Authorization Code模式介绍与接入步骤详解
    华为游戏登录验签失败can not find publicKey of the cp
    [古文观止]《相州昼锦堂记》(宋 欧阳修)
    [源码分析] Dynomite 分布式存储引擎 之 DynoJedisClient(2)
    [源码分析] Dynomite 分布式存储引擎 之 DynoJedisClient(1)
    Amazon Dynamo系统架构
    [从源码学设计] Flume 之 memory channel
    [阿里DIEN] 深度兴趣进化网络源码分析 之 Keras版本
    [从源码学设计]蚂蚁金服SOFARegistry之延迟操作
  • 原文地址:https://www.cnblogs.com/tele-share/p/10673741.html
Copyright © 2020-2023  润新知