• python 下载bilibili视频


    说明:

    1.清晰度的选择要登录,暂时还没做,目前下载的视频清晰度都是默认的480P

    2.进度条仿linux的,参考了一些博客修改了下,侵删

    3.其他评论,弹幕之类的相关爬虫代码放在了https://github.com/teleJa/bilibili

    4.判断sys.argv那个地方是因为一些爬虫调用了该文件,如果感觉不方面,直接传递视频番号进去就可以了

    下载过程如图

    直接上代码:

      1 import requests
      2 import re
      3 import os
      4 import json
      5 import sys
      6 import math
      7 from lxml import etree
      8 
      9 
     10 class BLDSplider:
     11     regex_cid = re.compile(""cid":(.{8})")
     12 
     13     def __init__(self, aid):
     14         self.aid = aid
     15 
     16         self.origin_url = "https://www.bilibili.com/video/av{}?from=search&seid=9346373599622336536".format(aid)
     17         self.headers = {
     18             "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
     19         }
     20 
     21         self.url = "https://api.bilibili.com/x/player/playurl?avid={}&cid={}&qn=0&type=&otype=json"
     22 
     23     def check_dir(self, author_name):
     24         # 检查目录
     25         self.parent_path = "e:/bilibili/" + author_name + "/" + str(self.aid) + "/"
     26         if not os.path.exists(self.parent_path):
     27             os.makedirs(self.parent_path)
     28 
     29         self.video_name = self.parent_path + str(self.aid) + ".mp4"
     30 
     31     def parse_url(self, item):
     32         cid = item["cid"]
     33         print("aid:%s   cid:%s" % (str(self.aid), cid))
     34         title = item["title"]
     35         print("title:%s" % title)
     36 
     37         self.headers["Referer"] = self.origin_url
     38         # 视频
     39         response = requests.get(self.url.format(self.aid, cid), headers=self.headers)
     40         if response.status_code == 200:
     41             result = json.loads(response.content.decode())
     42             durl = result["data"]["durl"][0]
     43             video_url = durl["url"]
     44             print("video_url:%s" % video_url)
     45             # 视频大小
     46             size = durl["size"]
     47             print("size:%s,约%2.2fMB" % (size, size / (1024 * 1024)))
     48             video_response = requests.get(video_url, headers=self.headers, stream=True)
     49             if video_response.status_code == 200:
     50                 with open(self.video_name, "wb") as file:
     51                     buffer = 1024
     52                     count = 0
     53                     while True:
     54                         if count + buffer <= size:
     55                             file.write(video_response.raw.read(buffer))
     56                             count += buffer
     57                         else:
     58                             file.write(video_response.raw.read(size % buffer))
     59                             count += size % buffer
     60                         file_size = os.path.getsize(self.video_name)
     61                         # print("
    下载进度 %.2f %%" % (count * 100 / size), end="")
     62 
     63                         width = 50
     64                         percent = (count / size)
     65                         use_num = int(percent * width)
     66                         space_num = int(width - use_num)
     67                         percent = percent * 100
     68                         print('
    进度:[%s%s]    %d%%' % (use_num * '#', space_num * ' ', percent), file=sys.stdout,
     69                               flush=True, end="")
     70                         if size == count:
     71                             break
     72                 print("
    ")
     73 
     74     # 获取视频相关参数
     75     def get_video_info(self):
     76         response = requests.get(self.origin_url, headers=self.headers)
     77         item = dict()
     78         if response.status_code == 200:
     79             # author
     80             html_element = etree.HTML(response.content.decode())
     81             author = dict()
     82             author_name = html_element.xpath(
     83                 "/html/body/div[@id='app']/div[@class='v-wrap']/div[@class='r-con']/div[@id='v_upinfo']//a[@report-id='name']/text()")[
     84                 0]
     85             # 通常是微博,微信公众号等联系方式
     86             author_others = html_element.xpath(
     87                 "/html/body/div[@id='app']/div[@class='v-wrap']/div[@class='r-con']/div[@id='v_upinfo']//div[@class='desc']/@title")[
     88                 0]
     89             author["name"] = author_name
     90             author["others"] = author_others
     91             item["author"] = author
     92 
     93             # cid
     94             cid = BLDSplider.regex_cid.findall(response.content.decode())[0]
     95             item["cid"] = cid
     96             info_url = "https://api.bilibili.com/x/web-interface/view?aid={}&cid={}".format(self.aid, cid)
     97             info_response = requests.get(info_url, headers=self.headers)
     98             if info_response.status_code == 200:
     99                 data = json.loads(info_response.content.decode())["data"]
    100                 # 视频简介
    101                 desc = data["desc"]
    102                 item["desc"] = desc
    103 
    104                 # title
    105                 title = data["title"]
    106                 item["title"] = title
    107 
    108                 stat = data["stat"]
    109                 # 播放量
    110                 view = stat["view"]
    111                 item["view"] = view
    112 
    113                 # 弹幕
    114                 danmaku = stat["danmaku"]
    115                 item["danmaku"] = danmaku
    116 
    117                 # 评论
    118                 reply = stat["reply"]
    119                 item["reply"] = reply
    120 
    121                 # 硬币
    122                 coin = stat["coin"]
    123                 item["coin"] = coin
    124 
    125                 # 点赞
    126                 like = stat["like"]
    127                 item["like"] = like
    128 
    129                 # 收藏
    130                 favorite = stat["favorite"]
    131                 item["favorite"] = favorite
    132 
    133                 # 分享
    134                 share = stat["share"]
    135                 item["share"] = share
    136             self.check_dir(item["author"]["name"])
    137             # 视频参数
    138             with open(self.parent_path + "video_info.txt", "w") as file:
    139                 file.write(json.dumps(item, ensure_ascii=False, indent=2))
    140             return item
    141 
    142     def run(self):
    143         item = self.get_video_info()
    144         self.parse_url(item)
    145 
    146 
    147 def main():
    148     # 55036734
    149     aid = 55036734
    150     if len(sys.argv) >= 2:
    151         if sys.argv[1]:
    152             aid = sys.argv[1]
    153     splider = BLDSplider(aid)
    154     splider.run()
    155 
    156 
    157 if __name__ == '__main__':
    158     main()

     

  • 相关阅读:
    利用别名切换索引流程Elasticsearch 7.7
    关于误删除elasticSearch 索引,怎么能快速找回?
    总结traefik 在k8s 环境中的配置文件
    ES ElasticSearch 7.x 下动态扩大索引的shard数量
    Java框架Spring Boot & 服务治理框架Dubbo & 应用容器引擎Docker 实现微服务发布
    谈一下Docker与Kubernetes集群的日志和日志管理-转载
    Elasticsearch优化 & filebeat配置文件优化 & logstash格式配置 & grok实践
    Nginx错误日志(error_log)配置及信息详解
    赵总的运维体系专栏学习的总结
    APP或者前端通过识别用户代理详细信息和浏览器数据进行安全防御
  • 原文地址:https://www.cnblogs.com/tele-share/p/11208351.html
Copyright © 2020-2023  润新知