• 爬取虎牙小姐姐热舞 Python


    最近疫情在家无聊得一批。爬点虎牙小姐姐的视频养养眼(前面其实有爬过,只是那时候比较懒,用的顺序爬取。)用的还有就是,自己也在做视频剪辑,将视频爬下做素材练练手。爬虫思路一般是确定数据源,然后对数据源进行分析,撸码保存。申明:本文仅做学习交流使用,如有侵权,请联系我删除。

    第一种方式,还是用函数写:

     1 import os.path
     2 import re
     3 import concurrent.futures
     4 import requests
     5 
     6 def change_title(orginal_title):
     7     # 一个更改标题的函数
     8     pattern = re.compile(r'[\\\/\:\*\?\"\<\>\|]')
     9     new_title = re.sub(pattern, '-', orginal_title)
    10     return new_title
    11 
    12 def get_response(page_url):
    13     headers = {
    14         'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36',
    15     }
    16     # 提取html内容的
    17     response = requests.get(url=page_url, headers=headers)
    18     response.raise_for_status()
    19     response.encoding = response.apparent_encoding
    20     response.encoding = 'utf-8'
    21     return response
    22 
    23 def save_video(videoRealUrl, videoName, videoQuality):
    24     # 先创建文件夹
    25     filePath = './虎牙video/' # 在当前目录下创建一个目录
    26     if not os.path.exists(filePath):
    27         os.mkdir(filePath)
    28     # 保存视频的,保存的视频包括标题,
    29     videoContent = get_response(page_url=videoRealUrl).content
    30 
    31     try:
    32         with open(filePath + videoName + ' - ' + videoQuality + ' - ' + '.mp4', mode='wb') as f:
    33             f.write(videoContent)
    34             print(f'正在保存---------------------->{videoName},请稍等!')
    35     except:
    36         print('有异常,请检查!')
    37 
    38 def main(page_url):
    39     # 主函数
    40     # 开始解析列表页
    41     video_category_data = get_response(page_url=page_url).text
    42     videoIds = re.findall('//v.huya.com/play/(\d+).html', video_category_data)
    43     for videoId in videoIds:
    44         # 开始拼接视频播放的请求url
    45         videoRequestUrl = f'https://liveapi.huya.com/moment/getMomentContent?&videoId={videoId}&_=1649038571444'
    46         # 获取json数据
    47         video_json_data = get_response(page_url=videoRequestUrl).json()
    48         # 通过json数据解析想要的数据
    49         videoName = video_json_data['data']['moment']['title']
    50         videoName = change_title(videoName) # 以防万一,更改下标题
    51         videoRealUrl = video_json_data['data']['moment']['videoInfo']['definitions'][0]['url']
    52         videoQuality = video_json_data['data']['moment']['videoInfo']['definitions'][0]['defName']
    53         save_video(videoRealUrl, videoName, videoQuality)
    54 if __name__ == "__main__":
    55     app = concurrent.futures.ThreadPoolExecutor(max_workers=10)
    56     for page in range(1, 11):
    57         print(f'--------------------------------------开始采集第{page}页数据!--------------------------------------')
    58         url = f'https://v.huya.com/g/all?set_id=51&order=hot&page={page}'
    59         app.submit(main, url)
    60     app.shutdown()

    第二种方式是面向对象的方式:

     1 import re
     2 import os
     3 import requests
     4 import json
     5 
     6 def change_title(orginal_title):
     7     # 一个更改标题的函数
     8     pattern = re.compile(r'[\\\/\:\*\?\"\<\>\|]')
     9     new_title = re.sub(pattern, '-', orginal_title)
    10     return new_title
    11 
    12 def video_stored_path():
    13     # 一个创建文件夹的函数
    14     filePath = './虎牙video/'
    15     if not os.path.exists(filePath):
    16         os.mkdir(filePath)
    17     return filePath
    18 class huyaVideoSpider():
    19     headers = {
    20         'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36',
    21     }
    22 
    23     def __init__(self, url, headers=headers):
    24         self.url = url
    25         self.headers = headers
    26 
    27     def get_response(self, page_url):
    28         # 获取网页的返回信息
    29         response = requests.get(url=page_url)
    30         response.raise_for_status()
    31         response.encoding = response.apparent_encoding
    32         response.encoding = 'utf-8'
    33         return response
    34 
    35     def save_video(self, videoContentUrl, videoTitle, videoQuality):
    36         # 获取路径
    37         filePath = video_stored_path()
    38         # 保存视频的方法
    39         videoContent = self.get_response(videoContentUrl).content
    40 
    41         try:
    42             with open(filePath + videoTitle + ' - ' + videoQuality + '.mp4', mode='wb') as f:
    43                 f.write(videoContent)
    44                 print(f'正在保存----------------------->{videoTitle},请稍等!')
    45         except:
    46             print('有异常,请检查!')
    47 
    48     def parse_category_page(self, category_page_url):
    49         # 解析目录页
    50         video_category_data = self.get_response(page_url=self.url).text
    51         # 提取视频id
    52         video_ids = re.findall('//v.huya.com/play/(\d+).html', video_category_data)
    53         # 开始拆分id拼接视频请求地址
    54         for video_id in video_ids:
    55             video_request_url = f'https://liveapi.huya.com/moment/getMomentContent?&videoId={video_id}&_=1649038571444'
    56             # 开始解析json数据
    57             json_data = json.loads(self.get_response(page_url=video_request_url).text)
    58             videoTitle = json_data['data']['moment']['title']
    59             videoTitle = change_title(videoTitle)
    60             videoContentUrl = json_data['data']['moment']['videoInfo']['definitions'][0]['url']
    61             videoQuality = json_data['data']['moment']['videoInfo']['definitions'][0]['defName']
    62             self.save_video(videoContentUrl, videoTitle, videoQuality)
    63 
    64     def run(self):
    65         self.parse_category_page(self.url)
    66 
    67 if __name__ == "__main__":
    68     for page in range(1, 11):
    69         print(f'--------------------------------正在采集第{page}页视频,请稍等!--------------------------------')
    70         url = f'https://v.huya.com/g/all?set_id=51&order=hot&page={page}'
    71         app = huyaVideoSpider(url=url)
    72         app.run()
    73         break

    还有一种方式是根据输入下载指搜索的关键词和指定页的视频。参考:爬取酷我音乐平台的付费音乐的第二段面向对象代码。

  • 相关阅读:
    Android 操作系统架构开篇
    《构建之法》读后感
    《梦断代码》读后感
    学习日报
    学习日报
    记账本开发4
    记账本开发3
    学习日报
    学习日报
    记账本开发2
  • 原文地址:https://www.cnblogs.com/mafu/p/16098598.html
Copyright © 2020-2023  润新知