爬取梨视频步骤:
-
1.爬取梨视频主页,获取主页所有的详情页链接
- url: https://www.pearvideo.com/- 1) 往url发送请求,获取主页的html文本 - 2) 解析并提取详情页相对路径video_1637593,与主页拼接即可 - 拼接后获取的是电影详情页url: - detail_url: https://www.pearvideo.com/ + video_1637593
-
2.往电影详情页发送请求,解析并提取真实视频url--> .mp4后缀的视频url
- 1) 往detail_url发送请求,获取detail_url的html文本
- 2) 解析并提取详情页中视频的真实url
案例一:
单线程同步爬取
import requests
import re
import uuid
# 1.发送请求
def get_html(url):
res = requests.get(url)
return res
# 2.解析数据
# 解析主页,获取视频详情页url
def parse_html(res):
# 获取所有视频的id
video_id_data = re.findall('<a href="video_(.*?)"',res.text, re.S)
"""
<a href="video_(.*?)" class="actwapslide-link">
"""
return video_id_data
# 3.请求 视频详情页,并解析出视频链接
def parse_detail(video_detail_url):
detail_html = requests.get(video_detail_url)
video_url = re.findall('srcUrl="(.*?)"',detail_html.text, re.S)[0]
"""
srcUrl="https://video.pearvideo.com/mp4/adshort/20191231/cont-1637727-14751751_adpkg-ad_hd.mp4"
"""
return video_url
# 4.保存数据
def save_video(video_url):
print('开始保存视频')
res_video = requests.get(video_url)
with open(f'{str(uuid.uuid4())}.mp4','wb') as f:
for line in res_video.iter_content():
f.write(line)
print('结束保存视频')
if __name__ == '__main__':
url = 'https://www.pearvideo.com'
res = get_html(url)
# 1.对梨视频主页进行解析,提取所有视频详情页的绝对路径
video_id_data = parse_html(res)
for video_id in video_id_data:
video_detail_url = url + '/video_' + video_id
# 2.往电影详情页发送请求,并解析
video_url = parse_detail(video_detail_url)
# 3.保存视频
save_video(video_url)
案例二:
多线程异步爬取
import requests
import re
import uuid
from concurrent.futures import ThreadPoolExecutor
# 创建线程池,最大连接数为50
pool = ThreadPoolExecutor(50)
# 1.发送请求
def get_html(url):
res = requests.get(url)
return res
# 2.解析数据
# 解析主页,获取视频详情页url
def parse_html(res):
# 获取所有视频的id
video_id_data = re.findall('<a href="video_(.*?)"',res.text, re.S)
"""
<a href="video_(.*?)" class="actwapslide-link">
"""
return video_id_data
# 3.请求 视频详情页,并解析出视频链接
def parse_detail(res): # res对象 --- 》 {'result': response}
detail_html = res.result()
# 通过回调得到的response参数是一个对象
video_url = re.findall('srcUrl="(.*?)"',detail_html.text, re.S)[0]
"""
srcUrl="https://video.pearvideo.com/mp4/adshort/20191231/cont-1637727-14751751_adpkg-ad_hd.mp4"
"""
# 异步提交任务爬取真实视频数据,并保存
pool.submit(save_video, video_url)
# 4.保存数据
def save_video(video_url):
print('开始保存视频')
res_video = requests.get(video_url)
with open(f'{str(uuid.uuid4())}.mp4','wb') as f:
for line in res_video.iter_content():
f.write(line)
print('结束保存视频')
if __name__ == '__main__':
import time
url = 'https://www.pearvideo.com'
res = get_html(url)
# 1.对梨视频主页进行解析,提取所有视频详情页的绝对路径
video_id_data = parse_html(res)
for video_id in video_id_data:
video_detail_url = url + '/video_' + video_id
time.sleep(0.1)
# 循环并发异步提交任务, add_done_callback将get_html任务的执行结果,回调给parse_detail函数
pool.submit(get_html, video_detail_url).add_done_callback(parse_detail)