需求:爬取梨视频中的视频数据
https://www.pearvideo.com/category_4
代码如下
1 import requests 2 from lxml import etree 3 import re 4 import random 5 %%time 6 from multiprocessing.dummy import Pool #线程池 7 #实例化一个线程池对象 8 pool = Pool(10)#参数表示的是开启线程的个数 9 #爬取梨视频的视频数据 10 url='https://www.pearvideo.com/category_4' 11 headers = { 12 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' 13 } 14 page_text=requests.get(url=url,headers=headers).text 15 #解析 详情url 视频的标题 16 tree = etree.HTML(page_text) 17 #定位视屏详情的标签元素 18 li_list=tree.xpath('//ul[@id="listvideoListUl"]/li') 19 video_url_list=[] 20 for li in li_list: 21 detail_url='https://www.pearvideo.com/'+ li.xpath('./div/a/@href')[0] 22 detail_text=requests.get(url=detail_url,headers=headers).text 23 #解析:视屏的url 24 # srcUrl在哪? 找寻路径,点击详情页面开发者模式,network--doc---video--mp4 25 exp='srcUrl="(.*?)",' 26 video_url=re.findall(exp,detail_text,re.S)[0] 27 video_url_list.append(video_url) 28 #使用线程池进行视频数据并发下载 29 video_data_list=pool.map(lambda link:requests.get(url=link,headers=headers).content,video_url_list) 30 31 pool.map(saveData,video_data_list) 32 pool.close() 33 pool.join() 34
1 def saveData(data): 2 name = str(random.randint(1,10000))+'.mp4' 3 with open(name,'wb') as fp: 4 fp.write(data) 5 print(name+'下载成功')