一、参考文章
python爬虫爬取校花网视频,单线程爬取
爬虫----爬取校花网视频,包含多线程版本
上述两篇文章都是对校花网视频的爬取,由于时间相隔很久了,校花网上的一些视频已经不存在了,因此上述文章中的代码在运行时会出现一些异常,本篇文章主要是对上述文章中的代码进行了优化和异常处理,在次做笔记记录方便以后查阅,修改如下:
1、添加的异常处理如下红色部分代码
二、单线程版本
1 #-*- coding=utf-8 -*- 2 import re 3 import requests 4 import hashlib 5 import time 6 import os 7 8 header = { 9 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36', 10 'Referer':'http://www.xiaohuar.com' 11 } 12 13 def get_index(url): 14 respose = requests.get(url, headers = header) 15 if respose.status_code == 200: 16 return respose.text 17 18 def parse_index(res): 19 urls = re.findall(r'class="items".*?href="(.*?)"', res, re.S) # re.S 把文本信息转换成1行匹配 20 return urls 21 22 23 def get_detail(urls): 24 for url in urls: 25 if not url.startswith('http'): 26 url='http://www.xiaohuar.com%s' %url 27 result = requests.get(url, headers = header) 28 if result.status_code == 200 : 29 mp4_url_list = re.findall(r'id="media".*?src="(.*?)"', result.text, re.S) 30 if mp4_url_list: 31 mp4_url = mp4_url_list[0] 32 save(mp4_url) 33 34 path = os.getcwd() + '/video/' 35 36 def save(url): 37 try:#下载视频加异常处理 38 video = requests.get(url, headers = header) 39 except requests.exceptions.RequestException as e : 40 print(repr(e)) 41 return 42 43 if video.status_code == 200: 44 m = hashlib.md5() 45 m.update(url.encode('utf-8')) 46 m.update(str(time.time()).encode('utf-8')) 47 filename = r'%s.mp4' % m.hexdigest() 48 filepath = path + filename 49 print(filepath) 50 with open(filepath, 'wb') as f: 51 f.write(video.content) 52 else: 53 print(f'视频不存在了:{url}') 54 55 def main(): 56 for i in range(5): 57 res1 = get_index('http://www.xiaohuar.com/list-3-%s.html' % i )#拿第一页数据 58 res2 = parse_index(res1)#提取第一页上的所有url 59 get_detail(res2)#下载url集合上的视频 60 61 if __name__ == '__main__': 62 main()
三、多线程版本
1 #-*- coding=utf-8 -*- 2 # 异步,多线程优化下载速度 3 4 import requests 5 import re 6 import os 7 import hashlib,time 8 from concurrent.futures import ThreadPoolExecutor 9 10 p = ThreadPoolExecutor(30) 11 12 def get_index(url): 13 response = requests.get(url) 14 if response.status_code == 200: 15 return response.text 16 17 def parse_index(res): 18 res = res.result() 19 urls = re.findall(r'class="items".*?href="(.*?)"', res, re.S) 20 21 p.submit(get_detail, urls) 22 23 def get_detail(urls): 24 for url in urls: 25 if not url.startswith('http'): 26 url='http://www.xiaohuar.com%s' %url 27 r1=requests.get(url) 28 if r1.status_code == 200: 29 url_list=re.findall(r'id="media".*?src="(.*?)"', r1.text, re.S) 30 if url_list: 31 mp4_url = url_list[0] 32 save(mp4_url) 33 34 path = os.getcwd() + '/video_mutil/' 35 if not os.path.exists(path): 36 os.makedirs(path) 37 38 def save(url): 39 try:#下载视频做异常处理,视频可能不存在了 40 r2 = requests.get(url) 41 except requests.exceptions.RequestException as e : 42 print(repr(e)) 43 return 44 45 if r2.status_code == 200: 46 m=hashlib.md5() 47 m.update(url.encode('utf-8')) 48 m.update(str(time.time()).encode('utf-8')) 49 filename = '%s.mp4' %m.hexdigest() 50 file_path = path + filename 51 with open(file_path,'wb') as f: 52 f.write(r2.content) 53 print('视频下载完成:%s' % file_path) 54 else: 55 print(f'视频不存在了:{url}') 56 57 def main(): 58 for i in range(5): 59 p.submit(get_index, 'http://www.xiaohuar.com/list-3-%s.html' % i).add_done_callback(parse_index) 60 61 if __name__ == '__main__': 62 main()
四、资源下载
资源下载地址:Python爬取校花网视频-单线程和多线程版本
转载声明:本站文章无特别说明,皆为原创,版权所有,转载请注明:朝十晚八