• 爬取梨视频


    
    import re
    import redis
    import requests,time
    
    from setting import PAGE,CATEGORY_ID,START,MAIN_URL,DETAIL_URL
    
    from concurrent.futures import ThreadPoolExecutor
    
    from myredis import POOL
    class CrawlVideo():
        pools = ThreadPoolExecutor(100)
    
        def __init__(self, page=PAGE):
            self.page = page
            self.video_info_dic_list = []
            self.conn = redis.Redis(connection_pool=POOL)
    
        def async_download(self,video_dic):
    
            video_link = video_dic["video_link"]
            if self.conn.get(video_link):
                return
            video_name = video_dic["title"][:3]
            response = requests.get(video_link)
            if response.status_code == 200:
                with open("%s.mp4" % video_name, "wb")as f:
                    f.write(response.content)
                self.conn.set(video_link,video_link)
    
    
        def download_video(self, category_id=CATEGORY_ID, start=START, num=PAGE):
    
            crawl_ids_list= self.crawl_videolist(category_id, start, num)
            print(len(crawl_ids_list))
            self.get_video_info(crawl_ids_list)
            i = 0
            while i < len(crawl_ids_list):
                try:
                    video_dic = self.video_info_dic_list.pop()
                    self.pools.submit(self.async_download,video_dic)
                    i += 1
                except Exception as e:
                    time.sleep(0.2)
    
    
    
        def get_video_ids(self, category_id, start):
            main_url = MAIN_URL.format(category_id, start)
            try:
                response = requests.get(main_url)
                video_id_list = re.findall('<a href="(video_d+)"', response.text)
                return video_id_list
            except Exception as e:
                pass
    
        # 爬取单个视频的id的列表,可以通过此列表发请求
        def crawl_videolist(self, category_id, start, num):
            crawl_ids_list = []
            page_num = self.get_page_num(num)
            for i in range(page_num):
                video_id_list = self.get_video_ids(category_id, start)
                crawl_ids_list.extend(video_id_list)
                start += self.page
            while len(crawl_ids_list) > num:
                crawl_ids_list.pop()
            return crawl_ids_list
    
    
        def get_detail(self, obj):
            response = obj.result()
            dic = {}
            title = re.search('<title>(.*?)</title>', response.text).group(1)
            video_link = re.search('srcUrl="(.*?)"', response.text).group(1)
            dic["title"] = title
            dic["video_link"] = video_link
            self.video_info_dic_list.append(dic)
    
    
    
        def async_request(self,url,video_addr):
            response = requests.get(url.format(video_addr))
            return response
    
        def get_video_info(self, video_id_list):
            url = DETAIL_URL
            try:
                for video_addr in video_id_list:
                    obj = self.pools.submit(self.async_request,url,video_addr)
                    obj.add_done_callback(self.get_detail)
            except Exception as e:
                print(e)
    
        def get_page_num(self, num):
            if num % self.page == 0:
                page_num = num / self.page
            elif num <= self.page:
                page_num = 1
            else:
                page_num = num // self.page + 1
            return int(page_num)
    
    
    
    crawl = CrawlVideo()
    crawl.download_video(start=1,num=2)
    
    
  • 相关阅读:
    Log4net详细说明
    IDEA 介绍
    在互联网中关系型数据库是否不再那么重要
    彻底删除Kafka中的topic
    kafka consumer 配置详解
    kafka常用命令
    kafka可视化客户端工具Kafka Tool
    System.InvalidOperationException:“线程间操作无效: 从不是创建控件“btnSearch”的线程访问它。
    zookeeper图形化的客户端工具
    window上安装kafka(单机)
  • 原文地址:https://www.cnblogs.com/robert-zhou/p/10685764.html
Copyright © 2020-2023  润新知