• 梨视频,进程池、线程池爬取


    进程池

    import requests, re, time
    from multiprocessing.dummy import Pool
    import random
    
    IpPool = [{'http': '183.147.230.104: 8118'}, {'http': '60.217.64.237: 31923'},
              {'http': '221.193.50.166: 8118'}]
    
    
    url = 'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=9&start=0'
    
    start = time.time()
    
    video_list=[]
    
    ret = requests.get(url)
    reg = '<a href="(.*?)" class="vervideo-lilink actplay">'
    video_urls = re.findall(reg, ret.text)
    print(video_urls)
    for url in video_urls:
        proxy = random.choice(IpPool)
        ret_detail = requests.get('https://www.pearvideo.com/' + url, proxies=proxy)
        print(proxy)
        reg = 'srcUrl="(.*?)",vdoUrl=srcUrl'
        mp4_url = re.findall(reg, ret_detail.text)[0]  # type:str
    
        video_name = mp4_url.rsplit('/', 1)[-1]
    
        dic = {
            'v_name': video_name,
            'v_url': mp4_url
        }
        video_list.append(dic)
    
    print(video_list)
    def get_video(dic):
        url = dic['v_url']
        name = dic['v_name']
        print(f'开始下载{name}')
        video_data = requests.get(url=url)
        print(url)
        with open(name, 'wb') as f:
            for line in video_data.iter_content():
                f.write(line)
            print(f'{name}下载完成')
    
        end = time.time()
        ctime = end - start
        print(ctime)
    
    pools = Pool(12)
    pools.map(get_video, video_list)
    pools.close()
    pools.join()
    

    线程池

    import requests
    import re
    import random
    from concurrent.futures import ThreadPoolExecutor
    
    import time
    start = time.time()
    
    pool = ThreadPoolExecutor(12)
    
    IpPool = [{'http': '183.147.230.104: 8118'}, {'http': '60.217.64.237: 31923'},
              {'http': '221.193.50.166: 8118'}]
    
    url = 'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=9&start=0'
    
    
    video_list=[]
    
    ret = requests.get(url)
    reg = '<a href="(.*?)" class="vervideo-lilink actplay">'
    video_urls = re.findall(reg, ret.text)
    print(video_urls)
    for url in video_urls:
        proxy = random.choice(IpPool)
        ret_detail = requests.get('https://www.pearvideo.com/' + url, proxies=proxy)
        print(proxy)
        reg = 'srcUrl="(.*?)",vdoUrl=srcUrl'
        mp4_url = re.findall(reg, ret_detail.text)[0]  # type:str
    
        video_name = mp4_url.rsplit('/', 1)[-1]
    
        dic = {
            'v_name': video_name,
            'v_url': mp4_url
        }
        video_list.append(dic)
    
    def get_video(dic):
        url = dic['v_url']
        name = dic['v_name']
        print(f'开始下载{name}')
        video_data = requests.get(url=url)
        print(url)
        with open(name, 'wb') as f:
            for line in video_data.iter_content():
                f.write(line)
            print(f'{name}下载完成')
            end = time.time()
            ctime = end - start
            print(ctime)
    
    print(video_list)
    def main():
        for url in video_list:
            done = pool.submit(get_video, url)
    
    
    if __name__ == '__main__':
        main()
        pool.shutdown(wait=True)
    
  • 相关阅读:
    JS实现延迟载入图片
    三星指纹识别新专利:手势打开不同应用
    与计算机之间的另一种沟通方式 ——“手势识别”
    手写数字识别系统之图像分割
    机器学习实战八大分类器识别树叶带源码
    构建CTC语音识别解码网络
    MFC CListCtrl 条目取消选中
    C++ 将输入的字符串中英文大写字母改成对应小写字母,并且过滤掉非英文字母字符
    C++遍历SQLite数据库下的所有表名 .
    MFC 操作注册表 Open QueryValue等
  • 原文地址:https://www.cnblogs.com/kai-/p/12658461.html
Copyright © 2020-2023  润新知