• 梨视频,进程池、线程池爬取


    进程池

    import requests, re, time
    from multiprocessing.dummy import Pool
    import random
    
    IpPool = [{'http': '183.147.230.104: 8118'}, {'http': '60.217.64.237: 31923'},
              {'http': '221.193.50.166: 8118'}]
    
    
    url = 'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=9&start=0'
    
    start = time.time()
    
    video_list=[]
    
    ret = requests.get(url)
    reg = '<a href="(.*?)" class="vervideo-lilink actplay">'
    video_urls = re.findall(reg, ret.text)
    print(video_urls)
    for url in video_urls:
        proxy = random.choice(IpPool)
        ret_detail = requests.get('https://www.pearvideo.com/' + url, proxies=proxy)
        print(proxy)
        reg = 'srcUrl="(.*?)",vdoUrl=srcUrl'
        mp4_url = re.findall(reg, ret_detail.text)[0]  # type:str
    
        video_name = mp4_url.rsplit('/', 1)[-1]
    
        dic = {
            'v_name': video_name,
            'v_url': mp4_url
        }
        video_list.append(dic)
    
    print(video_list)
    def get_video(dic):
        url = dic['v_url']
        name = dic['v_name']
        print(f'开始下载{name}')
        video_data = requests.get(url=url)
        print(url)
        with open(name, 'wb') as f:
            for line in video_data.iter_content():
                f.write(line)
            print(f'{name}下载完成')
    
        end = time.time()
        ctime = end - start
        print(ctime)
    
    pools = Pool(12)
    pools.map(get_video, video_list)
    pools.close()
    pools.join()
    

    线程池

    import requests
    import re
    import random
    from concurrent.futures import ThreadPoolExecutor
    
    import time
    start = time.time()
    
    pool = ThreadPoolExecutor(12)
    
    IpPool = [{'http': '183.147.230.104: 8118'}, {'http': '60.217.64.237: 31923'},
              {'http': '221.193.50.166: 8118'}]
    
    url = 'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=9&start=0'
    
    
    video_list=[]
    
    ret = requests.get(url)
    reg = '<a href="(.*?)" class="vervideo-lilink actplay">'
    video_urls = re.findall(reg, ret.text)
    print(video_urls)
    for url in video_urls:
        proxy = random.choice(IpPool)
        ret_detail = requests.get('https://www.pearvideo.com/' + url, proxies=proxy)
        print(proxy)
        reg = 'srcUrl="(.*?)",vdoUrl=srcUrl'
        mp4_url = re.findall(reg, ret_detail.text)[0]  # type:str
    
        video_name = mp4_url.rsplit('/', 1)[-1]
    
        dic = {
            'v_name': video_name,
            'v_url': mp4_url
        }
        video_list.append(dic)
    
    def get_video(dic):
        url = dic['v_url']
        name = dic['v_name']
        print(f'开始下载{name}')
        video_data = requests.get(url=url)
        print(url)
        with open(name, 'wb') as f:
            for line in video_data.iter_content():
                f.write(line)
            print(f'{name}下载完成')
            end = time.time()
            ctime = end - start
            print(ctime)
    
    print(video_list)
    def main():
        for url in video_list:
            done = pool.submit(get_video, url)
    
    
    if __name__ == '__main__':
        main()
        pool.shutdown(wait=True)
    
  • 相关阅读:
    html css div img垂直居中
    jquery 多选框 checkbox 获取选中的框
    css 滚动条样式
    css 翻牌 翻转 3d翻转 特效
    css强制不换行 多出的字省略号
    jquery获取元素坐标获取鼠标坐标
    鸡汤 咯咯
    <bean> 中配置详解 </bean>
    正则表达式的囧
    我的天$删除注册表$安装mysql最后一步不能启动服务的解决办法
  • 原文地址:https://www.cnblogs.com/kai-/p/12658461.html
Copyright © 2020-2023  润新知