• 异步线程池爬取 校花网视频


    import re
    import requests
    
    response = requests.get("http://www.xiaohuar.com/v/")
    
    
    
    url_s = re.findall('<div class="items">.*?href="(.*?)"',response.text,re.S)
    for url in url_s:
        res = requests.get(url)
        result = re.findall('<video id="media".*?src="(.*?)"',res.text,re.S)
    
        # print(result)
    
    def get_page(url):
        try:
            response = requests.get(url)
            if response.status_code==200:
                return response.text
        except Exception:
            pass
    
    
    
    def  parse_data(text):
        url_s = re.findall('<div class="items">.*?href="(.*?)"',text,re.S)
        # list = []
        for url in url_s:
            if url:
                yield url
    
    def parse_detail(text):
        try:
            movie_url_list = re.findall('<video id="media".*?src="(.*?)"',text,re.S)
            if movie_url_list:
                movie_url = movie_url_list[0]
                if movie_url.endswith(".mp4"):
                    return movie_url
    
        except Exception(TypeError):
            pass
    import uuid
    def download_movie(movie_url):
        try:
            response=requests.get(movie_url)
            # print(response.text)
            with open (r"D:spider1movies\%s.mp4"%uuid.uuid4(),"wb")as f:
                f.write(response.content)
        except Exception:
            pass
    
    
    
    
    
    
    
    if __name__ == '__main__':
        base_url = "http://www.xiaohuar.com/list-3-{}.html"
        for line in range(1):
            url=base_url.format(line)
            #1 发送请求
            index_text=get_page(url)
            #2解析数据
            urls = parse_data(index_text)
    
            for url in urls:
                #访问详情页获取详情页文本
                detail_text = get_page(url)
    
                movie_url =  parse_detail(detail_text)
                #保存视屏
                download_movie(movie_url)
    
    
    from concurrent.futures import ThreadPoolExecutor
    pool = ThreadPoolExecutor(50)
    
    
    
    response = requests.get("http://www.xiaohuar.com/v/")
    # print(response.text)
    
    
    url_s = re.findall('<div class="items">.*?href="(.*?)"',response.text,re.S)
    for url in url_s:
        # print(url)
    
        res = requests.get(url)
        result = re.findall('<video id="media".*?src="(.*?)"',res.text,re.S)
    
        # print(result)
    
    def get_page(url):
        print(url)
        try:
            response = requests.get(url)
            if response.status_code==200:
                return response.text
        except Exception:
            pass
    
    
    
    def  parse(res):
        text = res.result()
        if text:
            # print(text)
            url_s = re.findall('<div class="items">.*?href="(.*?)"',text,re.S)
            # list = []
            for url in url_s:
                if url:
                    if url.startswith("/"):
                        url = "http://www.xiaohuar.com"+url
                    pool.submit(get_page,url).add_done_callback(parse_detail)
    
    def parse_detail(res):
        text = res.result()
        if text:
            try:
                movie_url_list = re.findall('<video id="media".*?src="(.*?)"',text,re.S)
                if movie_url_list:
                    movie_url = movie_url_list[0]
                    if movie_url.endswith(".mp4"):
                        pool.submit(download_movie,movie_url)
    
            except Exception(TypeError):
                pass
    import uuid
    def download_movie(movie_url):
        if movie_url:
            try:
                response=requests.get(movie_url)
                # print(response.text)
                with open (r"D:spider1movies\%s.mp4"%uuid.uuid4(),"wb")as f:
                    f.write(response.content)
            except Exception:
                pass
    
    
    
    
    
    
    
    if __name__ == '__main__':
        base_url = "http://www.xiaohuar.com/list-3-{}.html"
        for line in range(2):
            url=base_url.format(line)
            #1 发送请求
            pool.submit(get_page,url).add_done_callback(parse)
  • 相关阅读:
    Linux 常用命令之df du
    Liunx 命令之链接操作
    1、SpringBoot bean,list,map Json返回
    Echarts 柱状图配置详解
    layui table数据表格reload where参数保留问题
    Mac休眠后解决卡死转圈问题
    5、Storm集成Kafka
    使用OData技术遇到的问题及解决办法
    我的柔情你永远不懂...
    如果你知道时间的默认值...
  • 原文地址:https://www.cnblogs.com/tangda/p/10932916.html
Copyright © 2020-2023  润新知