• 异步爬虫组件


    import time,requests
    from multiprocessing.dummy import Pool
    
    # 同步
    # urls = [
    #     "http://127.0.0.1:5000/jay1",
    #     "http://127.0.0.1:5000/jay2",
    #     "http://127.0.0.1:5000/jay3",
    # ]
    #
    # def get_request(url):
    #     page_text = requests.get(url=url).text
    #     print(len(page_text))
    #
    # if __name__ == '__main__':
    #     start_time = time.time()
    #     for url in urls:
    #         get_request(url)
    #     print("all_time:",time.time()-start_time)
    
    #基于线程池的异步效果
    urls = [
        "http://127.0.0.1:5000/jay1",
        "http://127.0.0.1:5000/jay2",
        "http://127.0.0.1:5000/jay3",
    ]
    
    def get_request(url):
        page_text = requests.get(url=url).text
        return len(page_text)
    
    if __name__ == '__main__':
        start_time = time.time()
        pool = Pool(3)#启动了三个线程
    
        page_len_list = pool.map(get_request,urls)
        # 参数1:回调函数
        # 参数2:可迭代的对象,alist
        # 作用:可以将alist中的每一个元素依次传递给回调函数作为参数,然后回调函数会异步
        # 对列表中的元素进行相关操作运算
        # map的返回值就是回调函数返回的所有结果
        print(page_len_list)
        print("all_time:",time.time()-start_time)
    import requests
    import asyncio,aiohttp
    import time
    from lxml import etree
    
    # #特殊的函数:不可以出现不支持异步模块的代码,不可以使用requests模块
    async def get_request(url):
        # 使用aiohttp进行网络请求
        # 实例化一个请求对象叫做sess
        async with aiohttp.ClientSession() as sess:
            # sess.get(url,headers,params,proxy)
            # sess.post(url,headers,data,proxy)
            # proxy参数的用法和requests不一样,剩下参数和requests的用法一样
            # proxy = "http://ip:port"
            async with await sess.get(url=url) as response:
                # text()返回字符串形式的响应数据
                # read()返回bytes类型的响应数据
                page_text = await response.text()
                return page_text
    
    #定义一个任务对象的回调函数
    #注意:回调函数必须要有一个参数,该参数表示就是该函数的绑定者
    #多任务的异步爬虫中数据解析或者持久化存储的操作需要写在任务对象的回调函数中
    def parse(task):
        page_text = task.result()
        tree = etree.HTML(page_text)
        data_text = tree.xpath('//a[@id="feng"]/text()')[0]
        print(data_text)
    
    if __name__ == '__main__':
        start_time = time.time()
        urls = [
            "http://127.0.0.1:5000/jay1",
            "http://127.0.0.1:5000/jay2",
            "http://127.0.0.1:5000/jay3",
        ]
        # 定义一个任务列表
        tasks = []
    
        for url in urls:
            #创建三个协程对象
            c = get_request(url)
            # 创建三个任务对象
            task = asyncio.ensure_future(c)
            # 绑定回调
            task.add_done_callback(parse)
    
            tasks.append(task)
    
        # 创建一个事件循环对象
        loop = asyncio.get_event_loop()
        # 将任务列表中的多个任务注册到了事件循环中
        # 将task存储到eventloop中且启动该对象
        loop.run_until_complete(asyncio.wait(tasks))
        print("all_time:",time.time()-start_time)
  • 相关阅读:
    LeetCode 24. Swap Nodes in Pairs
    LeetCode 02.07. Intersection of Two Linked Lists LCCI
    LeetCode 707. Design Linked List
    centos7 mail
    centos7 安装postgresql10
    centos 7 安装 mail
    解决运行docker命令要用sudo的问题
    Linux 命令速记本
    iterm2配置lrzsz
    截取某段时间内的日志
  • 原文地址:https://www.cnblogs.com/hude/p/12913415.html
Copyright © 2020-2023  润新知