• asyncio在爬虫中的使用


    # -*- coding: utf-8 -*-
    # 协程基础.py

    import asyncio import time async def request(url): print("正在请求:", url) # r = await asyncio.sleep(3) time.sleep(3) print("下载成功:", url) c = request("www.baidu.com") # 异步函数返回的协程对象 # 1.实例化事件循环 loop = asyncio.get_event_loop() # 2.任务对象,把协程对象放到任务对象中 task = loop.create_task(c) # 3.把任务对象放到事件循环中 loop.run_until_complete(task)
    # -*- coding: utf-8 -*-
    # 给任务对象绑定回调.py
    
    import asyncio
    import time
    
    async def request(url):
        print("正在请求:", url)
        # r = await asyncio.sleep(3)
        time.sleep(3)
        print("下载成功:", url)
        return 123
    c = request("www.baidu.com")  # 异步函数返回的协程对象
    
    # 回调函数的参数是任务对象task,回调在爬虫中是用来解析的
    def parse(task):
        print("这是回调函数")
        print("打印结果是协程函数的返回值", task.result())
    
    
    # 1.实例化事件循环
    loop = asyncio.get_event_loop()
    # 2.任务对象,把协程对象放到任务对象中
    task = loop.create_task(c)
    # 给任务对象绑定一个回调函数
    task.add_done_callback(parse)
    
    # 3.把任务对象放到事件循环中
    loop.run_until_complete(task)
    # -*- coding: utf-8 -*-
    # 多任务异步协程.py
    
    import asyncio
    import time
    
    urls = ['www.baidu.com', 'www.sogou.com', 'www.sina.com']
    start = time.time()
    
    async def request(url):
        print("正在请求:", url)
        # time.sleep(3)  # 需要改成支持异步的代码
        await asyncio.sleep(3)  # 协程对象
        print("下载成功:", url)
    
    
    loop = asyncio.get_event_loop()
    # 任务列表,放置多个任务
    tasks = []
    for url in urls:
        c = request(url)  # 协程对象
        task = loop.create_task(c)
        tasks.append(task)
    
    loop.run_until_complete(asyncio.wait(tasks))
    print('总共耗时:', time.time() - start)
    # -*- coding: utf-8 -*-
    # 多任务异步协程在爬虫中应用.py
    
    
    import asyncio
    import time
    import requests
    import aiohttp  # 跟requests的区别就是支持异步请求
    
    # 单线程 + 多任务异步协程
    # start = time.time()
    # urls = [
    #     'http://127.0.0.1:5000/bobo',
    #     'http://127.0.0.1:5000/jay',
    #     'http://127.0.0.1:5000/tom',
    # ]
    #
    # async def get_pageText(url):
    #     print("正在下载", url)
    #     page_text = requests.get(url).text     # 不支持异步请求,所以会报错
    #     print("下载完毕", url)
    #     # 返回给回调函数
    #     return page_text
    #
    #
    # loop = asyncio.get_event_loop()
    # tasks = []
    # for url in urls:
    #     c = get_pageText(url)
    #     task = loop.create_task(c)
    #     tasks.append(task)
    # loop.run_until_complete(asyncio.wait(tasks))
    #
    # print('总共耗时:', time.time() - start)
    
    
    start = time.time()
    urls = [
        'http://127.0.0.1:5000/bobo',  # 页面响应2秒
        'http://127.0.0.1:5000/jay',  # 页面响应2秒
        'http://127.0.0.1:5000/tom',  # 页面响应2秒
    ]
    
    # 代理操作的时候
    # async with await s.get(url=url,headers=headers,proxy="http://ip:port") as response:
    async def get_pageText(url):
        # 开启一个连接请求s
        async with aiohttp.ClientSession() as s:
            # await的使用条件: 请求和响应都存在网络传输,
            # 发送一个连接请求,其他参数跟用request发请求一样比如headers,直接写括号里
            async with await s.get(url=url) as response:
                # 获取响应
                page_text = await response.text()
                # print(page_text)
                # 把page_text传给回调函数进行解析
                return page_text
    
    
    from lxml import etree
    def parse(task):
        # 获取 执行函数调用的结果
        page_text = task.result()
    
        # # 实例化etree解析对象
        # tree = etree.HTML(page_text)
        # page_data = tree.xpath('//*[@id="page"]/a[1]/span[1]/i/@class')[0]
    
        print(page_text, "开始对页面进行解析")
    
    
    loop = asyncio.get_event_loop()
    tasks = []
    for url in urls:
        c = get_pageText(url)
        task = loop.create_task(c)
        # 给每一个任务对象绑定回调函数
        task.add_done_callback(parse)
        tasks.append(task)
    loop.run_until_complete(asyncio.wait(tasks))
    
    print('总共耗时:', time.time() - start)
  • 相关阅读:
    难找的对象
    欺负10086客服小姐(超搞笑)
    左手,请握紧你的右手!
    Discuz! 6.x/7.x EXP
    【超搞视频】  另类乞讨让你笑掉大牙 !
    GNU/Linux平台的C程序开发及程序运行环境
    SICK LMS200激光数据采集程序说明
    郭云深划分武学境界
    Discuz XSS得webshell
    200条装修小常识(结婚,不结婚的都要看一下,很有用的)
  • 原文地址:https://www.cnblogs.com/kenD/p/12269620.html
Copyright © 2020-2023  润新知