• aiohttp笔记


    简介

    aiohttp需要python3.5.3以及更高的版本,它不但能做客户端爬虫,也能做服务器端,利用asyncio,协程,十分高效
    官方文档

    采集模板

    一批,一次性采集

    import asyncio
    import logging
    import time
    from aiohttp import ClientSession, ClientTimeout
    
    
    logging.basicConfig(level=logging.INFO, format='[%(asctime)s] - %(levelname)s in %(filename)s.%(funcName)s: %(message)s')
    
    # 默认请求头
    HEADERS = {
        'accept': 'text/javascript, text/html, application/xml, text/xml, */*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/69.0.3497.100 Safari/537.36',
    }
    
    # 默认超时时间
    TIMEOUT = 15
    
    
    class AioCrawl:
    
        def __init__(self):
            self.logger = logging.getLogger(__name__)
    
        async def fetch(self, url, method='GET', headers=None, timeout=TIMEOUT, cookies=None, data=None):
            """采集纤程"""
    
            method = 'POST' if method.upper() == 'POST' else 'GET'
            headers = headers if headers else HEADERS
            timeout = ClientTimeout(total=timeout)
            cookies = cookies if cookies else None
            data = data if data and isinstance(data, dict) else {}
    
            async with ClientSession(headers=headers, timeout=timeout, cookies=cookies) as session:
                try:
                    if method == 'GET':
                        async with session.get(url) as response:
                            return await response.read()
                    else:
                        async with session.post(url, data=data) as response:
                            return await response.read()
                except Exception as e:
                    raise e
    
        def prepare_fetch(self, urls):
            """准备future_list"""
            return [asyncio.ensure_future(self.fetch(url)) for url in urls]
    
        def crawl_batch_urls(self, urls):
            """执行采集"""
            future_list = self.prepare_fetch(urls)
    
            loop = asyncio.get_event_loop()
            loop.run_until_complete(asyncio.wait(future_list))
    
            self.logger.info('采集完一批: {}'.format(len(urls)))
            return future_list
    
    
    if __name__ == '__main__':
        a = AioCrawl()
        # 2-4秒
        t0 = time.time()
        future_list = a.crawl_batch_urls(['https://www.sina.com.cn' for _ in range(5)])
        print(time.time() - t0)
    
        for future in future_list:
            if future.exception():
                print(future.exception())
            else:
                print(len(future.result()))
    

    动态添加任务

    import asyncio
    import time
    from threading import Thread
    
    from aiohttp import ClientSession, ClientTimeout
    
    
    # 默认请求头
    HEADERS = {
        'accept': 'text/javascript, text/html, application/xml, text/xml, */*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/69.0.3497.100 Safari/537.36',
    }
    
    # 默认超时时间
    TIMEOUT = 15
    
    
    def start_loop(loop):
        """驱动事件循环"""
        asyncio.set_event_loop(loop)
        loop.run_forever()
    
    
    async def fetch(url, method='GET', headers=None, timeout=TIMEOUT, cookies=None, data=None):
        """采集纤程"""
        print(url)
        method = 'POST' if method.upper() == 'POST' else 'GET'
        headers = headers if headers else HEADERS
        timeout = ClientTimeout(total=timeout)
        cookies = cookies if cookies else None
        data = data if data and isinstance(data, dict) else {}
        async with ClientSession(headers=headers, timeout=timeout, cookies=cookies) as session:
            try:
                if method == 'GET':
                    async with session.get(url) as response:
                        content = await response.read()
                        return response.status, content
                else:
                    async with session.post(url, data=data) as response:
                        content = await response.read()
                        return response.status, content
            except Exception as e:
                raise e
    
    
    def callback(future):
        """回调函数"""
        try:
            print(future.result())
        except Exception as e:
            print(e)
        print(type(future))
        print(future)
    
    
    if __name__ == '__main__':
        # 启动事件循环
        loop = asyncio.new_event_loop()
        t = Thread(target=start_loop, args=(loop,))
        t.setDaemon(True)
        t.start()
    
        f = asyncio.run_coroutine_threadsafe(fetch('https://www.sina.com.cn'), loop)
        f.add_done_callback(callback)  # 给future对象添加回调函数
    
        time.sleep(5)  # 否则看不到结果
    
    

    动态添加任务,封装成类

    import asyncio
    import logging
    import time
    from threading import Thread
    from aiohttp import ClientSession, ClientTimeout, TCPConnector
    
    
    # 默认请求头
    HEADERS = {
        'accept': 'text/javascript, text/html, application/xml, text/xml, */*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                      '(KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
    }
    
    
    # 默认超时时间
    TIMEOUT = 15
    
    
    def start_loop(loop):
        asyncio.set_event_loop(loop)
        loop.run_forever()
    
    
    class AioCrawl:
    
        def __init__(self):
            self.logger = logging.getLogger(__name__)
    
            # 启动事件循环
            self.event_loop = asyncio.new_event_loop()
            self.t = Thread(target=start_loop, args=(self.event_loop,))
            self.t.setDaemon(True)
            self.t.start()
    
            self.concurrent = 0  # 记录并发数
    
        async def fetch(self, url, method='GET', headers=None, timeout=TIMEOUT, cookies=None, data=None, proxy=None):
            """采集纤程
            :param url: str
            :param method: 'GET' or 'POST'
            :param headers: dict()
            :param timeout: int
            :param cookies:
            :param data: dict()
            :param proxy: str
            :return: (status, content)
            """
    
            method = 'POST' if method.upper() == 'POST' else 'GET'
            headers = headers if headers else HEADERS
            timeout = ClientTimeout(total=timeout)
            cookies = cookies if cookies else None
            data = data if data and isinstance(data, dict) else {}
    
            tcp_connector = TCPConnector(verify_ssl=False)  # 禁用证书验证
            async with ClientSession(headers=headers, timeout=timeout, cookies=cookies, connector=tcp_connector) as session:
                try:
                    if method == 'GET':
                        async with session.get(url, proxy=proxy) as response:
                            content = await response.read()
                            return response.status, content
                    else:
                        async with session.post(url, data=data, proxy=proxy) as response:
                            content = await response.read()
                            return response.status, content
                except Exception as e:
                    raise e
    
        def callback(self, future):
            """回调函数
            1.处理并转换成Result对象
            2.写数据库
            """
            msg = str(future.exception()) if future.exception() else 'success'
            code = 1 if msg == 'success' else 0
            status = future.result()[0] if code == 1 else None
            data = future.result()[1] if code == 1 else b''  # 空串
    
            data_len = len(data) if data else 0
            if code == 0 or (status is not None and status != 200):  # 打印小异常
                self.logger.warning('<url="{}", code={}, msg="{}", status={}, data(len):{}>'.format(
                    future.url, code, msg, status, data_len))
    
            self.concurrent -= 1  # 并发数-1
    
            print(len(data))
    
        def add_tasks(self, tasks):
            """添加任务
            :param tasks: list <class Task>
            :return: future
            """
            for task in tasks:
                # asyncio.run_coroutine_threadsafe 接收一个协程对象和,事件循环对象
                future = asyncio.run_coroutine_threadsafe(self.fetch(task), self.event_loop)
                future.add_done_callback(self.callback)  # 给future对象添加回调函数
                self.concurrent += 1  # 并发数加 1
    
    
    if __name__ == '__main__':
        a = AioCrawl()
    
        for _ in range(5):
            a.add_tasks(['https://www.sina.com.cn' for _ in range(2)])  # 模拟动态添加任务
            time.sleep(1)
    
    
  • 相关阅读:
    C 数组初始化
    Linux函数之snprintf()[一]
    出现一下错误
    IOS通过post方式发送图片续
    IOS通过post方式发送图片
    TCP和UDP的区别趣解
    [转]Release mode debugging with VC++
    [转]Math For Programmers
    OS:kernel and shell
    Reminder: 8020 rule
  • 原文地址:https://www.cnblogs.com/haoabcd2010/p/10615364.html
Copyright © 2020-2023  润新知