• aiohttp使用队列


    获取百度的搜索结果,然后把百度的长链接,获取到真实的url

    import time
    import aiofiles
    import aiohttp
    import asyncio
    from lxml import etree
    from asyncio import Queue
    from itertools import product
    import async_timeout
    
    MAX_THREADS = 50
    
    
    class BaiduSpider:
        def __init__(self):
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
                              "(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
            self.q = Queue()
            self.q2 = Queue()
    
        def url_generator(self):
            with open('keyword.txt', 'r', encoding='utf8') as f:
                for key in product(f, range(0, 5)):
                    yield f"https://www.baidu.com/s?wd={key[0].strip()}&pn={key[1]}"
    
        async def fetch(self, session, url):
            try:
                with async_timeout.timeout(1):
                    async with session.get(url, headers=self.headers) as resp:
                        if resp.status in [200, 201]:
                            return await resp.text()
            except Exception as e:
                pass
    
        async def work(self, session):
            while not self.q.empty():
                url = await self.q.get()
                html = await self.fetch(session, url)
                datas = await self.parser(session, html)
                self.q.task_done()
    
        async def parser(self, session, html):
            if html:
                tree = etree.HTML(html)
                datas = tree.xpath('//h3[@class="t"]/a')
                for data in datas:
                    title = data.xpath('string(.)')
                    link = data.xpath('@href')[0]
                    data = [title, link if title else '']
                    self.q2.put_nowait(data)
                await self.work2(session)
    
        async def work2(self, session):
            while not self.q2.empty():
                data = await self.q2.get()
                try:
                    with async_timeout.timeout(1):
                        async with session.get(data[1], headers=self.headers) as resp2:
                            print(resp2.url, data[0])
                            async with aiofiles.open('links.txt', 'a', encoding='utf-8') as fd:
                                if str(resp2.url) not in 'links.txt':
                                    await fd.write(f"{data[0]},{resp2.url}
    ")
                except Exception as e:
                    pass
    
        async def download(self):
            urls = self.url_generator()
            conn = aiohttp.TCPConnector(verify_ssl=False)  # 防止ssl报错
            [self.q.put_nowait(url) for url in urls]
            async with aiohttp.ClientSession(connector=conn) as session:
                tasks = [asyncio.ensure_future(self.work(session)) for _ in range(MAX_THREADS)]
                await asyncio.wait(tasks)
    
        def run(self):
            start_time = time.time()
            loop = asyncio.get_event_loop()
            tasks1 = asyncio.gather(self.download())
            loop.run_until_complete(tasks1)
            print(f'全程用时{time.time() - start_time}秒')
    
    
    if __name__ == '__main__':
        baidu = BaiduSpider()
        items = baidu.run()
    
    
  • 相关阅读:
    基于JavaFXWJFXGameEngine游戏引擎介绍与进度
    进程线程与cpu绑定
    [Vim练级攻略] Vim基础操作
    hdu 2159 fate
    python的httplib注意事项
    SQL优化总结
    项目整体开发流程以及配置人员
    hdu 1010 解题报告 Tempter of the Bone
    在centos搭建git服务器时,不小心把/home/git目录删除了,我是怎么恢复的
    int 和bigint差别有多大?
  • 原文地址:https://www.cnblogs.com/c-x-a/p/10668977.html
Copyright © 2020-2023  润新知