• tornado实现高并发爬虫


    from pyquery import PyQuery as pq
    from tornado import ioloop, gen, httpclient, queues
    from urllib.parse import urljoin
    
    base_url = "http://www.baidu.com"
    concurrency = 8
    
    
    async def get_url_links(url):
        response = await httpclient.AsyncHTTPClient().fetch(url)
        html = response.body.decode("utf-8")
        p = pq(html)
        links = []
        for tag_a in p("a").items():
            links.append(urljoin(base_url, tag_a.attr("href")))
        return links
    
    
    async def main():
        seen_set = set()
        q = queues.Queue()
    
        async def fetch_url(current_url):
            if current_url in seen_set:
                return
    
            print(f"获取:{current_url}")
            seen_set.add(current_url)
    
            next_urls = await get_url_links(current_url)
            for next_url in next_urls:
                if next_url.startswith(base_url):
                    await q.put(next_url)
    
        async def worker():
            async for url in q:
                if url is None:
                    return
                try:
                    await fetch_url(url)
                except Exception as e:
                    print(f"exception:{e}")
                finally:
                    # 计数器,每进入一个就加1,所以我们调用完了之后,要减去1
                    q.task_done()
    
        # 放入初始url到队列
        await q.put(base_url)
    
        # 启动协程,同时开启三个消费者
        workers = gen.multi([worker() for _ in range(3)])
    
        # 会阻塞,直到队列里面没有数据为止
        await q.join()
    
        for _ in range(concurrency):
            await q.put(None)
    
        # 等待所有协程执行完毕
        await workers
    
    
    if __name__ == '__main__':
        ioloop.IOLoop.current().run_sync(main)
    """
    获取:http://www.baidu.com
    获取:http://www.baidu.com/gaoji/preferences.html
    获取:http://www.baidu.com/
    获取:http://www.baidu.com/more/
    获取:http://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&word=
    获取:http://www.baidu.com/cache/sethelp/help.html
    获取:http://www.baidu.com/duty/
    获取:http://www.baidu.com/search/jiqiao.html
    获取:http://www.baidu.com#iec
    获取:http://www.baidu.com#circle
    获取:http://www.baidu.com#aoyouc
    获取:http://www.baidu.com#sougouc
    获取:http://www.baidu.com#qqtc
    获取:http://www.baidu.com#ttc
    获取:http://www.baidu.com#ffc
    获取:http://www.baidu.com#chromec
    获取:http://www.baidu.com#jishu360c
    获取:http://www.baidu.com#world_jishuc
    获取:http://www.baidu.com#operac
    获取:http://www.baidu.com#worldc
    获取:http://www.baidu.com#safaric
    获取:http://www.baidu.com#greenc
    获取:http://www.baidu.com#krc
    获取:http://www.baidu.com#bdbrowserc
    获取:http://www.baidu.com/duty/index.html
    获取:http://www.baidu.com/copyright.html
    获取:http://www.baidu.com/mianze-shengming.html
    获取:http://www.baidu.com/right.html
    获取:http://www.baidu.com/yinsiquan-policy.html
    获取:http://www.baidu.com/yinsiquan-sub.html
    获取:http://www.baidu.com/baozhang.html
    获取:http://www.baidu.com/index
    获取:http://www.baidu.com/search?keywords=%E7%99%BE%E5%BA%A6%E5%B8%90%E5%8F%B7%E8%A2%AB%E5%B0%81%E7%A6%81
    获取:http://www.baidu.com/search?keywords=%E5%A6%82%E4%BD%95%E4%B8%BE%E6%8A%A5%E7%BD%91%E7%AB%99
    获取:http://www.baidu.com/search?keywords=%E6%8A%95%E8%AF%89%E4%BE%B5%E6%9D%83%E4%BF%A1%E6%81%AF
    获取:http://www.baidu.com/search?keywords=%E7%99%BE%E5%BA%A6%E5%B8%90%E5%8F%B7%E8%A2%AB%E7%9B%97
    获取:http://www.baidu.com/search?keywords=%E5%B8%90%E5%8F%B7%E7%94%B3%E8%AF%89%E6%9C%AA%E9%80%9A%E8%BF%87
    获取:http://www.baidu.com/search?keywords=%E8%B4%B4%E5%90%A7%E8%B4%B4%E5%AD%90%E8%A2%AB%E5%88%A0
    获取:http://www.baidu.com/search?keywords=%E5%88%A0%E9%99%A4%2F%E6%9B%B4%E6%96%B0%E5%BF%AB%E7%85%A7
    获取:http://www.baidu.com/zhifu
    获取:http://www.baidu.com/jubao
    获取:http://www.baidu.com/statement
    获取:http://www.baidu.com/personalinformation
    获取:http://www.baidu.com/more/index.html
    获取:http://www.baidu.com/search/jubao.html
    """
    
  • 相关阅读:
    ERROR: Couldn't connect to Docker daemon at http+docker://localunixsocket
    plainless script for es
    canal 代码阅读
    elasticsearch 之编译过程
    nfs 共享目录
    canal mysql slave
    yum 运行失败
    linux 几种服务类型
    2019-04-16 SpringMVC 学习笔记
    2019-04-10 集成JasperReport
  • 原文地址:https://www.cnblogs.com/traditional/p/11326710.html
Copyright © 2020-2023  润新知