• Python 多线程、线程池、协程 爬虫


    多线程生产者消费者模型爬虫

    import queue
    
    import requests
    from bs4 import BeautifulSoup
    import threading
    import time
    import random
    
    
    def craw(url):
        r = requests.get(url=url)
        return r.text
    
    
    def parse(html):
        soup = BeautifulSoup(html, "html.parser")
        links = soup.find_all("a", class_="post-time-title")
        return [(link["href"], link.get_test()) for link in links]
    
    
    def do_craw(url_queue: queue.Queue, html_queue: queue.Queue):
        while True:
            url = url_queue.get()
            html = craw(url)
            html_queue.put(html)
            print(threading.current_thread().name, url)
            time.sleep(random.randint(1,2))
    
    
    def do_parse(html_queue:queue.Queue, f_out):
        while True:
            html = html_queue.get()
            results = parse(html)
            for result in results:
                f_out.write(str(result) + "
    ")
            print(threading.current_thread().name, html_queue.qsize())
            time.sleep(1)
    
    
    if __name__ == '__main__':
        url_queue = queue.Queue()
        html_queue = queue.Queue()
        for url in ["https://www.cnblogs.com/#p{}".format(i) for i in range(1, 25)]:
            url_queue.put(url)
    
        for idx in range(3):
            t = threading.Thread(target=do_craw, args=(url_queue, html_queue), name=f"craw-{idx}")
            t.start()
    
        file = open("02.data.txt", "w")
        for idx in range(2):
            d = threading.Thread(target=do_parse, args=(html_queue, file), name=f"parse-{idx}")
            d.start()
    

    多线程池爬虫

    from concurrent.futures import ThreadPoolExecutor, as_completed
    import requests
    from bs4 import BeautifulSoup
    
    spider_url = ["https://www.cnblogs.com/#p{}".format(i) for i in range(1, 25)]
    
    
    def craw(url):
        r = requests.get(url=url)
        return r.text
    
    
    def parse(html):
        soup = BeautifulSoup(html, "html.parser")
        links = soup.find_all("a", class_="post-time-title")
        return [(link["href"], link.get_test()) for link in links]
    
    
    # craw
    with ThreadPoolExecutor() as pool:
        htmls = pool.map(craw, spider_url)
        htmls = list(zip(spider_url, htmls))
        for k, v in htmls:
            print(k, len(v))
    
    
    with ThreadPoolExecutor() as pool:
        futures = {}
        for url, html in htmls:
            future = pool.submit(parse, html)
            futures[future] = url
    
        # for k, v in futures.items():
        #     print(v, k.result())
        for future in as_completed(futures):
            print(futures[future], future.result())
    

    协程

    import asyncio
    import aiohttp
    
    spider_url = ["https://www.cnblogs.com/taozhengquan/p/14966535.html"]*50
    
    # 信号量控制爬虫数量
    semaphore = asyncio.Semaphore(10)
    
    
    async def async_craw(url):
        async with semaphore:
            print("craw url:", url)
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as resp:
                    result = await resp.text()
                    print(url, len(result))
    
    
    loop = asyncio.get_event_loop()
    tasks = [
        loop.create_task(async_craw(item)) for item in spider_url
    ]
    loop.run_until_complete(asyncio.wait(tasks))
    
    
    此时此刻,非我莫属
  • 相关阅读:
    2012年互联网教育行业观察
    SharePoint 2013的简单介绍
    让Node.js在Azure上运行3
    让Node.js在Azure上运行2
    有一个字符串 "I am a good man",设计一个函数,返回 "man good a am I"。
    json序列化与反序列化
    golang连接mysql数据库进行查询
    简单的WCF服务
    百钱买百鸡问题
    大叔程序员的第九天 @广播启动Activity
  • 原文地址:https://www.cnblogs.com/taozhengquan/p/15254297.html
Copyright © 2020-2023  润新知