• aiohttp爬虫的模板,类的形式


      1 import asyncio
      2 import aiohttp
      3 import async_timeout
      4 from lxml import html
      5 from timeit import default_timer as timer
      6 
      7 from db import DBData
      8 
      9 
     10 class Crawler:
     11     def __init__(self, **kwargs):
     12         self.domains = kwargs["domains"]
     13         self.max_depth = kwargs["max_depth"]
     14         self.max_retries = 3
     15         self.max_workers = 10
     16         self.Q = asyncio.Queue()
     17         self.db_Q = asyncio.Queue()
     18         self.cache = set()
     19         self.count = 0
     20         self.loop = asyncio.get_event_loop()
     21         self.db_data = DBData()
     22 
     23         # Clear
     24         self.db_data.clear_crawler()
     25 
     26     async def get(self, url, timeout):
     27         with async_timeout.timeout(timeout):
     28             async with self.session.get(url) as response:
     29                 return await response.text()
     30 
     31     async def extract_urls(self, url, timeout=10):
     32         tree = html.fromstring(await self.get(url, timeout))
     33         # Search only in domains
     34         return {p for p in tree.xpath("//a/@href")}
     35                 # if any(domain in p for domain in self.domains)}
     36 
     37     async def worker(self):
     38         while True:
     39             url, depth, retries = await self.Q.get()
     40             if url in self.cache:
     41                 self.db_Q.put_nowait(url)
     42                 self.Q.task_done()
     43                 continue
     44             try:
     45                 new_urls = await self.extract_urls(url)
     46             except Exception as e:
     47                 if retries <= self.max_retries:
     48                     self.Q.put_nowait((url, depth, retries + 1))
     49                 else:
     50                     print("Error in %s: %s" % (url, repr(e)))
     51             else:
     52                 self.cache.add(url)
     53                 self.count += 1
     54                 self.db_Q.put_nowait(url)
     55                 print("Depth: %s Retry: %s Visited: %s" % (depth, retries, url))
     56                 if depth+1 <= self.max_depth:
     57                     for x in new_urls:
     58                         self.Q.put_nowait((x, depth + 1, retries))
     59             self.Q.task_done()
     60 
     61     async def run(self):
     62         async with aiohttp.ClientSession(loop=self.loop) as session:
     63             self.session = session
     64             workers = [self.worker() for _ in range(self.max_workers)]
     65             workers += [self.write_to_db() for _ in range(self.max_workers)]
     66             tasks = [self.loop.create_task(x) for x in workers]
     67             await asyncio.sleep(5)
     68             await self.Q.join()
     69             await self.db_Q.join()
     70             for task in tasks:
     71                 task.cancel()
     72 
     73     def start(self):
     74         for domain in self.domains:
     75             print("Crawling %s start..." % domain)
     76 
     77             self.Q.put_nowait((domain, 0, 0))
     78             start_time = timer()
     79             self.loop.run_until_complete(asyncio.gather(self.run()))
     80             self.loop.close()
     81             runtime = timer() - start_time
     82 
     83             print("Crawling %s end. Exec time: %s. Requests: %s" % (
     84                 domain, runtime, self.count))
     85 
     86     async def write_to_db(self):
     87         while True:
     88             address = await self.db_Q.get()
     89             if await self.db_data.check_url(address) is None:
     90                 self.db_data.add_url(address)
     91                 print("Write to DB: %s" % address)
     92             self.db_Q.task_done()
     93 
     94 
     95 if __name__ == "__main__":
     96     options = {
     97         "domains": ["https://www.yahoo.com/news/"],
     98         "max_depth": 1
     99     }
    100     c = Crawler(**options)
    101     c.start()
    View Code
  • 相关阅读:
    js验证及限制文本框输入
    在鼠标单击位置显示一个层,而不影响其它超级链接或按钮
    js创建弹出DIV层并锁定背景层
    SQLConvert(varchar(12),getdate(111))中的111,112,110
    C#提高的一些要点
    简洁纯js分页
    java排序集锦
    企业中vsftp虚拟用户高级配置实战
    MySQL的timeout那点事
    MYSQL管理之索引改造
  • 原文地址:https://www.cnblogs.com/zhongshuiping/p/10172497.html
Copyright © 2020-2023  润新知