1 import asyncio 2 import aiohttp 3 import async_timeout 4 from lxml import html 5 from timeit import default_timer as timer 6 7 from db import DBData 8 9 10 class Crawler: 11 def __init__(self, **kwargs): 12 self.domains = kwargs["domains"] 13 self.max_depth = kwargs["max_depth"] 14 self.max_retries = 3 15 self.max_workers = 10 16 self.Q = asyncio.Queue() 17 self.db_Q = asyncio.Queue() 18 self.cache = set() 19 self.count = 0 20 self.loop = asyncio.get_event_loop() 21 self.db_data = DBData() 22 23 # Clear 24 self.db_data.clear_crawler() 25 26 async def get(self, url, timeout): 27 with async_timeout.timeout(timeout): 28 async with self.session.get(url) as response: 29 return await response.text() 30 31 async def extract_urls(self, url, timeout=10): 32 tree = html.fromstring(await self.get(url, timeout)) 33 # Search only in domains 34 return {p for p in tree.xpath("//a/@href")} 35 # if any(domain in p for domain in self.domains)} 36 37 async def worker(self): 38 while True: 39 url, depth, retries = await self.Q.get() 40 if url in self.cache: 41 self.db_Q.put_nowait(url) 42 self.Q.task_done() 43 continue 44 try: 45 new_urls = await self.extract_urls(url) 46 except Exception as e: 47 if retries <= self.max_retries: 48 self.Q.put_nowait((url, depth, retries + 1)) 49 else: 50 print("Error in %s: %s" % (url, repr(e))) 51 else: 52 self.cache.add(url) 53 self.count += 1 54 self.db_Q.put_nowait(url) 55 print("Depth: %s Retry: %s Visited: %s" % (depth, retries, url)) 56 if depth+1 <= self.max_depth: 57 for x in new_urls: 58 self.Q.put_nowait((x, depth + 1, retries)) 59 self.Q.task_done() 60 61 async def run(self): 62 async with aiohttp.ClientSession(loop=self.loop) as session: 63 self.session = session 64 workers = [self.worker() for _ in range(self.max_workers)] 65 workers += [self.write_to_db() for _ in range(self.max_workers)] 66 tasks = [self.loop.create_task(x) for x in workers] 67 await asyncio.sleep(5) 68 await self.Q.join() 69 await self.db_Q.join() 70 for task in tasks: 71 task.cancel() 72 73 def start(self): 74 for domain in self.domains: 75 print("Crawling %s start..." % domain) 76 77 self.Q.put_nowait((domain, 0, 0)) 78 start_time = timer() 79 self.loop.run_until_complete(asyncio.gather(self.run())) 80 self.loop.close() 81 runtime = timer() - start_time 82 83 print("Crawling %s end. Exec time: %s. Requests: %s" % ( 84 domain, runtime, self.count)) 85 86 async def write_to_db(self): 87 while True: 88 address = await self.db_Q.get() 89 if await self.db_data.check_url(address) is None: 90 self.db_data.add_url(address) 91 print("Write to DB: %s" % address) 92 self.db_Q.task_done() 93 94 95 if __name__ == "__main__": 96 options = { 97 "domains": ["https://www.yahoo.com/news/"], 98 "max_depth": 1 99 } 100 c = Crawler(**options) 101 c.start()