基本使用
from twisted.web.client import getPage, defer from twisted.internet import reactor # 基本使用 def all_done(contents): # 所有爬虫执行完毕后,循环终止 reactor.stop() def callback(contents): # 每一个爬虫获取结果后,自动执行 print(contents) deferred_list = list() url_list = ['http://www.bing.com', 'http://www.baidu.com'] for url in url_list: deferred = getPage(bytes(url, encoding='utf8')) deferred.addCallback(callback) deferred_list.append(deferred) dlist = defer.DeferredList(deferred_list) dlist.addBoth(all_done) reactor.run()
基于装饰器1
from twisted.web.client import getPage, defer from twisted.internet import reactor # 基于装饰器1 def all_done(arg): reactor.stop() def onedone(response): print(response) # 三要素:装饰器,deferred对象,yield @defer.inlineCallbacks def task(url): deferred = getPage(bytes(url, encoding='utf8')) deferred.addCallback(onedone) yield deferred deferred_list = list() url_list = ['http://www.bing.com', 'http://www.baidu.com'] for url in url_list: deferred = task(url) deferred_list.append(deferred) dlist = defer.DeferredList(deferred_list) dlist.addBoth(all_done) reactor.run()
基于装饰器2
from twisted.web.client import getPage, defer from twisted.internet import reactor # 基于装饰器2 def all_done(arg): reactor.stop() def onedone(response): print(response) @defer.inlineCallbacks def task(): deferred_1 = getPage(bytes('http://www.baidu.com', encoding='utf8')) deferred_1.addCallback(onedone) yield deferred_1 deferred_2 = getPage(bytes('http://www.bing.com', encoding='utf8')) deferred_2.addCallback(onedone) yield deferred_2 ret = task() ret.addBoth(all_done) reactor.run()
基于装饰器3
from twisted.web.client import getPage, defer from twisted.internet import reactor # 基于装饰器3,永恒循环 def all_done(arg): reactor.stop() def onedone(response): print(response) @defer.inlineCallbacks def task(): deferred_1 = getPage(bytes('http://www.bing.com', encoding='utf8')) deferred_1.addCallback(onedone) yield deferred_1 stop_deferred = defer.Deferred() # 永远不能完成的任务 # stop_deferred.callback(None) # 自定义callback对象,通过回调终止操作 yield stop_deferred ret = task() ret.addBoth(all_done) reactor.run() # run是事件循环
基于装饰器4
from twisted.web.client import defer, getPage from twisted.internet import reactor # 基于装饰器,执行完毕后停止事件循环 running_list = list() stop_deferred = None def all_done(arg): reactor.stop() def onedone(response, url): print(response) running_list.remove(url) def check_empty(response): if not running_list: stop_deferred.callback(None) @defer.inlineCallbacks def task(url): deferred = getPage(bytes(url, encoding='utf8')) deferred.addCallback(onedone, url) deferred.addCallabck(check_empty) yield deferred global stop_deferred stop_deferred = defer.Deferred() yield stop_deferred running_list.append('http://www.baidu.com') ret = task('http://www.baidu.com') ret.addBoth(all_done) reactor.run()
基于装饰器5
from twisted.web.client import getPage, defer from twisted.internet import reactor class ExecutionEngine(object): def __init__(self): self.stop_deferred = None self.running_list = list() def one_done(self, response, url): print(response) self.running_list.remove(url) def check_empty(self, url): if not self.running_list: self.stop_deferred.callback(None) @defer.inlineCallbacks def open_spider(self, url): deferred = getPage(bytes(url, encoding='utf8')) deferred.addCallback(self.one_done, url) deferred.addCallback(self.check_empty) yield deferred @defer.inlineCallbacks def close_spider(self, url): self.stop_deferred = defer.Deferred() yield self.stop_deferred @defer.inlineCallbacks def task(url): engine = ExecutionEngine() engine.running_list.append(url) yield engine.open_spider(url) yield engine.close_spider(url) def all_done(arg): reactor.stop() if __name__ == "__main__": ret = task("http://www.bing.com") ret.addBoth(all_done) reactor.run()
Miniscrapy,scrapy源码初解
from twisted.web.client import getPage, defer from twisted.internet import reactor import queue class Request(object): def __init(self, url, callback): self.url = url self.callback = callback class Response(object): def __init__(self, body, request): self.body = body self.request = request self.url = reqeust.url @property def text(self): return self.body.decode('utf8') class Scheduler(object): def __init(self, engine): self.q = queue.Queue() self.engine = engine def enqueue_request(self, request): self.q.put(request) def next_request(self): try: req = self.q.get(block=False) except Exception as e: req = None return req def size(self): return self.q.qsize() class ExecutionEngine(object): def __init(self): self._closewait = None self.runing = True self.start_requests = None self.scheduler = Scheduler(self) self.inprogress = set() def check_empty(self, response): if not self.runing: self._closewait.callback(None) def _next_request(self): while self.start_requests: try: request = next(self.start_requests) except StopIteration: self.start_requests = None else: self.scheduler.enqueue_request(request) while len(self.inprogress) < 5 and self.scheduler.size() > 0: # 最大并发数 request = self.scheduler.next_request() if not request: break self.inprogress.add(request) d = getPage(bytes(request.url, encoding='utf8')) d.addBoth(self._handle_downloader_output, request) d.addBoth(lambda x, req: self.inprogress.remove(req), request) d.addBoth(lambda x: self._next_request()) if len(self.inprogress) == 0 and self.scheduler.size() == 0: self._closewait.callback(None) def _handle_downloader_output(self, body, request): # 获取内容,执行回调函数,并且把回调函数中的返回值获取,并添加到队列中 import types response = Response(body, request) func = request.callback or self.spider.parse gen = func(response) if isinstance(gen, types.GeneratorType): for req in gen: self.scheduler.enqueue_request(req) @defer.inlineCallbacks def start(self): self._closewait = defer_Deferred() yield self._closewait @defer.inlineCallbacks def open_spider(self, spider, start_requests): self.start_requests = start_requests self.spider = spider yield None reactor.callLater(0, self._next_request) class Crawler(object): def __init__(self, spider_cls): self.spider_cls = spider_cls self.spider = None self.engine = None @defer.inlineCallbacks def crawl(self): self.engine = ExecutionEngine() self.spider = self.spider_cls() start_requests = iter(self.spider.start_requests()) yield self.engine.open_spider(self.spider, start_requests) yield self.engine.start() class CrawlerProcess(object): def __init__(self): self._active = set() self.crawlers = set() def crawl(self, spider_cls, *args, **kwargs): crawler = Crawler(spider_cls) self.crawlers.add(crawler) d = crawler.crawl(*args, **kwargs) self._active.add(d) return d def start(self): d = defer.DeferredList(self._active) d.addBoth(self._stop_reactor) reactor.run() def _stop_reactor(self, _=None): reactor.stop() class Spider(object): def start_requests(self): for url in self.start_urls: yield Request(url) class BaiduSpider(spider): name = 'baidu' start_urls = [ 'http://www.baidu.com' ] def parse(self, response): print(response.text) class BingSpider(spider): pass if __name__ == "__main__": spider_cls_list = [BaiduSpider, BingSpider] crawler_process = CrawlerProcess() for spider_cls in spider_cls_list: crawler_process.crawl(spider_cls) crawler_process.start()