• Scrapy学习-23-分布式爬虫


    scrapy-redis分布式爬虫
    分布式需要解决的问题
      request队列集中管理
      去重集中管理
      存储管理
     
    使用scrapy-redis实现分布式爬虫
      github开源项目: https://github.com/rmax/scrapy-redis
     
    相关依赖模块下载
    pip isntall redis
    
    pip install scrapy
    使用
    配置settings
    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    
    DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
    
    ITEM_PIPELINES = {
        'scrapy_redis.pipelines.RedisPipeline': 300
    }
    在spider文件中创建一个spider项目的python文件
    # 基于basic_spider
    
    from scrapy_redis.spiders import RedisSpider
    
    class MySpider(RedisSpider):
        """Spider that reads urls from redis queue (myspider:start_urls)."""
        name = 'myspider_redis'
        redis_key = 'myspider:start_urls'
    
        def __init__(self, *args, **kwargs):
            # Dynamically define the allowed domains list.
            domain = kwargs.pop('domain', '')
            self.allowed_domains = filter(None, domain.split(','))
            super(MySpider, self).__init__(*args, **kwargs)
    
        def parse(self, response):
            return {
                'name': response.css('title::text').extract_first(),
                'url': response.url,
            }
    
    # 基于crawl_spider
    
    from scrapy.spiders import Rule
    from scrapy.linkextractors import LinkExtractor
    
    from scrapy_redis.spiders import RedisCrawlSpider
    
    
    class MyCrawler(RedisCrawlSpider):
        """Spider that reads urls from redis queue (myspider:start_urls)."""
        name = 'mycrawler_redis'
        redis_key = 'mycrawler:start_urls'
    
        rules = (
            # follow all links
            Rule(LinkExtractor(), callback='parse_page', follow=True),
        )
    
        def __init__(self, *args, **kwargs):
            # Dynamically define the allowed domains list.
            domain = kwargs.pop('domain', '')
            self.allowed_domains = filter(None, domain.split(','))
            super(MyCrawler, self).__init__(*args, **kwargs)
    
        def parse_page(self, response):
            return {
                'name': response.css('title::text').extract_first(),
                'url': response.url,
            }
    运行分布式爬虫
    scrapy runspider myspider.py
    初始化redis字段
    redis-cli lpush myspider:start_urls http://google.com
    
    # scrapy-redis会从这个myspider:start_urls获取第一个爬取的URL
    redis中默认生成的字段
    myspider:requests
    
    myspider:dupefilter
    运行
      使用scrapy创建main文件的方式运行即可
     
     
    scrapy-redis源码
    connection.py、defaults.py
      处理redis对象
     
    dupefilter.py
    # scrapy-redis中的dupefilter 与scrapy中的dupefilters,基于于同一个类,实现的接口都相同
    
    import logging
    import time
    
    from scrapy.dupefilters import BaseDupeFilter
    from scrapy.utils.request import request_fingerprint
    
    from . import defaults
    from .connection import get_redis_from_settings
    
    
    logger = logging.getLogger(__name__)
    
    
    # TODO: Rename class to RedisDupeFilter.
    class RFPDupeFilter(BaseDupeFilter):
        """Redis-based request duplicates filter.
    
        This class can also be used with default Scrapy's scheduler.
    
        """
    
        logger = logger
    
        def __init__(self, server, key, debug=False):
            """Initialize the duplicates filter.
    
            Parameters
            ----------
            server : redis.StrictRedis
                The redis server instance.
            key : str
                Redis key Where to store fingerprints.
            debug : bool, optional
                Whether to log filtered requests.
    
            """
            self.server = server  # 在connection.py中被实例化了,是一个redis对象
            self.key = key
            self.debug = debug
            self.logdupes = True
    
        @classmethod
        def from_settings(cls, settings):
            """Returns an instance from given settings.
    
            This uses by default the key ``dupefilter:<timestamp>``. When using the
            ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
            it needs to pass the spider name in the key.
    
            Parameters
            ----------
            settings : scrapy.settings.Settings
    
            Returns
            -------
            RFPDupeFilter
                A RFPDupeFilter instance.
    
    
            """
            server = get_redis_from_settings(settings)
            # XXX: This creates one-time key. needed to support to use this
            # class as standalone dupefilter with scrapy's default scheduler
            # if scrapy passes spider on open() method this wouldn't be needed
            # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
            key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
            debug = settings.getbool('DUPEFILTER_DEBUG')
            return cls(server, key=key, debug=debug)
    
        @classmethod
        def from_crawler(cls, crawler):
            """Returns instance from crawler.
    
            Parameters
            ----------
            crawler : scrapy.crawler.Crawler
    
            Returns
            -------
            RFPDupeFilter
                Instance of RFPDupeFilter.
    
            """
            return cls.from_settings(crawler.settings)
    
        def request_seen(self, request):
            """Returns True if request was already seen.
    
            Parameters
            ----------
            request : scrapy.http.Request
    
            Returns
            -------
            bool
    
            """
            fp = self.request_fingerprint(request)
            # This returns the number of values added, zero if already exists.
            added = self.server.sadd(self.key, fp)
            return added == 0
    
        def request_fingerprint(self, request):
            """Returns a fingerprint for a given request.
    
            Parameters
            ----------
            request : scrapy.http.Request
    
            Returns
            -------
            str
    
            """
            return request_fingerprint(request)
    
        @classmethod
        def from_spider(cls, spider):
            settings = spider.settings
            server = get_redis_from_settings(settings)
            dupefilter_key = settings.get("SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY)
            key = dupefilter_key % {'spider': spider.name}
            debug = settings.getbool('DUPEFILTER_DEBUG')
            return cls(server, key=key, debug=debug)
    
        def close(self, reason=''):
            """Delete data on close. Called by Scrapy's scheduler.
    
            Parameters
            ----------
            reason : str, optional
    
            """
            self.clear()
    
        def clear(self):
            """Clears fingerprints data."""
            self.server.delete(self.key)
    
        def log(self, request, spider):
            """Logs given request.
    
            Parameters
            ----------
            request : scrapy.http.Request
            spider : scrapy.spiders.Spider
    
            """
            if self.debug:
                msg = "Filtered duplicate request: %(request)s"
                self.logger.debug(msg, {'request': request}, extra={'spider': spider})
            elif self.logdupes:
                msg = ("Filtered duplicate request %(request)s"
                    " - no more duplicates will be shown"
                    " (see DUPEFILTER_DEBUG to show all duplicates)")
                self.logger.debug(msg, {'request': request}, extra={'spider': spider})
                self.logdupes = False
    View Code
    picklecompat.py
    # 这里使用的就是python的pickle模块,一个兼容py2和py3的串行化工具。这个serializer主要用于一会的scheduler存reuqest对象
    
    """A pickle wrapper module with protocol=-1 by default."""
    
    try:
        import cPickle as pickle  # PY2
    except ImportError:
        import pickle
    
    
    def loads(s):
        return pickle.loads(s)
    
    
    def dumps(obj):
        return pickle.dumps(obj, protocol=-1)
    View Code
    pipelines.py
    # 用于将items序列化后异步(deferToThread)存放到reids中
    
    # 此处pipelines不是必须使用的,我们可以自己写一个pipelines将数据保存在本地
    
    from scrapy.utils.misc import load_object
    from scrapy.utils.serialize import ScrapyJSONEncoder
    from twisted.internet.threads import deferToThread
    
    from . import connection, defaults
    
    
    default_serialize = ScrapyJSONEncoder().encode
    
    
    class RedisPipeline(object):
        """Pushes serialized item into a redis list/queue
    
        Settings
        --------
        REDIS_ITEMS_KEY : str
            Redis key where to store items.
        REDIS_ITEMS_SERIALIZER : str
            Object path to serializer function.
    
        """
    
        def __init__(self, server,
                    key=defaults.PIPELINE_KEY,
                    serialize_func=default_serialize):
            """Initialize pipeline.
    
            Parameters
            ----------
            server : StrictRedis
                Redis client instance.
            key : str
                Redis key where to store items.
            serialize_func : callable
                Items serializer function.
    
            """
            self.server = server
            self.key = key
            self.serialize = serialize_func
    
        @classmethod
        def from_settings(cls, settings):
            params = {
                'server': connection.from_settings(settings),
            }
            if settings.get('REDIS_ITEMS_KEY'):
                params['key'] = settings['REDIS_ITEMS_KEY']
            if settings.get('REDIS_ITEMS_SERIALIZER'):
                params['serialize_func'] = load_object(
                    settings['REDIS_ITEMS_SERIALIZER']
                )
    
            return cls(**params)
    
        @classmethod
        def from_crawler(cls, crawler):
            return cls.from_settings(crawler.settings)
    
        def process_item(self, item, spider):
            return deferToThread(self._process_item, item, spider)
    
        def _process_item(self, item, spider):
            key = self.item_key(item, spider)
            data = self.serialize(item)
            self.server.rpush(key, data)
            return item
    
        def item_key(self, item, spider):
            """Returns redis key based on given spider.
    
            Override this function to use a different key depending on the item
            and/or spider.
    
            """
            return self.key % {'spider': spider.name}
    View Code
    queue.py
    # 此处定义了三种队列先进先出队列、优先级队列、后进先出队列
    
    from scrapy.utils.reqser import request_to_dict, request_from_dict
    
    from . import picklecompat
    
    
    class Base(object):
        """Per-spider base queue class"""
    
        def __init__(self, server, spider, key, serializer=None):
            """Initialize per-spider redis queue.
    
            Parameters
            ----------
            server : StrictRedis
                Redis client instance.
            spider : Spider
                Scrapy spider instance.
            key: str
                Redis key where to put and get messages.
            serializer : object
                Serializer object with ``loads`` and ``dumps`` methods.
    
            """
            if serializer is None:
                # Backward compatibility.
                # TODO: deprecate pickle.
                serializer = picklecompat
            if not hasattr(serializer, 'loads'):
                raise TypeError("serializer does not implement 'loads' function: %r"
                                % serializer)
            if not hasattr(serializer, 'dumps'):
                raise TypeError("serializer '%s' does not implement 'dumps' function: %r"
                                % serializer)
    
            self.server = server
            self.spider = spider
            self.key = key % {'spider': spider.name}
            self.serializer = serializer
    
        def _encode_request(self, request):
            """Encode a request object"""
            obj = request_to_dict(request, self.spider)
            return self.serializer.dumps(obj)
    
        def _decode_request(self, encoded_request):
            """Decode an request previously encoded"""
            obj = self.serializer.loads(encoded_request)
            return request_from_dict(obj, self.spider)
    
        def __len__(self):
            """Return the length of the queue"""
            raise NotImplementedError
    
        def push(self, request):
            """Push a request"""
            raise NotImplementedError
    
        def pop(self, timeout=0):
            """Pop a request"""
            raise NotImplementedError
    
        def clear(self):
            """Clear queue/stack"""
            self.server.delete(self.key)
    
    
    class FifoQueue(Base):
        """Per-spider FIFO queue"""
    
        def __len__(self):
            """Return the length of the queue"""
            return self.server.llen(self.key)
    
        def push(self, request):
            """Push a request"""
            self.server.lpush(self.key, self._encode_request(request))
    
        def pop(self, timeout=0):
            """Pop a request"""
            if timeout > 0:
                data = self.server.brpop(self.key, timeout)
                if isinstance(data, tuple):
                    data = data[1]
            else:
                data = self.server.rpop(self.key)
            if data:
                return self._decode_request(data)
    
    
    class PriorityQueue(Base):
        """Per-spider priority queue abstraction using redis' sorted set"""
    
        def __len__(self):
            """Return the length of the queue"""
            return self.server.zcard(self.key)
    
        def push(self, request):
            """Push a request"""
            data = self._encode_request(request)
            score = -request.priority
            # We don't use zadd method as the order of arguments change depending on
            # whether the class is Redis or StrictRedis, and the option of using
            # kwargs only accepts strings, not bytes.
            self.server.execute_command('ZADD', self.key, score, data)
    
        def pop(self, timeout=0):
            """
            Pop a request
            timeout not support in this queue class
            """
            # use atomic range/remove using multi/exec
            pipe = self.server.pipeline()
            pipe.multi()
            pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0)
            results, count = pipe.execute()
            if results:
                return self._decode_request(results[0])
    
    
    class LifoQueue(Base):
        """Per-spider LIFO queue."""
    
        def __len__(self):
            """Return the length of the stack"""
            return self.server.llen(self.key)
    
        def push(self, request):
            """Push a request"""
            self.server.lpush(self.key, self._encode_request(request))
    
        def pop(self, timeout=0):
            """Pop a request"""
            if timeout > 0:
                data = self.server.blpop(self.key, timeout)
                if isinstance(data, tuple):
                    data = data[1]
            else:
                data = self.server.lpop(self.key)
    
            if data:
                return self._decode_request(data)
    
    
    # TODO: Deprecate the use of these names.
    SpiderQueue = FifoQueue
    SpiderStack = LifoQueue
    SpiderPriorityQueue = PriorityQueue
    View Code
    scheduler.py
    # 主要的两个函数enqueue_request、next_request
    
    import importlib
    import six
    
    from scrapy.utils.misc import load_object
    
    from . import connection, defaults
    
    
    # TODO: add SCRAPY_JOB support.
    class Scheduler(object):
        """Redis-based scheduler
    
        Settings
        --------
        SCHEDULER_PERSIST : bool (default: False)
            Whether to persist or clear redis queue.
        SCHEDULER_FLUSH_ON_START : bool (default: False)
            Whether to flush redis queue on start.
        SCHEDULER_IDLE_BEFORE_CLOSE : int (default: 0)
            How many seconds to wait before closing if no message is received.
        SCHEDULER_QUEUE_KEY : str
            Scheduler redis key.
        SCHEDULER_QUEUE_CLASS : str
            Scheduler queue class.
        SCHEDULER_DUPEFILTER_KEY : str
            Scheduler dupefilter redis key.
        SCHEDULER_DUPEFILTER_CLASS : str
            Scheduler dupefilter class.
        SCHEDULER_SERIALIZER : str
            Scheduler serializer.
    
        """
    
        def __init__(self, server,
                    persist=False,
                    flush_on_start=False,
                    queue_key=defaults.SCHEDULER_QUEUE_KEY,
                    queue_cls=defaults.SCHEDULER_QUEUE_CLASS,
                    dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY,
                    dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS,
                    idle_before_close=0,
                    serializer=None):
            """Initialize scheduler.
    
            Parameters
            ----------
            server : Redis
                The redis server instance.
            persist : bool
                Whether to flush requests when closing. Default is False.
            flush_on_start : bool
                Whether to flush requests on start. Default is False.
            queue_key : str
                Requests queue key.
            queue_cls : str
                Importable path to the queue class.
            dupefilter_key : str
                Duplicates filter key.
            dupefilter_cls : str
                Importable path to the dupefilter class.
            idle_before_close : int
                Timeout before giving up.
    
            """
            if idle_before_close < 0:
                raise TypeError("idle_before_close cannot be negative")
    
            self.server = server
            self.persist = persist
            self.flush_on_start = flush_on_start
            self.queue_key = queue_key
            self.queue_cls = queue_cls
            self.dupefilter_cls = dupefilter_cls
            self.dupefilter_key = dupefilter_key
            self.idle_before_close = idle_before_close
            self.serializer = serializer
            self.stats = None
    
        def __len__(self):
            return len(self.queue)
    
        @classmethod
        def from_settings(cls, settings):
            kwargs = {
                'persist': settings.getbool('SCHEDULER_PERSIST'),
                'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'),
                'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'),
            }
    
            # If these values are missing, it means we want to use the defaults.
            optional = {
                # TODO: Use custom prefixes for this settings to note that are
                # specific to scrapy-redis.
                'queue_key': 'SCHEDULER_QUEUE_KEY',
                'queue_cls': 'SCHEDULER_QUEUE_CLASS',
                'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY',
                # We use the default setting name to keep compatibility.
                'dupefilter_cls': 'DUPEFILTER_CLASS',
                'serializer': 'SCHEDULER_SERIALIZER',
            }
            for name, setting_name in optional.items():
                val = settings.get(setting_name)
                if val:
                    kwargs[name] = val
    
            # Support serializer as a path to a module.
            if isinstance(kwargs.get('serializer'), six.string_types):
                kwargs['serializer'] = importlib.import_module(kwargs['serializer'])
    
            server = connection.from_settings(settings)
            # Ensure the connection is working.
            server.ping()
    
            return cls(server=server, **kwargs)
    
        @classmethod
        def from_crawler(cls, crawler):
            instance = cls.from_settings(crawler.settings)
            # FIXME: for now, stats are only supported from this constructor
            instance.stats = crawler.stats
            return instance
    
        def open(self, spider):
            self.spider = spider
    
            try:
                self.queue = load_object(self.queue_cls)(
                    server=self.server,
                    spider=spider,
                    key=self.queue_key % {'spider': spider.name},
                    serializer=self.serializer,
                )
            except TypeError as e:
                raise ValueError("Failed to instantiate queue class '%s': %s",
                                self.queue_cls, e)
    
            self.df = load_object(self.dupefilter_cls).from_spider(spider)
    
            if self.flush_on_start:
                self.flush()
            # notice if there are requests already in the queue to resume the crawl
            if len(self.queue):
                spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
    
        def close(self, reason):
            if not self.persist:
                self.flush()
    
        def flush(self):
            self.df.clear()
            self.queue.clear()
    
        def enqueue_request(self, request):
            if not request.dont_filter and self.df.request_seen(request):
                self.df.log(request, self.spider)
                return False
            if self.stats:
                self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
            self.queue.push(request)
            return True
    
        def next_request(self):
            block_pop_timeout = self.idle_before_close
            request = self.queue.pop(block_pop_timeout)
            if request and self.stats:
                self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider)
            return request
    
        def has_pending_requests(self):
            return len(self) > 0
    View Code

    utils.py

    # 兼容python2和python3
    
    import six
    
    
    def bytes_to_str(s, encoding='utf-8'):
        """Returns a str if a bytes object is given."""
        if six.PY3 and isinstance(s, bytes):
            return s.decode(encoding)
        return s
    View Code

    spiders.py

    # 继承了RedisMixin和Spider
    
    # RedisMixin重载了start_request(),使用自己编写的函数next_requests,实现从redis中获取数据
    
    from scrapy import signals
    from scrapy.exceptions import DontCloseSpider
    from scrapy.spiders import Spider, CrawlSpider
    
    from . import connection, defaults
    from .utils import bytes_to_str
    
    
    class RedisMixin(object):
        """Mixin class to implement reading urls from a redis queue."""
        redis_key = None
        redis_batch_size = None
        redis_encoding = None
    
        # Redis client placeholder.
        server = None
    
        def start_requests(self):
            """Returns a batch of start requests from redis."""
            return self.next_requests()
    
        def setup_redis(self, crawler=None):
            """Setup redis connection and idle signal.
    
            This should be called after the spider has set its crawler object.
            """
            if self.server is not None:
                return
    
            if crawler is None:
                # We allow optional crawler argument to keep backwards
                # compatibility.
                # XXX: Raise a deprecation warning.
                crawler = getattr(self, 'crawler', None)
    
            if crawler is None:
                raise ValueError("crawler is required")
    
            settings = crawler.settings
    
            if self.redis_key is None:
                self.redis_key = settings.get(
                    'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
                )
    
            self.redis_key = self.redis_key % {'name': self.name}
    
            if not self.redis_key.strip():
                raise ValueError("redis_key must not be empty")
    
            if self.redis_batch_size is None:
                # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE).
                self.redis_batch_size = settings.getint(
                    'REDIS_START_URLS_BATCH_SIZE',
                    settings.getint('CONCURRENT_REQUESTS'),
                )
    
            try:
                self.redis_batch_size = int(self.redis_batch_size)
            except (TypeError, ValueError):
                raise ValueError("redis_batch_size must be an integer")
    
            if self.redis_encoding is None:
                self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING)
    
            self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
                            "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
                            self.__dict__)
    
            self.server = connection.from_settings(crawler.settings)
            # The idle signal is called when the spider has no requests left,
            # that's when we will schedule new requests from redis queue
            crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
    
        def next_requests(self):
            """Returns a request to be scheduled or none."""
            use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)
            fetch_one = self.server.spop if use_set else self.server.lpop
            # XXX: Do we need to use a timeout here?
            found = 0
            # TODO: Use redis pipeline execution.
            while found < self.redis_batch_size:
                data = fetch_one(self.redis_key)
                if not data:
                    # Queue empty.
                    break
                req = self.make_request_from_data(data)
                if req:
                    yield req
                    found += 1
                else:
                    self.logger.debug("Request not made from data: %r", data)
    
            if found:
                self.logger.debug("Read %s requests from '%s'", found, self.redis_key)
    
        def make_request_from_data(self, data):
            """Returns a Request instance from data coming from Redis.
    
            By default, ``data`` is an encoded URL. You can override this method to
            provide your own message decoding.
    
            Parameters
            ----------
            data : bytes
                Message from redis.
    
            """
            url = bytes_to_str(data, self.redis_encoding)
            return self.make_requests_from_url(url)
    
        def schedule_next_requests(self):
            """Schedules a request if available"""
            # TODO: While there is capacity, schedule a batch of redis requests.
            for req in self.next_requests():
                self.crawler.engine.crawl(req, spider=self)
    
        def spider_idle(self):
            """Schedules a request if available, otherwise waits."""
            # XXX: Handle a sentinel to close the spider.
            self.schedule_next_requests()
            raise DontCloseSpider
    
    
    class RedisSpider(RedisMixin, Spider):
        """Spider that reads urls from redis queue when idle.
    
        Attributes
        ----------
        redis_key : str (default: REDIS_START_URLS_KEY)
            Redis key where to fetch start URLs from..
        redis_batch_size : int (default: CONCURRENT_REQUESTS)
            Number of messages to fetch from redis on each attempt.
        redis_encoding : str (default: REDIS_ENCODING)
            Encoding to use when decoding messages from redis queue.
    
        Settings
        --------
        REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
            Default Redis key where to fetch start URLs from..
        REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
            Default number of messages to fetch from redis on each attempt.
        REDIS_START_URLS_AS_SET : bool (default: False)
            Use SET operations to retrieve messages from the redis queue. If False,
            the messages are retrieve using the LPOP command.
        REDIS_ENCODING : str (default: "utf-8")
            Default encoding to use when decoding messages from redis queue.
    
        """
    
        @classmethod
        def from_crawler(self, crawler, *args, **kwargs):
            obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
            obj.setup_redis(crawler)
            return obj
    
    
    class RedisCrawlSpider(RedisMixin, CrawlSpider):
        """Spider that reads urls from redis queue when idle.
    
        Attributes
        ----------
        redis_key : str (default: REDIS_START_URLS_KEY)
            Redis key where to fetch start URLs from..
        redis_batch_size : int (default: CONCURRENT_REQUESTS)
            Number of messages to fetch from redis on each attempt.
        redis_encoding : str (default: REDIS_ENCODING)
            Encoding to use when decoding messages from redis queue.
    
        Settings
        --------
        REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
            Default Redis key where to fetch start URLs from..
        REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
            Default number of messages to fetch from redis on each attempt.
        REDIS_START_URLS_AS_SET : bool (default: True)
            Use SET operations to retrieve messages from the redis queue.
        REDIS_ENCODING : str (default: "utf-8")
            Default encoding to use when decoding messages from redis queue.
    
        """
    
        @classmethod
        def from_crawler(self, crawler, *args, **kwargs):
            obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)
            obj.setup_redis(crawler)
            return obj
    View Code
    集成bloom-filter到scrapy-redis
    下载相关模块
    pip install mmh3
    
    pip install redis
    在utils中编写bloom-filter类,现实去重
    # -*- coding: utf-8 -*-
    import mmh3
    import redis
    import math
    import time
    
    
    class PyBloomFilter(object):
        # 内置100个随机种子
        SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372,
                344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338,
                465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53,
                481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371,
                63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518]
    
        # capacity是预先估计要去重的数量
        # error_rate表示错误率
        # conn表示redis的连接客户端
        # key表示在redis中的键的名字前缀
        def __init__(self, capacity=1000000000, error_rate=0.00000001, conn=None, key='BloomFilter'):
            self.m = math.ceil(capacity*math.log2(math.e)*math.log2(1/error_rate))      # 需要的总bit位数
            self.k = math.ceil(math.log1p(2)*self.m/capacity)                           # 需要最少的hash次数
            self.mem = math.ceil(self.m/8/1024/1024)                                    # 需要的多少M内存
            self.blocknum = math.ceil(self.mem/512)                                     # 需要多少个512M的内存块,value的第一个字符必须是ascii码,所有最多有256个内存块
            self.seeds = self.SEEDS[0:self.k]
            self.key = key
            self.N = 2**31-1
            self.redis = conn
            # print(self.mem)
            # print(self.k)
    
        def add(self, value):
            name = self.key + "_" + str(ord(value[0])%self.blocknum)
            hashs = self.get_hashs(value)
            for hash in hashs:
                self.redis.setbit(name, hash, 1)
    
        def is_exist(self, value):
            name = self.key + "_" + str(ord(value[0])%self.blocknum)
            hashs = self.get_hashs(value)
            exist = True
            for hash in hashs:
                exist = exist & self.redis.getbit(name, hash)
            return exist
    
        def get_hashs(self, value):
            hashs = list()
            for seed in self.seeds:
                hash = mmh3.hash(value, seed)
                if hash >= 0:
                    hashs.append(hash)
                else:
                    hashs.append(self.N - hash)
            return hashs
    
    
    pool = redis.ConnectionPool(host='192.168.1.1', port=6379, db=0)
    conn = redis.StrictRedis(connection_pool=pool)
    
    
    if __name__ == "__main__":
        bf = PyBloomFilter(conn=conn)
        bf.add('www.jobbole.com')
        bf.add('www.zhihu.com')
        print(bf.is_exist('www.zhihu.com'))
        print(bf.is_exist('www.lagou.com'))
    修改dupefilter源码,让它使用我们自定制的去重策略
    import logging
    import time
    
    from scrapy.dupefilters import BaseDupeFilter
    from scrapy.utils.request import request_fingerprint
    
    from . import defaults
    from .connection import get_redis_from_settings
    from ScrapyRedisTest.utils.bloomfilter import conn, PyBloomFilter
    
    
    logger = logging.getLogger(__name__)
    
    
    # TODO: Rename class to RedisDupeFilter.
    class RFPDupeFilter(BaseDupeFilter):
        """Redis-based request duplicates filter.
    
        This class can also be used with default Scrapy's scheduler.
    
        """
    
        logger = logger
    
        def __init__(self, server, key, debug=False):
            """Initialize the duplicates filter.
    
            Parameters
            ----------
            server : redis.StrictRedis
                The redis server instance.
            key : str
                Redis key Where to store fingerprints.
            debug : bool, optional
                Whether to log filtered requests.
    
            """
            self.server = server
            self.key = key
            self.debug = debug
            self.logdupes = True
    
            self.bf = PyBloomFilter(conn=conn, key=key)
    
        @classmethod
        def from_settings(cls, settings):
            """Returns an instance from given settings.
    
            This uses by default the key ``dupefilter:<timestamp>``. When using the
            ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
            it needs to pass the spider name in the key.
    
            Parameters
            ----------
            settings : scrapy.settings.Settings
    
            Returns
            -------
            RFPDupeFilter
                A RFPDupeFilter instance.
    
    
            """
            server = get_redis_from_settings(settings)
            # XXX: This creates one-time key. needed to support to use this
            # class as standalone dupefilter with scrapy's default scheduler
            # if scrapy passes spider on open() method this wouldn't be needed
            # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
            key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
            debug = settings.getbool('DUPEFILTER_DEBUG')
            return cls(server, key=key, debug=debug)
    
        @classmethod
        def from_crawler(cls, crawler):
            """Returns instance from crawler.
    
            Parameters
            ----------
            crawler : scrapy.crawler.Crawler
    
            Returns
            -------
            RFPDupeFilter
                Instance of RFPDupeFilter.
    
            """
            return cls.from_settings(crawler.settings)
    
        def request_seen(self, request):
            """Returns True if request was already seen.
    
            Parameters
            ----------
            request : scrapy.http.Request
    
            Returns
            -------
            bool
    
            """
            fp = self.request_fingerprint(request)
    
            if self.bf.is_exist(fp):
                return True
            else:
                self.bf.add(fp)
                return False
            # This returns the number of values added, zero if already exists.
            # added = self.server.sadd(self.key, fp)
            # return added == 0
    
        def request_fingerprint(self, request):
            """Returns a fingerprint for a given request.
    
            Parameters
            ----------
            request : scrapy.http.Request
    
            Returns
            -------
            str
    
            """
            return request_fingerprint(request)
    
        def close(self, reason=''):
            """Delete data on close. Called by Scrapy's scheduler.
    
            Parameters
            ----------
            reason : str, optional
    
            """
            self.clear()
    
        def clear(self):
            """Clears fingerprints data."""
            self.server.delete(self.key)
    
        def log(self, request, spider):
            """Logs given request.
    
            Parameters
            ----------
            request : scrapy.http.Request
            spider : scrapy.spiders.Spider
    
            """
            if self.debug:
                msg = "Filtered duplicate request: %(request)s"
                self.logger.debug(msg, {'request': request}, extra={'spider': spider})
            elif self.logdupes:
                msg = ("Filtered duplicate request %(request)s"
                    " - no more duplicates will be shown"
                    " (see DUPEFILTER_DEBUG to show all duplicates)")
                self.logger.debug(msg, {'request': request}, extra={'spider': spider})
                self.logdupes = False
     
     
     
     
  • 相关阅读:
    Anaconda使用命令
    排序算法3--插入排序--希尔排序(缩小增量排序)
    排序算法2--插入排序--折半插入排序
    排序算法1--插入排序--直接插入排序
    排序总结---常用的排序算法总结,java和js实现
    前端兼容性问题
    js对象的几种创建方式和js实现继承的方式[转]
    js原型和原型链[转]
    性能优化的方法
    Http状态码
  • 原文地址:https://www.cnblogs.com/cq146637/p/9081061.html
Copyright © 2020-2023  润新知