• scrapy-redis使redis不止保存url


    先看scrapy-redis源码

      1 class RedisMixin(object):
      2     """Mixin class to implement reading urls from a redis queue."""
      3     redis_key = None
      4     redis_batch_size = None
      5     redis_encoding = None
      6 
      7     # Redis client placeholder.
      8     server = None
      9 
     10     def start_requests(self):
     11         """Returns a batch of start requests from redis."""
     12         return self.next_requests()
     13 
     14     def setup_redis(self, crawler=None):
     15         """Setup redis connection and idle signal.
     16 
     17         This should be called after the spider has set its crawler object.
     18         """
     19         if self.server is not None:
     20             return
     21 
     22         if crawler is None:
     23             # We allow optional crawler argument to keep backwards
     24             # compatibility.
     25             # XXX: Raise a deprecation warning.
     26             crawler = getattr(self, 'crawler', None)
     27 
     28         if crawler is None:
     29             raise ValueError("crawler is required")
     30 
     31         settings = crawler.settings
     32 
     33         if self.redis_key is None:
     34             self.redis_key = settings.get(
     35                 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
     36             )
     37 
     38         self.redis_key = self.redis_key % {'name': self.name}
     39 
     40         if not self.redis_key.strip():
     41             raise ValueError("redis_key must not be empty")
     42 
     43         if self.redis_batch_size is None:
     44             # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE).
     45             self.redis_batch_size = settings.getint(
     46                 'REDIS_START_URLS_BATCH_SIZE',
     47                 settings.getint('CONCURRENT_REQUESTS'),
     48             )
     49 
     50         try:
     51             self.redis_batch_size = int(self.redis_batch_size)
     52         except (TypeError, ValueError):
     53             raise ValueError("redis_batch_size must be an integer")
     54 
     55         if self.redis_encoding is None:
     56             self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING)
     57 
     58         self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
     59                          "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
     60                          self.__dict__)
     61 
     62         self.server = connection.from_settings(crawler.settings)
     63         # The idle signal is called when the spider has no requests left,
     64         # that's when we will schedule new requests from redis queue
     65         crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
     66 
     67     def next_requests(self):
     68         """Returns a request to be scheduled or none."""
     69         use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET)
     70         fetch_one = self.server.spop if use_set else self.server.lpop
     71         # XXX: Do we need to use a timeout here?
     72         found = 0
     73         # TODO: Use redis pipeline execution.
     74         while found < self.redis_batch_size:
     75             data = fetch_one(self.redis_key)
     76             if not data:
     77                 # Queue empty.
     78                 break
     79             req = self.make_request_from_data(data)
     80             if req:
     81                 yield req
     82                 found += 1
     83             else:
     84                 self.logger.debug("Request not made from data: %r", data)
     85 
     86         if found:
     87             self.logger.debug("Read %s requests from '%s'", found, self.redis_key)
     88 
     89     def make_request_from_data(self, data):
     90         """Returns a Request instance from data coming from Redis.
     91 
     92         By default, ``data`` is an encoded URL. You can override this method to
     93         provide your own message decoding.
     94 
     95         Parameters
     96         ----------
     97         data : bytes
     98             Message from redis.
     99 
    100         """
    101         url = bytes_to_str(data, self.redis_encoding)
    102         return self.make_requests_from_url(url)
    103 
    104     def schedule_next_requests(self):
    105         """Schedules a request if available"""
    106         # TODO: While there is capacity, schedule a batch of redis requests.
    107         for req in self.next_requests():
    108             self.crawler.engine.crawl(req, spider=self)
    109 
    110     def spider_idle(self):
    111         """Schedules a request if available, otherwise waits."""
    112         # XXX: Handle a sentinel to close the spider.
    113         self.schedule_next_requests()
    114         raise DontCloseSpider
    115 
    116 
    117 class RedisSpider(RedisMixin, Spider):
    118     """Spider that reads urls from redis queue when idle.
    119 
    120     Attributes
    121     ----------
    122     redis_key : str (default: REDIS_START_URLS_KEY)
    123         Redis key where to fetch start URLs from..
    124     redis_batch_size : int (default: CONCURRENT_REQUESTS)
    125         Number of messages to fetch from redis on each attempt.
    126     redis_encoding : str (default: REDIS_ENCODING)
    127         Encoding to use when decoding messages from redis queue.
    128 
    129     Settings
    130     --------
    131     REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
    132         Default Redis key where to fetch start URLs from..
    133     REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
    134         Default number of messages to fetch from redis on each attempt.
    135     REDIS_START_URLS_AS_SET : bool (default: False)
    136         Use SET operations to retrieve messages from the redis queue. If False,
    137         the messages are retrieve using the LPOP command.
    138     REDIS_ENCODING : str (default: "utf-8")
    139         Default encoding to use when decoding messages from redis queue.
    140 
    141     """
    142 
    143     @classmethod
    144     def from_crawler(self, crawler, *args, **kwargs):
    145         obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
    146         obj.setup_redis(crawler)
    147         return obj
    148 
    149 
    150 class RedisCrawlSpider(RedisMixin, CrawlSpider):
    151     """Spider that reads urls from redis queue when idle.
    152 
    153     Attributes
    154     ----------
    155     redis_key : str (default: REDIS_START_URLS_KEY)
    156         Redis key where to fetch start URLs from..
    157     redis_batch_size : int (default: CONCURRENT_REQUESTS)
    158         Number of messages to fetch from redis on each attempt.
    159     redis_encoding : str (default: REDIS_ENCODING)
    160         Encoding to use when decoding messages from redis queue.
    161 
    162     Settings
    163     --------
    164     REDIS_START_URLS_KEY : str (default: "<spider.name>:start_urls")
    165         Default Redis key where to fetch start URLs from..
    166     REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS)
    167         Default number of messages to fetch from redis on each attempt.
    168     REDIS_START_URLS_AS_SET : bool (default: True)
    169         Use SET operations to retrieve messages from the redis queue.
    170     REDIS_ENCODING : str (default: "utf-8")
    171         Default encoding to use when decoding messages from redis queue.
    172 
    173     """
    174 
    175     @classmethod
    176     def from_crawler(self, crawler, *args, **kwargs):
    177         obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs)
    178         obj.setup_redis(crawler)
    179         return obj

    仔细看完的话会发现

    make_request_from_data(self, data)
    这个方法是从redis中返回一个请求实例 默认是一个url
    接下来重写一下这个方法直接传入到
    self.make_requests_from_url
    一个json串就好了
    在这个方法里面可以把这个串解析了请求url或者生产url
    代码如下
     1     def make_request_from_data(self, data):
     2         '''
     3         :params data bytes, Message from redis
     4         '''
     5         company = bytes_to_str(data, self.redis_encoding)
     6         return self.make_requests_from_url(company)
     7 
     8     def make_requests_from_url(self, company):
     9         data = eval(company)
    10         url = data["url"]
    11         headers = {
    12             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
    13             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
    14         }
    15         return Request(url, self.parse, meta={"data": data}, dont_filter=True, headers=headers)

    值得注意的是

    不能在make_request_from_data方法中直接使用Request(其他第三方的也不支持),会导致方法无法执行,也不抛出异常
    但是同时重写make_request_from_data和make_requests_from_url方法则可以执行
  • 相关阅读:
    cae when分组统计
    查看 Chrome 下载的文件的真实下载地址
    directory opus使用教程
    文件内容极速搜索工具: silversearcher-ag
    LINUX SHELL 变量的二次引用
    JS小练习
    jQuery
    JS-BOM对象
    JS-DOM对象
    JavaScript-基础知识
  • 原文地址:https://www.cnblogs.com/ltn26/p/10120444.html
Copyright © 2020-2023  润新知