• python


    1.  使用内置,并加以修改 ( 自定义 redis 存储的 keys )

    settings 配置
    # ############### scrapy redis连接 ####################
    
    REDIS_HOST = '140.143.227.206'                            # 主机名
    REDIS_PORT = 8888                                   # 端口
    REDIS_PARAMS  = {'password':'beta'}                                  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
    REDIS_ENCODING = "utf-8"                            # redis编码类型             默认:'utf-8'
    
    # REDIS_URL = 'redis://user:pass@hostname:9001'       # 连接URL(优先于以上配置)
    DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
    
    # DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
    DUPEFILTER_CLASS = 'dbd.xxx.RedisDupeFilter'
    from scrapy_redis.dupefilter import RFPDupeFilter
    from scrapy_redis.connection import get_redis_from_settings
    from scrapy_redis import defaults
    
    class RedisDupeFilter(RFPDupeFilter):
        @classmethod
        def from_settings(cls, settings):
            """Returns an instance from given settings.
    
            This uses by default the key ``dupefilter:<timestamp>``. When using the
            ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
            it needs to pass the spider name in the key.
    
            Parameters
            ----------
            settings : scrapy.settings.Settings
    
            Returns
            -------
            RFPDupeFilter
                A RFPDupeFilter instance.
    
    
            """
            server = get_redis_from_settings(settings)
            # XXX: This creates one-time key. needed to support to use this
            # class as standalone dupefilter with scrapy's default scheduler
            # if scrapy passes spider on open() method this wouldn't be needed
            # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
            key = defaults.DUPEFILTER_KEY % {'timestamp': '这里写 keys'}
            debug = settings.getbool('DUPEFILTER_DEBUG')
            return cls(server, key=key, debug=debug)

    2. 自定义

    from scrapy.dupefilter import BaseDupeFilter
    import redis
    from scrapy.utils.request import request_fingerprint
    import scrapy_redis
    
    
    class DupFilter(BaseDupeFilter):
        def __init__(self):
            self.conn = redis.Redis(host='140.143.227.206',port=8888,password='beta')
    
        def request_seen(self, request):
            """
            检测当前请求是否已经被访问过
            :param request: 
            :return: True表示已经访问过;False表示未访问过
            """
            fid = request_fingerprint(request)
            result = self.conn.sadd('visited_urls', fid)
            if result == 1:
                return False
            return True
  • 相关阅读:
    debug
    whlie and for
    while and for 2
    用鸿蒙开发AI应用(七)触摸屏控制LED
    animation动画组件在鸿蒙中的应用&鸿蒙的定时函数和js的动画效果
    2020技术征文大赛获奖名单公示
    HarmonyOS三方件开发指南(8)——RoundedImage
    从微信小程序到鸿蒙js开发【02】——数据绑定&tabBar&swiper
    从微信小程序到鸿蒙js开发【01】——环境搭建&flex布局
    HarmonyOS三方件开发指南(7)——compress组件
  • 原文地址:https://www.cnblogs.com/chaoqi/p/10551676.html
Copyright © 2020-2023  润新知