源码
class HttpErrorMiddleware(object): @classmethod def from_crawler(cls, crawler): return cls(crawler.settings) def __init__(self, settings): self.handle_httpstatus_all = settings.getbool('HTTPERROR_ALLOW_ALL') self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES') def process_spider_input(self, response, spider): if 200 <= response.status < 300: # common case return meta = response.meta if 'handle_httpstatus_all' in meta: return if 'handle_httpstatus_list' in meta: allowed_statuses = meta['handle_httpstatus_list'] elif self.handle_httpstatus_all: return else: allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list) if response.status in allowed_statuses: return raise HttpError(response, 'Ignoring non-200 response') def process_spider_exception(self, response, exception, spider): if isinstance(exception, HttpError): spider.crawler.stats.inc_value('httperror/response_ignored_count') spider.crawler.stats.inc_value( 'httperror/response_ignored_status_count/%s' % response.status ) logger.info( "Ignoring response %(response)r: HTTP status code is not handled or not allowed", {'response': response}, extra={'spider': spider}, ) return []
通过源码 init函数可以看到可以配置两个配置
HTTPERROR_ALLOW_ALL = true
HTTPERROR_ALLOWED_CODES=[301,404]
第一个配置是否允许所有,就是收到响应后,不管什么状态码都返回给爬虫
第二个是允许的列表
以上是全局配置 不推荐
如果想在每个爬虫里面进行配置
可以在单独的爬虫里面设置
handle_httpstatus_all = true
handle_httpstatus_list = [404,302]
除非你非常熟悉你的网站和scrapy 不建议使用这些配置 ,因为把错误的响应也返回给爬虫,没什么用