• scrapy的扩展件extensions


    extensions.py文件


    #
    -*- coding: utf-8 -*- # 该扩展会在以下事件时记录一条日志: # spider被打开 # spider被关闭 # 爬取了特定数量的条目(items)
    import logging from collections import defaultdict from scrapy import signals from scrapy.exceptions import NotConfigured from datetime import datetime logger = logging.getLogger(__name__) class SpiderOpenCloseLogging(object): def __init__(self, item_count): self.item_count = item_count self.items_scraped = 0 self.items_dropped = 0 self.stats = defaultdict(int) # 默认是0 正常状态 self.err_stats = defaultdict(int) # 默认是0 print("=="*20, 'Extension object created 扩展对象被创建') @classmethod def from_crawler(cls, crawler): # first check if the extension should be enabled and raise # NotConfigured otherwise # 关键:这里如果是False就直接放弃对象的创建了,在settings中写一个MYEXT_ENABLED,设置为True if not crawler.settings.getbool('MYEXT_ENABLED'): raise NotConfigured # get the number of items from settings # 默认每爬1000条才记录一次log,可以在settings中设置这个MYEXT_ITEMCOUNT数字 item_count = crawler.settings.getint('MYEXT_ITEMCOUNT', 1000) # instantiate the extension object ext = cls(item_count) # connect the extension object to signals # 把ext.spider_opened这个函数绑定到signal=signals.spider_opened这个信号上, # 每当一个item对象被yield出来的时候,这个信号就会产生 crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) # signals.item_scraped这个是主要的信号,前提是一个item被爬之后,并通过所有的Pipeline没有被drop掉 crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped) # 注册一个item_dropped信号,当item被drop之后这个信号会触发 crawler.signals.connect(ext.item_dropped, signal=signals.item_dropped) # 注册一个ext.response_received crawler.signals.connect(ext.response_received, signal=signals.response_received) # return the extension object return ext def spider_opened(self, spider): # spider.log("opened spider %s" % spider.name) # 可以把spider.log替换成print print("opened spider %s" % spider.name) def spider_closed(self, spider): # spider.log("closed spider %s" % spider.name) # 可以把spider.log替换成print print("closed spider %s" % spider.name) def item_scraped(self, item, spider): self.items_scraped += 1 if self.items_scraped % self.item_count == 0: # spider.log("scraped %d items" % self.items_scraped) # 可以把spider.log替换成print print("scraped %d items" % self.items_scraped) def item_dropped(self, item, spider, response, exception): self.items_dropped += 1 if self.items_dropped % self.item_count == 0: # spider.log("scraped %d items" % self.items_scraped) print("dropped %d items" % self.items_dropped) def response_received(self, response, request, spider): # 监控爬虫的健康情况 # 统计当前这一分钟正确状态和错误状态的数量 now = datetime.now().strftime('%Y%m%d%H%M') self.stats[now] += 1 # 正常状态+! if response.status in [401, 403, 404, 500, 501, 502]: self.err_stats[now] += 1 # 错误状态+1 if self.err_stats[now] / float(self.stats[now]) > 0.2: # 占比 # 一般线上部署有warning信息会发邮件,有err信息会发短信 # warning级别比err低,但是比info高 logger.warning(f'received {self.stats[now]} response and {self.err_stats[now]} of them is not 200,{now}')
    settings中配置文件
    
    # Enable or disable extensions
    # See https://docs.scrapy.org/en/latest/topics/extensions.html
    MYEXT_ENABLED = True  # 使用自定义插件
    MYEXT_ITEMCOUNT = 10  # 每爬10条打印一次或者记录一次日志
    EXTENSIONS = {
       # 'scrapy.extensions.telnet.TelnetConsole': None,
       'qianmu.extensions.SpiderOpenCloseLogging': 1,
    }



  • 相关阅读:
    iOS开发拓展篇—音频处理(音乐播放器1)
    iOS开发拓展篇—CoreLocation地理编码
    iOS开发拓展篇—CoreLocation定位服务
    iOS开发拓展篇—CoreLocation简单介绍
    iOS开发拓展篇—封装音频文件播放工具类
    图标框架Font Awesome
    WordPress基础:设置后台语言
    使用帝国备份王备份还原网站数据
    WordPress主题开发:加载脚本和样式
    WordPress主题开发:get_term_by和get_term_link
  • 原文地址:https://www.cnblogs.com/kenD/p/12248037.html
Copyright © 2020-2023  润新知