• 写一个scrapy中间件--ip代理池


    middleware文件


    #
    -*- coding: utf-8 -*- # Define here the models for your spider middleware # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html import random from scrapy import signals class TutorialDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) # 创建一个中间件 ip代理池 from collections import defaultdict from scrapy.exceptions import NotConfigured
    class RandomProxyMiddleware(object): def __init__(self, settings): # 第三步 初始化配置和变量 # 在settings中写一个 PROXIES 列表配置 # 从settings中把代理读进来(把环境变量读进来) self.proxies = settings.getlist("PROXIES") self.stats = defaultdict(int) # 默认值是0 统计次数 self.max_failed = 3 # 请求最多不超过3次 @classmethod def from_cralwer(cls, crawler): # 第一步 创建中间件对象 # 首先获取配置 HTTPPROXY_ENABLED 看看是否启用代理, if not crawler.settings.getbool("HTTPPROXY_ENABLED"): # 如果没有启用代理 raise NotConfigured # auth_encoding = crawler.settings.get("HTTPPROXY_AUTH_ENCODING") # 读取配置,这里暂时不用 # 第二步 return cls(crawler.settings) # cls()实际调用的是 init()函数,如果init接受参数,cls就需要参数 def process_request(self, request, spider): # 第四步 为每个request对象随机分配一个ip代理 # 让这个请求使用代理 初始url不使用代理ip if self.proxies and not request.meta.get("proxy") and request.url not in spider.start_urls: request.meta["proxy"] = random.choice(self.proxies)
          
    def process_response(self, request, response, spider): # 第五步: 请求成功 cur_proxy = request.meta.get('proxy') # 判断是否被对方禁封 if response.status > 400: # 给相应的ip失败次数 +1 self.stats[cur_proxy] += 1 print("当前ip{},第{}次出现错误状态码".format(cur_proxy, self.stats[cur_proxy])) # 当某个ip的失败次数累计到一定数量 if self.stats[cur_proxy] >= self.max_failed: # 当前ip失败超过3次 print("当前状态码是{},代理{}可能被封了".format(response.status, cur_proxy)) # 可以认为该ip被对方封了,从代理池中删除这个ip self.remove_proxy(cur_proxy) del request.meta['proxy'] # 将这个请求重新给调度器,重新下载 return request # 状态码正常的时候,正常返回 return response def process_exception(self, request, exception, spider): # 第五步:请求失败 cur_proxy = request.meta.get('proxy') # 取出当前代理 from twisted.internet.error import ConnectionRefusedError, TimeoutError # 如果本次请求使用了代理,并且网络请求报错,认为这个ip出了问题 if cur_proxy and isinstance(exception, (ConnectionRefusedError, TimeoutError)): print("当前的{}和当前的{}".format(exception, cur_proxy)) self.remove_proxy(cur_proxy) del request.meta['proxy'] # 重新下载这个请求 return request def remove_proxy(self, proxy): if proxy in self.proxies: self.proxies.remove(proxy) print("从代理列表中删除{}".format(proxy))
    settings 文件
    
    # Enable or disable downloader middlewares
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    DOWNLOADER_MIDDLEWARES = {
       'tutorial.middlewares.RandomProxyMiddleware': 749,  # 修改下载优先级数字
    }
  • 相关阅读:
    Ubuntu 装JDK
    U盘文件夹被病毒隐藏,且不能取消解决办法
    wireshark: there are no interfaces on which a capture can be done
    [转]Ubuntu 常用快捷键10个
    恢复被win7覆盖的Ubuntu Grub
    U盘安装Win7 64位
    荣耀3X畅玩版狙击红米note!
    Secret and Whisper
    360 chrome不能登录Google账户
    周鸿祎仍想做手机
  • 原文地址:https://www.cnblogs.com/kenD/p/12243717.html
Copyright © 2020-2023  润新知