• python之scrapy模块下载中间件


    知识点

    使用方法:
           编写一个Downloader Middlewares和我们编写一个pipeline一样,定义一个类,然后在setting中开启
    
        Downloader Middlewares默认的方法:
          process_request(self, request, spider):
            当每个request通过下载中间件时,该方法被调用。
        process_response(self, request, response, spider):
            当下载器完成http请求,传递响应给引擎的时候调用

    1、学习官网网址

    https://docs.scrapy.org/

    2、settings文件,USER_AGENTS代理池

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for zjh project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'zjh'
    
    SPIDER_MODULES = ['zjh.spiders']
    NEWSPIDER_MODULE = 'zjh.spiders'
    
    LOG_LEVEL = "WARNING"
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'zjh (+http://www.yourdomain.com)'
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = True
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'zjh.middlewares.ZjhSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #开启下载中间件
    DOWNLOADER_MIDDLEWARES = { 'zjh.middlewares.RandomUserAgentMiddleware': 543, 'zjh.middlewares.CheckUserAgent': 544, } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'zjh.pipelines.ZjhPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' USER_AGENTS = [ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" ]

    3、middleware.py处理代码池

    # -*- coding: utf-8 -*-
    
    # Define here the models for your spider middleware
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    from scrapy import signals
    
    import  random
    class RandomUserAgentMiddleware:
        """
        下载中间件一般用于反爬虫,代理IP,自定义USER_AGENTS
        """
        def process_request(self,request,spider):
            ua = random.choice(spider.settings.get("USER_AGENTS"))
            request.headers["User-Agent"] = ua
    
    class CheckUserAgent:
        def process_response(self,request,response,spider):
            print(dir(response.request))
            print(request.headers["User-Agent"])
            # return 必须有,表示响应经过引擎交给爬虫
            return response

    4、参考学习

      a)代理UserAgent

      

      b) 代理ip

      

  • 相关阅读:
    青少年机器人技术等级考试实际操作试卷(三级)201812 new
    SQL预编译 new
    青少年机器人技术等级考试实际操作试卷(三级)201809 new
    青少年机器人技术等级考试实际操作试卷(三级)201803 new
    SQL基本练习 new
    Asp.Net MVC 自定义一个ActionResult用于AJAX交互
    使用 TeamLab 来协同和管理工作
    使用Chose来美化Select
    在Asp.Net下使用couchbase实现分布式缓存
    如何修改couchbase的RAM
  • 原文地址:https://www.cnblogs.com/ywjfx/p/11088960.html
Copyright © 2020-2023  润新知