• scrapy之中间件


    1 Dowloader Middeware

    下载中间件的用途
        1、在process——request内,自定义下载,不用scrapy的下载
        2、对请求进行二次加工,比如
            设置请求头
            设置cookie
            添加代理
                scrapy自带的代理组件:
                    from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
                    from urllib.request import getproxies
    class DownMiddleware1(object):
        def process_request(self, request, spider):
            """
            请求需要被下载时,经过所有下载器中间件的process_request调用
            :param request: 
            :param spider: 
            :return:  
                None,继续后续中间件去下载;
                Response对象,停止process_request的执行,开始执行process_response
                Request对象,停止中间件的执行,将Request重新调度器
                raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
            """
            pass
    
    
    
        def process_response(self, request, response, spider):
            """
            spider处理完成,返回时调用
            :param response:
            :param result:
            :param spider:
            :return: 
                Response 对象:转交给其他中间件process_response
                Request 对象:停止中间件,request会被重新调度下载
                raise IgnoreRequest 异常:调用Request.errback
            """
            print('response1')
            return response
    
        def process_exception(self, request, exception, spider):
            """
            当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
            :param response:
            :param exception:
            :param spider:
            :return: 
                None:继续交给后续中间件处理异常;
                Response对象:停止后续process_exception方法
                Request对象:停止中间件,request将会被重新调用下载
            """
            return None
    下载器中间件
    #1、与middlewares.py同级目录下新建proxy_handle.py
    import requests
    
    def get_proxy():
        return requests.get("http://127.0.0.1:5010/get/").text
    
    def delete_proxy(proxy):
        requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
        
        
    
    #2、middlewares.py
    from Amazon.proxy_handle import get_proxy,delete_proxy
    
    class DownMiddleware1(object):
        def process_request(self, request, spider):
            """
            请求需要被下载时,经过所有下载器中间件的process_request调用
            :param request:
            :param spider:
            :return:
                None,继续后续中间件去下载;
                Response对象,停止process_request的执行,开始执行process_response
                Request对象,停止中间件的执行,将Request重新调度器
                raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
            """
            proxy="http://" + get_proxy()
            request.meta['download_timeout']=20
            request.meta["proxy"] = proxy
            print('为%s 添加代理%s ' % (request.url, proxy),end='')
            print('元数据为',request.meta)
    
        def process_response(self, request, response, spider):
            """
            spider处理完成,返回时调用
            :param response:
            :param result:
            :param spider:
            :return:
                Response 对象:转交给其他中间件process_response
                Request 对象:停止中间件,request会被重新调度下载
                raise IgnoreRequest 异常:调用Request.errback
            """
            print('返回状态吗',response.status)
            return response
    
    
        def process_exception(self, request, exception, spider):
            """
            当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
            :param response:
            :param exception:
            :param spider:
            :return:
                None:继续交给后续中间件处理异常;
                Response对象:停止后续process_exception方法
                Request对象:停止中间件,request将会被重新调用下载
            """
            print('代理%s,访问%s出现异常:%s' %(request.meta['proxy'],request.url,exception))
            import time
            time.sleep(5)
            delete_proxy(request.meta['proxy'].split("//")[-1])
            request.meta['proxy']='http://'+get_proxy()
    
            return request
    配置代理

    2 Spider Middleware

    1、爬虫中间件方法介绍

    from scrapy import signals
    
    class SpiderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the spider middleware does not modify the
        # passed objects.
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) #当前爬虫执行时触发spider_opened
            return s
    
        def spider_opened(self, spider):
            # spider.logger.info('我是egon派来的爬虫1: %s' % spider.name)
            print('我是egon派来的爬虫1: %s' % spider.name)
    
        def process_start_requests(self, start_requests, spider):
            # Called with the start requests of the spider, and works
            # similarly to the process_spider_output() method, except
            # that it doesn’t have a response associated.
    
            # Must return only requests (not items).
            print('start_requests1')
            for r in start_requests:
                yield r
    
        def process_spider_input(self, response, spider):
            # Called for each response that goes through the spider
            # middleware and into the spider.
            # 每个response经过爬虫中间件进入spider时调用
    
            # 返回值:Should return None or raise an exception.
            #1、None: 继续执行其他中间件的process_spider_input
            #2、抛出异常:
            # 一旦抛出异常则不再执行其他中间件的process_spider_input
            # 并且触发request绑定的errback
            # errback的返回值倒着传给中间件的process_spider_output
            # 如果未找到errback,则倒着执行中间件的process_spider_exception
    
            print("input1")
            return None
    
        def process_spider_output(self, response, result, spider):
            # Called with the results returned from the Spider, after
            # it has processed the response.
    
            # Must return an iterable of Request, dict or Item objects.
            print('output1')
    
            # 用yield返回多次,与return返回一次是一个道理
            # 如果生成器掌握不好(函数内有yield执行函数得到的是生成器而并不会立刻执行),生成器的形式会容易误导你对中间件执行顺序的理解
            # for i in result:
            #     yield i
            return result
    
        def process_spider_exception(self, response, exception, spider):
            # Called when a spider or process_spider_input() method
            # (from other spider middleware) raises an exception.
    
            # Should return either None or an iterable of Response, dict
            # or Item objects.
            print('exception1')
    爬虫中间件

    2、当前爬虫启动时以及初始请求产生时

    #步骤一:
    '''
    打开注释:
    SPIDER_MIDDLEWARES = {
       'Baidu.middlewares.SpiderMiddleware1': 200,
       'Baidu.middlewares.SpiderMiddleware2': 300,
       'Baidu.middlewares.SpiderMiddleware3': 400,
    }
    
    '''
    
    
    #步骤二:middlewares.py
    from scrapy import signals
    
    class SpiderMiddleware1(object):
        @classmethod
        def from_crawler(cls, crawler):
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) #当前爬虫执行时触发spider_opened
            return s
    
        def spider_opened(self, spider):
            print('我是egon派来的爬虫1: %s' % spider.name)
    
        def process_start_requests(self, start_requests, spider):
            # Must return only requests (not items).
            print('start_requests1')
            for r in start_requests:
                yield r
    
    
            
            
    class SpiderMiddleware2(object):
        @classmethod
        def from_crawler(cls, crawler):
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)  # 当前爬虫执行时触发spider_opened
            return s
    
        def spider_opened(self, spider):
            print('我是egon派来的爬虫2: %s' % spider.name)
    
        def process_start_requests(self, start_requests, spider):
            print('start_requests2')
            for r in start_requests:
                yield r
    
    
    class SpiderMiddleware3(object):
        @classmethod
        def from_crawler(cls, crawler):
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)  # 当前爬虫执行时触发spider_opened
            return s
    
        def spider_opened(self, spider):
            print('我是egon派来的爬虫3: %s' % spider.name)
    
        def process_start_requests(self, start_requests, spider):
            print('start_requests3')
            for r in start_requests:
                yield r
    
    
    #步骤三:分析运行结果
    #1、启动爬虫时则立刻执行:
    
    我是egon派来的爬虫1: baidu
    我是egon派来的爬虫2: baidu
    我是egon派来的爬虫3: baidu
    
    
    #2、然后产生一个初始的request请求,依次经过爬虫中间件1,2,3:
    start_requests1
    start_requests2
    start_requests3
    View Code

    3、process_spider_input返回None时

    #步骤一:打开注释:
    SPIDER_MIDDLEWARES = {
       'Baidu.middlewares.SpiderMiddleware1': 200,
       'Baidu.middlewares.SpiderMiddleware2': 300,
       'Baidu.middlewares.SpiderMiddleware3': 400,
    }
    
    '''
    
    #步骤二:middlewares.py
    from scrapy import signals
    
    class SpiderMiddleware1(object):
    
        def process_spider_input(self, response, spider):
            print("input1")
    
        def process_spider_output(self, response, result, spider):
            print('output1')
            return result
    
        def process_spider_exception(self, response, exception, spider):
            print('exception1')
    
    
    class SpiderMiddleware2(object):
    
        def process_spider_input(self, response, spider):
            print("input2")
            return None
    
        def process_spider_output(self, response, result, spider):
            print('output2')
            return result
    
        def process_spider_exception(self, response, exception, spider):
            print('exception2')
    
    
    class SpiderMiddleware3(object):
    
        def process_spider_input(self, response, spider):
            print("input3")
            return None
    
        def process_spider_output(self, response, result, spider):
            print('output3')
            return result
    
        def process_spider_exception(self, response, exception, spider):
            print('exception3')
    
    
    #步骤三:运行结果分析
    
    #1、返回response时,依次经过爬虫中间件1,2,3
    input1
    input2
    input3
    
    #2、spider处理完毕后,依次经过爬虫中间件3,2,1
    output3
    output2
    output1
    View Code

    4、process_spider_input抛出异常时

    #步骤一:
    '''
    打开注释:
    SPIDER_MIDDLEWARES = {
       'Baidu.middlewares.SpiderMiddleware1': 200,
       'Baidu.middlewares.SpiderMiddleware2': 300,
       'Baidu.middlewares.SpiderMiddleware3': 400,
    }
    
    '''
    
    #步骤二:middlewares.py
    
    from scrapy import signals
    
    class SpiderMiddleware1(object):
    
        def process_spider_input(self, response, spider):
            print("input1")
    
        def process_spider_output(self, response, result, spider):
            print('output1')
            return result
    
        def process_spider_exception(self, response, exception, spider):
            print('exception1')
    
    
    class SpiderMiddleware2(object):
    
        def process_spider_input(self, response, spider):
            print("input2")
            raise Type
    
        def process_spider_output(self, response, result, spider):
            print('output2')
            return result
    
        def process_spider_exception(self, response, exception, spider):
            print('exception2')
    
    
    class SpiderMiddleware3(object):
    
        def process_spider_input(self, response, spider):
            print("input3")
            return None
    
        def process_spider_output(self, response, result, spider):
            print('output3')
            return result
    
        def process_spider_exception(self, response, exception, spider):
            print('exception3')
    
            
    
    #运行结果        
    input1
    input2
    exception3
    exception2
    exception1
    
    #分析:
    #1、当response经过中间件1的 process_spider_input返回None,继续交给中间件2的process_spider_input
    #2、中间件2的process_spider_input抛出异常,则直接跳过后续的process_spider_input,将异常信息传递给Spiders里该请求的errback
    #3、没有找到errback,则该response既没有被Spiders正常的callback执行,也没有被errback执行,即Spiders啥事也没有干,那么开始倒着执行process_spider_exception
    #4、如果process_spider_exception返回None,代表该方法推卸掉责任,并没处理异常,而是直接交给下一个process_spider_exception,全都返回None,则异常最终交给Engine抛出
    View Code

    5、指定errback

    #步骤一:spider.py
    import scrapy
    
    
    class BaiduSpider(scrapy.Spider):
        name = 'baidu'
        allowed_domains = ['www.baidu.com']
        start_urls = ['http://www.baidu.com/']
    
    
        def start_requests(self):
            yield scrapy.Request(url='http://www.baidu.com/',
                                 callback=self.parse,
                                 errback=self.parse_err,
                                 )
    
        def parse(self, response):
            pass
    
        def parse_err(self,res):
            #res 为异常信息,异常已经被该函数处理了,因此不会再抛给因此,于是开始走process_spider_output
            return [1,2,3,4,5] #提取异常信息中有用的数据以可迭代对象的形式存放于管道中,等待被process_spider_output取走
    
    
    
    #步骤二:
    '''
    打开注释:
    SPIDER_MIDDLEWARES = {
       'Baidu.middlewares.SpiderMiddleware1': 200,
       'Baidu.middlewares.SpiderMiddleware2': 300,
       'Baidu.middlewares.SpiderMiddleware3': 400,
    }
    
    '''
    
    #步骤三:middlewares.py
    
    from scrapy import signals
    
    class SpiderMiddleware1(object):
    
        def process_spider_input(self, response, spider):
            print("input1")
    
        def process_spider_output(self, response, result, spider):
            print('output1',list(result))
            return result
    
        def process_spider_exception(self, response, exception, spider):
            print('exception1')
    
    
    class SpiderMiddleware2(object):
    
        def process_spider_input(self, response, spider):
            print("input2")
            raise TypeError('input2 抛出异常')
    
        def process_spider_output(self, response, result, spider):
            print('output2',list(result))
            return result
    
        def process_spider_exception(self, response, exception, spider):
            print('exception2')
    
    
    class SpiderMiddleware3(object):
    
        def process_spider_input(self, response, spider):
            print("input3")
            return None
    
        def process_spider_output(self, response, result, spider):
            print('output3',list(result))
            return result
    
        def process_spider_exception(self, response, exception, spider):
            print('exception3')
    
    
    
    #步骤四:运行结果分析
    input1
    input2
    output3 [1, 2, 3, 4, 5] #parse_err的返回值放入管道中,只能被取走一次,在output3的方法内可以根据异常信息封装一个新的request请求
    output2 []
    output1 []
    View Code
  • 相关阅读:
    控件还是还原到客户端的好
    练习之彩票一 需求整理和分析
    Oracle 查询并删除重复记录的SQL语句
    C# 如何生成CHM帮助文件
    C#生成CHM帮助文件—>续
    datagridview中用Enter代替tab实现焦点切换,可换行
    博客园的dotaer
    winfrom中datagridview指定单元格为编辑状态
    C#生成CHM帮助文件(linq版)
    C语言03
  • 原文地址:https://www.cnblogs.com/lujiacheng-Python/p/10162645.html
Copyright © 2020-2023  润新知