• Scrapy中间件


    一、下载中间件

    1、应用场景

    代理

    USER_AGENT(在setting文件中配置即可)

    2、定义类

    a、process_request 返回None

    执行顺序

    md1 request -> md2 request -> md2 response -> md1 response

    class DM1(object):
    
        def process_request(self, request, spider):
            print('M1 request', request)
            return None
    
        def process_response(self, request, response, spider):
            print('M1 response', response)
            return response
    
        def process_exception(self, request, exception, spider):
            pass
    
    
    class DM2(object):
    
        def process_request(self, request, spider):
            print('M2 request', request)
            return None
    
        def process_response(self, request, response, spider):
            print('M2 response', response)
            return response
    
        def process_exception(self, request, exception, spider):
            pass

    b、process_request 返回 Response

    顺序 md1 request -> md2 response -> md1 response

    from scrapy.http import Response
    
    
    class DM1(object):
    
        def process_request(self, request, spider):
            print('M1 request', request)
            return Response(url="www.test.com", status=200, headers=None, body=b'test')
    
        def process_response(self, request, response, spider):
            print('M1 response', response)
            return response
    
        def process_exception(self, request, exception, spider):
            pass
    
    
    class DM2(object):
    
        def process_request(self, request, spider):
            print('M2 request', request)
            return None
    
        def process_response(self, request, response, spider):
            print('M2 response', response)
            return response
    
        def process_exception(self, request, exception, spider):
            pass

    c、返回Request

    发生阻塞,返回request到调度器->下载中间件->返回request到调度器

    from scrapy.http import Response
    from scrapy.http import Request
    
    
    class DM1(object):
    
        def process_request(self, request, spider):
            print('M1 request', request)
    
            return Request("http://quotes.toscrape.com/page/2/")
    
        def process_response(self, request, response, spider):
            print('M1 response', response)
            return response
    
        def process_exception(self, request, exception, spider):
            pass
    
    
    class DM2(object):
    
        def process_request(self, request, spider):
            print('M2 request', request)
            return None
    
        def process_response(self, request, response, spider):
            print('M2 response', response)
            return response
    
        def process_exception(self, request, exception, spider):
            pass

    d、抛出异常,需要process_exception捕获

    from scrapy.http import Response
    from scrapy.http import Request
    from scrapy.exceptions import IgnoreRequest
    
    
    class DM1(object):
    
        def process_request(self, request, spider):
            print('M1 request', request)
            raise IgnoreRequest('有异常发生')
    
    
        def process_response(self, request, response, spider):
            print('M1 response', response)
            return response
    
        def process_exception(self, request, exception, spider):
            print(exception)
    
    
    class DM2(object):
    
        def process_request(self, request, spider):
            print('M2 request', request)
            return None
    
        def process_response(self, request, response, spider):
            print('M2 response', response)
            return response
    
        def process_exception(self, request, exception, spider):
            pass

    3、配置文件

    DOWNLOADER_MIDDLEWARES = {
       # 'toscrapy.middlewares.ToscrapyDownloaderMiddleware': 543,
       'toscrapy.downloadermd.DM1': 543,
       'toscrapy.downloadermd.DM2': 545,
    }

    二、爬虫中间件

    1、应用

    深度和优先级

    2、定义类

    class MySpiderMiddleware(object):
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            return s
    
        # 进入爬虫的parse方法,执行
        def process_spider_input(self, response, spider):
            print('in')
            return None
    
        # 出来爬虫的parse方法,执行一次
        def process_spider_output(self, response, result, spider):
            print('out')
            for i in result:
                yield i
    
        def process_spider_exception(self, response, exception, spider):
            pass
    
        # 只在开启爬虫的时候,执行一次
        def process_start_requests(self, start_requests, spider):
            print('start')
            for r in start_requests:
                yield r
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)

    3、配置文件

    SPIDER_MIDDLEWARES = {
       'toscrapy.spidermd.MySpiderMiddleware': 543,
    }
  • 相关阅读:
    maven pom
    Amazon(vpc)对等网络
    AWS IAM用户启用MFA
    AWS系列-EC2实例镜像选择
    AWS系列-EC2实例添加磁盘
    AWS系列-AWS EC2实例类型改配(机器配置升级)
    AWS必要了解知识
    AWS系列-EC2默认限制说明
    AWS系列-S3实现文件服务页面展示
    AWS 区域与可用区
  • 原文地址:https://www.cnblogs.com/wt7018/p/11756179.html
Copyright © 2020-2023  润新知