• scrapy中间件


    一、下载中间件

    from scrapy import signals
    from scrapy.http import Response
    from scrapy.exceptions import IgnoreRequest
    from AMAZON.proxy_handle import get_proxy,delete_proxy
    # print('eeeeeeeeeeee',get_proxy())
    
    class DownMiddleware1(object):
        def process_request(self, request, spider):
            """
            请求需要被下载时,经过所有下载器中间件的process_request调用
            :param request: 
            :param spider: 
            :return:  
                None,继续后续中间件去下载;
                Response对象,停止process_request的执行,开始执行process_response
                Request对象,停止中间件的执行,将Request重新调度器
                raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
            """
            # spider.name
            print('下载中间件1')
            print('gggggggggggggggggggg',get_proxy())
    
            # request.meta['proxy']='http://user:pwd@ip:port'
            request.meta['download_timeout']=10
            request.meta['proxy']='http://'+get_proxy()
            print(request.meta)
            # return Response('http://www.xxx.com')
            # print(request.dont_filter)
            # return request
            # raise IgnoreRequest
            # raise TimeoutError
    
        def process_response(self, request, response, spider):
            """
            spider处理完成,返回时调用
            :param response:
            :param result:
            :param spider:
            :return: 
                Response 对象:转交给其他中间件process_response
                Request 对象:停止中间件,request会被重新调度下载
                raise IgnoreRequest 异常:调用Request.errback
            """
            print('response1')
            return response
    
        def process_exception(self, request, exception, spider):
            """
            当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
            :param response:
            :param exception:
            :param spider:
            :return: 
                None:继续交给后续中间件处理异常;
                Response对象:停止后续process_exception方法
                Request对象:停止中间件,request将会被重新调用下载
            """
            print('异常1')
            # return None
    
            # 删旧代理 delelte request.meta['proxy']
            old_proxy=request.meta['proxy'].split("//")[-1]
            print('oooooooooooo',old_proxy)
            delete_proxy(old_proxy)
    
            request.meta['proxy']='http://'+get_proxy()
            return request
  • 相关阅读:
    js 防止页面后退的方法
    asp.net 设置网页过期
    C#子类调用基类构造备忘
    asp.net 自定义控件 嵌入资源文件 备忘
    CSS实现高度和宽度自适应
    C# 更新SQL Server数据库备注信息从另一数据库
    asp.net mvc4 学习笔记一(基本原理)
    CommittableTransaction和TransactionScope
    Delphi7 错误: Access violation at address ****** in module 'ntdll.dll'. Read of address ******.
    Delphi XE10 IdFtp 错误:No FTP list parsers have been registered
  • 原文地址:https://www.cnblogs.com/ldq1996/p/8342112.html
Copyright © 2020-2023  润新知