• 爬虫下载中间件


    # 设置随机请求头  设置代理ip
    # 在middleware.py文件中 写一个类
    
    class MiddlewearproDownloaderMiddleware(object):
    
        user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
            "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]
        PROXY_http = [
            '153.180.102.104:80',
            '195.208.131.189:56055',
        ]
        PROXY_https = [
            '120.83.49.90:9000',
            '95.189.112.214:35508',
        ]
        
        # 拦截所有的正常对象
        def process_request(self, request, spider):
            # 随机选择一个UA
            user_agent = random.choice(self.user_agent_list)
            request.headers['User-Agent'] = user_agent
    
        # 拦截所有的响应对象
        def process_response(self, request, response, spider):
    
            return response
    
        # 拦截发生异常的请求对象
        def process_exception(self, request, exception, spider):
    
            #设置代理ip
            if request.url.split(':')[0] == 'http':
                request.meta['proxy'] = 'http://'+random.choice(self.PROXY_http)
            else:
                request.meta['proxy'] = 'https://'+random.choice(self.PROXY_https)
                
                
    # 在settings文件中修改中间件  只需要类名改成新的类名就可以
    DOWNLOADER_MIDDLEWARES = {
       'UA_demo.middlewares.MiddlewearproDownloaderMiddleware': 543,
    }
    
    
    # 爬虫文件 打印一下请求头看下
    # -*- coding: utf-8 -*-
    import scrapy
    import json
    
    class UaSpider(scrapy.Spider):
        name = 'httpbin'
        allowed_domains = ['httpbin.org']
        start_urls = ['http://httpbin.org/user-agent']
    
        def parse(self, response):
            user_agent=json.loads(response.text)["user-agent"]
            print(user_agent)
            yield scrapy.Request(self.start_urls[0],dont_filter=True) 
            # 因为scrapy框架会去重,如果不写dont_filter=True拿完第一个url之后就不会再去执行'http://httpbin.org/user-agent'了
  • 相关阅读:
    Jquery事件
    基础:装箱和拆箱...
    navicat编辑表的作用
    谷歌浏览器preview展示问题
    @Scheduled并行执行
    spring异步执行方法线程池的配置
    dubbo的ExceptionFilter异常处理
    dubbo异常处理
    idea设置启动jvm参数
    前后端分离走本地代码Charles的使用
  • 原文地址:https://www.cnblogs.com/kenD/p/11123707.html
Copyright © 2020-2023  润新知