• scrapy框架使用ip代理(ip池) #request.meta['proxy'] = "http://122.7.199.137:4558"


    在中间件middlewares中写入一个类,然后再setting中的DOWNLOADER_MIDDLEWARES = {}开启一下

    具体代码是

    from scrapy.http import HtmlResponse
    ip_pool = []
    pro_addr = ''
    class proxyMiddleware(object):
        def process_request(self, request, spider):
            global pro_addr,ip_pool
            if "jdzgb" in spider.name:
                while 1:
                    if len(ip_pool) < 3:
                        get_ip_url = "http://d.jghttp.golangapi.com/getipxxxxxx"  #获取ip的url
                        ips = requests.get(get_ip_url).text.split('
    ')
                        for i in ips[:-1]:
                            ip_pool.append(i.strip())
                        break
                    else:
                        break
                if not pro_addr:
                    pro_addr = random.choice(ip_pool)
                while 1:
                    url = 'https://www.baidu.com'
                    proxies = {
                        "http": pro_addr,
                    }
                    try:
                        s = requests.session()
                        s.keep_alive = False  # 关闭多余连接
                        response = s.get(url=url,proxies=proxies,timeout=4, verify=False)
                        code = response.status_code
                        # res = requests.get(url, proxies=proxies,timeout=4)
                        # code = res.status_code
                    except Exception as e:
                        print(e)
                        code = '0'
                    print(code,pro_addr)
                    # print(1, ip_pool)
                    if code == 200 or code == 304:
                        request.meta['proxy'] = "http://" + pro_addr
                #pro_addr = random.choice(ip_pool) #这里的意思是每次访问的ip都不一样,如果把这里关闭,那么就是一个ip如果不过期,就会一直使用这个ip break else: if pro_addr in ip_pool: ip_pool.remove(pro_addr) while 1: if len(ip_pool) < 3: get_ip_url = "http://d.jghttp.golangapi.com/getipxxxxxxx"#获取ip的url ips = requests.get(get_ip_url).text.split(' ') for i in ips[:-1]: ip_pool.append(i.strip()) break else: break pro_addr = random.choice(ip_pool)

      

  • 相关阅读:
    Centos7创建用户su登录后显示为 bash-4.1$
    winserver 搭建 Citrix License 许可服务器
    Centos7扩展存储空间
    CentOS7配置crate集群
    CentOS7安装cratedb
    初探Vue
    浅谈web攻防
    响应式布局
    JavaScript正则表达式,你真的知道?
    15个关于Chrome的开发必备小技巧[译]
  • 原文地址:https://www.cnblogs.com/qiaoer1993/p/11321159.html
Copyright © 2020-2023  润新知