• 免费代理ip爬虫分享


    分享一个某代理网站的免费代理ip的爬虫,直接复制到pycharm运行就可以了。

    注意:爬取的代理ip有点坑,因为是免费的所以过期时间很快,可能1分钟后就会失效。并且在scrapy使用这些代理ip还会给你打印一堆广告。且用且珍惜。

    import requests
    from lxml import etree
    import json
    
    class XiciProxiesSpider(object):
    
        def __init__(self):
            self.start_url = 'http://www.xicidaili.com/nn'
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
            }
    
        def get_page_from_url(self, url):
            response = requests.get(url,headers=self.headers)
            return response.content.decode()
    
        def get_data_from_page(self, page):
            # print(page)
            # 把page转换为Element对象
            html = etree.HTML(page)
            # 获取包含代理信息的tr列表
            trs = html.xpath('//*[@id="ip_list"]/tr')[1:]
    
            # 遍历trs, 获取数据信息
    
            data = {
                'http':[],
                'https':[]
            }
    
            for tr in trs:
                try:
                    ip = tr.xpath('./td[2]/text()')[0]
                    port = tr.xpath('./td[3]/text()')[0]
                    ip_type = tr.xpath('./td[6]/text()')[0].lower()
                    # 如果ip不是http或https直接返回
                    if ip_type not in data.keys():
                        return
                    # 构建代理数据
                    item = {ip_type: '{}://{}:{}'.format(ip_type, ip, port)}
                    # 检查代理IP是否可用, 如果可用添加到列表中
                    if self.validate_ip(item, ip_type):
                        print(item[ip_type])
                        data[ip_type].append(item[ip_type])
                except Exception as ex:
                    print(ex)
                    print(etree.tostring(tr))
    
            # print(data)
            return data
    
        def validate_ip(self, item, ip_type):
    
            try:
                test_url = "{}://baidu.com".format(ip_type)
                response = requests.get('http://baidu.com', proxies=item, timeout=2)
                if response.status_code == 200:
                    return True
                return False
            except Exception as ex:
                return False
    
        def save_data(self, data):
            with open('proxies.json', 'w') as f:
                json.dump(data, f, indent=2)
    
    
        def run(self):
            # 获取页面内宽容
            page = self.get_page_from_url(self.start_url)
            # 获取可用代理IP
            data = self.get_data_from_page(page)
            # 保存数据
            self.save_data(data)
    
    if __name__ == '__main__':
        fps = XiciProxiesSpider()
        fps.run()
  • 相关阅读:
    codeforces round 512 F. Putting Boxes Together 树状数组维护区间加权平均数
    sgu 110 射线关于球的反射光线
    快速读模板
    HDU-3506 二维四边形不等式
    BZOJ 1563 四边形不等式
    HIT Summer Day17 计算几何初步 题解
    IME Starters Try-outs 2018 C. China Adventures
    IME Starters Try-outs 2018 J. JHADCBEIGF
    ACM International Collegiate Programming Contest, Amman Collegiate Programming Contest (2018) GYM 100810 K. League of Demacia
    Codeforces Round #493 (Div. 1) C. Sky Full of Stars
  • 原文地址:https://www.cnblogs.com/liuguangbiao/p/9407086.html
Copyright © 2020-2023  润新知