• python爬取免费西祠代理


    #!/usr/local/bin/python3.7
    
    """
    @File    :   xicidaili.py
    @Time    :   2020/06/02
    @Author  :   Mozili
    
    """
    
    import urllib.request
    import urllib.parse
    from lxml import etree
    import random
    import time
    
    def handler_request(url):
        # 请求头
        headers = {
         'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
         }
        # 创建请求
        req = urllib.request.Request(url=url, headers=headers)
        # 发送请求
        res = urllib.request.urlopen(req)
        # 获取内容
        cot = res.read().decode()
        return cot
    
    def preserve_data(ips, ports, types):
    
        for ip in ips:
            for i in range(len(ports)):
                for j in range(len(types)):
                    str = types[j] + ' ' + ip + ':' + ports[i] + '
    '
                    # 删除列表中第一个元素
                    del types[0]
                    # print(types)
                    del ports[0]
                    # print(ports)
                    with open('Reptile/daili.txt', 'a', encoding='utf-8') as fp:
                        fp.write(str)
                    break
                break
            
    def download_content(tree):
        # 获取ip
        ips = tree.xpath("//tr[@class='odd']/td[2]/text()")
        # print(ips)
        # 获取端口
        ports = tree.xpath("//tr[@class='odd']/td[3]/text()")
        # print(ports)
        # 获取类型
        types = tree.xpath("//tr[@class='odd']/td[6]/text()")
        # print(types)
        # 保存数据到txt文档
        preserve_data(ips, ports, types)
    
    
    if __name__ == "__main__":
        # 输入页码
        start_page = int(input('请输入起始页码:'))
        end_page = int(input('请输入结束页码:'))
        # url列表
        url_list= [
            'https://www.xicidaili.com/nn/',
            'https://www.xicidaili.com/nt/',
            'https://www.xicidaili.com/wn/',
            'https://www.xicidaili.com/wt/',
            'https://www.xicidaili.com/qq/'
            ]
        for url in url_list:
            for page in range(start_page, end_page+1):
                new_url = url + str(page)
                # print(url)
                # 创建请求
                content = handler_request(new_url)
                # print(content)
                time.sleep(1)
                # 创建对象,网络文件
                tree = etree.HTML(content)
                # 开始爬取内容
                download_content(tree)
        
  • 相关阅读:
    敏捷的调试
    敏捷的编码
    敏捷的需求分析
    敏捷的反馈
    敏捷的方法论
    敏捷的世界观
    MarkDown添加数学公式
    性能分析初学者指南
    可执行文件的装载与进程
    会话技术------客户端技术cookie
  • 原文地址:https://www.cnblogs.com/lxmtx/p/13031894.html
Copyright © 2020-2023  润新知