• 爬虫学习笔记:打造自己的代理池


    # -*- coding: utf-8 -*-
    """
    Created on Sat Dec 18 00:00:59 2021
    @author: Hider
    """
    import requests
    import parsel
    import time
    import pandas as pd
    
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36', 'Connection': 'close'
    }
    
    def get_page(page):
        url = 'https://www.kuaidaili.com/free/inha/' + str(page)
        response = requests.get(url=url, headers=headers)
        html = parsel.Selector(response.text)
        parse_page(html)
    
    def parse_page(html):
        parse_list = html.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
        for tr in parse_list:
            # parse_lists = {}
            ip = tr.xpath('./td[@data-title="IP"]//text()').extract_first()
            port = tr.xpath('./td[@data-title="PORT"]//text()').extract_first()
            nimingdu = tr.xpath('./td[@data-title="匿名度"]//text()').extract_first()
            type1 = tr.xpath('./td[@data-title="类型"]//text()').extract_first()
            location = tr.xpath('./td[@data-title="位置"]//text()').extract_first()
            speed = tr.xpath('./td[@data-title="响应速度"]//text()').extract_first()
            last_time = tr.xpath('./td[@data-title="最后验证时间"]//text()').extract_first()
            # parse_lists[http] = num + ':' + port
            parse_lists.append([ip, port, nimingdu, type1, location, speed, last_time])
            time.sleep(0.1)
            # print(parse_lists)
        
    if __name__ == '__main__':
        parse_lists = []
        for page in range(1, 21):
            get_page(page)
    
    df = pd.DataFrame(parse_lists, columns=['IP','PORT','匿名度','类型','位置','响应速度','最后验证时间'])
    
    
    
  • 相关阅读:
    vue-以文件流-blob-的形式-下载-导出文件
    vue-element-upload 文件上传打开选择文件弹框前进行提示或操作
    django-创建Template(模板)
    django-配置url
    django-创建页面
    django创建应用及应用模块解释
    django创建项目及目录介绍
    django的安装
    Python2X学习16-python-列表
    Request
  • 原文地址:https://www.cnblogs.com/hider/p/15780049.html
Copyright © 2020-2023  润新知