• 抓取代理IP,然后保存成txt


    #!/usr/bin/env python
    # coding=utf-8
    #针对 www.xicidaili.com
    import re
    import random
    import sys
    import time
    import datetime
    import threading
    from random import choice
    import requests
    import bs4
    import string


    file=open('data.txt','a')  
    def get_ip(str1):
        """获取代理IP"""
        url = "http://www.xicidaili.com/wt/"+str1
        headers = { "Accept":"text/html,application/xhtml+xml,application/xml;",
                    "Accept-Encoding":"gzip, deflate, sdch",
                    "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
                    "Referer":"http://www.xicidaili.com",
                    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
                    }
        r = requests.get(url,headers=headers)
        soup = bs4.BeautifulSoup(r.text, 'html.parser')
        data = soup.table.find_all("td")
        ip_compile= re.compile(r'<td>(d+.d+.d+.d+)</td>')    # 匹配IP
        port_compile = re.compile(r'<td>(d+)</td>')                # 匹配端口
        ip = re.findall(ip_compile,str(data))       # 获取所有IP
        port = re.findall(port_compile,str(data))   # 获取所有端口
        list = [":".join(i) for i in zip(ip,port)]  
        print list  
        for i in list:
            file.write(str(i)+' ')

    def main():
        count = 1
        while (count < 2000):    
            str1 = str(count)
            get_ip(str1)   
            count += 1
            time.sleep(0.1)
        file.close()    
    if __name__ == '__main__':
        main()

  • 相关阅读:
    自调用匿名函数和js的Module模式
    设置一天中不同时段的倒计时,计算时针和分针的夹角
    移动端web开发中对点透的处理,以及理解fastclick如何做到去除300ms延迟
    使用Fiddler改变线上js文件的引用路径
    Linux下常用设置文件和文件夹读写权限操作
    RESTful API
    mysql之load语句
    Django学习之点赞功能
    Django学习之网站图标
    python学习之pyenv
  • 原文地址:https://www.cnblogs.com/wj2ge/p/7009849.html
Copyright © 2020-2023  润新知