python-爬免费ip并验证其可行性

前言

最近在重新温习python基础-正则，感觉正则很强大，不过有点枯燥，想着，就去应用正则，找点有趣的事玩玩

00xx01---代理IP

有好多免费的ip,不过一个一个保存太难了，也不可能，还是用我们的python爬取吧

00xx02---正则提取ip

 1 import requests
 2 import re
 3 
 4 #防反爬
 5 headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" }
 6 
 7 url = "https://www.xicidaili.com/nn/1"
 8 
 9 response = requests.get(url,headers=headers)
10     # print(response.text)
11 
12 html = response.text
13 #print(html)
14 
15 #re.S忽略换行的干扰
16 ips = re.findall("<td>(d+.d+.d+.d+)</td>",html,re.S)
17 ports = re.findall(("<td>(d+)</td>"),html,re.S)
18 print(ips)
19 print(ports)

00xx03---拼接IP和端口

 1 import requests
 2 import re
 3 
 4 #防反爬
 5 headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" }
 6 
 7 url = "https://www.xicidaili.com/nn/1"
 8 
 9 response = requests.get(url,headers=headers)
10     # print(response.text)
11 
12 html = response.text
13 # print(html)
14 
15 #re.S忽略换行的干扰
16 ips = re.findall("<td>(d+.d+.d+.d+)</td>",html,re.S)
17 ports = re.findall(("<td>(d+)</td>"),html,re.S)
18 #print(ips)
19 #print(ports)
20 for ip in zip(ips,ports ):  #提取拼接ip和端口
21     print(ip)

00xx03---验证IP可行性

思路：带着ip和端口去访问一个网站，百度就可以

 1 import requests
 2 import re
 3 
 4 
 5 headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" }
 6 for i in range(1,1000):
 7     #网址
 8     url = "https://www.xicidaili.com/nn/{}".format(i)
 9 
10     response = requests.get(url,headers=headers)
11     # print(response.text)
12 
13     html = response.text
14 
15     #re.S忽略换行的干扰
16     ips = re.findall("<td>(d+.d+.d+.d+)</td>",html,re.S)
17     ports = re.findall(("<td>(d+)</td>"),html,re.S)
18     # print(ips)
19     # print(ports)
20     for ip in zip(ips,ports ):  #提取拼接ip和端口
21         proxies = {
22             "http":"http://" + ip[0] + ":" + ip[1],
23             "https":"http://" + ip[0] + ":" + ip[1]
24         }
25         try:
26             res = requests.get("http://www.baidu.com",proxies=proxies,timeout = 3)  #访问网站等待3s没有反应，自动断开
27             print(ip,"能使用")
28             with open("ip.text",mode="a+") as f:
29                 f.write(":".join(ip))  #写入ip.text文本
30                 f.write("
") #换行
31         except Exception as e:   #捕捉错误异常
32             print(ip,"不能使用")

00xx04---写入文本

 1 import requests
 2 import re
 3 
 4 #防反爬
 5 headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" }
 6 
 7 url = "https://www.xicidaili.com/nn/1"
 8 
 9 response = requests.get(url,headers=headers)
10     # print(response.text)
11 
12 html = response.text
13 # print(html)
14 
15 #re.S忽略换行的干扰
16 ips = re.findall("<td>(d+.d+.d+.d+)</td>",html,re.S)
17 ports = re.findall(("<td>(d+)</td>"),html,re.S)
18 #print(ips)
19 #print(ports)
20 for ip in zip(ips,ports ):  #提取拼接ip和端口
21     print(ip)
22     proxies = {
23             "http":"http://" + ip[0] + ":" + ip[1],
24             "https":"http://" + ip[0] + ":" + ip[1]
25         }
26     try:
27         res = requests.get("http://www.baidu.com",proxies=proxies,timeout = 3)  #访问网站等待3s没有反应，自动断开
28         print(ip,"能使用")
29         with open("ip.text",mode="a+") as f:
30             f.write(":".join(ip))  #写入ip.text文本
31             f.write("
") #换行
32     except Exception as e:   #捕捉错误异常
33         print(ip,"不能使用")

爬了一页，才几个能用，有3000多页，不可能手动的

00xx05---批量爬

 1 #!/usr/bin/env python3
 2 # coding:utf-8
 3 # 2019/11/18 22:38
 4 #lanxing
 5 import requests
 6 import re
 7 
 8 #防反爬
 9 headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" }
10 for i in range(1,3000):  #爬3000个网页
11     #网站
12     url = "https://www.xicidaili.com/nn/{}".format(i)
13 
14     response = requests.get(url,headers=headers)
15         # print(response.text)
16 
17     html = response.text
18     # print(html)
19 
20     #re.S忽略换行的干扰
21     ips = re.findall("<td>(d+.d+.d+.d+)</td>",html,re.S)
22     ports = re.findall(("<td>(d+)</td>"),html,re.S)
23     #print(ips)
24     #print(ports)
25     for ip in zip(ips,ports ):  #提取拼接ip和端口
26         print(ip)
27         proxies = {
28                 "http":"http://" + ip[0] + ":" + ip[1],
29                 "https":"http://" + ip[0] + ":" + ip[1]
30             }
31         try:
32             res = requests.get("http://www.baidu.com",proxies=proxies,timeout = 3)  #访问网站等待3s没有反应，自动断开
33             print(ip,"能使用")
34             with open("ip.text",mode="a+") as f:
35                 f.write(":".join(ip))  #写入ip.text文本
36                 f.write("
") #换行
37         except Exception as e:   #捕捉错误异常
38             print(ip,"不能使用")

00xx06---最后

哈哈，感觉爬的速度太慢了，毕竟是单线程，如果要快速爬，可以试试用多线程爬取，

以后再补充完善代码吧

相关阅读:
Android无线测试之—UiAutomator UiSelector API介绍之四
 Android无线测试之—UiAutomator UiSelector API介绍之三
 Android无线测试之—UiAutomator UiSelector API介绍之二
 网页抓取- 3
VC 6.0 LNK2005 错误处理
 抓取网页（3）之部分工程文件
 网页抓取（2）
网页抓取总结（一）
nyoj-291 互素数个数欧拉函数
 nyoj-257 郁闷的C小加（一）前缀表达式变后缀
原文地址：https://www.cnblogs.com/lanyincao/p/11886678.html