1 import requests
2 from lxml import etree
3
4
5 # 将可以使用的代理IP的信息存储到文件
6 def write_proxy(proxies):
7 print(proxies)
8 for proxy in proxies:
9 with open("ip_proxy.txt", 'a+') as f:
10 print("正在写入:", proxy)
11 f.write(proxy + '
')
12 print("录入完成!!!")
13
14
15 # 解析网页,并得到网页中的代理IP
16 def get_proxy(html):
17 # 对获取的页面进行解析
18 selector = etree.HTML(html)
19 # print(selector.xpath("//title/text()"))
20 proxies = []
21 # 信息提取
22 for each in selector.xpath('//table[@id="ip_list"]/tr')[1:]:
23 # ip.append(each[0])
24 ip = each.xpath("./td[2]/text()")[0]
25 port = each.xpath("./td[3]/text()")[0]
26 proxy = ip + ":" + port
27
28 proxies.append(proxy)
29 print(len(proxies))
30 test_proxies(proxies)
31
32
33 # 验证已得到IP的可用性,本段代码通过访问百度网址,返回的response状态码判断(是否可用)。
34 def test_proxies(proxies):
35 proxies = proxies
36 url = "http://www.baidu.com/"
37 header = {
38 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
39 }
40 normal_proxies = []
41 count = 1
42 for proxy in proxies:
43 print("第%s个。。" % count)
44 count += 1
45 try:
46 response = requests.get(url, headers=header, proxies={"http": proxy}, timeout=1)
47 if response.status_code == 200:
48 print("该代理IP可用:", proxy)
49 normal_proxies.append(proxy)
50 else:
51 print("该代理IP不可用:", proxy)
52 except Exception:
53 print("该代理IP无效:", proxy)
54 pass
55 # print(normal_proxies)
56 write_proxy(normal_proxies)
57
58
59 def get_html(url):
60 header = {
61 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
62 }
63 response = requests.get(url,headers=header,)
64 # print(response.text)
65 get_proxy(response.text)
66
67
68 if __name__ == "__main__":
69 base_url = "http://www.xicidaili.com/nn/%s/"
70 # 爬取3页数据
71 for i in range(1, 4):
72 url = base_url % i
73 get_html(url)