爬取的目标网站
66ip
##定义单页爬取函数 def parse_page(url): r = requests.get(url,headers=headers) if r.status_code == 200: html = etree.HTML(r.text) trs = html.xpath('//div[@align="center"]/table//tr') for tr in trs[1:]: ip = tr.xpath('.//td[1]/text()')[0] port = tr.xpath('.//td[2]/text()')[0] ip_port = ip+":"+port def main(): ##爬虫进行翻页 for i in range(1,34): url = "http://www.66ip.cn/areaindex_%d/1.html" %i parse_page(url) if __name__ == '__main__': main()
proxylist
这个网站为:http://proxylist.fatezero.org/
打开为此页面
我们查看一下网页源代码,如下图,发现并没有ip,说明此网页用了ajax技术
,我们进行检查并到network很容易发现接口
打开此接口的url就感觉非常的好,里面是一条条json数据包裹着的代理Ip,每一条以换行符 隔开,所以我们只需要对接口也就是http://proxylist.fatezero.org/proxy.list进行请求爬取
下面是代码
url = "http://proxylist.fatezero.org/proxy.list" r = requests.get(url,headers=headers) if r.status_code == 200: ##使用split即将每一条ip都变成一个列表的一个元素 lists = r.text.split(' ') for i in lists: try: li = json.loads(i,strict=False) ##提取高匿http代理 if str(li['anonymity']) == 'high_anonymous' and str(li['type']) == 'http': ip_port = str(li['host'])+":"+str(li['port']) except: continue
快代理
快代理提取较简单,代码如下
##定义单页爬取函数 def parse_page(url): r = requests.get(url,headers=headers) html = etree.HTML(r.text) trs = html.xpath('//tbody//tr') for tr in trs: ip = tr.xpath('./td[1]/text()')[0] port = tr.xpath('./td[2]/text()')[0] ip_port = ip+":"+port def main(): ##爬虫进行翻页 for i in range(1,30): url = "https://www.kuaidaili.com/free/inha/%d" %i parse_page(url) if __name__ == '__main__': main()
验证ip可用性
在上面3个网站爬取的ip+port放入temp列表中
def test_proxy(): for ip_port in temp: proxy = { 'http':ip_port } try: r = requests.get('http://www.baidu.com',headers=headers,proxies=proxy,timeout=5) print(r.status_code) if r.status_code != 200: temp.remove(ip_port) except: temp.remove(ip_port) print("faild:{}".format(ip_port))
使用try函数是排除掉代理无法访问www.baidu.com的ip,再使用状态码是否为200排除错误状态码的ip
完整代码
```python from lxml import etree import requests temp = [] def get_66ip(): def parse_page(url): r = requests.get(url,headers=headers) if r.status_code == 200: html = etree.HTML(r.text) trs = html.xpath('//div[@align="center"]/table//tr') for tr in trs[1:]: ip = tr.xpath('.//td[1]/text()')[0] port = tr.xpath('.//td[2]/text()')[0] ip_port = ip+":"+port temp.append(ip_port) def main(): for i in range(1,34): url = "http://www.66ip.cn/areaindex_%d/1.html" %i parse_page(url) if __name__ == '__main__': main() def pro(): url = "http://proxylist.fatezero.org/proxy.list" r = requests.get(url,headers=headers) if r.status_code == 200: lists = r.text.split(' ') for i in lists: try: li = json.loads(i,strict=False) if str(li['anonymity']) == 'high_anonymous' and str(li['type']) == 'http': ip_port = str(li['host'])+":"+str(li['port']) temp.append(ip_port) except: continue def kuai(): def parse_page(url): r = requests.get(url,headers=headers) html = etree.HTML(r.text) trs = html.xpath('//tbody//tr') for tr in trs: ip = tr.xpath('./td[1]/text()')[0] port = tr.xpath('./td[2]/text()')[0] ip_port = ip+":"+port temp.append(ip_port) def main(): for i in range(1,30): url = "https://www.kuaidaili.com/free/inha/%d" %i parse_page(url) if __name__ == '__main__': main() def test_proxy(): for ip_port in temp: proxy = { 'http':ip_port } try: r = requests.get('http://www.baidu.com',headers=headers,proxies=proxy,timeout=5) print(r.status_code) if r.status_code != 200: temp.remove(ip_port) except: temp.remove(ip_port) print("faild:{}".format(ip_port)) if __name__ == '__main__': get_66ip() kuai() pro() test_proxy() for i in temp: print(i) #以a模式为追加写入到Txt文件 for i in temp: print(i) f = open('2222.txt','a',encoding='utf-8') f.write(i) f.write(' ') f.close
结尾
爬取代理ip就结束了,大家也可以考虑部署到服务器上,做一个api接口,有兴趣的可以自己探索一下。。。。