1 # coding = utf-8 2 3 __autor__ = 'litao' 4 5 import urllib.request 6 import urllib.request 7 import urllib.error 8 import socket 9 import gevent 10 from gevent import monkey 11 from bs4 import BeautifulSoup 12 import time 13 import random 14 home = "http://www.xicidaili.com/wt/" 15 first_proxy_list = [] 16 end_proxy_list = [] 17 # proxy_support = urllib.request.ProxyHandler({"http": "http://10.10.1.10:3128", "https": "http://10.10.1.10:1080"}) 18 headers = { 19 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36" 20 } 21 monkey.patch_all() 22 def test_proxy(proxy_key): 23 # for i in range(len(first_proxy_list)): 24 # proxy_support = urllib.request.ProxyHandler({"http":proxy_list[i]}) 25 print(proxy_key) 26 proxy = {"http":proxy_key} 27 url = "https://www.baidu.com/" 28 29 30 proxy_support = urllib.request.ProxyHandler(proxy) 31 opener = urllib.request.build_opener(proxy_support) 32 urllib.request.install_opener(opener) 33 res = urllib.request.Request(url=url, headers=headers) 34 try: 35 response = urllib.request.urlopen(res,timeout=5) 36 if response.code == 200: 37 end_proxy_list.append(proxy_key) 38 except Exception as e: 39 print("error:",e) 40 # except socket.timeout as e: 41 # print("This proxy is socket.timeout") 42 # except urllib.error.URLError as e: 43 # print("This proxy is timeout") 44 45 def get_proxy_list(): 46 for i in range(20): 47 url =home + str(i+1) 48 print(url) 49 # proxy_support = urllib.request.ProxyHandler({"http":"123.125.5.100:3128"}) 50 # opener = urllib.request.build_opener(proxy_support) 51 # urllib.request.install_opener(opener) 52 res = urllib.request.Request(url=url, headers=headers) 53 response =urllib.request.urlopen(res,timeout=20).read().decode() 54 soup = BeautifulSoup(response,'html.parser') 55 print(response) 56 content = soup.find_all("table",attrs={"id":"ip_list"})[0].find_all('tr')[1:] 57 for i in range(len(content)): 58 result = content[i].find_all('td') 59 proxy_enum = result[1].text+":"+result[2].text 60 print(proxy_enum) 61 first_proxy_list.append(proxy_enum) 62 time.sleep(random.randint(120,240)) 63 64 65 def join_gevent(first_proxy_list,gevent_list): 66 for i in range(len(first_proxy_list)): 67 gevent_list.append(gevent.spawn(test_proxy,first_proxy_list[i])) 68 69 def main(): 70 gevent_list = [] 71 get_proxy_list() 72 with open("proxy_first.txt",'a',encoding='utf-8') as f: 73 for item in first_proxy_list: 74 f.write(item+' ') 75 join_gevent(first_proxy_list, gevent_list) 76 gevent.joinall(gevent_list) 77 print(end_proxy_list) 78 with open("proxy_end.txt",'a',encoding='utf-8') as f: 79 for item in end_proxy_list: 80 f.write(item+' ') 81 82 if __name__ == "__main__": 83 main()