利用多线程检测代理网站提供的免费代理是否可用
1 import requests
2 from lxml import etree
3 import time
4 import multiprocessing
5
6 def get_all_proxy(queue):
7 url = 'http://www.xicidaili.com/nn/1'
8 headers = {
9 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
10 }
11 response = requests.get(url, headers=headers)
12 html_ele = etree.HTML(response.text)
13
14 ip_eles = html_ele.xpath('//table[@id="ip_list"]/tr/td[2]/text()')
15 port_ele = html_ele.xpath('//table[@id="ip_list"]/tr/td[3]/text()')
16 # proxy_list = []
17 for i in range(0,len(ip_eles)):
18 proxy_str = 'http://' + ip_eles[i] + ':' + port_ele[i]
19 #proxy_list.append(proxy_str)
20 #print(proxy_str)
21 queue.put(proxy_str)
22
23 def check_one_proxy(proxy):
24 try:
25 #proxy = proxy_and_queue
26 url = 'http://www.baidu.com/s?wd=ip'
27 proxy_dict = {
28 'http': proxy
29 }
30 try:
31 response = requests.get(url, proxies=proxy_dict, timeout=5)
32 if response.status_code == 200:
33 print(proxy)
34 return proxy
35 else:
36 print('bad '+proxy)
37 return proxy
38 except:
39 return None
40 except Exception as e:
41 print(e)
42
43 if __name__ == '__main__':
44 start_time = time.time()
45 # 创建队列
46 q = multiprocessing.Queue()
47 # pool 进程池中, 要用的是下面的这个queue
48 #result_q = multiprocessing.Manager().Queue()
49 # 获取所有代理
50 p = multiprocessing.Process(target=get_all_proxy, args=(q,))
51 p.start()
52 # proxy_list = get_all_proxy()
53 # 检测代理的可用性
54
55 pool = multiprocessing.Pool(30)
56 result_list = []
57 while True:
58 try:
59 proxy_str = q.get(timeout=5)
60 except:
61 break
62 #print('apply_async 之前')
63 #proxy_and_queue = [proxy_str, result_q]
64 proxy_res = pool.apply_async(check_one_proxy, (proxy_str,))
65 result_list.append(proxy_res)
66 #valid_proxy_list = check_all_proxy(proxy_list)
67
68 valid_proxy_list = []
69 for proxy_res in result_list:
70 result = proxy_res.get()
71 if result is None:
72 pass
73 else:
74 valid_proxy_list.append(result)
75 #print(result)
76 print('All proxy we can get:')
77 print(valid_proxy_list)
78 pool.close()
79 pool.join()
80 p.join()
81
82 end_time = time.time()
83 print('--'*30)
84 # print(valid_proxy_list)
85 print('耗时:' + str(end_time-start_time))