import requests import queue import threading from lxml import etree #要爬取的URL url = "http://xxxxx" #代理ip网站 proxy_url = "https://www.kuaidaili.com/free/inha/{page}/" class MyThreadPool: def __init__(self, maxsize): self.maxsize = maxsize self._pool = queue.Queue(maxsize) for _ in range(maxsize): self._pool.put(threading.Thread) def get_thread(self): return self._pool.get() def add_thread(self): self._pool.put(threading.Thread) def get_url(url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36', } response = requests.get(url,headers=headers) html_str = response.text return html_str def proxy_get_url(url,prox): proxies = {} proxies["http"] = prox headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36', } response = requests.get(url,headers=headers,proxies=proxies,timeout=3) html_str = response.text return html_str def ip_proxy(html_str): html = etree.HTML(html_str) ip_list = html.xpath('//tr/td[@data-title="IP"]/text()') port_list = html.xpath('//tr/td[@data-title="PORT"]/text()') http_list = [] for i in range(len(ip_list)): http_proxy = ip_list[i]+":"+port_list[i] http_list.append(http_proxy) return http_list def available_ip(ip_list): for ip in ip_list: try: proxy_get_url('https://www.baidu.com/',ip) except Exception as e: continue IP_LIST.append(ip) if __name__ == "__main__": IP_LIST = [] pool = MyThreadPool(20) #线程池数 #验证代理ip for i in range(1,20): #页数 page_ip = get_url(proxy_url.format(page=i)) ip_list = ip_proxy(page_ip) t = pool.get_thread() obj = t(target=available_ip,args=(ip_list,)) obj.start() #爬取网站 for ip in IP_LIST: try: proxy_get_url(url,ip) except Exception as e: continue print(ip)
#使用一个ip爬取网站,如果ip不可用了删除ip
while IP_LIST:
try:
print(IP_LIST[0])
proxy_get_url(url,IP_LIST[0])
except Exception as e:
del IP_LIST[0]
continue