设计IP池:
应用场景:
1.防止网站IP检测,封掉IP,终止爬虫程序运行
2.无痕浏览器 绕过非强制验证码问题
3.防识别
设计思路:
1.IP来源
2.IP管理
2.0.IP存活检测
2.1.IP程序中管理
3.IP应用
第一:
IP来源做法:(不管你的IP来源在哪(收费IP网站,免费IP网站,...提供IP资源的地方))
源源不断的像提供IP的地方索取IP(这个过程我们不需要吝啬,一直运行即可)
2.IP管理做法:(提供两种方式 Redis池 或者 存入txt文档)
将源源不断的IP存入到Redis池 或者txt文档
2.0.IP存活检测做法:
时间 + 检测接口
2.1.IP程序管理:
将IP放入队列中,针对多线程使用
3.IP应用
将取出来的IP应用到requests中
第二:
代码实现
我这里把IP来源和IP管理写在了一起:
Redis池方式:
import redis,time, threading, random, requests, telnetlib, os from threading import Thread from concurrent.futures import ThreadPoolExecutor from queue import Queue class IPS_(): def __init__(self): # redis链接信息 self.host = '123.59.207.171' self.port = 6379 # redis 取出的结果默认是字节,设定 decode_responses=True 改成字符串 self.decode_responses = True # 过期时间 self.ex = 50 self.password = 'amms..bridge' self.IpUrls = ['http://http1.9vps.com/getip.asp?username=13835372142&pwd=235b75eb472ee6e5afe3418a345773b3&geshi=1&fenge=1&fengefu=&getnum=100', 'http://http1.9vps.com/getip.asp?username=13835372142&pwd=784942f3cbc0a52493fd1d1e1764d0ee&geshi=1&fenge=1&fengefu=&getnum=100'] self.Lock = threading.Lock() self.queue_ip = Queue() self.threadPoll = ThreadPoolExecutor(max_workers=20) def link(self): # 连接redis的操作 # self.re = redis.Redis(host=self.host, password=self.password, port=self.port, db=0,decode_responses=self.decode_responses) # decode_responses=True 自动解码,输出的结果自动由bytes类型变为字符串类型 # 连接池的操作 self.re_pool = redis.Redis(connection_pool=redis.ConnectionPool(host=self.host, port=self.port, decode_responses=self.decode_responses, password = self.password)) def thread_PullIP(self): # 两个子线程去访问付费ip网址,主线程继续往下执行。 for ipurl in self.IpUrls: # ip_t = Thread(target=self.pullIP, args=(ipurl,)) # ip_t.setDaemon(True) # ip_t.start() self.threadPoll.submit(self.pullIP, ipurl) def pullIP(self, pro_url): pwd = pro_url.split('pwd=')[1].split('&geshi')[0] lasttime = time.time() i = 1 while True: try: # 若超过5秒 换ip sleeptime = time.time() - lasttime if sleeptime < 5: time.sleep(5 - sleeptime) lasttime = time.time() ip = requests.get(pro_url, timeout=3).text if ip == 'false!error!请等待 5秒后再提取!': print('false!error!请等待 5秒后再提取! -- {}'.format(pwd)) continue # ip存活检测 留下存活的ip 存入队列中 telnetlib.Telnet(ip.split(':')[0], port=ip.split(':')[1], timeout=3) # 这里设置 i = 1是为了不让他执行100次跳入 i>100 的操作 i = 1 self.queue_ip.put(ip) except Exception as e: # too many request i += 1 print('ERROR: -- {} -- {}'.format(e, pwd)) if i > 100: print('ERROR:连续一百次获取不到ip') # 程序退出 # os._exit(0) 程序无异常 程序退出 # os._exit(1) 程序有异常 程序退出 os._exit(1) def add(self): while True: if self.queue_ip.empty(): time.sleep(5 / len(self.IpUrls)) else: ip = self.queue_ip.get() if ip not in self.all(): time.sleep(5 / len(self.IpUrls)) print('添加ip: {}'.format(ip)) self.re_pool.set(ip, round(time.time()) + self.ex, ex=self.ex) # self.threadPoll.submit(self.deleteRegularly, ip) else: print('ip重复: {}'.format(ip)) self.queue_ip.task_done() def run(self): # server self.link() self.thread_PullIP() self.add() def all(self): # 获得所有ip return self.re_pool.keys() if __name__ == '__main__': IPS_().run()
在我们的说明下,这个py代码是一直保持运行的状态。
IP存活检测:
import redis, time, sys, os class IPS_(): def __init__(self): # redis链接信息 self.host = '123.59.207.171' self.port = 6379 # redis 取出的结果默认是字节,设定 decode_responses=True 改成字符串 self.decode_responses = True # 过期时间 self.ex = 50 self.password = 'amms..bridge' self.re_pool = redis.Redis(connection_pool=redis.ConnectionPool(host=self.host, port=self.port, decode_responses=self.decode_responses, password=self.password), health_check_interval = 30) self.lastip = '' self.lasttime = time.time() self.sleeptime_ = 2.5 def one(self): # 获得一个ip while True: sleeptime = time.time() - self.lasttime if sleeptime < self.sleeptime_: time.sleep(self.sleeptime_ - sleeptime) self.lasttime = time.time() try: keys = self.re_pool.keys() if keys == []: print('ERROR: --- IP池为空,检查IP池') os._exit(1) values = self.re_pool.mget(keys) ip = keys[values.index(max(['0' if value == None else value for value in values]))] if ip == self.lastip: # print('INFO: ---- ip重复,重新获取中') continue self.lastip = ip return ip except Exception as e: print('ERROR: ---- 延时10秒,重新链接,检查网络 -- 报错信息 - {}'.format(e)) time.sleep(10) self.re_pool = redis.Redis(connection_pool=redis.ConnectionPool(host=self.host, port=self.port, decode_responses=self.decode_responses, password=self.password)) def all(self): # 获得所有ip return self.re_pool.keys() def test(self): # cli while True: print(self.one()) # print('\n') if __name__ == '__main__': IPS_().test()
IP应用:
import sys # 此路径是你编写IP来源以及IP管理的路劲 sys.path.append("D:\Work\IPS") from redis_cli import IPS_ ips = IPS_() def getpro(): while True: ip = ips.one() return ip if __name__ == '__main__': while True: print(getpro())
额外加IP程序内的管理:
import sys sys.path.append("D:\JR\jr\ZKGIT\IPS") from redis_cli import IPS_ ips = IPS_() import time from concurrent.futures import ThreadPoolExecutor from threading import Lock, Thread from queue import Queue class First(object): def __init__(self): self.lock1 = Lock() self.ip_queue = Queue() def put_ip(self): old_ip = '' while True: ip = ips.one() # 如果上次取的IP和本次相同,跳过 if old_ip == ip: continue self.ip_queue.put(ip) def get_ip(self): while True: try: # 多线程 防止争抢IP上锁 self.lock1.acquire() ip = self.ip_queue.get() self.lock1.release() except: time.sleep(2) continue return ip def function(self,threadpool): ip = self.get_ip() print(ip) if __name__ == '__main__': first = First() threadpool = ThreadPoolExecutor(max_workers=10) t = Thread(target=first.put_ip, args=()) t.setDaemon(True) t.start() first.function(threadpool)
结束。