设置IP代理池及IP变换方案
方案一:
使用国内免费的IP代理
1 http://www.xicidaili.com
# 创建一个tools文件夹,新建一个py文件,用于获取代理IP和PORT from scrapy.selector import Selector import MySQLdb import requests conn = MySQLdb.connect(host="192.168.1.1", user="root", passwd="123456", db="databasename", charset="utf8") cursor = conn.cursor() def crawl_ips(): headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36"} for i in range(1568): re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers) selector = Selector(text=re.text) all_trs = selector.css("#ip_list tr") ip_list = [] for tr in all_trs[1:]: speed_str = tr.css(".bar::attr(title)").extract()[0] if speed_str: speed = float(speed_str.split("秒")[0]) all_texts = tr.css("td::text").extract() ip = all_texts[0] port = all_texts[1] proxy_type = all_texts[5] ip_list.append((ip, port, proxy_type, speed)) for ip_info in ip_list: cursor.execute( "insert proxy_ip(ip, port, speed, proxy_type) VALUES('{0}', '{1}', {2}, 'HTTP')".format( ip_info[0], ip_info[1], ip_info[3] ) ) conn.commit() class GetIP(object): def delete_ip(self, ip): delete_sql = """ delete from proxy_ip where ip='{0}' """.format(ip) cursor.execute(delete_sql) conn.commit() return True def judge_ip(self, ip, port): http_url = "http://www.baidu.com" proxy_url = "http://{0}:{1}".format(ip, port) try: proxy_dict = { "http":proxy_url, } response = requests.get(http_url, proxies=proxy_dict) except Exception as e: print ("invalid ip and port") self.delete_ip(ip) return False else: code = response.status_code if code >= 200 and code < 300: print ("effective ip") return True else: print ("invalid ip and port") self.delete_ip(ip) return False def get_random_ip(self): random_sql = """ SELECT ip, port FROM proxy_ip ORDER BY RAND() LIMIT 1 """ result = cursor.execute(random_sql) for ip_info in cursor.fetchall(): ip = ip_info[0] port = ip_info[1] judge_re = self.judge_ip(ip, port) if judge_re: return "http://{0}:{1}".format(ip, port) else: return self.get_random_ip() if __name__ == "__main__": get_ip = GetIP() get_ip.get_random_ip()
# 修改settings配置 DOWNLOADER_MIDDLEWARES = { 'ArticleSpider.middlewares.JSPageMiddleware': 1, 'ArticleSpider.middlewares.RandomProxyMiddleware': 1 }
方案二:
改造github开源项目成为适合自己的proxies代理工具
1 https://github.com/aivarsk/scrapy-proxies
方案三:
官方提供github开源项目,收费版本,但相对稳定
1 https://github.com/scrapy-plugins/scrapy-crawlera
方案四:
使用tor洋葱网络,匿名伪装自己的IP地址