• 《爬虫》爬取可用的免费IP


    import telnetlib
    import urllib.request
    from bs4 import BeautifulSoup
    
    for d in range(1, 3):  # 采集1到2页
    	scrapeUrl = 'http://www.xicidaili.com/nn/%d/' % d
    	req = urllib.request.Request(scrapeUrl)
    	req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
    	response = urllib.request.urlopen(req)
    	html = response.read()
    
    	bsObj = BeautifulSoup(html, "html.parser")
    
    	for i in range(100):
    		speed = float(bsObj.select('td')[6 + i * 10].div.get('title').replace('秒', ''))
    		if speed < 0.2:  # 验证速度,只要速度在0.2秒之内的
    			ip = bsObj.select('td')[1 + i * 10].get_text()
    			port = bsObj.select('td')[2 + i * 10].get_text()
    			ip_address = 'http://' + ip + ':' + port
    			try:
    				telnetlib.Telnet(ip, port=port, timeout=2)  # 用telnet对ip进行验证
    			except:
    				print('fail')
    			else:
    				print('sucess:' + ip_address)
    				f = open('proxy_list.txt', 'a')
    				f.write(ip_address + '
    ')
    				f.close()
    

     

    版本二:

    import threading
    import time
    import json
    
    import telnetlib
    
    
    class TestProxy(object):
    	def __init__(self):
    		today = time.strftime('%Y%m%d', time.localtime())
    		self.filename = today + '.txt'
    		self.sFile = self.filename
    		self.dFile = r'alive.txt'
    		self.URL = r'http://www.baidu.com'
    		self.threads = 10
    		self.timeout = 3
    		self.aliveList = []
    
    		self.run()
    
    	def run(self):
    		with open(self.sFile, 'r',encoding='utf-8') as f:
    			lines = f.readlines()
    			line = lines.pop()
    			line = json.loads(line)
    			while lines:
    				for i in range(self.threads):
    					t = threading.Thread(target=self.linkWithProxy, args=(line,))
    					t.start()
    					if lines:
    						line = lines.pop()
    					else:
    						continue
    			with open(self.dFile, 'w') as f:
    				for i in range(len(self.aliveList)):
    					f.write(self.aliveList[i] + '
    ')
    
    	def linkWithProxy(self, line):
    		line = json.loads(line)
    		protocol = line['protocol'].lower()
    		ip = line['ip']
    		port = line['port']
    		server = protocol + '://' + line['ip'] + ':' + line['port']
    		print(server)
    		try:
    			response = telnetlib.Telnet(ip, port=port, timeout=self.timeout)
    		except:
    			print('%s 链接失败' % server)
    			return
    		else:
    			print('%s 链接成功!' % server)
    			self.aliveList.append(server)
    		print(self.aliveList)
    
    
    if __name__ == '__main__':
    	TP = TestProxy()
    

      

     

  • 相关阅读:
    Solr服务在Linux上的搭建详细教程
    Linux服务器上安装JDK小白教程
    request和response中文乱码问题后台处理办法
    Redis的五种数据类型及方法
    Java类装载器ClassLoader
    Git快速入门和常用命令
    redis在Linux上的部署和jedis简单使用
    Linux常用基础命令
    JDK1.7中HashMap底层实现原理
    微信电脑版无法显示图片无法下载文件
  • 原文地址:https://www.cnblogs.com/shuimohei/p/12660070.html
Copyright © 2020-2023  润新知