1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # @Date : 2017-08-30 20:38:23 4 # @Author : EnderZhou (zptxwd@gmail.com) 5 # @Link : http://www.cnblogs.com/enderzhou/ 6 # @Version : $Id$ 7 8 import requests 9 from bs4 import BeautifulSoup as bs 10 11 # 这种爬取网页内容中的列表的方式复用性差,不同的网站需要针对性的修改。每次使用均需要填写更换header头。后续将编写适用性更强的版本。 12 13 url = 'http://www.kuaidaili.com/free/inha/' 14 15 headers = { 16 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 17 'Accept-Encoding':'gzip, deflate', 18 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6', 19 'AlexaToolbar-ALX_NS_PH':'AlexaToolbar/alx-4.0.1', 20 'Cache-Control':'max-age=0', 21 'Connection':'keep-alive', 22 'Cookie':'yd_cookie=a0d0f393-2812-44d0b1453fbf740f3ce870820ada37151e8c; _ydclearance=dd0b3de069ce8a768712e248-d97e-4bd9-8284-f2ef598da35b-1504104455; channelid=0; sid=1504099004948599; _ga=GA1.2.742898386.1504074603; _gid=GA1.2.583101265.1504074603; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1504074603,1504097260; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1504099719', 23 'Host':'www.kuaidaili.com', 24 'Upgrade-Insecure-Requests':'1', 25 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 26 } 27 28 def proxy_check(types,ip,port): 29 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'} 30 url = 'http://www.whatismyip.com.tw/' 31 proxy = {} 32 proxy[types.lower()] = '%s:%s' % (ip,port) 33 print proxy 34 try: 35 r = requests.get(url,headers=headers,proxies=proxy) 36 soup = bs(r.content,'html.parser') 37 chack_ip = soup.find_all(name='b') 38 print chack_ip[0].string+':'+port 39 except Exception,e: 40 # print e 41 pass 42 43 # proxy_check('http','183.62.11.242','8088')#可用于测试代理验证模块 44 45 def main(): 46 r = requests.get(url=url,headers=headers) 47 tr_soup = bs(r.content,'html.parser') 48 tr = tr_soup.find_all(name='tr') 49 for i in tr: 50 # print i 51 td_soup = bs(str(i),'html.parser') 52 td = td_soup.find_all(name='td') 53 if len(td) != 0: 54 ip = str(td[0].string) 55 port = str(td[1].string) 56 types = str(td[3].string) 57 proxy_check(types,ip,port) 58 59 if __name__ == '__main__': 60 main()