• Python实例 -- 爬虫


     1 #coding="utf-8"
     2 
     3 import urllib2
     4 import re
     5 import threading
     6 import time
     7 
     8 """
     9 抓取代理发布页的ip和port10 http://www.xici.net.co/nn/%d
    11 """
    12 
    13 proxylist = []
    14 
    15 
    16 def get_proxy_from_cnproxy():
    17     global proxylist
    18     
    19     p = re.compile(r'<td><img alt="(.+?)" src=".+?" /></td>[sS]*?<td>(.+?)</td>[sS]*?<td>(.+?)</td>[sS]*?<td>[sS]*?<a href=".+?">.+?</a>[sS]*?</td>[sS]*?<td>.+?</td>[sS]*?<td>(.+?)</td>')
    20     
    21     for i in range(1,2):
    22         target = r"http://www.xici.net.co/nn/%d" %i
    23         print target
    24         req = urllib2.urlopen(target)
    25         result = req.read()
    26         matchs = p.findall(result)
    27         for record in matchs:
    28             addr = record[0]
    29             ip = record[1]
    30             port = record[2]
    31             protocol = record[3]
    32             l = [ip, port, protocol, addr]
    33             #print l
    34             proxylist.append(l)    
    35         print proxylist
    36 
    37 
    38 class ProxyCheck(threading.Thread):
    39     def __init__(self, proxylist, fname):
    40         threading.Thread.__init__(self)
    41         self.proxylist = proxylist
    42         self.timeout = 5
    43         self.test_url = "http://www.baidu.com/"
    44         self.test_str = "030173"
    45         self.checkedPProxyList = []
    46         self.fname = fname
    47         
    48     def checkProxy(self):
    49         cookies = urllib2.HTTPCookieProcessor()
    50         for proxy in self.proxylist:
    51             proxy_handler = urllib2.ProxyHandler({"http":r'http://%s:%s'%(proxy[0],proxy[1])})
    52             opener = urllib2.build_opener(cookies, proxy_handler)
    53             opener.addheaders = [('user-agent', 'mozilla/5.0(iphone; u; cpu like mac os x; en) applewebkit/420+ (khtml, like gecko) version/3.0 mobile/1A537a safari/419.3')]
    54             urllib2.install_opener(opener)
    55             t1 = time.time()
    56             try:
    57                 req = urllib2.urlopen(self.test_url, timeout = self.timeout)
    58                 result = req.read()
    59                 timeused = time.time() - t1
    60                 pos = result.find(self.test_str)
    61                 if pos > 1:
    62                     self.checkedPProxyList.append([proxy[0],proxy[1],proxy[2],proxy[3],timeused])
    63                 else:
    64                     continue;
    65             except Exception,e:
    66                 print e.message
    67                 continue;
    68             
    69     def sort(self):
    70         sorted(self.checkedPProxyList,cmp=lambda x,y:cmp(x[4],y[4]))
    71     
    72     def save(self):
    73         f = open(self.fname, 'w+')
    74         for proxy in self.checkedPProxyList:
    75             f.write("%s:%s	%s	%s	%s
    "%(proxy[0],proxy[1],proxy[2],proxy[3],str(proxy[4])))
    76         f.close()
    77         
    78     def run(self):
    79         self.checkProxy()
    80         self.sort()
    81         self.save()
    82         
    83 
    84 if __name__ == "__main__":
    85     get_proxy_from_cnproxy()
    86     t1 = ProxyCheck(proxylist,"test.txt")
    87     t1.start()
  • 相关阅读:
    Java遍历Map键、值。获取Map大小的方法
    Oracle CASE WHEN 用法介绍
    JS动态改变select选择变更option的index值
    js对select动态添加和删除OPTION
    在js中使用createElement创建HTML对象和元素
    清空select标签中option选项的3种不同方式
    json-lib包笔记
    异常:javax.el.PropertyNotFoundException: Property 'id' not found on ..........
    golang struct的使用
    golang多维数组的切片
  • 原文地址:https://www.cnblogs.com/luzhiyuan/p/3889192.html
Copyright © 2020-2023  润新知