• 基于Python的urllib2模块的多线程网络爬虫程序


      1 m Queue import Queue
      2 from gzip import GzipFile
      3 from StringIO import StringIO
      4 import time
      5 import socket
      6 class ContentEncodingProcessor(urllib2.BaseHandler):
      7   """A handler to add gzip capabilities to urllib2 requests """
      8  
      9   # add headers to requests
     10   def http_request(self, req):
     11     req.add_header("Accept-Encoding", "gzip, deflate")
     12     return req
     13  
     14   # decode
     15   def http_response(self, req, resp):
     16     old_resp = resp
     17     
     18    # if(resp.geturl() != req):
     19     #    print 'no'
     20      #   return 1
     21     # gzip
     22     if resp.headers.get("content-encoding") == "gzip":
     23         gz = GzipFile(
     24                     fileobj=StringIO(resp.read()),
     25                     mode="r"
     26                   )
     27         resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
     28         resp.msg = old_resp.msg
     29     # deflate
     30     if resp.headers.get("content-encoding") == "deflate":
     31         gz = StringIO( deflate(resp.read()) )
     32         resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)  # 'class to add info() and
     33         resp.msg = old_resp.msg
     34     return resp
     35 
     36 # deflate support
     37 import zlib
     38 def deflate(data):   # zlib only provides the zlib compress format, not the deflate format;
     39   try:               # so on top of all there's this workaround:
     40     return zlib.decompress(data, -zlib.MAX_WBITS)
     41   except zlib.error:
     42     return zlib.decompress(data)
     43 
     44 
     45 #(set timeout)
     46 socket.setdefaulttimeout(10)
     47 
     48 encoding_support = ContentEncodingProcessor
     49 opener = urllib2.build_opener( encoding_support, urllib2.HTTPHandler)
     50 
     51 class Fetcher:
     52     def __init__(self,threads):
     53         self.opener = urllib2.build_opener(urllib2.HTTPHandler)
     54         self.lock = Lock() #线程锁
     55         self.q_req = Queue() #任务队列
     56         self.q_ans = Queue() #完成队列import socket
     57         self.threads = threads
     58         for i in range(threads):
     59             t = Thread(target=self.threadget)
     60             t.setDaemon(True)
     61             t.start()
     62         self.running = 0
     63  
     64     def __del__(self): #解构时需等待两个队列完成
     65         time.sleep(0.5)
     66         self.q_req.join()
     67         self.q_ans.join()
     68  
     69     def taskleft(self):
     70         return self.q_req.qsize()+self.q_ans.qsize()+self.running
     71  
     72     def push(self,req):
     73         self.q_req.put(req)
     74  
     75     def pop(self):
     76         return self.q_ans.get()
     77  
     78     def threadget(self):
     79         while True:
     80             ans = ''
     81             req = self.q_req.get()
     82      #       print req
     83 
     84             with self.lock: #要保证该操作的原子性,进入critical area
     85                 self.running += 1
     86 
     87             try:
     88 #               ans = self.opener.open(req).read()
     89                #content =  opener.open(req).read()
     90                 content = urllib2.urlopen(req).read()
     91             #    print temp.geturl()
     92             #    print req
     93             #    add gzip support from here
     94                 ans = str(content)
     95             except Exception, what:
     96                 print what 
     97                 pass
     98 
     99             self.q_ans.put((ans,req))
    100             with self.lock:
    101                 self.running -= 1
    102             self.q_req.task_done()
    103             time.sleep(0.01) # don't spam
    104  
    105 if __name__ == "__main__":
    106     a = [0] * 3600000
    107     links = [ 'http://www.songtaste.com/song/%d/'%i for i in range(1,3600000) ]
    108     f = Fetcher(threads=50)
    109     for url in links:
    110         f.push(url)
    111     while f.taskleft():  
    112         the_page,x =f.pop()
    113        # print the_page
    114         try:
    115           npos = the_page.index('chart#fav')
    116         except :
    117           pass
    118         else:
    119            for j in range(npos,1,-1):
    120             if the_page[j] == ',': 
    121                 k = j 
    122                 break
    123            sum = 0 ;
    124            t = 1 ; 
    125            for j in range(k-1,1,-1):
    126               if  the_page[j] <= '9' and the_page[j] >='0':
    127                   sum = sum + (int(the_page[j]) - int('0')) * t
    128                   t *= 10;
    129               else :
    130                   break
    131            p = int(x[30:-1])
    132            if(p % 10000 <= 5  )
    133            a[p] = sum
    134            if sum != 0:
    135                 print p
    136                 print sum
    137     
    View Code
    没有梦想,何谈远方
  • 相关阅读:
    解决deepin没有ll等命令的办法
    解决客户端Redis中文乱码问题
    Redis 常用命令操作
    Redis常用数据类型
    Redis 入门
    Ubuntu18.04 安装netstat
    Ubuntu18.04 安装redis
    常用sql:按照表中的某一列对数据进行分组,统计数据条数
    date( ) 日期函数
    tp5 apache 转 nginx 需要配置的伪静态
  • 原文地址:https://www.cnblogs.com/zyue/p/3833152.html
Copyright © 2020-2023  润新知