1 #coding:utf-8 2 import time 3 import threading 4 from html_downLoader import HtmlDownLoader 5 import ParseAlexa 6 import multiprocessing 7 from MongoQueue import MongoQueue 8 import sys 9 if sys.getdefaultencoding()!="utf-8": 10 reload(sys) 11 sys.setdefaultencoding("utf-8") 12 SLEEP_TIME=1 13 alexaCallback=ParseAlexa.AlexaCallback() 14 crawl_queue=alexaCallback("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip") 15 max_threads=5 16 result={} 17 def threaded_crawler(): 18 threads=[] 19 #crawl_queue=alexaCallback("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip") 20 dlownloader=HtmlDownLoader() 21 def process_queue(): 22 while True: 23 try: 24 url=crawl_queue.pop() 25 crawl_queue.complete(url) 26 except Exception,e: 27 print e.message 28 break 29 else: 30 print "正在爬取%s"%url 31 html=dlownloader.downLoad(url) 32 result[url]=html 33 34 while threads or crawl_queue.__nonzero__(): 35 while len(threads)<max_threads and crawl_queue.__nonzero__(): 36 thread=threading.Thread(target=process_queue) 37 thread.setDaemon(True) 38 thread.start() 39 threads.append(thread) 40 time.sleep(SLEEP_TIME) 41 for thread in threads: 42 if not thread.is_alive(): 43 threads.remove(thread) 44 print result,' ' 45 46 def process_crawler(): 47 num_cpus=multiprocessing.cpu_count() 48 print "Starting {} process".format(num_cpus) 49 process=[] 50 for i in range(num_cpus): 51 p=multiprocessing.Process(target=threaded_crawler) 52 p.daemon=True 53 p.start() 54 # p.join() 55 process.append(p) 56 for p in process: 57 p.join() 58 # print result 59 if __name__ == '__main__': 60 #alexaCallback=ParseAlexa.AlexaCallback() 61 #threaded_crawler(alexaCallback) 62 process_crawler() 63 # print result