• 多进程的妙用


     1 #coding:utf-8
     2 import time
     3 import threading
     4 from html_downLoader import HtmlDownLoader
     5 import ParseAlexa
     6 import multiprocessing
     7 from MongoQueue import MongoQueue
     8 import sys
     9 if sys.getdefaultencoding()!="utf-8":
    10     reload(sys)
    11     sys.setdefaultencoding("utf-8")
    12 SLEEP_TIME=1
    13 alexaCallback=ParseAlexa.AlexaCallback()
    14 crawl_queue=alexaCallback("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip")
    15 max_threads=5
    16 result={}
    17 def threaded_crawler():
    18     threads=[]
    19     #crawl_queue=alexaCallback("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip")
    20     dlownloader=HtmlDownLoader()
    21     def process_queue():
    22         while True:
    23             try:
    24                 url=crawl_queue.pop()
    25                 crawl_queue.complete(url)
    26             except Exception,e:
    27                 print e.message
    28                 break
    29             else:
    30                 print "正在爬取%s"%url
    31                 html=dlownloader.downLoad(url)
    32                 result[url]=html
    33 
    34     while threads or crawl_queue.__nonzero__():
    35         while len(threads)<max_threads and crawl_queue.__nonzero__():
    36             thread=threading.Thread(target=process_queue)
    37             thread.setDaemon(True)
    38             thread.start()
    39             threads.append(thread)
    40             time.sleep(SLEEP_TIME)
    41         for thread in threads:
    42             if not thread.is_alive():
    43                 threads.remove(thread)
    44     print result,'
    
    
    
    
    '
    45 
    46 def process_crawler():
    47     num_cpus=multiprocessing.cpu_count()
    48     print "Starting {} process".format(num_cpus)
    49     process=[]
    50     for i in range(num_cpus):
    51         p=multiprocessing.Process(target=threaded_crawler)
    52         p.daemon=True
    53         p.start()
    54         # p.join()
    55         process.append(p)
    56     for p in process:
    57         p.join()
    58     # print result
    59 if __name__ == '__main__':
    60     #alexaCallback=ParseAlexa.AlexaCallback()
    61     #threaded_crawler(alexaCallback)
    62     process_crawler()
    63     # print result
  • 相关阅读:
    《洛谷P2296 寻找道路》
    《浙江科技学院第17届大学生程序设计竞赛:D:合并序列》
    《数论整理二》
    《洛谷P1282 多米诺骨牌》
    《洛谷P2140 小Z的电力管制》
    《洛谷P2798 爆弹虐场》
    Linux下运行C语言程序
    计算圆柱的底面积和体积
    将摄氏温度转化为华氏温度
    如果今天是星期二,那么100天后是星期几?
  • 原文地址:https://www.cnblogs.com/zhongshuiping/p/9815102.html
Copyright © 2020-2023  润新知