• 多进程的妙用


     1 #coding:utf-8
     2 import time
     3 import threading
     4 from html_downLoader import HtmlDownLoader
     5 import ParseAlexa
     6 import multiprocessing
     7 from MongoQueue import MongoQueue
     8 import sys
     9 if sys.getdefaultencoding()!="utf-8":
    10     reload(sys)
    11     sys.setdefaultencoding("utf-8")
    12 SLEEP_TIME=1
    13 alexaCallback=ParseAlexa.AlexaCallback()
    14 crawl_queue=alexaCallback("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip")
    15 max_threads=5
    16 result={}
    17 def threaded_crawler():
    18     threads=[]
    19     #crawl_queue=alexaCallback("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip")
    20     dlownloader=HtmlDownLoader()
    21     def process_queue():
    22         while True:
    23             try:
    24                 url=crawl_queue.pop()
    25                 crawl_queue.complete(url)
    26             except Exception,e:
    27                 print e.message
    28                 break
    29             else:
    30                 print "正在爬取%s"%url
    31                 html=dlownloader.downLoad(url)
    32                 result[url]=html
    33 
    34     while threads or crawl_queue.__nonzero__():
    35         while len(threads)<max_threads and crawl_queue.__nonzero__():
    36             thread=threading.Thread(target=process_queue)
    37             thread.setDaemon(True)
    38             thread.start()
    39             threads.append(thread)
    40             time.sleep(SLEEP_TIME)
    41         for thread in threads:
    42             if not thread.is_alive():
    43                 threads.remove(thread)
    44     print result,'
    
    
    
    
    '
    45 
    46 def process_crawler():
    47     num_cpus=multiprocessing.cpu_count()
    48     print "Starting {} process".format(num_cpus)
    49     process=[]
    50     for i in range(num_cpus):
    51         p=multiprocessing.Process(target=threaded_crawler)
    52         p.daemon=True
    53         p.start()
    54         # p.join()
    55         process.append(p)
    56     for p in process:
    57         p.join()
    58     # print result
    59 if __name__ == '__main__':
    60     #alexaCallback=ParseAlexa.AlexaCallback()
    61     #threaded_crawler(alexaCallback)
    62     process_crawler()
    63     # print result
  • 相关阅读:
    Redis 字符串(String)
    Redis 哈希(Hash)
    Redis 键(key)
    Redis 命令
    Redis 数据类型
    Redis 配置
    Log4j 2X 日志文件路径问题
    shiro项目从 log4j1X 迁移到 log4j2X
    shiro+SpringMVC 项目 配置404页面
    邮件发送-》http://service.mail.qq.com/cgi-bin/help?subtype=1&&id=28&&no=1001256
  • 原文地址:https://www.cnblogs.com/zhongshuiping/p/9815102.html
Copyright © 2020-2023  润新知