• 单线程、多线程、多进程、协程比较,以爬取新浪军事历史为例


    演示python单线程、多线程、多进程、协程

      1 import requests,json,random
      2 import re,threading,time
      3 from lxml import etree
      4 
      5 lock=threading.Lock()
      6 semaphore=threading.Semaphore(100)   ###每次限制只能100线程
      7 
      8 user_agent_list = [ 
      9         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" ,
     10         "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 
     11         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 
     12         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 
     13         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 
     14         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 
     15         "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 
     16         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
     17         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
     18         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
     19         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
     20         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
     21         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
     22         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
     23         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
     24         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 
     25         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 
     26         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
     27     ]
     28 count=0
     29 
     30 def sina(page_url):    ##列表页
     31     if semaphore.acquire():
     32         header={}
     33 
     34         header['User-Agent']=random.choice(user_agent_list)
     35         header.update({
     36             "Host":"platform.sina.com.cn",
     37 
     38             #"Cookie":"global_cookie=fb1g6d0w64d2cmu86sv4g9n3va0j137sk48; vh_newhouse=3_1491312022_2816%5B%3A%7C%40%7C%3A%5D833300ee3177d88529c7aa418942ece9; newhouse_user_guid=2F163DE7-8201-7FA9-2FB6-E507FE6F03B1; SoufunSessionID_Esf=3_1495389730_232; sf_source=; s=; showAdsh=1; hlist_xfadhq_SZ=0%7c2017%2f5%2f25+1%3a21%3a47%7c; city=sz; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; logGuid=a768dd46-b85b-47f4-a7a0-0a6596cab4cd; __utma=147393320.1111837171.1491290389.1495646208.1495650134.9; __utmb=147393320.12.10.1495650134; __utmc=147393320; __utmz=147393320.1495650134.9.4.utmcsr=esf.sz.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/; unique_cookie=U_cqyov4ut5vv1al8e2858qhzgt17j2z06mph*14"
     39             })
     40         while(1):
     41             content=''
     42             try:
     43                 content=requests.get(page_url,headers=header,timeout=5).content
     44 
     45             except Exception as e:
     46                 print e
     47             if content!='':
     48                 break
     49 
     50 
     51 
     52 
     53         jsona=re.findall('jQuery191012358189839869738_1495880348059(([sS]*?"}]}})',content)[0]
     54         #print jsona
     55         dict= json.loads(jsona)
     56         #print type(dict)
     57         #print dict
     58         #print dict['result']['data']
     59         for l in dict['result']['data']:
     60             title= l['title']
     61             url= l['url']
     62             biaoqian=get_biaoqian(url)
     63 
     64             lock.acquire()
     65             global count
     66             count+=1
     67             print time.strftime('%H:%M:%S',time.localtime(time.time())),'    ',count
     68             print '列表页:'
     69              70             print ' title: %s
     url: %s'%(title,url)
     71 
     72             print '详情页:'
     73             print ' biaoqian: %s 
    '%(biaoqian)
     74             print '**************************************************************'
     75             lock.release()
     76 
     77         semaphore.release()
     78 
     79 
     80 
     81 def get_biaoqian(url):    ###新闻页,爬取标签
     82 
     83     header={'User-Agent':random.choice(user_agent_list)}
     84     header.update({"Host":"mil.news.sina.com.cn"})
     85 
     86     while(1):
     87         content=''
     88         try:
     89             content=requests.get(url,headers=header,timeout=10).content
     90         except Exception as  e:
     91             #print e
     92             pass
     93         if content!='':
     94             break
     95 
     96 
     97     se=etree.HTML(content)
     98     #print etree.tounicode(se)
     99     biaoqian=se.xpath('//p[@class="art_keywords"]/a/text()')
    100     return  ' '.join(biaoqian)
    101 
    102 
    103 
    104 
    105 def singe_req():
    106     for i in range(1,301):
    107         page_url='http://platform.sina.com.cn/news/news_list?app_key=2872801998&channel=mil&cat_1=lishi&show_all=0&show_cat=1&show_ext=1&tag=1&format=json&page=%s&show_num=10&callback=jQuery191012358189839869738_1495880348059&_=1495880348069'%i
    108         sina(page_url)
    109     print 'over'
    110 
    111 def threading_red():
    112     threads=[]
    113     for i in range(1,301):
    114         t=threading.Thread(target=sina,args=('http://platform.sina.com.cn/news/news_list?app_key=2872801998&channel=mil&cat_1=lishi&show_all=0&show_cat=1&show_ext=1&tag=1&format=json&page=%s&show_num=10&callback=jQuery191012358189839869738_1495880348059&_=1495880348069'%i,))
    115         threads.append(t)
    116         t.start()
    117     for t in threads:
    118         t.join()
    119     print 'over'
    120 
    121 def  muiltiprocessing_req():
    122     import multiprocessing
    123     pool = multiprocessing.Pool(100)
    124     #pool = multiprocessing.Pool(multiprocessing.cpu_count())
    125 
    126     pool.map(sina, ['http://platform.sina.com.cn/news/news_list?app_key=2872801998&channel=mil&cat_1=lishi&show_all=0&show_cat=1&show_ext=1&tag=1&format=json&page=%s&show_num=10&callback=jQuery191012358189839869738_1495880348059&_=1495880348069'%i for i in range(1,301)])
    127     pool.close()
    128     pool.join()
    129     print 'over'
    130 
    131 def gevent_req():
    132     ######################利用pool######################
    133     from gevent import monkey
    134     from gevent.pool import Pool
    135 
    136     monkey.patch_all()
    137     pool = Pool(100)
    138     data= pool.map(sina, ['http://platform.sina.com.cn/news/news_list?app_key=2872801998&channel=mil&cat_1=lishi&show_all=0&show_cat=1&show_ext=1&tag=1&format=json&page=%s&show_num=10&callback=jQuery191012358189839869738_1495880348059&_=1495880348069'%i for i in range(1,301)])
    139     print 'over'
    140 
    141 if __name__=='__main__':
    142     pass
    143     singe_req()                     ##单线程
    144     #threading_red()                  ###多线程
    145     #muiltiprocessing_req()             ####多进程
    146 #gevent_req() ##协程


    这篇主要是用四种方法来实现爬虫。无论是100线程还是100进程或者100协程,网速都撑满了,爬取速度很快,单线程对网速利用很不充分,当然就爬取缓慢。

    特别是我之前在面试房极客时候,那主管告诉我,他说他看了网上说python多线程是假的,所以他从来就没使用过多线程,只用多进程,他认为多线程不能加快爬虫速度。

    关于这一点我是非常确定python多线程能加快爬取速度的,因为我使用多线程的时间很长,那主管应该只看了一半,python对cpu密集型速度提升不了多少,但对于io密集型的速度提升是立竿见影的,特别是对timeout比较大的网站,多线程爬取优势非常明显,因为爬虫是打开页面,请求服务器后端,服务器后端操作数据库查询数据,数据库返回给后端返回给前段,这种属于io密集型,多线程在爬虫和性能测试都是可以的。而多进程实在是开销太大了,开100进程,任务管理器可以看到100个python.exe,每个占用20M内存,多进程启动时候占用cpu极高。爬虫是非常适合多线程的,或者利用协程也可以。

    发下运行结果:

  • 相关阅读:
    第十四节:Web爬虫之Ajax数据爬取
    第十三节:web爬虫之Redis数据存储
    第十二节:Web爬虫之MongoDB数据库安装与数据存储
    第十一节:Web爬虫之数据存储(数据更新、删除、查询)
    第十节:Web爬虫之数据存储与MySQL8.0数据库安装和数据插入
    第九节:web爬虫之urllib(五)
    第八节:web爬虫之urllib(四)
    第七节:web爬虫之urllib(三)
    第六节:web爬虫之urllib(二)
    第五节:web爬虫之urllib(一)
  • 原文地址:https://www.cnblogs.com/ydf0509/p/6914384.html
Copyright © 2020-2023  润新知