本篇将测试爬取单本小说下: 利用多进程,多线程,协程,以及多进程加多线程,多进程加协程组合应用的效率。
以爬取--笔趣阁--大道争锋为例,测试相关组合的性能。
- 多线程
代码如下:
# -*- coding: utf-8 -*- """ Created on Wed Mar 4 10:39:55 2020 @author: wenzhe.tian 多进程+多线程 多进程+协程 """ book_name_list=['大道争锋'] ####### 开始工作 import time from concurrent.futures import ThreadPoolExecutor import requests from lxml import etree import os import urllib.parse as parse save_path='D:\bqg_novel\' headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'} target_url='https://m.52bqg.com' try: os.mkdir(save_path) except: pass ### 定义两个函数获取书的url和章节的url def get_chapter_content(i): # 根据章节url返回章节内容 chapter_now=requests.get(target_url+i,headers) chapter_now.encoding='gbk' chapter_now=chapter_now.text; #源码 # chapter_now_ori=chapter_now #检测用 chapter_now=etree.HTML(chapter_now) chapter_content=' '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()')) next_page_num=1 while '下一页' in chapter_now.xpath('//div[@class="nr_page"]//td[@class="next"]/descendant::text()'): chapter_content=chapter_content.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','') next_page_num=next_page_num+1; chapter_now=requests.get(target_url+i.replace('.html','_'+str(next_page_num)+'.html'),headers) chapter_now.encoding='gbk' chapter_now=chapter_now.text; #源码 # chapter_now_ori=chapter_now #检测用 chapter_now=etree.HTML(chapter_now) chapter_content_next=' '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()')) chapter_content_next=chapter_content_next.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','') chapter_content=chapter_content+chapter_content_next; return chapter_content def get_chapter_link(i): ########## 确定章节的数目,爬取所有章节link ###### global url_all,headers if i==0: req_next=requests.get(url_all,headers) else: req_next=requests.get(url_all+'/'+str(i+1),headers) req_next.encoding='gbk' html_next=etree.HTML(req_next.text) chapter_name_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]//a/descendant::text()|//ul[@class="last9"]//li//a/descendant::text()') chapter_url_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]/a/@href|//ul[@class="last9"]//li/a/@href') chapter_name=chapter_name_next[1:] chapter_url=chapter_url_next[1:] return chapter_name,chapter_url ################################# 对于所有章节的url内容爬取 ##################################### novel=[] for k in book_name_list: start=time.time() url='https://m.52bqg.com/modules/article/waps.php?searchtype=articlename&searchkey='+parse.quote(k,encoding="gbk")+'&t_btnsearch=' req=requests.get(url,headers) req.encoding='gbk' if 'book_' in req.url and 'search' not in req.url: #搜索结果不是清单则直接开始爬 url_all=req.url url_all=url_all.replace('book','chapters') else: #是清单则判断是否有完全匹配项,若无则只爬榜1 search_result=req.text html_search=etree.HTML(search_result) search_book=html_search.xpath('//div[@class="article"]/a/text()') search_book_url=html_search.xpath('//div[@class="article"]/a[1]/@href') if k in search_book: url_all=target_url+search_book_url[search_book.index(k)] url_all=url_all.replace('book','chapters') else: url_all=target_url+search_book_url[0] url_all=url_all.replace('book','chapters') # 根据书名判断章节页数 req_all=requests.get(url_all,headers) req_all.encoding='gbk' html_all=etree.HTML(req_all.text) chapter_page_all=html_all.xpath('//table[@class="page-book"]//td/a/@href') chapter_page_all=chapter_page_all[1].split('/') chapter_page_all=int(chapter_page_all[-1]) # 开始多线程抓取 with ThreadPoolExecutor(250) as executor: # 根据章节页数,得到章节url chapter=list(executor.map(get_chapter_link,range(chapter_page_all))) chapter=list(zip(*chapter)) chapter_url=list(chapter[1]) chapter_name=list(chapter[0]) chapter_url = sum(chapter_url, []) chapter_name = sum(chapter_name, []) chapter_all=list(executor.map(get_chapter_content,chapter_url)) end=time.time() print("耗时: "+str(int(end-start))+'秒') #计时统计 for i in range(len(chapter_all)): chapter_all[i]=chapter_name[i]+' '+chapter_all[i] target='/n'.join(chapter_all) f = open(save_path+'\'+k+'.txt','a+',encoding='utf-8') f.read() f.write(target) f.close() print(k+'已完成')
耗时: 70s
- 协程
代码如下:
# -*- coding: utf-8 -*- """ Created on Wed Mar 4 10:39:55 2020 @author: wenzhe.tian """ book_name_list=['大道争锋'] ####### 开始工作 import gevent from gevent import monkey,pool pool=pool.Pool(200) monkey.patch_all(thread=False) import requests import time from lxml import etree import os import urllib.parse as parse save_path='D:\bqg_novel\' headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'} target_url='https://m.52bqg.com' try: os.mkdir(save_path) except: pass ### 定义两个函数获取书的url和章节的url def get_chapter_content(i): # 根据章节url返回章节内容 chapter_now=requests.get(target_url+i,headers) chapter_now.encoding='gbk' chapter_now=chapter_now.text; #源码 # chapter_now_ori=chapter_now #检测用 chapter_now=etree.HTML(chapter_now) chapter_content=' '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()')) next_page_num=1 while '下一页' in chapter_now.xpath('//div[@class="nr_page"]//td[@class="next"]/descendant::text()'): chapter_content=chapter_content.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','') next_page_num=next_page_num+1; chapter_now=requests.get(target_url+i.replace('.html','_'+str(next_page_num)+'.html'),headers) chapter_now.encoding='gbk' chapter_now=chapter_now.text; #源码 # chapter_now_ori=chapter_now #检测用 chapter_now=etree.HTML(chapter_now) chapter_content_next=' '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()')) chapter_content_next=chapter_content_next.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','') chapter_content=chapter_content+chapter_content_next; return chapter_content def get_chapter_link(i): ########## 确定章节的数目,爬取所有章节link ###### global url_all,headers if i==0: req_next=requests.get(url_all,headers) else: req_next=requests.get(url_all+'/'+str(i+1),headers) req_next.encoding='gbk' html_next=etree.HTML(req_next.text) chapter_name_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]//a/descendant::text()|//ul[@class="last9"]//li//a/descendant::text()') chapter_url_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]/a/@href|//ul[@class="last9"]//li/a/@href') chapter_name=chapter_name_next[1:] chapter_url=chapter_url_next[1:] return chapter_name,chapter_url ################################# 对于所有章节的url内容爬取 ##################################### novel=[] for k in book_name_list: start=time.time() url='https://m.52bqg.com/modules/article/waps.php?searchtype=articlename&searchkey='+parse.quote(k,encoding="gbk")+'&t_btnsearch=' req=requests.get(url,headers) req.encoding='gbk' if 'book_' in req.url and 'search' not in req.url: #搜索结果不是清单则直接开始爬 url_all=req.url url_all=url_all.replace('book','chapters') else: #是清单则判断是否有完全匹配项,若无则只爬榜1 search_result=req.text html_search=etree.HTML(search_result) search_book=html_search.xpath('//div[@class="article"]/a/text()') search_book_url=html_search.xpath('//div[@class="article"]/a[1]/@href') if k in search_book: url_all=target_url+search_book_url[search_book.index(k)] url_all=url_all.replace('book','chapters') else: url_all=target_url+search_book_url[0] url_all=url_all.replace('book','chapters') # 根据书名判断章节页数 req_all=requests.get(url_all,headers) req_all.encoding='gbk' html_all=etree.HTML(req_all.text) chapter_page_all=html_all.xpath('//table[@class="page-book"]//td/a/@href') chapter_page_all=chapter_page_all[1].split('/') chapter_page_all=int(chapter_page_all[-1]) # 开始协程抓取 g_list=list(map(lambda x:gevent.spawn(get_chapter_link, x),range(chapter_page_all))) gevent.joinall(g_list) chapter=[] for g in g_list: chapter.append(g.value) chapter=list(zip(*chapter)) chapter_url=list(chapter[1]) chapter_name=list(chapter[0]) chapter_url = sum(chapter_url, []) chapter_name = sum(chapter_name, []) g_list=list(map(lambda x:gevent.spawn(get_chapter_content, x),chapter_url)) gevent.joinall(g_list) chapter_all=[] for g in g_list: chapter_all.append(g.value) end=time.time() print("耗时: "+str(int(end-start))+'秒') #计时统计 for i in range(len(chapter_all)): chapter_all[i]=chapter_name[i]+' '+chapter_all[i] target='/n'.join(chapter_all) f = open(save_path+'\'+k+'.txt','a+',encoding='utf-8') f.read() f.write(target) f.close() print(k+'已完成')
耗时: 103s
- 多进程调用多线程
代码如下:
# -*- coding: utf-8 -*- """ Created on Wed Mar 4 10:39:55 2020 @author: wenzhe.tian 多进程+多线程 """ ####### 开始工作 import time from concurrent.futures import ThreadPoolExecutor import requests from lxml import etree import os import urllib.parse as parse from multiprocessing import Pool book_name_list=['斗罗大陆3龙王传说'] save_path='D:\bqg_novel\' headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'} target_url='https://m.52bqg.com' try: os.mkdir(save_path) except: pass novel=[] ##### 定义两个函数获取书的url和章节的url def get_chapter_content(i): # 根据章节url返回章节内容 chapter_now=requests.get(target_url+i,headers) chapter_now.encoding='gbk' chapter_now=chapter_now.text; #源码 # chapter_now_ori=chapter_now #检测用 chapter_now=etree.HTML(chapter_now) chapter_content=' '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()')) next_page_num=1 while '下一页' in chapter_now.xpath('//div[@class="nr_page"]//td[@class="next"]/descendant::text()'): chapter_content=chapter_content.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','') next_page_num=next_page_num+1; chapter_now=requests.get(target_url+i.replace('.html','_'+str(next_page_num)+'.html'),headers) chapter_now.encoding='gbk' chapter_now=chapter_now.text; #源码 # chapter_now_ori=chapter_now #检测用 chapter_now=etree.HTML(chapter_now) chapter_content_next=' '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()')) chapter_content_next=chapter_content_next.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','') chapter_content=chapter_content+chapter_content_next; return chapter_content def get_chapter_link(link): ########## 确定章节的数目,爬取所有章节link ###### i=link[0] url_all=link[1] if i==0: req_next=requests.get(url_all,headers) else: req_next=requests.get(url_all+str(i+1),headers) req_next.encoding='gbk' html_next=etree.HTML(req_next.text) chapter_name_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]//a/descendant::text()|//ul[@class="last9"]//li//a/descendant::text()') chapter_url_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]/a/@href|//ul[@class="last9"]//li/a/@href') chapter_name=chapter_name_next[1:] chapter_url=chapter_url_next[1:] return chapter_name,chapter_url def run_proc(page): with ThreadPoolExecutor(200) as executor: # 根据章节页数,得到章节url i=list(page[0]) for k in range(len(i)): i[k]=[i[k],page[1]] chapter=list(executor.map(get_chapter_link,i)) chapter=list(zip(*chapter)) chapter_url=list(chapter[1]) chapter_name=list(chapter[0]) chapter_url = sum(chapter_url, []) chapter_name = sum(chapter_name, []) chapter_all=list(executor.map(get_chapter_content,chapter_url)) for i in range(len(chapter_all)): chapter_all[i]=chapter_name[i]+' '+chapter_all[i] return chapter_all ################################# 对于所有章节的url内容爬取 ##################################### if __name__ == '__main__': for k in book_name_list: start=time.time() url='https://m.52bqg.com/modules/article/waps.php?searchtype=articlename&searchkey='+parse.quote(k,encoding="gbk")+'&t_btnsearch=' req=requests.get(url,headers) req.encoding='gbk' if 'book_' in req.url and 'search' not in req.url: #搜索结果不是清单则直接开始爬 url_all=req.url url_all=url_all.replace('book','chapters') else: #是清单则判断是否有完全匹配项,若无则只爬榜1 search_result=req.text html_search=etree.HTML(search_result) search_book=html_search.xpath('//div[@class="article"]/a/text()') search_book_url=html_search.xpath('//div[@class="article"]/a[1]/@href') if k in search_book: url_all=target_url+search_book_url[search_book.index(k)] url_all=url_all.replace('book','chapters') else: url_all=target_url+search_book_url[0] url_all=url_all.replace('book','chapters') # 根据书名判断章节页数 req_all=requests.get(url_all,headers) req_all.encoding='gbk' html_all=etree.HTML(req_all.text) chapter_page_all=html_all.xpath('//table[@class="page-book"]//td/a/@href') chapter_page_all=chapter_page_all[1].split('/') chapter_page_all=int(chapter_page_all[-1]) # 对小说页使用进程处理 count=0 page_list=[] while count<chapter_page_all: next_count=count+10 if next_count>chapter_page_all: next_count=chapter_page_all; page_list.append([range(count,next_count),url_all]) count=count+10 p = Pool(4) result=p.map(run_proc, page_list) p.close() p.join() chapter_all= sum(result,[]) end=time.time() print("耗时: "+str(int(end-start))+'秒') #计时统计 target='/n'.join(chapter_all) f = open(save_path+'\'+k+'.txt','a+',encoding='utf-8') f.read() f.write(target) f.close() print(k+'已完成')
耗时: 40s
- 多进程调用协程
代码如下:
# -*- coding: utf-8 -*- """ Created on Wed Mar 4 10:39:55 2020 @author: wenzhe.tian 多进程+多线程 多进程+协程 """ ####### 开始工作 import gevent from gevent import monkey monkey.patch_all(thread=False) import requests import time from lxml import etree import os import urllib.parse as parse from multiprocessing import Pool book_name_list=['大道争锋'] save_path='D:\bqg_novel\' headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'} target_url='https://m.52bqg.com' try: os.mkdir(save_path) except: pass novel=[] ##### 定义两个函数获取书的url和章节的url def get_chapter_content(i): # 根据章节url返回章节内容 chapter_now=requests.get(target_url+i,headers) chapter_now.encoding='gbk' chapter_now=chapter_now.text; #源码 # chapter_now_ori=chapter_now #检测用 chapter_now=etree.HTML(chapter_now) chapter_content=' '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()')) next_page_num=1 while '下一页' in chapter_now.xpath('//div[@class="nr_page"]//td[@class="next"]/descendant::text()'): chapter_content=chapter_content.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','') next_page_num=next_page_num+1; chapter_now=requests.get(target_url+i.replace('.html','_'+str(next_page_num)+'.html'),headers) chapter_now.encoding='gbk' chapter_now=chapter_now.text; #源码 # chapter_now_ori=chapter_now #检测用 chapter_now=etree.HTML(chapter_now) chapter_content_next=' '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()')) chapter_content_next=chapter_content_next.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','') chapter_content=chapter_content+chapter_content_next; return chapter_content def get_chapter_link(link): ########## 确定章节的数目,爬取所有章节link ###### i=link[0] url_all=link[1] if i==0: req_next=requests.get(url_all,headers) else: req_next=requests.get(url_all+str(i+1),headers) req_next.encoding='gbk' html_next=etree.HTML(req_next.text) chapter_name_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]//a/descendant::text()|//ul[@class="last9"]//li//a/descendant::text()') chapter_url_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]/a/@href|//ul[@class="last9"]//li/a/@href') chapter_name=chapter_name_next[1:] chapter_url=chapter_url_next[1:] return chapter_name,chapter_url def run_proc(page): # 根据章节页数,得到章节url i=list(page[0]) for k in range(len(i)): i[k]=[i[k],page[1]] g_list=list(map(lambda x:gevent.spawn(get_chapter_link, x),i)) gevent.joinall(g_list) chapter=[] for g in g_list: chapter.append(g.value) chapter=list(zip(*chapter)) chapter_url=list(chapter[1]) chapter_name=list(chapter[0]) chapter_url = sum(chapter_url, []) chapter_name = sum(chapter_name, []) g_list=list(map(lambda x:gevent.spawn(get_chapter_content, x),chapter_url)) gevent.joinall(g_list) chapter_all=[] for g in g_list: chapter_all.append(g.value) for i in range(len(chapter_all)): chapter_all[i]=chapter_name[i]+' '+chapter_all[i] return chapter_all ################################# 对于所有章节的url内容爬取 ##################################### if __name__ == '__main__': for k in book_name_list: start=time.time() url='https://m.52bqg.com/modules/article/waps.php?searchtype=articlename&searchkey='+parse.quote(k,encoding="gbk")+'&t_btnsearch=' req=requests.get(url,headers) req.encoding='gbk' if 'book_' in req.url and 'search' not in req.url: #搜索结果不是清单则直接开始爬 url_all=req.url url_all=url_all.replace('book','chapters') else: #是清单则判断是否有完全匹配项,若无则只爬榜1 search_result=req.text html_search=etree.HTML(search_result) search_book=html_search.xpath('//div[@class="article"]/a/text()') search_book_url=html_search.xpath('//div[@class="article"]/a[1]/@href') if k in search_book: url_all=target_url+search_book_url[search_book.index(k)] url_all=url_all.replace('book','chapters') else: url_all=target_url+search_book_url[0] url_all=url_all.replace('book','chapters') # 根据书名判断章节页数 req_all=requests.get(url_all,headers) req_all.encoding='gbk' html_all=etree.HTML(req_all.text) chapter_page_all=html_all.xpath('//table[@class="page-book"]//td/a/@href') chapter_page_all=chapter_page_all[1].split('/') chapter_page_all=int(chapter_page_all[-1]) # 对小说页使用进程处理 count=0 page_list=[] while count<chapter_page_all: next_count=count+10 if next_count>chapter_page_all: next_count=chapter_page_all; page_list.append([range(count,next_count),url_all]) count=count+10 p = Pool(4) result=p.map(run_proc, page_list) p.close() p.join() chapter_all= sum(result,[]) end=time.time() print("耗时: "+str(int(end-start))+'秒') #计时统计 target='/n'.join(chapter_all) f = open(save_path+'\'+k+'.txt','a+',encoding='utf-8') f.read() f.write(target) f.close() print(k+'已完成')
耗时: 60s
简单的看出多核应用大于单核,多线程当然好于无线程(不要受GIL锁对多线程误解,效率明显是提高的)
本测试中开多线程效率(线程数250)是要大于协程的。
下篇将会对全站小说爬取的多进程多线程调用总结以及scrapy应用的对比。
欢迎交流指正。有任何疑问直接丢评论区。