• python爬虫多进程,多线程,协程以及组合应用的效率对比(multiprocessing库)--以爬取单本小说全文为例


    本篇将测试爬取单本小说下: 利用多进程多线程协程,以及多进程加多线程多进程加协程组合应用的效率。

    以爬取--笔趣阁--大道争锋为例,测试相关组合的性能。

    • 多线程

    代码如下:

    # -*- coding: utf-8 -*-
    """
    Created on Wed Mar  4 10:39:55 2020
    
    @author: wenzhe.tian
    
    多进程+多线程
    多进程+协程
    """
    
    book_name_list=['大道争锋']
    
    
    
    
    ####### 开始工作
    import time
    from concurrent.futures import ThreadPoolExecutor
    import requests
    from lxml import etree
    import os
    import urllib.parse as parse
    
    
    save_path='D:\bqg_novel\'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
    target_url='https://m.52bqg.com'
    try:
        os.mkdir(save_path)
    except:
        pass
    
    
    ### 定义两个函数获取书的url和章节的url
    def get_chapter_content(i): # 根据章节url返回章节内容
        chapter_now=requests.get(target_url+i,headers)
        chapter_now.encoding='gbk'
        chapter_now=chapter_now.text; #源码
        #    chapter_now_ori=chapter_now #检测用
        chapter_now=etree.HTML(chapter_now)
        chapter_content='
    '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()'))
        next_page_num=1
        while '下一页' in chapter_now.xpath('//div[@class="nr_page"]//td[@class="next"]/descendant::text()'):
            chapter_content=chapter_content.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','')
            next_page_num=next_page_num+1;
            chapter_now=requests.get(target_url+i.replace('.html','_'+str(next_page_num)+'.html'),headers)
            chapter_now.encoding='gbk'
            chapter_now=chapter_now.text; #源码
            #        chapter_now_ori=chapter_now #检测用
            chapter_now=etree.HTML(chapter_now)
            chapter_content_next='
    '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()'))
            chapter_content_next=chapter_content_next.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','')
            chapter_content=chapter_content+chapter_content_next;
        return chapter_content
    
    
    def get_chapter_link(i): ########## 确定章节的数目,爬取所有章节link ######
        global url_all,headers
        if i==0:
            req_next=requests.get(url_all,headers)
        else:
            req_next=requests.get(url_all+'/'+str(i+1),headers)
        req_next.encoding='gbk'
        html_next=etree.HTML(req_next.text)
        chapter_name_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]//a/descendant::text()|//ul[@class="last9"]//li//a/descendant::text()')
        chapter_url_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]/a/@href|//ul[@class="last9"]//li/a/@href')
        chapter_name=chapter_name_next[1:]
        chapter_url=chapter_url_next[1:]
        return chapter_name,chapter_url
    
    
    
    ################################# 对于所有章节的url内容爬取 #####################################
    novel=[]
    
    for k in book_name_list:
        start=time.time()
        url='https://m.52bqg.com/modules/article/waps.php?searchtype=articlename&searchkey='+parse.quote(k,encoding="gbk")+'&t_btnsearch='
        req=requests.get(url,headers)
        req.encoding='gbk'
        if 'book_' in req.url and 'search' not in req.url: #搜索结果不是清单则直接开始爬
            url_all=req.url
            url_all=url_all.replace('book','chapters')
        else: #是清单则判断是否有完全匹配项,若无则只爬榜1
            search_result=req.text
            html_search=etree.HTML(search_result)
            search_book=html_search.xpath('//div[@class="article"]/a/text()')
            search_book_url=html_search.xpath('//div[@class="article"]/a[1]/@href')
            if k in search_book:
                url_all=target_url+search_book_url[search_book.index(k)]
                url_all=url_all.replace('book','chapters')
            else:
                url_all=target_url+search_book_url[0]
                url_all=url_all.replace('book','chapters')
    
        # 根据书名判断章节页数
        req_all=requests.get(url_all,headers)
        req_all.encoding='gbk'
        html_all=etree.HTML(req_all.text)
        chapter_page_all=html_all.xpath('//table[@class="page-book"]//td/a/@href')
        chapter_page_all=chapter_page_all[1].split('/')
        chapter_page_all=int(chapter_page_all[-1])
        # 开始多线程抓取
        with ThreadPoolExecutor(250) as executor:
            # 根据章节页数,得到章节url
            chapter=list(executor.map(get_chapter_link,range(chapter_page_all)))
            chapter=list(zip(*chapter))
            chapter_url=list(chapter[1])
            chapter_name=list(chapter[0])
            chapter_url = sum(chapter_url, [])
            chapter_name = sum(chapter_name, [])
            chapter_all=list(executor.map(get_chapter_content,chapter_url))
        end=time.time()
        print("耗时: "+str(int(end-start))+'') #计时统计
        for i in range(len(chapter_all)):
            chapter_all[i]=chapter_name[i]+'
    '+chapter_all[i]
        target='/n'.join(chapter_all)
        f = open(save_path+'\'+k+'.txt','a+',encoding='utf-8')
        f.read()
        f.write(target)
        f.close()
        print(k+'已完成')
    View Code

    耗时: 70s

    • 协程

    代码如下:

    # -*- coding: utf-8 -*-
    """
    Created on Wed Mar  4 10:39:55 2020
    
    @author: wenzhe.tian
    
    
    
    """
    
    book_name_list=['大道争锋']
    
    
    
    
    ####### 开始工作
    
    import gevent
    from gevent import monkey,pool
    pool=pool.Pool(200)
    monkey.patch_all(thread=False)
    import requests
    import time
    from lxml import etree
    import os
    import urllib.parse as parse
    
    
    save_path='D:\bqg_novel\'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
    target_url='https://m.52bqg.com'
    try:
        os.mkdir(save_path)
    except:
        pass
    
    
    ### 定义两个函数获取书的url和章节的url
    def get_chapter_content(i): # 根据章节url返回章节内容
        chapter_now=requests.get(target_url+i,headers)
        chapter_now.encoding='gbk'
        chapter_now=chapter_now.text; #源码
        #    chapter_now_ori=chapter_now #检测用
        chapter_now=etree.HTML(chapter_now)
        chapter_content='
    '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()'))
        next_page_num=1
        while '下一页' in chapter_now.xpath('//div[@class="nr_page"]//td[@class="next"]/descendant::text()'):
            chapter_content=chapter_content.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','')
            next_page_num=next_page_num+1;
            chapter_now=requests.get(target_url+i.replace('.html','_'+str(next_page_num)+'.html'),headers)
            chapter_now.encoding='gbk'
            chapter_now=chapter_now.text; #源码
            #        chapter_now_ori=chapter_now #检测用
            chapter_now=etree.HTML(chapter_now)
            chapter_content_next='
    '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()'))
            chapter_content_next=chapter_content_next.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','')
            chapter_content=chapter_content+chapter_content_next;
        return chapter_content
    
    
    def get_chapter_link(i): ########## 确定章节的数目,爬取所有章节link ######
        global url_all,headers
        if i==0:
            req_next=requests.get(url_all,headers)
        else:
            req_next=requests.get(url_all+'/'+str(i+1),headers)
        req_next.encoding='gbk'
        html_next=etree.HTML(req_next.text)
        chapter_name_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]//a/descendant::text()|//ul[@class="last9"]//li//a/descendant::text()')
        chapter_url_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]/a/@href|//ul[@class="last9"]//li/a/@href')
        chapter_name=chapter_name_next[1:]
        chapter_url=chapter_url_next[1:]
        return chapter_name,chapter_url
    
    
    
    ################################# 对于所有章节的url内容爬取 #####################################
    novel=[]
    
    for k in book_name_list:
        start=time.time()
        url='https://m.52bqg.com/modules/article/waps.php?searchtype=articlename&searchkey='+parse.quote(k,encoding="gbk")+'&t_btnsearch='
        req=requests.get(url,headers)
        req.encoding='gbk'
        if 'book_' in req.url and 'search' not in req.url: #搜索结果不是清单则直接开始爬
            url_all=req.url
            url_all=url_all.replace('book','chapters')
        else: #是清单则判断是否有完全匹配项,若无则只爬榜1
            search_result=req.text
            html_search=etree.HTML(search_result)
            search_book=html_search.xpath('//div[@class="article"]/a/text()')
            search_book_url=html_search.xpath('//div[@class="article"]/a[1]/@href')
            if k in search_book:
                url_all=target_url+search_book_url[search_book.index(k)]
                url_all=url_all.replace('book','chapters')
            else:
                url_all=target_url+search_book_url[0]
                url_all=url_all.replace('book','chapters')
    
        # 根据书名判断章节页数
        req_all=requests.get(url_all,headers)
        req_all.encoding='gbk'
        html_all=etree.HTML(req_all.text)
        chapter_page_all=html_all.xpath('//table[@class="page-book"]//td/a/@href')
        chapter_page_all=chapter_page_all[1].split('/')
        chapter_page_all=int(chapter_page_all[-1])
        # 开始协程抓取
        g_list=list(map(lambda x:gevent.spawn(get_chapter_link, x),range(chapter_page_all)))
        gevent.joinall(g_list)
        chapter=[]
        for g in g_list:
            chapter.append(g.value)
        chapter=list(zip(*chapter))
        chapter_url=list(chapter[1])
        chapter_name=list(chapter[0])
        chapter_url = sum(chapter_url, [])
        chapter_name = sum(chapter_name, [])
        g_list=list(map(lambda x:gevent.spawn(get_chapter_content, x),chapter_url))
        gevent.joinall(g_list)
        chapter_all=[]
        for g in g_list:
            chapter_all.append(g.value)
        end=time.time()
        print("耗时: "+str(int(end-start))+'') #计时统计
        for i in range(len(chapter_all)):
            chapter_all[i]=chapter_name[i]+'
    '+chapter_all[i]
        target='/n'.join(chapter_all)
        f = open(save_path+'\'+k+'.txt','a+',encoding='utf-8')
        f.read()
        f.write(target)
        f.close()
        print(k+'已完成')
    View Code

    耗时: 103s

    • 多进程调用多线程

    代码如下:

    # -*- coding: utf-8 -*-
    """
    Created on Wed Mar  4 10:39:55 2020
    
    @author: wenzhe.tian
    
    多进程+多线程
    
    """
    
    
    ####### 开始工作
    import time
    from concurrent.futures import ThreadPoolExecutor
    import requests
    from lxml import etree
    import os
    import urllib.parse as parse
    from multiprocessing import Pool
    book_name_list=['斗罗大陆3龙王传说']
    save_path='D:\bqg_novel\'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
    target_url='https://m.52bqg.com'
    try:
        os.mkdir(save_path)
    except:
        pass
    novel=[]
    
    ##### 定义两个函数获取书的url和章节的url
    def get_chapter_content(i): # 根据章节url返回章节内容
        chapter_now=requests.get(target_url+i,headers)
        chapter_now.encoding='gbk'
        chapter_now=chapter_now.text; #源码
        #    chapter_now_ori=chapter_now #检测用
        chapter_now=etree.HTML(chapter_now)
        chapter_content='
    '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()'))
        next_page_num=1
        while '下一页' in chapter_now.xpath('//div[@class="nr_page"]//td[@class="next"]/descendant::text()'):
            chapter_content=chapter_content.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','')
            next_page_num=next_page_num+1;
            chapter_now=requests.get(target_url+i.replace('.html','_'+str(next_page_num)+'.html'),headers)
            chapter_now.encoding='gbk'
            chapter_now=chapter_now.text; #源码
            #        chapter_now_ori=chapter_now #检测用
            chapter_now=etree.HTML(chapter_now)
            chapter_content_next='
    '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()'))
            chapter_content_next=chapter_content_next.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','')
            chapter_content=chapter_content+chapter_content_next;
        return chapter_content
    
    
    def get_chapter_link(link): ########## 确定章节的数目,爬取所有章节link ######
        i=link[0]
        url_all=link[1]
        if i==0:
            req_next=requests.get(url_all,headers)
        else:
            req_next=requests.get(url_all+str(i+1),headers)
        req_next.encoding='gbk'
        html_next=etree.HTML(req_next.text)
        chapter_name_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]//a/descendant::text()|//ul[@class="last9"]//li//a/descendant::text()')
        chapter_url_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]/a/@href|//ul[@class="last9"]//li/a/@href')
        chapter_name=chapter_name_next[1:]
        chapter_url=chapter_url_next[1:]
        return chapter_name,chapter_url
    
    def run_proc(page):
        with ThreadPoolExecutor(200) as executor:
            # 根据章节页数,得到章节url
            i=list(page[0])
            for k in range(len(i)):
                i[k]=[i[k],page[1]]
            chapter=list(executor.map(get_chapter_link,i))
            chapter=list(zip(*chapter))
            chapter_url=list(chapter[1])
            chapter_name=list(chapter[0])
            chapter_url = sum(chapter_url, [])
            chapter_name = sum(chapter_name, [])
            chapter_all=list(executor.map(get_chapter_content,chapter_url))
            for i in range(len(chapter_all)):
                chapter_all[i]=chapter_name[i]+'
    '+chapter_all[i]
        return chapter_all
    
    ################################# 对于所有章节的url内容爬取 #####################################
    if __name__ == '__main__':
        for k in book_name_list:
            start=time.time()
            url='https://m.52bqg.com/modules/article/waps.php?searchtype=articlename&searchkey='+parse.quote(k,encoding="gbk")+'&t_btnsearch='
            req=requests.get(url,headers)
            req.encoding='gbk'
            if 'book_' in req.url and 'search' not in req.url: #搜索结果不是清单则直接开始爬
                url_all=req.url
                url_all=url_all.replace('book','chapters')
            else: #是清单则判断是否有完全匹配项,若无则只爬榜1
                search_result=req.text
                html_search=etree.HTML(search_result)
                search_book=html_search.xpath('//div[@class="article"]/a/text()')
                search_book_url=html_search.xpath('//div[@class="article"]/a[1]/@href')
                if k in search_book:
                    url_all=target_url+search_book_url[search_book.index(k)]
                    url_all=url_all.replace('book','chapters')
                else:
                    url_all=target_url+search_book_url[0]
                    url_all=url_all.replace('book','chapters')
    
            # 根据书名判断章节页数
            req_all=requests.get(url_all,headers)
            req_all.encoding='gbk'
            html_all=etree.HTML(req_all.text)
            chapter_page_all=html_all.xpath('//table[@class="page-book"]//td/a/@href')
            chapter_page_all=chapter_page_all[1].split('/')
            chapter_page_all=int(chapter_page_all[-1])
    
            # 对小说页使用进程处理
            count=0
            page_list=[]
            while count<chapter_page_all:
                next_count=count+10
                if next_count>chapter_page_all:
                    next_count=chapter_page_all;
                page_list.append([range(count,next_count),url_all])
                count=count+10
    
            p = Pool(4)
            result=p.map(run_proc, page_list)
            p.close()
            p.join()
            chapter_all= sum(result,[])
            end=time.time()
            print("耗时: "+str(int(end-start))+'') #计时统计
    
            target='/n'.join(chapter_all)
            f = open(save_path+'\'+k+'.txt','a+',encoding='utf-8')
            f.read()
            f.write(target)
            f.close()
            print(k+'已完成')
    View Code

    耗时: 40s

    • 多进程调用协程

    代码如下:

    # -*- coding: utf-8 -*-
    """
    Created on Wed Mar  4 10:39:55 2020
    
    @author: wenzhe.tian
    
    多进程+多线程
    多进程+协程
    """
    
    
    ####### 开始工作
    import gevent
    from gevent import monkey
    monkey.patch_all(thread=False)
    import requests
    import time
    from lxml import etree
    import os
    import urllib.parse as parse
    from multiprocessing import Pool
    
    book_name_list=['大道争锋']
    save_path='D:\bqg_novel\'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
    target_url='https://m.52bqg.com'
    try:
        os.mkdir(save_path)
    except:
        pass
    novel=[]
    
    ##### 定义两个函数获取书的url和章节的url
    def get_chapter_content(i): # 根据章节url返回章节内容
        chapter_now=requests.get(target_url+i,headers)
        chapter_now.encoding='gbk'
        chapter_now=chapter_now.text; #源码
        #    chapter_now_ori=chapter_now #检测用
        chapter_now=etree.HTML(chapter_now)
        chapter_content='
    '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()'))
        next_page_num=1
        while '下一页' in chapter_now.xpath('//div[@class="nr_page"]//td[@class="next"]/descendant::text()'):
            chapter_content=chapter_content.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','')
            next_page_num=next_page_num+1;
            chapter_now=requests.get(target_url+i.replace('.html','_'+str(next_page_num)+'.html'),headers)
            chapter_now.encoding='gbk'
            chapter_now=chapter_now.text; #源码
            #        chapter_now_ori=chapter_now #检测用
            chapter_now=etree.HTML(chapter_now)
            chapter_content_next='
    '.join(chapter_now.xpath('//div[@id="nr1"]/descendant::text()'))
            chapter_content_next=chapter_content_next.replace('本章未完,点击下一页继续阅读','').replace('-->>','').replace('&n','')
            chapter_content=chapter_content+chapter_content_next;
        return chapter_content
    
    
    def get_chapter_link(link): ########## 确定章节的数目,爬取所有章节link ######
        i=link[0]
        url_all=link[1]
        if i==0:
            req_next=requests.get(url_all,headers)
        else:
            req_next=requests.get(url_all+str(i+1),headers)
        req_next.encoding='gbk'
        html_next=etree.HTML(req_next.text)
        chapter_name_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]//a/descendant::text()|//ul[@class="last9"]//li//a/descendant::text()')
        chapter_url_next=html_next.xpath('//ul[@class="last9"]//li[@class="even"]/a/@href|//ul[@class="last9"]//li/a/@href')
        chapter_name=chapter_name_next[1:]
        chapter_url=chapter_url_next[1:]
        return chapter_name,chapter_url
    
    def run_proc(page):
        # 根据章节页数,得到章节url
        i=list(page[0])
        for k in range(len(i)):
            i[k]=[i[k],page[1]]
        g_list=list(map(lambda x:gevent.spawn(get_chapter_link, x),i))
        gevent.joinall(g_list)
        chapter=[]
        for g in g_list:
            chapter.append(g.value)
        chapter=list(zip(*chapter))
        chapter_url=list(chapter[1])
        chapter_name=list(chapter[0])
        chapter_url = sum(chapter_url, [])
        chapter_name = sum(chapter_name, [])
        g_list=list(map(lambda x:gevent.spawn(get_chapter_content, x),chapter_url))
        gevent.joinall(g_list)
        chapter_all=[]
        for g in g_list:
            chapter_all.append(g.value)
    
        for i in range(len(chapter_all)):
            chapter_all[i]=chapter_name[i]+'
    '+chapter_all[i]
        return chapter_all
    
    ################################# 对于所有章节的url内容爬取 #####################################
    if __name__ == '__main__':
        for k in book_name_list:
            start=time.time()
            url='https://m.52bqg.com/modules/article/waps.php?searchtype=articlename&searchkey='+parse.quote(k,encoding="gbk")+'&t_btnsearch='
            req=requests.get(url,headers)
            req.encoding='gbk'
            if 'book_' in req.url and 'search' not in req.url: #搜索结果不是清单则直接开始爬
                url_all=req.url
                url_all=url_all.replace('book','chapters')
            else: #是清单则判断是否有完全匹配项,若无则只爬榜1
                search_result=req.text
                html_search=etree.HTML(search_result)
                search_book=html_search.xpath('//div[@class="article"]/a/text()')
                search_book_url=html_search.xpath('//div[@class="article"]/a[1]/@href')
                if k in search_book:
                    url_all=target_url+search_book_url[search_book.index(k)]
                    url_all=url_all.replace('book','chapters')
                else:
                    url_all=target_url+search_book_url[0]
                    url_all=url_all.replace('book','chapters')
    
            # 根据书名判断章节页数
            req_all=requests.get(url_all,headers)
            req_all.encoding='gbk'
            html_all=etree.HTML(req_all.text)
            chapter_page_all=html_all.xpath('//table[@class="page-book"]//td/a/@href')
            chapter_page_all=chapter_page_all[1].split('/')
            chapter_page_all=int(chapter_page_all[-1])
    
            # 对小说页使用进程处理
            count=0
            page_list=[]
            while count<chapter_page_all:
                next_count=count+10
                if next_count>chapter_page_all:
                    next_count=chapter_page_all;
                page_list.append([range(count,next_count),url_all])
                count=count+10
    
            p = Pool(4)
            result=p.map(run_proc, page_list)
            p.close()
            p.join()
            chapter_all= sum(result,[])
            end=time.time()
            print("耗时: "+str(int(end-start))+'') #计时统计
    
            target='/n'.join(chapter_all)
            f = open(save_path+'\'+k+'.txt','a+',encoding='utf-8')
            f.read()
            f.write(target)
            f.close()
            print(k+'已完成')
    View Code

    耗时: 60s

    简单的看出多核应用大于单核,多线程当然好于无线程(不要受GIL锁对多线程误解,效率明显是提高的)

    本测试中开多线程效率(线程数250)是要大于协程的。

    下篇将会对全站小说爬取的多进程多线程调用总结以及scrapy应用的对比。

    欢迎交流指正。有任何疑问直接丢评论区。

  • 相关阅读:
    docker基本指令
    Process API
    Go的Context用法
    golang的可空类型和零值
    动态编程(DynamicObject、ExpandoObject)
    git clean
    SQL Server Join
    公用表表达式 Common Table Expression
    ubuntu安装mysql
    asp.net core-使用Nlog和log4net
  • 原文地址:https://www.cnblogs.com/techs-wenzhe/p/12550451.html
Copyright © 2020-2023  润新知