• python3+beautifulSoup4.6抓取某网站小说(四)多线程抓取


    上一篇多文章,是二级目录,根目录“小说”,二级目录“作品名称”,之后就是小说文件。

    本篇改造了部分代码,将目录设置为根目录->作者目录->作品目录->作品章节.txt.

    但这并不是本章内容当重点,重点是使用这个爬虫程序抓取当时候,经常会因为网络丢包等原因导致程序中断,

    本来想着是循环获取网站状态,然后重新发起请求,结果好像也没什么用。然后在虫师讲selenium的书中看到了多线程,正好就实验下,结果发现,速度很快,cool!

    以下代码基本摘自虫师的selenium2

    多线程的引用

    import threading

    方法调用:threading.Thread(target=music, args=('music方法参数1',music方法参数2) )

    from time import sleep,ctime
    import threading
    
    def music(func,loop):
        for i in range(loop):
            print('music',func,ctime())
            sleep(2)
    
    def movie(func,loop):
        for i in range(loop):
            print('movie',func,ctime())
            sleep(4)
    
    def testOne():
        music('简单的歌', 2)
        movie('两杆大烟枪', 2)
        print('all end', ctime())
    def testTwo(): threads = [] t1 = threading.Thread(target=music, args=('喜欢的人',2) ) threads.append(t1) t2 = threading.Thread(target=movie, args=('搏击俱乐部',2) ) threads.append(t2) t3= threading.Thread(target=music, args=('喜欢的人2', 2)) threads.append(t3) for t in threads: t.start() for t in threads: t.join() print('all end', ctime())

    if __name__ == '__main__':
    testOne()
    #testTwo()
    #testThree()
    #threadsRun()
    t.join方法用来串联线程,可以保证all end 语句在最后打印出来。

      

    创建线程管理类

    创建类名时就引入Thread:class MyThread(threading.Thread)

    class MyThread(threading.Thread):
    
        def __init__(self, func, args, name):
            threading.Thread.__init__(self)
            self.func = func
            self.args = args
            self.name = name
    
        def run(self):
            self.func(*self.args)
    

     self:类实例,默认参数

     func:调用方法名

       args:参数

       name:方法+".__name__"

    完整代码:

     1 class MyThread(threading.Thread):
     2 
     3     def __init__(self, func, args, name):
     4         threading.Thread.__init__(self)
     5         self.func = func
     6         self.args = args
     7         self.name = name
     8 
     9     def run(self):
    10         self.func(*self.args)
    11 
    12 def super_play(file_,time):
    13     for i in range(3):
    14         print('play', file_, ctime())
    15         sleep(time)
    16 
    17 
    18 def time(args):
    19     pass
    20 
    21 
    22 def testThree():
    23     threads = []
    24     lists = {'气球.mp3': 3, '电影.rmvb': 4, 'last.avg' : 2}
    25     for file_, time_ in lists.items():
    26         t = MyThread(super_play, (file_, time_), super_play.__name__)
    27         threads.append(t)
    28 
    29     files = range(len(lists))
    30 
    31     for f in files:
    32         threads[f].start()
    33     for f in files:
    34         threads[f].join()
    35 
    36     print('all end', ctime())
    View Code

    改造小说爬虫

    好了,多线程说完了,怎么调用咱们写的小说类呢,很简单

    首先,改造pageOne

        def readPageOneByThread(self,page,time_):
            page_url = str(self.two_page_url)
            new_page_url = page_url.replace("?", page)
            print('第', page, '页---', new_page_url)
            path = self.folder_path              
            self.readPageTwo(new_page_url, path)
            sleep(time_)
        # end readPageOneByThread  ---------------------------------------
    

     init方法中,self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"

    接下来,编写添加线程的方法:

        def threadsRun(self):
    
            #self.readPageOne(122)
    
            for i in range(1,123):
                page = str(i)
                t = MyThread( self.readPageOneByThread, (page,2) , self.readPageOneByThread.__name__)
                #t = threading.Thread(target=self.testRun, args=( str(i) ))
                self.threads.append(t)
    
            for t in self.threads:
                t.start()
            for t in self.threads:
                t.join()
                #t.join()
    
            print('all end: %s' % ctime())
    
    
    class MyThread(threading.Thread):
    
        def __init__(self, func, args, name):
            threading.Thread.__init__(self)
            self.func = func
            self.args = args
            self.name = name
    
        def run(self):
            self.func(*self.args)
    

      这里偷了个懒,直接写了总页数,其实也可以使用原来的pageone方法读取last的div获取页数

    下面是完整代码:

      1 # -*- coding: UTF-8 -*-
      2 from urllib import request
      3 from bs4 import BeautifulSoup
      4 from time import sleep,ctime
      5 import os
      6 import threading
      7 import re
      8 import random
      9 
     10 '''
     11 使用BeautifulSoup抓取网页
     12 version:0.5 更新为本地缓存链接
     13 author:yaowei
     14 date:2018-03-23
     15 '''
     16 
     17 
     18 class Capture():
     19 
     20     def __init__(self):
     21         self.index_page_url = 'http://www.cuiweijuxs.com/'
     22         self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/'
     23         self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
     24         self.folder_path = '绯色/'
     25         self.href_list = []
     26         self.head = {}
     27         self.threads = []
     28         # 写入User Agent信息
     29         self.head[
     30             'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'
     31 
     32     # end __init__ ---------------------------------------
     33 
     34     # 获取BeautifulSoup
     35     def getSoup(self,query_url):
     36         req = request.Request(query_url, headers=self.head)
     37         webpage = request.urlopen(req)
     38         html = webpage.read()
     39         soup = BeautifulSoup(html, 'html.parser')
     40         return soup
     41         # soup = BeautifulSoup(html, 'html5lib')
     42 
     43     # 读取分版页面,打开分页链接
     44     def readPageOne(self,count,time_):
     45 
     46         print('count=====',count)
     47 
     48         # 总页数
     49         if count :
     50             item_size = count
     51         else :
     52             # 读取页面
     53             soup = self.getSoup(self.one_page_url)
     54             last = soup.find("a", 'last')
     55             item_size = int(last.string)
     56 
     57         print('item_size=====',item_size)
     58         page_url = str(self.two_page_url)
     59 
     60         # 循环打开分页链接,读取分页页面
     61         for item in range(item_size):
     62             page = str(item + 1)
     63             new_page_url = page_url.replace("?", page)
     64             print('', page, '页---', new_page_url)
     65             path = self.folder_path
     66             self.readPageTwo(new_page_url, path)
     67 
     68         sleep(time_)
     69     # end readPageOne  ---------------------------------------
     70 
     71     def readPageOneByThread(self,page,time_):
     72         page_url = str(self.two_page_url)
     73         new_page_url = page_url.replace("?", page)
     74         print('', page, '页---', new_page_url)
     75         path = self.folder_path              
     76         self.readPageTwo(new_page_url, path)
     77         sleep(time_)
     78     # end readPageOneByThread  ---------------------------------------
     79 
     80     # 读取分页页面
     81     def readPageTwo(self, page_url, path):
     82         soup = self.getSoup(page_url)
     83         # first div[id="newscontent"]->div[class="l"]
     84         con_div = soup.find('div', {'id': 'newscontent'}).find('div', {'class': 'l'})
     85         # first div[id="newscontent"]->div[class="l"]->all spann[class="s2"]
     86         span_list = con_div.find_all('span', {'class': 's2'})
     87 
     88         # 遍历span
     89         for span in span_list:
     90             # 找到父节点下的span[class="s5"],以作者为文件夹名字
     91             author = span.parent.find('span', {'class': 's5'}).get_text()
     92 
     93             # span[class="s2"]->a
     94             a_href = span.find('a')
     95             href = a_href.get('href')  # 单部作品链接
     96             folder_name = a_href.get_text()  # 作品名字
     97             print('a_href', href, '---folder_name', folder_name)
     98             new_path = path + '/' + author + '/' + folder_name
     99             self.createFolder(new_path)  # 创建文件夹
    100 
    101             self.readPageThree(href, new_path)  # 读取单部作品
    102 
    103             # t = threading.Thread(target=self.readPageThree, args={href, new_path})
    104             # self.threads.append(t)
    105             # end for
    106 
    107     # end readPage  ---------------------------------------
    108 
    109     # 打开作品链接,遍历单章
    110     def readPageThree(self, page_url, path):
    111         soup = self.getSoup(page_url)  # 作品页面
    112         print('readPageThree--', page_url)
    113         a_list = soup.find('div', {'id': 'list'}).find_all('a')
    114         idx = 0  # 序号
    115         for a_href in a_list:
    116             idx = idx + 1
    117             href = self.index_page_url + a_href.get('href')
    118             file_path = path + '/' + str(idx) + '_' + a_href.get_text() + '.txt'
    119             print('file_a_href', href, '---file_path', file_path)
    120 
    121             '''
    122             new_path = self.isTxt(file_path)
    123             if new_path:
    124                 print(new_path)
    125                 file_object = open('网页链接//hrefs.txt', 'w', encoding='utf-8')
    126                 file_object.write(href+','+new_path)
    127                 file_object.close()
    128              '''
    129             self.readPageFour(href, file_path)
    130 
    131             #self.href_list.append({'href': href, 'file_path': file_path})
    132 
    133             # 多线程
    134             #t = threading.Thread(target=self.readPageFour, args={href, file_path})
    135             #t.start()
    136             #t.join(15)
    137 
    138     # end readPageThree  ---------------------------------------
    139 
    140     # 读取单章内容并写入
    141     def readPageFour(self, page_url, path):
    142         new_path = self.isTxt(path)  # 是否存在,存在则返回'',没创建则返回合法文件名
    143         if new_path:
    144             soup = self.getSoup(page_url)
    145             con_div = soup.find('div', {'id': 'content'})  # 读取文本内容
    146             content = con_div.get_text().replace('<br/>', '
    ').replace('&nbsp;', ' ')
    147             # content = content.replace('&amp;','').replace('amp;','').replace('rdquo;','').replace('ldquo;','')
    148             # content = content.rstrip("& amp;rdquo;amp;& amp;ldquo;")
    149 
    150             self.writeTxt(new_path, content)  # 写入文件
    151 
    152     # end readPageFour  ---------------------------------------
    153 
    154     def readPageHtml(self, page_url, path):
    155         soup = self.getSoup(page_url)
    156         con_div = soup.find('div', {'id': 'content'})
    157         content = con_div.get_text().replace('<br/>', '
    ').replace('&nbsp;', ' ')
    158 
    159     def createFolder(self, path):
    160         path = path.strip()
    161         # 去除尾部  符号
    162         path = path.rstrip("\")
    163         rstr = r"[:*?"<>|]"  # '/  : * ? " < > |'
    164         new_path = re.sub(rstr, "_", path)  # 替换为下划线
    165         is_exists = os.path.exists(new_path)
    166         # 不存在则创建
    167         if not is_exists:
    168             os.makedirs(new_path)
    169             print('目录:', new_path + ' create')
    170         else:
    171             print(new_path + ' 目录已存在')
    172 
    173     # end createFolder  ---------------------------------------
    174 
    175     def isTxt(self, path):
    176         path = path.strip()
    177         # 去除尾部  符号
    178         path = path.rstrip("\")
    179         rstr = r"[:*?"<>|]"  # '/  : * ? " < > |'
    180         new_path = re.sub(rstr, "_", path)  # 替换为下划线
    181         isExists = os.path.exists(new_path)
    182         if isExists:
    183             print(new_path, '已存在')
    184             return ''
    185         else:
    186             return new_path
    187 
    188     # end createTxt ---------------------------------------
    189 
    190     def writeTxt(self, file_name, content):
    191         isExists = os.path.exists(file_name)
    192         if isExists:
    193             print(file_name, '已存在')
    194         else:
    195             file_object = open(file_name, 'w', encoding='utf-8')
    196             file_object.write(content)
    197             file_object.close()
    198 
    199     # end writeTxt ------------------------------------------
    200 
    201     def run(self):
    202         try:
    203             self.readPageOne()
    204         except BaseException as error:
    205             print('error--', error)
    206 
    207     def runTest(self):
    208         try:
    209             page_url = 'http://www.cuiweijuxs.com/4_4508/'
    210             path = '小说/runTest'
    211             self.readPageThree(page_url, path)
    212         except BaseException as error:
    213             print('error--', error)
    214 
    215     def testRun(self,num,time_):
    216         for i in range(3):
    217             print('num=',num,ctime())
    218             sleep(time_)
    219             
    220     def threadsRun(self):
    221 
    222         #self.readPageOne(122)
    223 
    224         for i in range(1,123):
    225             page = str(i)
    226             t = MyThread( self.readPageOneByThread, (page,2) , self.readPageOneByThread.__name__)
    227             #t = threading.Thread(target=self.testRun, args=( str(i) ))
    228             self.threads.append(t)
    229 
    230         for t in self.threads:
    231             t.start()
    232         for t in self.threads:
    233             t.join()
    234             #t.join()
    235 
    236         print('all end: %s' % ctime())
    237 
    238 
    239 class MyThread(threading.Thread):
    240 
    241     def __init__(self, func, args, name):
    242         threading.Thread.__init__(self)
    243         self.func = func
    244         self.args = args
    245         self.name = name
    246 
    247     def run(self):
    248         self.func(*self.args)
    249 
    250 
    251 Capture().threadsRun()
    View Code

      

  • 相关阅读:
    个人阅读作业+个人总结
    个人作业Week3-案例分析
    个人作业Week2-代码复审
    个人作业-Week1
    个人项目-数独
    第0次个人作业
    团队项目-游戏引擎的考察
    Week3结对项目-数独游戏
    个人作业Week3-案例分析
    代码复审
  • 原文地址:https://www.cnblogs.com/yaomaomao/p/8746231.html
Copyright © 2020-2023  润新知