• python3+beautifulSoup4.6抓取某网站小说(三)网页分析,BeautifulSoup解析


    本章学习内容:将网站上的小说都爬下来,存储到本地。

    目标网站:www.cuiweijuxs.com

    分析页面,发现一共4步:从主页进入分版打开分页列表、打开分页下所有链接、打开作品页面、打开单章内容。

    所以实现步骤如下:

    1、进入分版页面,www.cuiweijuxs.com/jingpinxiaoshuo/

       找到最大分页数

    <a href="http://www.cuiweijuxs.com/jingpinxiaoshuo/5_122.html" class="last">122</a>

    循环打开每个页面

    href="http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html" 

    2、找到当页所有链接,循环打开单页链接,下为可定位元素

    div id="newscontent"
    div class="l"
      <span class="s2">
      <a href="http://www.cuiweijuxs.com/4_4521/" target="_blank">标题</a>

    3、打开单页链接,找到章节列表,下为可定位元素

    <div id="list">
    <dd>
    <a href="/4_4508/528170.html">第一章</a>
    </dd>
    </div>

    4、打开单章链接,读取内容

    <div id="content">

    内容
    <div>

     

    setup1:创建class,初始化参数,抽象化获取beautifulsoup解析后到网页

    # -*- coding: UTF-8 -*-
    from urllib import request
    from bs4 import BeautifulSoup
    import os
    
    '''
    使用BeautifulSoup抓取网页
    '''
    
    class Capture():
    
        def __init__(self):
            self.index_page_url = 'http://www.cuiweijuxs.com/'
            self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/'
            self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
            self.folder_path = '小说/'
            self.head = {}
            # 写入User Agent信息
            self.head[
                'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'
    
        # 获取BeautifulSoup
        def getSoup(self, query_url):
            req = request.Request(query_url, headers=self.head)
            webpage = request.urlopen(req)
            html = webpage.read()
            #soup = BeautifulSoup(html, 'html.parser')
            soup = BeautifulSoup(html, 'html5lib')
            return soup
            # end getSoup

      

    setup2:创建进入分版页面,找到最大分页数,并循环打开每个页面

    # 读取更新列表
        def readPageOne(self):
            soup = self.getSoup(self.one_page_url)
            last = soup.find("a","last")
            itemSize = int(last.string)
            page_url = str(self.two_page_url)
    
            for item in range(itemSize):
                print( item )
                new_page_url = page_url.replace( "?",str(item+1) )
                self.readPageTwo(new_page_url)
    
        # end readPageOne
    

      使用getSoup方法获取解析后到html网页,使用find方法找到class是“last”的a标签,获取最大分页数

      循环分页,从1开始

    setup3:读取单页链接

    #读取单页链接
    def readPageTwo(self,page_url):
        soup = self.getSoup(page_url)
        con_div = soup.find('div',{'id':'newscontent'}).find('div',{'class':'l'})
        a_list = con_div.find_all('span',{'class':'s2'})[0].find_all('a')
        print(a_list)
        for a_href in a_list:
            #print(child)
            href = a_href.get('href')
            folder_name = a_href.get_text()
            print('a_href',href,'---folder_name',folder_name)
            path = self.folder_path + folder_name
            self.createFolder(path)
            self.readPageThree(href,path)
            # end for
    
    # end readPageTwo
    

      找到div下id是newscontent的标签,再往下找到class是“l”的div,再找到所有class是“s2”的span,找到此span下的a标签,循环打开a标签

         并找到标签名( a_href.get_text() )作为文件夹名称

    setup4:打开作品页面,循环章节链接,拼接文件名称

       #打开作品页面
        def readPageThree(self,page_url,path):
            soup = self.getSoup(page_url)
            print('readPageThree--',page_url)
            a_list = soup.find('div', {'id': 'list'}).find_all('a')
            idx = 0
            for a_href in a_list:
                idx = idx+1
                href = self.index_page_url +  a_href.get('href')
                txt_name =   path + '/' +  str(idx) + '_'+ a_href.get_text()  + '.txt'
                print('a_href', href, '---path', txt_name)
                isExists = os.path.exists(txt_name)
                if isExists:
                    print(txt_name, '已存在')
                else:
                    self.readPageFour(href,txt_name)
    

      

     setup5:打开章节链接,读取id=content的div下所有内容,写入文件中

     #读取单章内容并写入
        def readPageFour(self,page_url,path):
            soup = self.getSoup(page_url)
            con_div = soup.find('div', {'id': 'content'})
            content = con_div.get_text().replace('<br/>', '
    ').replace(' ', ' ')
            self.writeTxt(path,content)

     完整代码实现如下:

      1 # -*- coding: UTF-8 -*-
      2 from urllib import request
      3 from bs4 import BeautifulSoup
      4 import os
      5 
      6 '''
      7 使用BeautifulSoup抓取网页
      8 '''
      9 
     10 class Capture():
     11 
     12     def __init__(self):
     13         self.index_page_url = 'http://www.cuiweijuxs.com/'
     14         self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/'
     15         self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
     16         self.folder_path = '小说/'
     17         self.head = {}
     18         # 写入User Agent信息
     19         self.head[
     20             'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'
     21 
     22     # 获取BeautifulSoup
     23     def getSoup(self, query_url):
     24         req = request.Request(query_url, headers=self.head)
     25         webpage = request.urlopen(req)
     26         html = webpage.read()
     27         #soup = BeautifulSoup(html, 'html.parser')
     28         soup = BeautifulSoup(html, 'html5lib')
     29         return soup
     30         # end getSoup
     31 
     32     #读取更新列表
     33     def readPageOne(self):
     34         soup = self.getSoup(self.one_page_url)
     35         last = soup.find("a","last")
     36         itemSize = int(last.string)
     37         page_url = str(self.two_page_url)
     38 
     39         for item in range(itemSize):
     40             print( item )
     41             new_page_url = page_url.replace( "?",str(item+1) )
     42             self.readPageTwo(new_page_url)
     43 
     44         # end readPageOne
     45 
     46     #读取单页链接
     47     def readPageTwo(self,page_url):
     48         soup = self.getSoup(page_url)
     49         con_div = soup.find('div',{'id':'newscontent'}).find('div',{'class':'l'})
     50         a_list = con_div.find_all('span',{'class':'s2'})[0].find_all('a')
     51         print(a_list)
     52         for a_href in a_list:
     53             #print(child)
     54             href = a_href.get('href')
     55             folder_name = a_href.get_text()
     56             print('a_href',href,'---folder_name',folder_name)
     57             path = self.folder_path + folder_name
     58             self.createFolder(path)
     59             self.readPageThree(href,path)
     60             # end for
     61 
     62         # end readPage
     63 
     64     #打开单章链接
     65     def readPageThree(self,page_url,path):
     66         soup = self.getSoup(page_url)
     67         print('readPageThree--',page_url)
     68         a_list = soup.find('div', {'id': 'list'}).find_all('a')
     69         idx = 0
     70         for a_href in a_list:
     71             idx = idx+1
     72             href = self.index_page_url +  a_href.get('href')
     73             txt_name =   path + '/' +  str(idx) + '_'+ a_href.get_text()  + '.txt'
     74             print('a_href', href, '---path', txt_name)
     75             isExists = os.path.exists(txt_name)
     76             if isExists:
     77                 print(txt_name, '已存在')
     78             else:
     79                 self.readPageFour(href,txt_name)
     80 
     81 
     82     #读取单章内容并写入
     83     def readPageFour(self,page_url,path):
     84         soup = self.getSoup(page_url)
     85         con_div = soup.find('div', {'id': 'content'})
     86         content = con_div.get_text().replace('<br/>', '
    ').replace('&nbsp;', ' ')
     87         self.writeTxt(path,content)
     88 
     89     def readPageHtml(self,page_url,path):
     90         soup = self.getSoup(page_url)
     91         con_div = soup.find('div', {'id': 'content'})
     92         content = con_div.get_text().replace('<br/>', '
    ').replace('&nbsp;', ' ')
     93 
     94 
     95     def createFolder(self,path):
     96         path = path.strip()
     97         # 去除尾部  符号
     98         path = path.rstrip("\")
     99         isExists = os.path.exists(path)
    100         # 不存在则创建
    101         if not isExists:
    102             os.makedirs(path)
    103             print(path + ' create')
    104         else:
    105             print( path + ' 目录已存在')
    106         #end createFolder
    107 
    108     def writeTxt(self,file_name,content):
    109         isExists = os.path.exists(file_name)
    110         if isExists:
    111             print(file_name,'已存在')
    112         else:
    113             file_object = open(file_name, 'w',encoding='utf-8')
    114             file_object.write(content)
    115             file_object.close()
    116 
    117     def run(self):
    118         try:
    119             self.readPageOne()
    120         except BaseException as error:
    121             print('error--',error)
    122 
    123 
    124 Capture().run()
    View Code
  • 相关阅读:
    vim 高级功能
    Vim高手,从来不用鼠标2——替换、撤销、缩进、查找
    Vim高手,从来不用鼠标
    zookeeper的leader选举机制个人总结
    【JVM】GC之垃圾收集算法
    【软件工程】常见的几种软件过程模型的比较
    【设计模式】适配器模式
    【计算机网络】网络的硬件构成要素
    【计算机网络】ISO/OSI 网络体系结构
    【Spring Cloud】Spring Cloud Config 实现分布式配置中心
  • 原文地址:https://www.cnblogs.com/yaomaomao/p/8745343.html
Copyright © 2020-2023  润新知