• Python爬虫练习:抓取笔趣阁小说(一)


    练习使用requests BeautifulSoup 抓取一本小说存放到D盘中

    速度比较慢、抓取服务器容易中断

    # -*- coding:UTF-8 -*-
    import requests
    from bs4 import BeautifulSoup
    import re
    
    """
    获取书籍目录
    """
    
    
    def getBookContents(url):
        req = requests.get(url=url)
        req.encoding = "gb2312"
        html = req.text
    
        dv_bf = BeautifulSoup(html, "html5lib")
        dv = dv_bf.find("div", class_="listmain")
        # dvs = dv_bf.find_all("div", class_="listmain")
    
        a_bf = BeautifulSoup(str(dv), "html5lib")
        a = a_bf.find_all("a")
    
        book_contents_list = []
        i = 0
        for content in a[13:]:
            book_title = content.string
            book_url = content.get("href")
            try:
                # 数据清洗 获取标题"章"字索引 若没有则出现异常 不记录数据
                book_title_index = str(book_title).index("章", 0)
                # print(book_title_index)
                # 通过index切片 获取新的章节标题
                new_book_title = book_title[book_title_index + 1:]
                # print(new_book_title)
                # 去除标题含有的空格
                i = i + 1
                new_book_titles = "第{}章".format(i) + new_book_title.lstrip()
                new_book_url = "http://www.biqukan.com{}".format(book_url)
                #print(new_book_titles, new_book_url)
    
                # 一组数据设置为字典类型
                contenets = {new_book_titles: new_book_url}
                # 存放到list
                book_contents_list.append(contenets)
            except:
                # 通过异常捕捉,出现异常是没有找到"章"字符索引
                print("*****************不是正文章节节点,不予记录****************")
                print("原标题=", book_title)
                print("原链接=", new_book_url)
        return book_contents_list
    
    
    """
    通过文章链接地址获取章节内容
    """
    
    
    def getConnect(url):
        target = 'http://www.biqukan.com/1_1094/5403177.html'
        req = requests.get(url=url)
        req.encoding = 'gb2312'
        html = req.text
        div_bf = BeautifulSoup(html, "html5lib")
        div = div_bf.find("div", id="content")
        # 去除script
        [s.extract() for s in div('script')]
        # print(div.text)
        return div.text
    
    
    """
    将小说内容写入到文件
    """
    
    
    def saveData(filepath, text):
        with open(filepath, mode="w", encoding="UTF-8") as f:
            f.writelines(text)
            f.write('
    
    ')
    
    
    if __name__ == '__main__':
    
        book_list = getBookContents("http://www.biqukan.com/1_1094")
    
        for li in book_list:
                filepath = "d:\123\"
                connecturl = ""
                for aa in  li.keys():
                    filepath = filepath+aa
                    connecturl = li[aa]
    
                text = getConnect(connecturl)
                saveData(filepath,text)
    

     

  • 相关阅读:
    [HAOI2009] 毛毛虫
    [NOI2015]品酒大会
    SDOI2016 生成魔咒
    [POJ2406]字符串的幂
    [SPOJ705]不同的子串
    快速幂和矩阵快速幂
    对于最近的一些日常总结by520(17.10.18)
    思维训练
    趣味性的高智商(贼有意思)的题(坑)
    C++手动开O2优化
  • 原文地址:https://www.cnblogs.com/dangzhengtao/p/12213513.html
Copyright © 2020-2023  润新知