• python抓取小说——练习


    action.py

    import requests
    from lxml import etree
    class Action:
    
        # 抓取章节列表页数据
        def getList(book_link):
            r = requests.get(book_link)
            r.encoding = "gbk"
            html = etree.HTML(r.text)
            list_links = html.xpath('//div[@class="ml_list"]/ul/li/a/@href')
            book_name = html.xpath('//div[@class="introduce"]/h1/text()')[0]
            return {"book_name":book_name,"list_links":list_links}
    
        # 抓取章节内容页数据
        def getContent(page_link):
            r = requests.get(page_link)
            r.encoding = "gbk"
            html = etree.HTML(r.text)
            title = html.xpath('//div[@class="nr_title"]/h3/text()')
            contents = html.xpath('//div[@class="articlecontent"]/p/text()')
            content = ""
            for p in contents:
                content = content + p + "\r\n"+"\r\n"
            return {"title":title[0],"content":content}

     save.py

    class Save:
        # 把内容写入文本文件
        def saveTxt(file_name,txt_content):
            with open(file_name,"wb") as f:
                f.write(txt_content.encode("gbk"))

    tools.py

    import os
    import glob
    class Tools:
        # 创建文件夹
        # 遇到重复文件夹命名为文件夹目录_1(2,3,4……)
        # 返回文件夹目录名称
        def mkdir(path,root_flag=False):
            folder = os.path.exists(path)
            floder_path = path
            if not folder:
                os.makedirs(path)
            else:
                if not root_flag:
                    num_p = 1
                    sub_path = glob.glob(path + '*')
                    if sub_path:
                        # 最后一个创建目录
                        last_path = sub_path[-1]
                        floder_path = last_path + '_{}'.format(num_p)
                        if last_path.find('_') > 0:
                            num_str = last_path.split('_')
                            if num_str[-1].isdigit():
                                num_p = int(num_str[-1]) + 1
                                floder_path = last_path[0:last_path.rfind(
                                    '_')] + '_{}'.format(num_p)
                                os.makedirs(floder_path)
                            else:
                                os.makedirs(floder_path)
                        else:
                            os.makedirs(floder_path)
            return floder_path

    main.py

    from module.action import Action as action
    from module.save import Save as save
    from module.tools import Tools as tools
    
    def main():
        book_link = input("请输入书籍封面页链接:")
        if book_link == "":
            return
        # 获取列表数据
        l_data = action.getList(book_link)
        tools.mkdir("book/"+l_data["book_name"])
        # 循环下载章节
        for link in l_data["list_links"]:
            p_data = action.getContent("https://www.00ksw.com"+link)
            save_path = "book/"+l_data["book_name"]+"/"+p_data["title"]+".txt"
            save.saveTxt(save_path,p_data["content"])
            print(p_data["title"]+"------抓取完成")
    
    
    if __name__ == "__main__":
        main()

    运行:

  • 相关阅读:
    在ModelSim中添加Xilinx仿真库
    关于DDR3非常棒的文章
    Modelsim编译Xilinx器件库的另一种方法
    FPGA按键去抖verilog代码
    DDOS与DDOS追踪的介绍
    自动实时监控Windows2003服务器终端登录
    Java 如何实现线程间通信
    C# 获取FormData数据
    利用 FormData 对象发送 Key/Value 对的异步请求
    jdk初始安装配置
  • 原文地址:https://www.cnblogs.com/wordblog/p/16102038.html
Copyright © 2020-2023  润新知