action.py
import requests from lxml import etree class Action: # 抓取章节列表页数据 def getList(book_link): r = requests.get(book_link) r.encoding = "gbk" html = etree.HTML(r.text) list_links = html.xpath('//div[@class="ml_list"]/ul/li/a/@href') book_name = html.xpath('//div[@class="introduce"]/h1/text()')[0] return {"book_name":book_name,"list_links":list_links} # 抓取章节内容页数据 def getContent(page_link): r = requests.get(page_link) r.encoding = "gbk" html = etree.HTML(r.text) title = html.xpath('//div[@class="nr_title"]/h3/text()') contents = html.xpath('//div[@class="articlecontent"]/p/text()') content = "" for p in contents: content = content + p + "\r\n"+"\r\n" return {"title":title[0],"content":content}
save.py
class Save: # 把内容写入文本文件 def saveTxt(file_name,txt_content): with open(file_name,"wb") as f: f.write(txt_content.encode("gbk"))
tools.py
import os import glob class Tools: # 创建文件夹 # 遇到重复文件夹命名为文件夹目录_1(2,3,4……) # 返回文件夹目录名称 def mkdir(path,root_flag=False): folder = os.path.exists(path) floder_path = path if not folder: os.makedirs(path) else: if not root_flag: num_p = 1 sub_path = glob.glob(path + '*') if sub_path: # 最后一个创建目录 last_path = sub_path[-1] floder_path = last_path + '_{}'.format(num_p) if last_path.find('_') > 0: num_str = last_path.split('_') if num_str[-1].isdigit(): num_p = int(num_str[-1]) + 1 floder_path = last_path[0:last_path.rfind( '_')] + '_{}'.format(num_p) os.makedirs(floder_path) else: os.makedirs(floder_path) else: os.makedirs(floder_path) return floder_path
main.py
from module.action import Action as action from module.save import Save as save from module.tools import Tools as tools def main(): book_link = input("请输入书籍封面页链接:") if book_link == "": return # 获取列表数据 l_data = action.getList(book_link) tools.mkdir("book/"+l_data["book_name"]) # 循环下载章节 for link in l_data["list_links"]: p_data = action.getContent("https://www.00ksw.com"+link) save_path = "book/"+l_data["book_name"]+"/"+p_data["title"]+".txt" save.saveTxt(save_path,p_data["content"]) print(p_data["title"]+"------抓取完成") if __name__ == "__main__": main()
运行: