import requests import os from bs4 import BeautifulSoup shici_url = 'http://www.shicimingju.com' url = 'http://www.shicimingju.com/book/' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36' } # # 请求页面数据 response = requests.get(url=url,headers=headers) page_text = response.text soup = BeautifulSoup(page_text,'lxml') # 获取所有小说名称组成的列表 a_list = soup.select('.bookmark-list>ul>li>h2>a') # 获取书籍的详细 def get_book_detail(page_url): book_detail_content = requests.get(url=page_url, headers=headers).text soup = BeautifulSoup(book_detail_content, 'lxml') book_content = soup.select('.chapter_content>p') if not book_content: book_content = soup.select('.chapter_content') content = '' for book_c in book_content: content = content + book_c.text # 获取 详细内容 return content # 获取书籍的列表页面 def get_book_list(book_url,f): book_list_content = requests.get(url=book_url, headers=headers).text soup = BeautifulSoup(book_list_content, 'lxml') book_mulu = soup.select('.book-mulu>ul>li>a') for book in book_mulu: page_title = book.text print(page_title + "开始下载...") page_url = shici_url+book['href'] # 调用 详细页面 content = get_book_detail(page_url) f.write(page_title+" "+content+" ") print(page_title+"下载完成...") f.close() # 判断目录是否存在 file_path = './史书/' if not os.path.exists(file_path): os.mkdir(file_path) n = 0 for a in a_list: n = n + 1 # 书名 book_name = a.text print("<<%s>>正在下载..."%book_name) # 创建以当前书名为文件名的txt文件 file_name = file_path+str(n)+'.'+book_name+'.txt' f = open(file_name,'a+',encoding='utf-8') # url book_url = shici_url+a['href'] # 通过url 进入到 书籍的列表页面 get_book_list(book_url,f)