最近想看盗墓笔记,看了一下网页代码,竟然不是js防爬虫,那就用简单的代码爬下了一节:
""" 爬取盗墓笔记小说-七星鲁王宫 """ from urllib.request import urlopen from bs4 import BeautifulSoup from docx import Document import os class Download(): def __init__(self): self.baseUrl = 'http://www.daomubiji.com/nu-hai-qian-sha-' self.basePath = os.path.dirname(__file__) def makedir(self, name): path = os.path.join(self.basePath, name) isExist = os.path.exists(path) if not isExist: os.makedirs(path) print('File has been created.') else: print('The file is existed.') #切换到该目录下 os.chdir(path) def connect(self, url): try: html = urlopen(url) print(url) obj = BeautifulSoup(html, 'lxml') except: print('This page is not existed.') return obj def getContent(self): doc = Document() self.makedir('storyFiles') for page in range(1,47): if page < 10: url = self.baseUrl + '0' + str(page) + '.html' else: url = self.baseUrl + str(page) + '.html' obj = self.connect(url) content = obj.find('article', {'class': 'article-content'}) doc.add_paragraph(content.text) doc.save('盗墓笔记-怒海潜沙.doc') if __name__ == '__main__': obj = Download() obj.getContent()