Python3.5
爬取《神雕侠侣》http://www.kanunu8.com/wuxia/201102/1610.html
武侠迷,所以喜欢爬取武侠小说
#!/usr/bin/python # -*- coding: utf-8 -*- from selenium import webdriver import os from docx import Document import re class House(): def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'} self.baseUrl = 'http://www.kanunu8.com/wuxia/201102/1610.html' self.basePath = os.path.dirname(__file__) def makedir(self, name): path = os.path.join(self.basePath, name) isExist = os.path.exists(path) if not isExist: os.makedirs(path) print('File has been created.') else: print('The file is existed.') #切换到该目录下 os.chdir(path) def connect(self, url): try: driver = webdriver.PhantomJS() driver.get(url) return driver except: print('This page is not existed.') #爬取每个板块中每一章节的链接地址 def getBookLinkList(self, url): driver = self.connect(url) bookLinkList = [] try: #找到所有href链接 bookLinks = driver.find_elements_by_xpath("//a") for link in bookLinks: temp = link.get_attribute('href') print(temp) try: #通过正则表达式筛选出各章节的链接 pattern = re.compile(".+/[0-9]{5}.html$") if pattern.match(temp): print('ok') bookLinkList.append(link.get_attribute('href')) except: print('little error') except: print('Error') return bookLinkList #爬取每本书的细节数据 def getBookDetail(self, url): driver = self.connect(url) try: #找到标题和文章内容 title = driver.find_element_by_xpath('//h2').text content = driver.find_element_by_xpath('//p').text print(title) print(content) except: print('Error.') return title, content def getData(self): doc = Document() self.makedir('StoryFiles') bookLinkList = self.getBookLinkList(self.baseUrl) for linkUrl in bookLinkList: doc.add_paragraph(self.getBookDetail(linkUrl)) doc.save('神雕侠侣.docx') if __name__ == '__main__': house = House() house.getData()