python爬虫-搜索小说并下载

  1 #coding:utf-8
  2 import requests,os,re
  3 from bs4 import BeautifulSoup
  4 from selenium import webdriver
  5 from selenium.webdriver.chrome.options import Options
  6 from selenium.webdriver.common.keys import Keys
  7 
  8 class downloader():
  9 
 10     def __init__(self):
 11         self.urls = []  # 保存章节链接
 12         self.name = []  # 保存章节名
 13         self.url = 'https://so.biqusoso.com/s.php?ie=utf-8&siteid=biqugex.com&q='
 14 
 15     """输入小说名，搜索"""
 16     def Get_url(self):
 17         #创建chrome参数对象，设置chrome浏览器无界面模式
 18         chrome_options = Options()
 19         chrome_options.add_argument('--headless')
 20         # 创建chrome无界面对象
 21         browser = webdriver.Chrome(options=chrome_options)
 22         browser.get(self.url)
 23         c = input('请输入小说全名：')
 24         browser.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div[2]/form/input[3]').send_keys(c)
 25         browser.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div[2]/form/input[4]').click()
 26         new_url = browser.current_url
 27         # 关闭浏览器
 28         browser.close()
 29         # 关闭chromedriver进程
 30         browser.quit()
 31         print("已关闭浏览器")
 32         # print(new_url)
 33         response = requests.get(new_url)
 34         response.encoding = 'utf-8'
 35         soup = BeautifulSoup(response.text, 'lxml')
 36         # print(soup)
 37         name1 = soup.find_all('span', class_='s2')
 38         soup = BeautifulSoup(str(name1), 'lxml')
 39         new_name = soup.find('a')
 40         new_name1 = new_name.string
 41         # print(new_name1)
 42         self.href = new_name.attrs['href']
 43         print(self.href)
 44         return self.href
 45     def Response(self):
 46         response = requests.get(self.href)
 47         response.encoding = 'gbk'  # 解决乱码
 48         self.soup = BeautifulSoup(response.text, 'lxml')  # 解析网页
 49         div = self.soup.find_all('div', class_='listmain')  # 在解析结果中查找class_='listmain'
 50         soup1 = BeautifulSoup(str(div), 'lxml')  # 删除字符串头和尾的空格
 51         h = soup1.find_all('a')  # 在class_='listmain下面找到a标签
 52         for i in h:
 53             self.name.append(i.string)  # 将a标签中的非属性字符，即章节名添加到name
 54             self.urls.append('https://www.biqugex.com%s' % i.get('href'))  # 将a标签中的链接，添加到urls
 55 
 56     def file(self):
 57         """查找小说名字，并创建同名文件夹"""
 58         div1 = self.soup.select('body > div.book > div.info > h2')
 59         a = BeautifulSoup(str(div1), 'lxml')
 60         b = a.find('h2')
 61         b = b.string
 62         c = 'C:\Users\Administrator\Desktop\%s' % b
 63         if not os.path.exists(c):
 64             os.mkdir(c)
 65 
 66         # 循环解析urls，得到小说正文
 67         i = 0
 68         while i < len(self.urls):
 69             response1 = requests.get(url=self.urls[i])
 70             response1.encoding = 'gbk'
 71             soup2 = BeautifulSoup(response1.text, 'lxml')
 72             d = soup2.find_all('div', id='content')
 73             id1 = BeautifulSoup(str(d), 'lxml')
 74             # 创建文件名
 75             src = self.name[i] + '.txt'
 76             filename = c + '/' + src
 77             print(filename)
 78 
 79             # 将解析到的小说正文写到文件中
 80             for result in id1:
 81                 res = result.text
 82                 id2 = soup2.select('#content')
 83                 with open(filename, 'w+', encoding='utf-8') as f:
 84                     f.write(res)
 85                 i += 1
 86 #如果输入的网址不是正确的网址，则提示请输入正确的笔趣阁网址
 87     def Main(self):
 88         try:
 89             d = downloader()
 90             d.Get_url()
 91         except:
 92             print('没有找到')
 93         else:
 94             d.Response()
 95             d.file()
 96 
 97 
 98 
 99 if __name__ == '__main__':
100     # url=input('请输入网址：')
101     # url='https://www.biqugex.com/book_104027/'
102     a = downloader()
103     a.Main()
相关阅读:
firefox配置
 安装gstreamer开发环境
 linux下批量替换文件内容(转)
iptables详细教程：基础、架构、清空规则、追加规则、应用实例(转)
iptables 使用
 如何用iptables实现NAT（转）
Python 练习题
 Python unittest 参数化
 Python Logging模块
 Python 多进程
原文地址：https://www.cnblogs.com/hfct/p/11661063.html