笔趣阁小说 selenium爬取

import re
from time import sleep

from lxml import etree
from selenium import webdriver

options = webdriver.ChromeOptions()
#options.add_argument('--headless')
options.add_argument(
    "User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36")
options.add_argument("Referer=https://s.weibo.com/")
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('blink-settings=imagesEnabled=false')
options.add_argument('--disable-gpu')
options.add_argument('--hide-scrollbars')  # 隐藏滚动条, 应对一些特殊页面
options.add_argument(
    'Cookie: ')


class Qidian:
    def __init__(self, url, driver):
        self.url = url
        self.driver = driver
        content = self.get_content(url)
        self.file_name = self.pase_file_name(content)


    def crawl_start(self):
        content = self.get_content(self.url)
        self.parse_detail(content)

    def get_content(self,url):
        self.driver.get(url)
        content = driver.page_source
        return content

    def pase_file_name(self, content):
        html = etree.HTML(content)
        file_info = html.xpath('//*[@id="info"]/h1/text()')
        file_name = file_info[0] + ".txt"
        return file_name

    def parse_detail(self, content):
        html = etree.HTML(content)
        ul = html.xpath('//div[@id="list"]/dl//dd')
        open(self.file_name, 'w')
        for li in ul:
            item = {}
            title = li.xpath('./a/text()')
            href = li.xpath('./a/@href')

            item['title'] = title[0]
            item['href'] = "http://www.biquge.info/0_273/" + href[0]
            print(item)
            driver.get(item['href'])
            html = etree.HTML(driver.page_source)
            details = html.xpath('//*[@id="content"]//text()')
            detail = ''.join(details)

            self.save_to_file(self.file_name, title[0], detail)
            sleep(3)

    def save_to_file(self, file_name, title, content):
        with open(file_name, 'a+') as f:
            f.write(title + '
')
            f.write(content)
            f.write('
')
            f.close()


if __name__ == "__main__":
    url = "http://www.biquge.info/0_273/"
    driver = webdriver.Chrome(options=options)
    try:
        qidian = Qidian(url, driver)
        qidian.crawl_start()
        driver.quit()
    except Exception as e:
        print(str(e))

相关阅读:
Linux下查找大文件以及目录
 Linux 下定时备份数据库以及删除缓存
 java中main方法的 (String []args)
RabbitMQ消息队列（二）：”Hello, World“
maven 多模块项目
 java 接口的作用和好处
 Centos下使用压缩包安装MySQL5.7
修复mysql：[ERROR] Native table ‘performance_schema’
连接Mysql提示Can’t connect to local MySQL server through socket的解决方法
 centos6下无法使用lsof命令"-bash: lsof: command not found"
原文地址：https://www.cnblogs.com/brady-wang/p/12541164.html