• 笔趣阁小说 selenium爬取


    import re
    from time import sleep
    
    from lxml import etree
    from selenium import webdriver
    
    options = webdriver.ChromeOptions()
    #options.add_argument('--headless')
    options.add_argument(
        "User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36")
    options.add_argument("Referer=https://s.weibo.com/")
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('blink-settings=imagesEnabled=false')
    options.add_argument('--disable-gpu')
    options.add_argument('--hide-scrollbars')  # 隐藏滚动条, 应对一些特殊页面
    options.add_argument(
        'Cookie: ')
    
    
    class Qidian:
        def __init__(self, url, driver):
            self.url = url
            self.driver = driver
            content = self.get_content(url)
            self.file_name = self.pase_file_name(content)
    
    
        def crawl_start(self):
            content = self.get_content(self.url)
            self.parse_detail(content)
    
        def get_content(self,url):
            self.driver.get(url)
            content = driver.page_source
            return content
    
        def pase_file_name(self, content):
            html = etree.HTML(content)
            file_info = html.xpath('//*[@id="info"]/h1/text()')
            file_name = file_info[0] + ".txt"
            return file_name
    
        def parse_detail(self, content):
            html = etree.HTML(content)
            ul = html.xpath('//div[@id="list"]/dl//dd')
            open(self.file_name, 'w')
            for li in ul:
                item = {}
                title = li.xpath('./a/text()')
                href = li.xpath('./a/@href')
    
                item['title'] = title[0]
                item['href'] = "http://www.biquge.info/0_273/" + href[0]
                print(item)
                driver.get(item['href'])
                html = etree.HTML(driver.page_source)
                details = html.xpath('//*[@id="content"]//text()')
                detail = ''.join(details)
    
                self.save_to_file(self.file_name, title[0], detail)
                sleep(3)
    
        def save_to_file(self, file_name, title, content):
            with open(file_name, 'a+') as f:
                f.write(title + '
    ')
                f.write(content)
                f.write('
    ')
                f.close()
    
    
    if __name__ == "__main__":
        url = "http://www.biquge.info/0_273/"
        driver = webdriver.Chrome(options=options)
        try:
            qidian = Qidian(url, driver)
            qidian.crawl_start()
            driver.quit()
        except Exception as e:
            print(str(e))
    

      

  • 相关阅读:
    Linux下查找大文件以及目录
    Linux 下定时备份数据库以及删除缓存
    java中main方法的 (String []args)
    RabbitMQ消息队列(二):”Hello, World“
    maven 多模块项目
    java 接口的作用和好处
    Centos下使用压缩包安装MySQL5.7
    修复mysql:[ERROR] Native table ‘performance_schema’
    连接Mysql提示Can’t connect to local MySQL server through socket的解决方法
    centos6下无法使用lsof命令"-bash: lsof: command not found"
  • 原文地址:https://www.cnblogs.com/brady-wang/p/12541164.html
Copyright © 2020-2023  润新知