• Python XPath抓取小说《三国演义》 《二》


    昨天的代码 可以达到爬虫的目的 但是效果不理想,容易中断,

    今天做了改进 增加写入文本,判断是否创建多层文件夹,增加了随机headers  可以多爬一些内容

    不过还是会中断...

    from lxml import etree
    import requests
    import time
    import os
    import random
    
    
    def getHeaders():
    
        #随机获取一个headers
    
        user_agents = ['Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
                       'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
                       'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
                       'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36',
                       'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36',
                       'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6',
                       'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
                       'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'
                       ]
        headers = {'User-Agent': random.choice(user_agents)}
        return headers
    
    """
    request请求头
    """
    def getRequestHtml(target):
        req = requests.get(url = target,headers = getHeaders())
        req.encoding = "gb2312"
        html = req.text
        return html
    
    """
    获取章节列表和地址
    """
    def  getContents(target):
    
        html = getRequestHtml(target)
        bookdata = etree.HTML(html)
        table_list = bookdata.xpath('//table[9]//tr[1]//td[2]//table[4]//tr[1]//td[1]//table[1]//a')
        return table_list
    
    """
    获取小说内容
    """
    def getContent(target):
        html = getRequestHtml(target)
        bookdata = etree.HTML(html)
        table_list = bookdata.xpath('//table[5]//tr[1]//td[2]//text()')
        return table_list
    
    """
    将小说内容写入到文件
    """
    
    
    def saveData(filepath, name, text):
    
        isExists = os.path.exists(filepath)
    
        if not isExists:
            os.makedirs(filepath)
            print("创建文件夹成功")
        else:
            print("文件夹已存在")
    
        url = filepath+name+".txt"
        with open(url, mode="w", encoding="UTF-8") as f:
            f.writelines(text)
            f.write('
    
    ')
    
    
    if __name__ == '__main__':
        #三国演义 目录地址
        target = "https://www.kanunu8.com/files/old/2011/2447.html"
        #获取目录列表和地址列表
        title_list = getContents(target)
    
        for t in title_list:
            title = t.text
            url = "https://www.kanunu8.com/files/old/2011/"+t.get('href')
            print(title,url)
            text = getContent(url)
            time.sleep(2)
            filePath = "D:\小说\三国演义\"
            saveData(filePath,title,text)
    

      

  • 相关阅读:
    内部类的作用
    zookeeper(1)-概述
    @RequestBody、@ResponseBody注解是如何将输入输出转换成json的
    HashMap之红黑树
    HashMap深入理解
    SpringBoot的四种定时任务
    Redis基础
    Redis内存回收机制
    高频面试题
    36. Valid Sudoku
  • 原文地址:https://www.cnblogs.com/dangzhengtao/p/12217555.html
Copyright © 2020-2023  润新知