python-爬虫-史书典籍

import requests
import os
from lxml import html
import time


def get_title_url(tree):
    '''一级  获取标题'''
    # 史书典籍
    # 格式：/book/sanguoyanyi.html
    History_book_url_list = tree.xpath("//div[@class='index-li'][3]/ul/li/a/@href")
    # 格式：三国演义
    History_book_name_list = tree.xpath("//div[@class='index-li'][3]/ul/li/a/text()")
    return History_book_url_list,History_book_name_list


def get_article_url(tree):
    '''二级  获取文章标题'''
    # 三国演义典籍
    # 格式：/book/sanguoyanyi/1.html
    book_url_list = tree.xpath("//div[@class='book-mulu']/ul/li/a/@href")
    # 格式：第一回·宴桃园豪杰三结义  斩黄巾英雄首立功
    book_name_list = tree.xpath("//div[@class='book-mulu']/ul/li/a/text()")
    return book_url_list,book_name_list


def get_article(tree):
    '''三级  获取文章内容'''
    # 第一回·宴桃园豪杰三结义  斩黄巾英雄首立功
    # 格式：/book/sanguoyanyi/1.html
    article_list = tree.xpath("//div[@class='chapter_content']/p/text()")
    return ''.join(article_list)

def get_request(url,headers):
    '''获取页面'''
    response = requests.get(url=url,headers=headers)
    tree = html.fromstring(response.text)
    return tree

def save_mkdir(two):
    '''三级  保存文章夹'''
    # 一级文件夹
    if os.path.exists('史书典籍'):
        pass
    else:
        os.mkdir('史书典籍')
    # 二级文件夹
    if os.path.exists('史书典籍/'+ two):
        pass
    else:
        os.mkdir('史书典籍/'+ two)

def police_2(a):
    '''二级中断检测'''
    b = None
    if os.path.exists('史书典籍/police_2.txt'):
        with open('史书典籍/police_2.txt', 'r') as f:
            b = f.read()
            f.close()
            if b is None:
                return True
            elif b is '':
                return True
            if a < int(b):
                return False
    # 写入并返回True
    with open('史书典籍/police_2.txt', 'w') as f:
        f.write(str(a))
        f.close()
        return True



def police_3(a):
    '''三级中断检测'''
    b = None
    if os.path.exists('史书典籍/police_3.txt'):
        with open('史书典籍/police_3.txt', 'r') as f:
            b = f.read()
            f.close()
            if b is None:
                return True
            elif b is '':
                return True
            if a < int(b):
                return False
    # 写入并返回True
    with open('史书典籍/police_3.txt', 'w') as f:
        f.write(str(a))
        f.close()
        return True


def main():
    '''主函数'''
    # 根路由
    root = 'http://www.shicimingju.com'
    # 头部
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
    }


    # 获取root页面
    tree1 = get_request(root,headers)
    # 获取一级名字和路由
    History_book_url_list, History_book_name_list = get_title_url(tree1)
    # 获取二级页面
    for i in range(len(History_book_url_list)):
        if police_2(i) is False:
            continue
        # 二级路由
        url2 = root + History_book_url_list[i]
        print("爬取>>>"+History_book_name_list[i]+'开始')
        tree2 = get_request(url2,headers)
        # 获取二级名字和路由
        book_url_list,book_name_list = get_article_url(tree2)
        # 文章夹保存
        save_mkdir(History_book_name_list[i])
        # 下载文章
        for j in range(len(book_url_list)):
            if police_3(j) is False:
                continue
            time.sleep(1)
            # 三级路由
            url3 = root + book_url_list[j]
            print("爬取:" + book_name_list[j])
            # 文章
            tree3 = get_request(url3, headers)
            txt = get_article(tree3)
            # 文章标题
            txt_name = book_name_list[j]
            # 文章保存
            file_path = '史书典籍/{}/{}.txt'.format(History_book_name_list[i],(txt_name.replace(' ','')).replace('·',''))
            with open(file_path,'w',encoding='utf-8') as f:
                f.write(txt)
                f.close()
        print("爬取>>>" + History_book_name_list[i] + '结束')



if __name__ == '__main__':
    main()
相关阅读:
设计模式
 刷新所有视图存储过程
 js杨辉三角控制台输出
 2018申请淘宝客AppKey
w3c标准 dom对象事件冒泡和事件捕获
 promise原理
 vue virtual Dom
css学习
 seo优化
 新概念学习
原文地址：https://www.cnblogs.com/person1-0-1/p/11316076.html