• python爬虫 爬取诗词与史书


    import requests
    import os
    from bs4 import BeautifulSoup
    
    shici_url = 'http://www.shicimingju.com'
    url = 'http://www.shicimingju.com/book/'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
    }
    # # 请求页面数据
    response = requests.get(url=url,headers=headers)
    page_text = response.text
    soup = BeautifulSoup(page_text,'lxml')
    # 获取所有小说名称组成的列表
    a_list = soup.select('.bookmark-list>ul>li>h2>a')
    # 获取书籍的详细
    def get_book_detail(page_url):
        book_detail_content = requests.get(url=page_url, headers=headers).text
        soup = BeautifulSoup(book_detail_content, 'lxml')
        book_content = soup.select('.chapter_content>p')
        if not book_content:
            book_content = soup.select('.chapter_content')
        content = ''
        for book_c in book_content:
            content = content +  book_c.text
        # 获取 详细内容
        return content
    # 获取书籍的列表页面
    def get_book_list(book_url,f):
        book_list_content = requests.get(url=book_url, headers=headers).text
        soup = BeautifulSoup(book_list_content, 'lxml')
        book_mulu = soup.select('.book-mulu>ul>li>a')
        for book in book_mulu:
            page_title = book.text
            print(page_title + "开始下载...")
            page_url = shici_url+book['href']
            # 调用 详细页面
            content = get_book_detail(page_url)
            f.write(page_title+"
    
    "+content+"
    
    
    ")
            print(page_title+"下载完成...")
        f.close()
    # 判断目录是否存在
    file_path = './史书/'
    if not os.path.exists(file_path):
        os.mkdir(file_path)
    n = 0
    for a in a_list:
        n = n + 1
        # 书名
        book_name = a.text
        print("<<%s>>正在下载..."%book_name)
        # 创建以当前书名为文件名的txt文件
        file_name = file_path+str(n)+'.'+book_name+'.txt'
        f = open(file_name,'a+',encoding='utf-8')
        # url
        book_url = shici_url+a['href']
        # 通过url 进入到 书籍的列表页面
        get_book_list(book_url,f)
  • 相关阅读:
    hadoop yarn
    java 删除文件夹及其里面的文件
    scrapy 429 处理
    java 获取当前时间(年月日时分秒)
    Java测试类
    eclipse环境问题-java版本不兼容
    Java内功修炼系列一工厂模式
    Java内功修炼系列一观察者模式
    Java内功修炼系列一责任链模式
    Java内功修炼系列一拦截器
  • 原文地址:https://www.cnblogs.com/hela/p/11323071.html
Copyright © 2020-2023  润新知