• 爬取喜马拉雅免费有声小说


    import requests
    import re
    from bs4 import BeautifulSoup
    import json
    import math

    '''
    写文件

    '''


    def json_sanalyzes(legal):

    contents ={}
    for i in legal:
    li=[]
    for k in i:
    contents['name']=k['trackName']
    contents['src'] =k['src']
    yield contents



    # return contents
    #
    #


    def dump_load(url,id):
    header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
    }
    res = requests.get(url,headers=header)
    while res.status_code!=200:
    continue
    else:
    counts = res.text
    supers=BeautifulSoup(counts,"html.parser")
    res=supers.find("h2",class_="rC5T")
    ressss=re.findall(r"<h2 class="rC5T">专辑里的声音(<!-- -->(d+)<!-- -->)",str(res))[0]#获取小说总条数
    n=math.ceil(int(ressss)/30) # 区分页漂移量
    for i in range(n):#循环请求要爬取的页面url 每页爬取30条
    url ="https://www.ximalaya.com/revision/play/album?albumId=%s&pageNum=%d&sort=-1&pageSize =30"%(id,i+1)
    contens_rs = requests.get(url,headers=header)
    while contens_rs.status_code!=200:
    continue
    else:
    rescsa=json.loads(contens_rs.content.decode())
    yield rescsa['data']['tracksAudioPlay'] #采用yield 生成器

    if __name__=="__main__":
    import os
    id = 12642314 #小说id
    url = "https://www.ximalaya.com/youshengshu/%d/"%id #请求要访问小说页面的主页面
    #dump_load(url,id)
    response = json_sanalyzes(dump_load(url,id))# 获取小说每页列表并解析出 音频地址 和 小说单张名称
    header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
    }
    for i in response: # 循环下载小说内容
    dump_cont = requests.get(i['src'], headers=header)
    while dump_cont.status_code!=200:
    continue
    else:
    p = "D:/untitled2/venv/theand/小说/" # 小说存放地址
    if os.path.exists(p) == False: # 判断小说地址是否存在 不存在
    os.mkdir(p) #则创建目录
    b_name =i['name'].replace('"'," ").replace('"'," ")
    path_paths = p + b_name + ".mp3" #拼接小说单张字节名称
    path_p = path_paths.replace(' ', '')
    with open(str(path_p), "wb") as f: #写入文件音频
    f.write(requests.get(i['src']).content)

  • 相关阅读:
    收藏随笔
    Jquery根据元素ID判断该元素是否存在
    DIV+CSS布局中IE与FF浏览器之间重要的兼容性差异
    css3 boxsizing属性
    常见CSS属性及值
    Pycharm学习记录注释
    python之reload用法
    python之sorted用法
    android studio目录结构浅析
    纪念开通博客
  • 原文地址:https://www.cnblogs.com/wxc1/p/10237354.html
Copyright © 2020-2023  润新知