• 某MP3爬虫


    某MP3爬虫,爬取预数据,保存数据
    为后续下载脚本提供数据
    import requests
    import re
    from lxml import etree
    from openpyxl import Workbook
    
    
    """
    获取单个的连接、标题、cv、R*J号
    """
    
    pattern = re.compile(r'RJ\d+')
     
    hd = {
        'cookie': '_ga=GA1.2.1877631639.1626354328; _gid=GA1.2.556826390.1626510667; aiBLOCKS={%221%22:{%22c%22:1%2C%22h%22:62193%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%222%22:{%22c%22:3%2C%22h%22:62302%2C%22cpt%22:1%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%223%22:{%22c%22:1%2C%22h%22:32840%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%224%22:{%22c%22:1%2C%22h%22:52649%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%225%22:{%22c%22:1%2C%22h%22:52655%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%226%22:{%22c%22:-1627116371%2C%22h%22:43456%2C%22cpt%22:0%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%227%22:{%22c%22:1%2C%22h%22:52661%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%228%22:{%22c%22:1%2C%22h%22:52613%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%229%22:{%22c%22:1%2C%22h%22:52646%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2210%22:{%22c%22:1%2C%22h%22:52619%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2211%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2212%22:{%22c%22:1%2C%22h%22:52616%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2213%22:{%22c%22:1%2C%22h%22:52628%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2214%22:{%22c%22:1%2C%22h%22:52634%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2215%22:{%22c%22:1%2C%22h%22:52640%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2216%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2217%22:{%22c%22:2%2C%22h%22:21793%2C%22cpt%22:2%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%2219%22:{%22c%22:1%2C%22h%22:10267%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}}; pvc_visits[0]=1626546673b51942a1626546785b51317a1626546803b37431a1626551293b50818a1626551308b49536a1626551325b49528a1626551373b46295a1626551396b45281a1626551410b41686a1626551655b40332a1626551671b39967a1626551704b39303a1626551720b38318a1626551735b38310a1626551762b38178a1626551779b36528a1626551801b36178; __cf_bm=59c696b6a3b1617334bdb0a004f9eb5f58ea942b-1626517081-1800-AbUJDW1X\/BeCmR0+SQCFmzBW9EGU98T4cWhuG3bGsB3HeDggVQDWZq4ljeoE8EjdpFKxCDdDnvJuBrdh4oo0bYrJg7\/zcaIZUcc0gDqY3D3k6u7tLaaooNYTqBXwPLox3g==',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
    }
    
    def gethtml(url,title,cv,rj,href):
            r = requests.get(url, headers=hd, stream=True)
            print(r.status_code)
            html = etree.HTML(r.text)
            num_html_data = html.xpath('//*[@id="site-main"]/div/div/div[1]/div/ul/li')
            print("当前页作品数有"+len(num_html_data))
            for i in range(1,len(num_html_data)+1):
                href_html_data = html.xpath('//*[@id="site-main"]/div/div/div[1]/div/ul/li[{}]/div/div/div/h2/a/@href'.format(i))
                name_html_data = html.xpath('//*[@id="site-main"]/div/div/div[1]/div/ul/li[{}]/div/div/div/h2/a/text()'.format(i))
                cv_rj=html.xpath('//*[@id="site-main"]/div/div/div[1]/div/ul/li[{}]/div/div/div//p[@style]'.format(i))
                if len(cv_rj)is 3:
                    name_html_data[0] = html.xpath('//*[@id="site-main"]/div/div/div[1]/div/ul/li[{}]/div/div/div/p[3]/strong/text()'.format(i))
                print("--------------------第{}个---------------------------".format(i))   
                print(href_html_data[0])#url
                print(name_html_data[0])#title
                print(cv_rj[-2].text)#CV
                print(pattern.search(cv_rj[-1].text)[0])#RJ
                #保存到数组
                href.append(href_html_data[0])
                title.append(str(name_html_data[0]))
                cv.append(cv_rj[-2].text)
                rj.append(pattern.search(cv_rj[-1].text)[0])
    
    def touchfile(title,cv,rj,href):
        wb = Workbook()
        ws = wb.active
        for i in range(len(href)):
            ws.append([href[i],rj[i],title[i],cv[i]])
        wb.save('sample.xlsx')#保存到当前目录
    def main():
        title = []
        cv = []
        rj = []
        href = []
        r = requests.get('https://xxxx.com/tag/xxxx/', headers=hd, stream=True)
        print(r.status_code)
        html = etree.HTML(r.text)
        num = html.xpath('//*[@id="site-main"]/div/div/div[1]/div/nav/div/a[2]/text()')
        print("总页数:{}页".format(num[0]))
        #print(type(int(num[0])))
        #print(int(num[0]))
    
        for i in range(1,int(num[0])+1):
            url = 'https://xxxx/tag/xxxx/page/{}/'.format(i)
            gethtml(url,title,cv,rj,href)
        print("开始保存excel文件")
        touchfile(title,cv,rj,href)
        print("文件保存成功-------------------完成")
    
    main()
    单个mp3下载脚本
    import requests
     
    hd = {
        'sec-fetch-dest': 'audio',
        'cookie': '_ga=GA1.2.1877631639.1626354328; _gid=GA1.2.556826390.1626510667; aiBLOCKS={%221%22:{%22c%22:1%2C%22h%22:62193%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%222%22:{%22c%22:3%2C%22h%22:62302%2C%22cpt%22:1%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%223%22:{%22c%22:1%2C%22h%22:32840%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%224%22:{%22c%22:1%2C%22h%22:52649%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%225%22:{%22c%22:1%2C%22h%22:52655%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%226%22:{%22c%22:-1627116371%2C%22h%22:43456%2C%22cpt%22:0%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%227%22:{%22c%22:1%2C%22h%22:52661%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%228%22:{%22c%22:1%2C%22h%22:52613%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%229%22:{%22c%22:1%2C%22h%22:52646%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2210%22:{%22c%22:1%2C%22h%22:52619%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2211%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2212%22:{%22c%22:1%2C%22h%22:52616%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2213%22:{%22c%22:1%2C%22h%22:52628%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2214%22:{%22c%22:1%2C%22h%22:52634%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2215%22:{%22c%22:1%2C%22h%22:52640%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2216%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2217%22:{%22c%22:2%2C%22h%22:21793%2C%22cpt%22:2%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%2219%22:{%22c%22:1%2C%22h%22:10267%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}}; pvc_visits[0]=1626546673b51942a1626546785b51317a1626546803b37431a1626551293b50818a1626551308b49536a1626551325b49528a1626551373b46295a1626551396b45281a1626551410b41686a1626551655b40332a1626551671b39967a1626551704b39303a1626551720b38318a1626551735b38310a1626551762b38178a1626551779b36528a1626551801b36178; __cf_bm=59c696b6a3b1617334bdb0a004f9eb5f58ea942b-1626517081-1800-AbUJDW1X\/BeCmR0+SQCFmzBW9EGU98T4cWhuG3bGsB3HeDggVQDWZq4ljeoE8EjdpFKxCDdDnvJuBrdh4oo0bYrJg7\/zcaIZUcc0gDqY3D3k6u7tLaaooNYTqBXwPLox3g==',
        'Referer':'https://xxxx/51938/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
    }
     
    print("开始下载")
    url = 'https://xxxx/xx.mp3'
    r = requests.get(url, headers=hd, stream=True)
    file_size_str=r.headers['Content-Length'] #提取出来的是个数字str
    file_size=int(file_size_str)/1024/1024    #把提取出数字str转为int或者float进行运算
    print('文件大小为:'+str(file_size)+'M')
    with open('XXX.mp3', "wb") as mp3:  
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            if chunk:
                mp3.write(chunk)
                print("正在下载")
     
    print("下载结束")
    更改后的代码,功能完善
    import pandas as pd
    import requests
    import time
    url = []
    rj = []
    #读取url
    url_df = pd.read_excel("sample.xlsx", usecols=[0],names=None)  # 读取项目名称列,不要列名
    url_df_li = url_df.values.tolist()
    for url_s_li in url_df_li:
        url.append(url_s_li[0])
        
    #读取rj
    rj_df = pd.read_excel("sample.xlsx", usecols=[1],names=None)  # 读取项目名称列,不要列名
    rj_df_li = rj_df.values.tolist()
    for rj_s_li in rj_df_li:
        rj.append(rj_s_li[0])
    
    hd = {
        'sec-fetch-dest': 'audio',
        'cookie': '_ga=GA1.2.1877631639.1626354328; _gid=GA1.2.556826390.1626510667; aiBLOCKS={%221%22:{%22c%22:1%2C%22h%22:62193%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%222%22:{%22c%22:3%2C%22h%22:62302%2C%22cpt%22:1%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%223%22:{%22c%22:1%2C%22h%22:32840%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%224%22:{%22c%22:1%2C%22h%22:52649%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%225%22:{%22c%22:1%2C%22h%22:52655%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%226%22:{%22c%22:-1627116371%2C%22h%22:43456%2C%22cpt%22:0%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%227%22:{%22c%22:1%2C%22h%22:52661%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%228%22:{%22c%22:1%2C%22h%22:52613%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%229%22:{%22c%22:1%2C%22h%22:52646%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2210%22:{%22c%22:1%2C%22h%22:52619%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2211%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2212%22:{%22c%22:1%2C%22h%22:52616%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2213%22:{%22c%22:1%2C%22h%22:52628%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2214%22:{%22c%22:1%2C%22h%22:52634%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2215%22:{%22c%22:1%2C%22h%22:52640%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2216%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2217%22:{%22c%22:2%2C%22h%22:21793%2C%22cpt%22:2%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%2219%22:{%22c%22:1%2C%22h%22:10267%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}}; pvc_visits[0]=1626546673b51942a1626546785b51317a1626546803b37431a1626551293b50818a1626551308b49536a1626551325b49528a1626551373b46295a1626551396b45281a1626551410b41686a1626551655b40332a1626551671b39967a1626551704b39303a1626551720b38318a1626551735b38310a1626551762b38178a1626551779b36528a1626551801b36178; __cf_bm=59c696b6a3b1617334bdb0a004f9eb5f58ea942b-1626517081-1800-AbUJDW1X\/BeCmR0+SQCFmzBW9EGU98T4cWhuG3bGsB3HeDggVQDWZq4ljeoE8EjdpFKxCDdDnvJuBrdh4oo0bYrJg7\/zcaIZUcc0gDqY3D3k6u7tLaaooNYTqBXwPLox3g==',
        'Referer':'',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
    }
    
    for i in range(68,len(url)):
        i = int(i)
        print("{}开始下载".format(rj[i]))
        manurl = 'https://xxxxxxx/f/{}.mp3'.format(rj[i])
        hd['Referer']=url[i]
        print(hd['Referer'])
        print(type(hd['Referer']))
        print("对应url:{}".format(url[i]))
        r = requests.get(manurl, headers=hd, stream=True)
        print("状态码:{}".format(r.status_code))
        file_size_str=r.headers['content-Length'] #提取出来的是个数字str
        file_size=int(file_size_str)/1024/1024    #把提取出数字str转为int或者float进行运算
        print('文件大小为:'+str(file_size)+'M')
        with open('{}.mp3'.format(rj[i]), "wb") as mp3:  
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    mp3.write(chunk)
                    print("{}正在下载".format(rj[i]))
         
        print("{}下载结束".format(rj[i]))
        time.sleep(5)
     
  • 相关阅读:
    初谈面向对象
    java概述~至数组
    django一对多数据库模型
    Django url()函数详解
    python编码规范
    django的用户管理
    ubuntu下安装搜狗拼音
    乱七八糟的2013
    使用django进行微信公众平台开发
    我们要写的项目
  • 原文地址:https://www.cnblogs.com/JKding233/p/15947116.html
Copyright © 2020-2023  润新知