某MP3爬虫,爬取预数据,保存数据
为后续下载脚本提供数据
import requests import re from lxml import etree from openpyxl import Workbook """ 获取单个的连接、标题、cv、R*J号 """ pattern = re.compile(r'RJ\d+') hd = { 'cookie': '_ga=GA1.2.1877631639.1626354328; _gid=GA1.2.556826390.1626510667; aiBLOCKS={%221%22:{%22c%22:1%2C%22h%22:62193%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%222%22:{%22c%22:3%2C%22h%22:62302%2C%22cpt%22:1%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%223%22:{%22c%22:1%2C%22h%22:32840%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%224%22:{%22c%22:1%2C%22h%22:52649%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%225%22:{%22c%22:1%2C%22h%22:52655%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%226%22:{%22c%22:-1627116371%2C%22h%22:43456%2C%22cpt%22:0%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%227%22:{%22c%22:1%2C%22h%22:52661%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%228%22:{%22c%22:1%2C%22h%22:52613%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%229%22:{%22c%22:1%2C%22h%22:52646%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2210%22:{%22c%22:1%2C%22h%22:52619%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2211%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2212%22:{%22c%22:1%2C%22h%22:52616%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2213%22:{%22c%22:1%2C%22h%22:52628%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2214%22:{%22c%22:1%2C%22h%22:52634%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2215%22:{%22c%22:1%2C%22h%22:52640%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2216%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2217%22:{%22c%22:2%2C%22h%22:21793%2C%22cpt%22:2%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%2219%22:{%22c%22:1%2C%22h%22:10267%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}}; pvc_visits[0]=1626546673b51942a1626546785b51317a1626546803b37431a1626551293b50818a1626551308b49536a1626551325b49528a1626551373b46295a1626551396b45281a1626551410b41686a1626551655b40332a1626551671b39967a1626551704b39303a1626551720b38318a1626551735b38310a1626551762b38178a1626551779b36528a1626551801b36178; __cf_bm=59c696b6a3b1617334bdb0a004f9eb5f58ea942b-1626517081-1800-AbUJDW1X\/BeCmR0+SQCFmzBW9EGU98T4cWhuG3bGsB3HeDggVQDWZq4ljeoE8EjdpFKxCDdDnvJuBrdh4oo0bYrJg7\/zcaIZUcc0gDqY3D3k6u7tLaaooNYTqBXwPLox3g==', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } def gethtml(url,title,cv,rj,href): r = requests.get(url, headers=hd, stream=True) print(r.status_code) html = etree.HTML(r.text) num_html_data = html.xpath('//*[@id="site-main"]/div/div/div[1]/div/ul/li') print("当前页作品数有"+len(num_html_data)) for i in range(1,len(num_html_data)+1): href_html_data = html.xpath('//*[@id="site-main"]/div/div/div[1]/div/ul/li[{}]/div/div/div/h2/a/@href'.format(i)) name_html_data = html.xpath('//*[@id="site-main"]/div/div/div[1]/div/ul/li[{}]/div/div/div/h2/a/text()'.format(i)) cv_rj=html.xpath('//*[@id="site-main"]/div/div/div[1]/div/ul/li[{}]/div/div/div//p[@style]'.format(i)) if len(cv_rj)is 3: name_html_data[0] = html.xpath('//*[@id="site-main"]/div/div/div[1]/div/ul/li[{}]/div/div/div/p[3]/strong/text()'.format(i)) print("--------------------第{}个---------------------------".format(i)) print(href_html_data[0])#url print(name_html_data[0])#title print(cv_rj[-2].text)#CV print(pattern.search(cv_rj[-1].text)[0])#RJ #保存到数组 href.append(href_html_data[0]) title.append(str(name_html_data[0])) cv.append(cv_rj[-2].text) rj.append(pattern.search(cv_rj[-1].text)[0]) def touchfile(title,cv,rj,href): wb = Workbook() ws = wb.active for i in range(len(href)): ws.append([href[i],rj[i],title[i],cv[i]]) wb.save('sample.xlsx')#保存到当前目录 def main(): title = [] cv = [] rj = [] href = [] r = requests.get('https://xxxx.com/tag/xxxx/', headers=hd, stream=True) print(r.status_code) html = etree.HTML(r.text) num = html.xpath('//*[@id="site-main"]/div/div/div[1]/div/nav/div/a[2]/text()') print("总页数:{}页".format(num[0])) #print(type(int(num[0]))) #print(int(num[0])) for i in range(1,int(num[0])+1): url = 'https://xxxx/tag/xxxx/page/{}/'.format(i) gethtml(url,title,cv,rj,href) print("开始保存excel文件") touchfile(title,cv,rj,href) print("文件保存成功-------------------完成") main()
单个mp3下载脚本
import requests hd = { 'sec-fetch-dest': 'audio', 'cookie': '_ga=GA1.2.1877631639.1626354328; _gid=GA1.2.556826390.1626510667; aiBLOCKS={%221%22:{%22c%22:1%2C%22h%22:62193%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%222%22:{%22c%22:3%2C%22h%22:62302%2C%22cpt%22:1%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%223%22:{%22c%22:1%2C%22h%22:32840%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%224%22:{%22c%22:1%2C%22h%22:52649%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%225%22:{%22c%22:1%2C%22h%22:52655%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%226%22:{%22c%22:-1627116371%2C%22h%22:43456%2C%22cpt%22:0%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%227%22:{%22c%22:1%2C%22h%22:52661%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%228%22:{%22c%22:1%2C%22h%22:52613%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%229%22:{%22c%22:1%2C%22h%22:52646%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2210%22:{%22c%22:1%2C%22h%22:52619%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2211%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2212%22:{%22c%22:1%2C%22h%22:52616%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2213%22:{%22c%22:1%2C%22h%22:52628%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2214%22:{%22c%22:1%2C%22h%22:52634%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2215%22:{%22c%22:1%2C%22h%22:52640%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2216%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2217%22:{%22c%22:2%2C%22h%22:21793%2C%22cpt%22:2%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%2219%22:{%22c%22:1%2C%22h%22:10267%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}}; pvc_visits[0]=1626546673b51942a1626546785b51317a1626546803b37431a1626551293b50818a1626551308b49536a1626551325b49528a1626551373b46295a1626551396b45281a1626551410b41686a1626551655b40332a1626551671b39967a1626551704b39303a1626551720b38318a1626551735b38310a1626551762b38178a1626551779b36528a1626551801b36178; __cf_bm=59c696b6a3b1617334bdb0a004f9eb5f58ea942b-1626517081-1800-AbUJDW1X\/BeCmR0+SQCFmzBW9EGU98T4cWhuG3bGsB3HeDggVQDWZq4ljeoE8EjdpFKxCDdDnvJuBrdh4oo0bYrJg7\/zcaIZUcc0gDqY3D3k6u7tLaaooNYTqBXwPLox3g==', 'Referer':'https://xxxx/51938/', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } print("开始下载") url = 'https://xxxx/xx.mp3' r = requests.get(url, headers=hd, stream=True) file_size_str=r.headers['Content-Length'] #提取出来的是个数字str file_size=int(file_size_str)/1024/1024 #把提取出数字str转为int或者float进行运算 print('文件大小为:'+str(file_size)+'M') with open('XXX.mp3', "wb") as mp3: for chunk in r.iter_content(chunk_size=1024 * 1024): if chunk: mp3.write(chunk) print("正在下载") print("下载结束")
更改后的代码,功能完善
import pandas as pd import requests import time url = [] rj = [] #读取url url_df = pd.read_excel("sample.xlsx", usecols=[0],names=None) # 读取项目名称列,不要列名 url_df_li = url_df.values.tolist() for url_s_li in url_df_li: url.append(url_s_li[0]) #读取rj rj_df = pd.read_excel("sample.xlsx", usecols=[1],names=None) # 读取项目名称列,不要列名 rj_df_li = rj_df.values.tolist() for rj_s_li in rj_df_li: rj.append(rj_s_li[0]) hd = { 'sec-fetch-dest': 'audio', 'cookie': '_ga=GA1.2.1877631639.1626354328; _gid=GA1.2.556826390.1626510667; aiBLOCKS={%221%22:{%22c%22:1%2C%22h%22:62193%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%222%22:{%22c%22:3%2C%22h%22:62302%2C%22cpt%22:1%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%223%22:{%22c%22:1%2C%22h%22:32840%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%224%22:{%22c%22:1%2C%22h%22:52649%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%225%22:{%22c%22:1%2C%22h%22:52655%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%226%22:{%22c%22:-1627116371%2C%22h%22:43456%2C%22cpt%22:0%2C%22ct%22:1626597077%2C%22x%22:1626597634}%2C%227%22:{%22c%22:1%2C%22h%22:52661%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%228%22:{%22c%22:1%2C%22h%22:52613%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%229%22:{%22c%22:1%2C%22h%22:52646%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2210%22:{%22c%22:1%2C%22h%22:52619%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2211%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2212%22:{%22c%22:1%2C%22h%22:52616%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2213%22:{%22c%22:1%2C%22h%22:52628%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2214%22:{%22c%22:1%2C%22h%22:52634%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2215%22:{%22c%22:1%2C%22h%22:52640%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2216%22:{%22c%22:1%2C%22h%22:52625%2C%22cpt%22:1%2C%22ct%22:1626597069}%2C%2217%22:{%22c%22:2%2C%22h%22:21793%2C%22cpt%22:2%2C%22ct%22:1626597069%2C%22x%22:1626597634}%2C%2219%22:{%22c%22:1%2C%22h%22:10267%2C%22cpt%22:1%2C%22ct%22:1626597069%2C%22x%22:1626597634}}; pvc_visits[0]=1626546673b51942a1626546785b51317a1626546803b37431a1626551293b50818a1626551308b49536a1626551325b49528a1626551373b46295a1626551396b45281a1626551410b41686a1626551655b40332a1626551671b39967a1626551704b39303a1626551720b38318a1626551735b38310a1626551762b38178a1626551779b36528a1626551801b36178; __cf_bm=59c696b6a3b1617334bdb0a004f9eb5f58ea942b-1626517081-1800-AbUJDW1X\/BeCmR0+SQCFmzBW9EGU98T4cWhuG3bGsB3HeDggVQDWZq4ljeoE8EjdpFKxCDdDnvJuBrdh4oo0bYrJg7\/zcaIZUcc0gDqY3D3k6u7tLaaooNYTqBXwPLox3g==', 'Referer':'', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } for i in range(68,len(url)): i = int(i) print("{}开始下载".format(rj[i])) manurl = 'https://xxxxxxx/f/{}.mp3'.format(rj[i]) hd['Referer']=url[i] print(hd['Referer']) print(type(hd['Referer'])) print("对应url:{}".format(url[i])) r = requests.get(manurl, headers=hd, stream=True) print("状态码:{}".format(r.status_code)) file_size_str=r.headers['content-Length'] #提取出来的是个数字str file_size=int(file_size_str)/1024/1024 #把提取出数字str转为int或者float进行运算 print('文件大小为:'+str(file_size)+'M') with open('{}.mp3'.format(rj[i]), "wb") as mp3: for chunk in r.iter_content(chunk_size=1024 * 1024): if chunk: mp3.write(chunk) print("{}正在下载".format(rj[i])) print("{}下载结束".format(rj[i])) time.sleep(5)