爬取音乐资源
实现
#python 的正则库 import re #python 的requests库 import requests import time #找到url的规律 #每一页的url # http://www.htqyy.com/top/hot # http://www.htqyy.com/top/musicList/hot?pageIndex=1&pageSize=20 # http://www.htqyy.com/top/musicList/hot?pageIndex=2&pageSize=20 #歌曲连接 # http://www.htqyy.com/play/33 # 33-每个歌曲的号码,页url可以找到 #资源所在url # http://f2.htqyy.com/play8/33/mp3/6 #class="num">41</span><span class="title"><a href="/play/46" target="play" title="琵琶语" sid="46">琵琶语</a></span> songName=[] songID=[] headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" } page=2 #page=int(input("请输入您要爬取的页数:")) for i in range(0,page): url="http://www.htqyy.com/top/musicList/hot?pageIndex="+str(i)+"&pageSize=20" #发送get请求,获取音乐榜单网页信息 r=requests.get(url,headers=headers) #GBK网页采用的编码格式 r.encoding='GBK' html_text=r.text print(html_text) #正则找到对应歌的url part1=r'title="(.*?)" sid=' part2=r'sid="(.*?)"' #将匹配的字串组成列表形式返回 titlelist=re.findall(part1,html_text) idlist=re.findall(part2,html_text) #在一个列表尾添加另一个列表 songName.extend(titlelist) songID.extend(idlist) for i in range(0,len(songID)): songurl="http://f2.htqyy.com/play8/"+str(songID[i])+"/mp3/6" songname=songName[i] #二进制文件 data=requests.get(songurl).content print("正在下载...") with open("E:\music\{0}.mp3".format(songname),"wb") as f: f.write(data) time.sleep(5)
当无法访问试试下面代码
headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" } songurl="http://f2.htqyy.com/play8/33/mp3/6" songname="清风" #二进制文件 data=requests.get(songurl,headers=headers).content print("正在下载...") with open("D:\Python\{0}.mp3".format(songname),"wb") as f: f.write(data)
总结
当得到的网页信息是乱码:
print requests.get(url).encoding 打印获取到的网页信息采用什么编码
r = requests.get(url)
r.encoding = 'GBK'
print(r.text) 将编码格式采用'GBK',网页编码,就不会出现乱码
字符串拼接:
+或者format()