MusiCode 批量下载指定歌手的所有专辑（已解除验证码限制）

一直想把喜欢的歌手的专辑全都归类并下载下来，由于那专辑数量实在太多了，再加上最近开始学习python，就想着何不用python写个脚本把下载过程自动化呢？所以就花了点时间写了这么个东西，分享给有需要的人。:)
写这个东西，一开始并没有想到抓取过于频繁、时间过长会出现验证码，由于验证码的问题试了几种方式都无法得到很好的解决，于是加上了生成下载清单这一步，加这一步的时候，一开始是把最终下载地址存储起来，结果发现，下载地址居然会过期，没办法最后只有将下载页面地址存储下来，使用下载命令的时候，再去下载页面获取最终下载地址。
这段脚本使用了两个开源的模块，gevent和BeautifulSoup。

updated-------------------------------------------------------------------------------------------
已解除验证码限制，若出现验证码，则会从验证码页面中提取出所需cookie并重新发起请求。

   #coding=utf-8
    
    import urllib,urllib2,re,os,json,gevent,traceback
    from BeautifulSoup import BeautifulSoup
    from gevent import monkey
    
    monkey.patch_all()
    
    rootUrl='http://music.baidu.com'
    artistId=2825 #想批量下载并归类你喜欢的歌手的所有专辑？那就把这里替换成该歌手在百度音乐的Id吧，例如：http://music.baidu.com/artist/2825
    pagesize=10
    savePath='G:\crawl\david bowie\' #改成你想存储的文件夹
    listDir='_____downlist\'
    handleCount=0
    BAIDUVERIFY=''
    
    def crawlList():
        artistUrl=rootUrl+'/artist/'+str(artistId)
        homeHtml=request(artistUrl)
        soup=BeautifulSoup(homeHtml)
        try:
            pagecount=len(soup.findAll("div",{"class":"page-inner"})[1].findAll(text=re.compile(r'd+')))
       except:
            print traceback.print_exc()
            print homeHtml
            return
        jobs=[]
        listPath=savePath+listDir
        if not os.path.exists(listPath):
            os.mkdir(listPath)
        for i in range(pagecount):
            jobs.append(gevent.spawn(crawlPage,i))
        gevent.joinall(jobs)
            
    def request(url):
        global BAIDUVERIFY
        req=urllib2.Request(url)
        if BAIDUVERIFY!='':
            req.add_header('Cookie','BAIDUVERIFY='+BAIDUVERIFY+';')
        resp=urllib2.urlopen(req)
        html= resp.read()
        verify=getBaiduVerify(html)
        if verify!='':
            print u'成功提取验证码并重新发起请求'
            BAIDUVERIFY=verify
            return request(url)
        return html
        
    def getBaiduVerify(html):
        vcode=re.search(r'name="vcode" value="(.*?)"' , html, re.I)
        id=re.search(r'name="id" value="(.*?)"' , html, re.I)
        di=re.search(r'name="di" value="(.*?)"' , html, re.I)
        if vcode and id and di:
            return vcode.group(1)+':'+id.group(1)+':'+di.group(1)
        return ''
    
    def crawlPage(page):
        start=page*pagesize
        albumListUrl='http://music.baidu.com/data/user/getalbums?start=%d&ting_uid=%d&order=time' % (start,artistId)
        print albumListUrl
        albumListHtml=json.loads(request(albumListUrl))["data"]["html"]
        albumListSoup=BeautifulSoup(albumListHtml)
        covers=albumListSoup.findAll('a',{'class':'cover'})
        pagePath=savePath+listDir+str(page)+'\'
        if not os.path.exists(pagePath):
            os.mkdir(pagePath)
        for cover in covers:
            try:
                crawlAlbum(pagePath,rootUrl+cover['href'],cover['title'])
            except:
                print traceback.print_exc()
    
    def crawlAlbum(pagePath,albumUrl,title):
        print albumUrl,title
        albumHtml=request(albumUrl)
        albumSoup=BeautifulSoup(albumHtml)
        musicWraps=albumSoup.findAll('span',{'class':'song-title '})
        title=re.subn(r'\|/|:|*|?|"|<|>||','',title)[0]
        path=savePath+title+'\'
        albumListPath=pagePath+title+'.txt'
        albumFile=open(albumListPath,'w')
        for wrap in musicWraps:
            link=wrap.find('a')
            try:
                musicPage=rootUrl+link['href']
                albumFile.write('%s	%s	%s
' % (musicPage,link['title'],path)) #真实下载地址会过期，这里保存下载页面
            except:
                print traceback.print_exc()
        albumFile.close()
    
    def crawlDownloadUrl(musicPage):
        downPage=musicPage+'/download'
        downHtml=request(downPage)
        downUrl=re.search('http://[^ ]*xcode.[a-z0-9]*' , downHtml, re.M).group()
        return downUrl
    
    def downList():
        listPath=savePath+listDir
        jobs=[]
        for pageDir in os.listdir(listPath):
            jobs.append(gevent.spawn(downPage,listPath+pageDir))
        gevent.joinall(jobs)
    
    def downPage(pagePath):
        for filename in os.listdir(pagePath):
            filePath=pagePath+'\'+filename
            albumFile=open(filePath,'r')
            try:
                for args in albumFile.readlines():
                    arrArgs=args.split('	')
                    downMusic(arrArgs[0],arrArgs[1],arrArgs[2].replace('
',''))
            except:
                print traceback.print_exc()
            finally:
               albumFile.close()
    
    
   def downMusic(musicPage,title,path):
        global handleCount
        if not os.path.exists(path):
            os.mkdir(path)
        handleCount+=1
        print handleCount,musicPage,title,path
        filename=path+re.subn(r'\|/|:|*|?|"|<|>||','',title)[0]+'.mp3'
        if os.path.isfile(filename):
            return
        downUrl=crawlDownloadUrl(musicPage)
        try:
            urllib.urlretrieve(downUrl,filename)
        except:
            print traceback.print_exc()
            os.remove(filename)
    
    if __name__=='__main__':
        print u'命令：
	list	生成下载清单
	down	开始下载
	exit	退出'
        cmd=raw_input('>>>')
        while cmd!='exit':
            if cmd=='list':
                crawlList()
                print u'已生成下载清单'
            elif cmd=='down':
               downList()
               print u'下载完成'
           else:
               print 'unknow cmd'
          cmd=raw_input('>>>')

相关阅读:
webLogic的安装与配置总结
 hibernate 中save()、update()、saveOrUpdate()的区别？
struts2+spring+hibernate+oracle整合，实现增删改查操作。（一）
配置struts时web.xml中<url-pattern>*.action</url-pattern>
java中，返回1000-10000中能被3整除，且个位数是6的个数
 kubernetes部署Fluentd+Elasticsearch+kibana 日志收集系统
 用Docker搭建WordPress
51建设Android版一些技术整理
 微信内置浏览器隐藏功能左上角功能选项
 vs2013修改默认的开发环境
原文地址：https://www.cnblogs.com/xuxiaoshuan/p/3628928.html