a python script to download video from youku

A useful script to download video from youku.
I find a related one from a google project http://code.google.com/u/ldmiao,which only can get one video address. however, the youku split a video into about 10 subvideos, so the script can not get the whole video .After a study on the youku json, the script is improved as followed:

#youku.py
#download video from youku
import feedparser,urllib,urllib2
import re
class Youku():
   def __init__(self,uri):
       self.regex={
           'url':re.compile('youku\.com/(?:v_show/id_)?([-+_\w]+)=*\.html',re.IGNORECASE),
           'ticket':(re.compile('addVariable\s*\(\s*[\'"]VideoIDS[\'"]\s*,\s*(\d+)\s*\)',re.IGNORECASE),re.compile('sendVideoLink\s*\(\s*[\'"][^\'"]+[\'"]\s*,\s*[\'"]([^\'"]+)[\'"]',re.IGNORECASE),re.compile('[\'"\s]+?key1[\'"\s]+?:\s*[\'"]([^\'"]+)[\'"]\s*,\s*[\'"\s]+?key2[\'"\s]+?:\s*[\'"]([^\'"]+)[\'"]',re.IGNORECASE))
       }
       self.valid=self.regex['url'].search(uri)
       if self.valid:
           self.id='yk:'+self.valid.group(1)
           self.link='http://v.youku.com/v_show/id_'+self.valid.group(1)
   def download(self):
       URL= self.getRealURL()
       for u in URL:
           cmd ='wget %s' % u
           os.system(cmd)
           #print cmd
   def getRealURL(self):
       realURL=[]
       shortid=''
       longid=''
       key1=''
       key2=''
       html=urllib2.urlopen(self.link)
       num =0
       for i in html:
           res1,res2=self.regex['ticket'][0].search(i),self.regex['ticket'][1].search(i)
           if res1:
               shortid=res1.group(1)
           if res2:
               longid=res2.group(1)          
       if shortid and longid:
           html=urllib2.urlopen('http://v.youku.com/player/getPlayList/VideoIDS/'+shortid)
           for i in html:
               i = i.strip()
               res=self.regex['ticket'][2].search(i)
               if res:
                   key1=res.group(1)
                   key2=res.group(2)
               res3=re.compile('\"no\"').findall(i)
               if res3:
                   num += len(res3)
           if key1 and key2:
               print "num %d" % num
               mobj = re.search(r'''(\d{8})(\d{2})(.*)''', longid)
               url_prefix = mobj.group(1)
               file_order = mobj.group(2)
               url_suffix = mobj.group(3)
               for i in range(num):
                   urlid = url_prefix+'%02d'+url_suffix
                   urlid = urlid % i
                   newurl =  ('http://f.youku.com/player/getFlvPath/'+urlid+'?k='+key2+hex(int(key1,16)^0xA55AA5A5)[2:])
                   realURL.append(newurl)
               return realURL
if __name__ == '__main__':
   url='http://v.youku.com/v_show/id_XMTA0Nzk2OTY4.html'
   yk = Youku(url)
   print yk.getRealURL()
   yk.download()

相关阅读:
设计模式_2_简单工厂、工厂方法、抽象工厂比较
 SQL模拟padding函数
 MySqlHelper c#访问MySql的工具类
 常见数据库设计（1）——字典数据
 常见数据库设计(2)——历史数据问题之单记录变更
 设计模式_1_单例模式
 代码调用存储过程超时，SQL Server Management Studio里运行很快（改进）
转:Rowid和Rownum区别
 Oracle数据库中system和sys的区别
 转:Python之全局变量
原文地址：https://www.cnblogs.com/xueliangliu/p/2962181.html