• python爬虫脚本下载YouTube视频


    python爬虫脚本下载YouTube视频

    工作环境:

    • python 2.7.13

    • pip

    • lxml, 安装 pip install lxml,主要用xpath查找节点,可以使用re模块代替

    • pytube, 安装 pip install pytube

    • 科学上网工具

    参考:

    源码:

    1. # coding: utf-8 
    2. __author__ = "zwzhou" 
    3. __date__ = "2017-03-19" 
    4.  
    5. import urllib2 
    6. from pytube import YouTube 
    7. from pprint import pprint 
    8. from lxml import etree 
    9. import sys,getopt 
    10.  
    11. def getHtml(url): 
    12. user_agent='Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.13 (KHTML, like Gecko) Chrome/24.0.1284.0 Safari/537.13' 
    13. headers={'User-Agent':user_agent} 
    14. request=urllib2.Request(url,headers=headers) 
    15. response=urllib2.urlopen(request) 
    16. html=response.read() 
    17. return html 
    18.  
    19. def getUrl(html): 
    20. global savepath 
    21. global maxNumber 
    22. global timeThreshold 
    23. global cur_count 
    24. global videoLists 
    25. tree=etree.HTML(html) 
    26. urllist=tree.xpath(u'//div[@class="thumb-wrapper"]/a/@href'
    27. #print urllist 
    28. urllist_time=tree.xpath(u'//div[@class="thumb-wrapper"]/a/span/span/text()'
    29.  
    30. baseurl=r'https://www.youtube.com' 
    31. for (item_name,item_length) in zip(urllist,urllist_time): 
    32. #print item_name 
    33. #print item_length 
    34. try
    35. yt = YouTube(baseurl+item_name) 
    36. except
    37. print "Some thing wrong about the authority" 
    38.  
    39. print("video name:"+yt.filename) 
    40. print("video time:"+item_length) 
    41. if yt.filename in videoLists: # 文件已经存在 
    42. print "This video has been downloaded!" 
    43. else
    44. if checktime(item_length): 
    45. video = yt.filter('mp4')[-1
    46. print("Now is loading %s------------>"%yt.filename) 
    47. video.download(savepath) 
    48. print("--------------->%sVideo is loaded!"%yt.filename) 
    49. cur_count+=1 
    50. videoLists.append(yt.filename) 
    51. if cur_count >= maxNumber:# 达到要求 
    52. print('There are %d videos downloaded!This task is completed!'%maxNumber) 
    53. # TODO: if necessary, the videoLists can be logged 
    54. sys.exit()  
    55. else
    56. print 'This video is too long and it will not be downloaded, just be ignored!' 
    57. if urllist: 
    58. getUrl(baseurl+urllist[0]) #下一个页面 
    59.  
    60.  
    61. def checktime(timelength): 
    62. global timeThreshold 
    63. strs=timelength.split(':'
    64. time =int(strs[0])*60+int(strs[1]) 
    65. if time< timeThreshold: 
    66. return True 
    67. else
    68. return False 
    69.  
    70. def usage(): 
    71. print ''' 
    72. usage: python dl_youtube [option] [arg] 
    73. options and args: 
    74. -s : download path 
    75. -t : time threshold of the video to be loaded, in seconds 
    76. -u : start url which to be crawled, it can be set more than one time 
    77. -n : when downloading is stop, i.e. how many videos will be downloaded, default is 10000. 
    78. -h : print this help message 
    79. ''' 
    80.  
    81. if __name__ == "__main__"
    82. start_urls=['https://www.youtube.com/watch?v=TThzH_sJo6o'
    83. videoLists=[] # 保存文件名,防止重复下载 
    84. # 初始值 
    85. savepath=r"D://MyDownloads" 
    86. maxNumber=10000 
    87. timeThreshold=240 
    88. cur_count=0 
    89.  
    90. opts,args=getopt.getopt(sys.argv[1:],'hs:t:n:u:'
    91. for op,value in opts: 
    92. if op == "-s": # 下载路径,如默认 D://MyDownloads 
    93. savepath=value 
    94. elif op == '-t': # 时常限制,默认240s 
    95. timeThreshold =int(value) 
    96. elif op == "-h": # help 
    97. usage() 
    98. sys.exit() 
    99. elif op == '-n'
    100. maxNumber=int(value) 
    101. elif op == '-u': # 初始的搜索链接 
    102. start_urls.append(value) 
    103.  
    104. for item in start_urls: 
    105. html = getHtml(item) 
    106. getUrl(html) 
    107.  

    使用

    1. python dl_youtube.py -n 10 -s D://MyDownloads -t 600 -u https://www.youtube.com/watch?v=TThzH_sJo6o 

    将从页面 https://www.youtube.com/watch?v=TThzH_sJo6o 开始搜索下载10段时长小于6分钟的video保存到D://MyDownloads文件夹中。

  • 相关阅读:
    数码摄影入门之十 数码相片后期处理
    Easy CHM 2.10
    LeapFTP 3.0.0.43 汉化版(附带LeapFTP 3.0注册码)
    使用“淘宝助理”的常见错误
    “互联网浏览器”控件与webBrowser控件的区别
    易语言源代码毁来者来了!!
    易语言正则表达式的多行匹配替换
    Explorer.exe鲜为人知的参数
    原始套接字概述
    网络技术数据封装
  • 原文地址:https://www.cnblogs.com/YiXiaoZhou/p/6581031.html
Copyright © 2020-2023  润新知