• python下载网页视频


    因网站不同需要修改。

    下载 mp4 连接

    from bs4 import BeautifulSoup
    import requests
    import urllib
    import re
    import json
    encodestyle = 'gbk'
    homepage='http://www.**.html'
    htmlhead='http://www.**'  # GetwVideoHtml() 函数用
    
    
    #GetNPage_html(homepage,n)
    #HtmlList2Mp4List(sumhtml)
    #Writelist2json(listname,lists)
    
    def GetwVideoHtml(furl):
        retlist=[]
        res = requests.get(furl)
        res.encoding= encodestyle
        soup = BeautifulSoup(res.text,'html.parser')
        for Tag_contentpage in soup.select('.video_box'):   #<div class= video_box>
            for tag_a in Tag_contentpage.select('a'):       #<a href = 'http-html' target='_blank'>
                httphtml=tag_a['href']
                retlist.append(htmlhead+httphtml)  # use htmlhead
                #print(imgsrc)
        return retlist
    def GetNPage_html(homepage,n):
        rethtml=[]
        for num in range(1,n+1):
            if num == 1:
                homewebpage=homepage
            else:
                homewebpage= homepage.rsplit('.',1)[0] + '_'+ str(num) + '.html'
            print(homewebpage)
            htmllinks = GetwVideoHtml(homewebpage)
            rethtml = rethtml + htmllinks
        return rethtml
    
    def GetMp4SrcFromHtml(url):
        headers = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")
        opener = urllib.request.build_opener()
        opener.addheaders = [headers]
        urllib.request.install_opener(opener)
        file = urllib.request.urlopen(url).read()
        file = file.decode('gbk')
        pattern = re.compile(r'(https?://.*.mp4)', re.I)  
        videolinks = pattern.findall(file)
        videolinks = list(set(videolinks))
        return videolinks
    def HtmlList2Mp4List(sumhtml):
        retmp4s = []
        for html in sumhtml:
            mp4s = GetMp4SrcFromHtml(html)
            for mp4 in mp4s:
                retmp4s.append(mp4)
        return retmp4s
    def Writelist2json(listname,lists):  
        length = str(len(lists)) 
        with open('D:/ipynb/commfile/'+ listname + '_len_'+length +'.json', 'w') as fw:
            json.dump(lists, fw)
            
    sumhtml = GetNPage_html(homepage,3)
    mp4list = HtmlList2Mp4List(sumhtml)
    Writelist2json("mp4list",mp4list)
    

    下载部分

    from bs4 import BeautifulSoup
    import requests
    import urllib
    import json
    import threading
    import datetime
    import os
    
    def mkdir(path):
        folder = os.path.exists(path)
        if not folder:                   #判断是否存在文件夹如果不存在则创建为文件夹
            os.makedirs(path)            #makedirs 创建文件时如果路径不存在会创建这个路径
            print ("---  new folder...  ---")
            print ("---  OK  ---")
        else:
            print ("---  There is this folder!  ---")
    
    def Schedule(a,b,c):
        '''
        回调函数:用于显示下载进度
        a:已经下载的数据块
        b:数据块的大小
        c:远程文件的大小
       '''
        per = 100.0 * a * b / c
        if (per > 100) :
            per = 100
        print ('%.2f%%' % per)
    def createdownloadlink(name,url):
        urllib.request.urlretrieve(url,name)
    
    class myThread (threading.Thread):
        def __init__(self, name, url):
            threading.Thread.__init__(self) # 线程初始化
            self.name = name  # 赋值成员变量
            self.url = url
        def run(self):
            print ("开始下载:" + self.name)
            urllib.request.urlretrieve(self.url,self.name)
            #createdownloadlink(self.name, self.url)  # 在线程中运行的函数
            print ("完成下载:" + self.name)
    def DownMp4file(lists):
        dateASfolder=datetime.datetime.now().strftime('%m-%d')
        foldername = 'D:/videos/'+dateASfolder
        mkdir( foldername)    
        threadlist = [];#存放线程的数组,相当于线程池
        filenum=0
        for url in lists:  
            filename = foldername + '/'+ str(filenum)+ '.mp4'   
            filenum=filenum+1
            thread = myThread(filename, url)  # 创建线程对象
            threadlist.append(thread)        #这个线程放到线程threads
        return threadlist
    # 执行部分
    
    
    with open('D:/ipynb/commfile/srcmp4s_len_66.json', 'r') as fr:
        srcmp4s = json.load(fr)
    
    print(len(srcmp4s))
    srcmp4s[0]
    
    threads= DownMp4file(srcmp4s)
    
    for t in threads[:10]:#让线程池中的所有数组开始
        t.start(); 
    for t in threads[:10]:
        t.join();#等待所有线程运行完毕才执行一下的代码
    
  • 相关阅读:
    海量数据处理
    mysql数据导出
    手机归属地
    如何正确合理的建立MYSQL数据库索引
    Java 复杂excel报表导出
    NullpointerException真的一定要被预防?
    代码传递信息方式的探究
    ThreadLoacl的反思
    Codis分布式锁
    spring mvc:事务引起的try/catch失效
  • 原文地址:https://www.cnblogs.com/ims-/p/9736006.html
Copyright © 2020-2023  润新知