因网站不同需要修改。
下载 mp4 连接
from bs4 import BeautifulSoup
import requests
import urllib
import re
import json
encodestyle = 'gbk'
homepage='http://www.**.html'
htmlhead='http://www.**' # GetwVideoHtml() 函数用
#GetNPage_html(homepage,n)
#HtmlList2Mp4List(sumhtml)
#Writelist2json(listname,lists)
def GetwVideoHtml(furl):
retlist=[]
res = requests.get(furl)
res.encoding= encodestyle
soup = BeautifulSoup(res.text,'html.parser')
for Tag_contentpage in soup.select('.video_box'): #<div class= video_box>
for tag_a in Tag_contentpage.select('a'): #<a href = 'http-html' target='_blank'>
httphtml=tag_a['href']
retlist.append(htmlhead+httphtml) # use htmlhead
#print(imgsrc)
return retlist
def GetNPage_html(homepage,n):
rethtml=[]
for num in range(1,n+1):
if num == 1:
homewebpage=homepage
else:
homewebpage= homepage.rsplit('.',1)[0] + '_'+ str(num) + '.html'
print(homewebpage)
htmllinks = GetwVideoHtml(homewebpage)
rethtml = rethtml + htmllinks
return rethtml
def GetMp4SrcFromHtml(url):
headers = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
file = urllib.request.urlopen(url).read()
file = file.decode('gbk')
pattern = re.compile(r'(https?://.*.mp4)', re.I)
videolinks = pattern.findall(file)
videolinks = list(set(videolinks))
return videolinks
def HtmlList2Mp4List(sumhtml):
retmp4s = []
for html in sumhtml:
mp4s = GetMp4SrcFromHtml(html)
for mp4 in mp4s:
retmp4s.append(mp4)
return retmp4s
def Writelist2json(listname,lists):
length = str(len(lists))
with open('D:/ipynb/commfile/'+ listname + '_len_'+length +'.json', 'w') as fw:
json.dump(lists, fw)
sumhtml = GetNPage_html(homepage,3)
mp4list = HtmlList2Mp4List(sumhtml)
Writelist2json("mp4list",mp4list)
下载部分
from bs4 import BeautifulSoup
import requests
import urllib
import json
import threading
import datetime
import os
def mkdir(path):
folder = os.path.exists(path)
if not folder: #判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) #makedirs 创建文件时如果路径不存在会创建这个路径
print ("--- new folder... ---")
print ("--- OK ---")
else:
print ("--- There is this folder! ---")
def Schedule(a,b,c):
'''
回调函数:用于显示下载进度
a:已经下载的数据块
b:数据块的大小
c:远程文件的大小
'''
per = 100.0 * a * b / c
if (per > 100) :
per = 100
print ('%.2f%%' % per)
def createdownloadlink(name,url):
urllib.request.urlretrieve(url,name)
class myThread (threading.Thread):
def __init__(self, name, url):
threading.Thread.__init__(self) # 线程初始化
self.name = name # 赋值成员变量
self.url = url
def run(self):
print ("开始下载:" + self.name)
urllib.request.urlretrieve(self.url,self.name)
#createdownloadlink(self.name, self.url) # 在线程中运行的函数
print ("完成下载:" + self.name)
def DownMp4file(lists):
dateASfolder=datetime.datetime.now().strftime('%m-%d')
foldername = 'D:/videos/'+dateASfolder
mkdir( foldername)
threadlist = [];#存放线程的数组,相当于线程池
filenum=0
for url in lists:
filename = foldername + '/'+ str(filenum)+ '.mp4'
filenum=filenum+1
thread = myThread(filename, url) # 创建线程对象
threadlist.append(thread) #这个线程放到线程threads
return threadlist
# 执行部分
with open('D:/ipynb/commfile/srcmp4s_len_66.json', 'r') as fr:
srcmp4s = json.load(fr)
print(len(srcmp4s))
srcmp4s[0]
threads= DownMp4file(srcmp4s)
for t in threads[:10]:#让线程池中的所有数组开始
t.start();
for t in threads[:10]:
t.join();#等待所有线程运行完毕才执行一下的代码