下载链接:http://www.pearvideo.com/category_1
import requests from lxml import etree import re from urllib.request import urlretrieve ''' 1 获取视频id 2 拼接完整url 3 获取完整视频播放地址 4 下载视频 ''' def download_video(url): # url = 'http://www.pearvideo.com/category_1' response = requests.get(url) # 返回状态码200,代表请求成功 html = response.text # 把文本文件处理成可解析的对象 html = etree.HTML(html) # 获取video id video_id = html.xpath("//div[@class='vervideo-bd']/a/@href") starturl = "http://www.pearvideo.com" # 视频播放url列表 video_urls = [] for i in video_id: newurl = starturl + '/' + i video_urls.append(newurl) for url in video_urls: html = requests.get(url).text # 获取视频播放真正地址 # 定义播放地址的正则表达式模式 # srcUrl = "http://video.pearvideo.com/mp4/adshort/20180720/cont-1393622-12502013_adpkg-ad_hd.mp4" req = 'srcUrl="(.*?)"' # 编译正则表达式字符串为对象,目的是提高效率, 可以不加 req = re.compile(req) play_url = re.findall(req, html) # 注意play_url的格式是list # 获取视频标题 # 定义视频标题正则表达式模式 # <h1 class="video-tt">闯大祸!金毛撞上兰博基尼,车损40万</h1> re_tag = '<h1 class="video-tt">(.*?)</h1>' tag = re.findall(re_tag, html) # 下载视频 print("正在下载 %s"%tag[0]) urlretrieve(play_url[0], "./video/{}.mp4".format(tag[0])) # print("下载完成 %s"%tag[0]) # download_video() # 动态加载链接变化 # http://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=12&mrd=0.8960730781029713&hotContIds=1394290,1394224,1394233 # http://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=24&mrd=0.45490116190392094&hotContIds=1394290,1394224,1394233http://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=24&mrd=0.45490116190392094&hotContIds=1394290,1394224,1394233 # http://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=36&mrd=0.895263612547242&hotContIds=1394290,1394224,1394233 # &mrd 后面的东西可以不加 # 动态加载下载 def download_more(): n = 12 while True: if n > 48: return # 或者break跳出循环 url = "http://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=%d"%n download_video(url) n += 12 # sleep(10) download_more()
F12打开开发者工具 -> NetWork -> 找到动态加载html项(category_loading开头)-> Headers -> Request URL