• 有问题的Py代码


    前几天在群里看到一个朋友说想抓取一个影视网站 把所有的资源弄下来 自己开一个玩玩 但是没有抓到关键数据 然后把网址发了出来 我上去分析了一番  然后使用python写了一个多线程的  但是因为视频文件太大 没有去测试下载很多  所以大家自己测试一下了  有什么问题 可以反馈一下
    用到的第三方模块有 没有的朋友请自己下载一下
    1 . requests
    2. BeautifulSoup
    3. lxml

    首先第一步 获得整站所有的视频连接

    第二步  是进入选择的电影的页面 去获得视频的链接

    第三步 构造下载视频用到的参数

    第四步 下载视频 保存到本地

    直接上实现代码  
    使用的多线程 加信号量实现  默认开启5条线程开始操作 每条线程去下载一套视频  是一套 一套 一套     
    也可以自己去修改同时开启几条线程
    实现代码

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    import re
    import requests
    from threading import *
    from bs4 import BeautifulSoup
    from lxml import etree
    from contextlib import closing
    
    nMaxThread = 5
    connectlock = BoundedSemaphore(nMaxThread)
    gHeads = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}
    
    
    class MovieThread(Thread):
        def __init__(self, url, movieName):
            Thread.__init__(self)
            self.url = url
            self.movieName = movieName
    
        def run(self):
            try:
                urlList = self.GetMovieUrl(self.url)
                for i in range(len(urlList)):
                    type, vkey = self.GetVkeyParam(self.url, urlList[i])
                    if type != None and vkey != None:
                        payload, DownloadUrl = self.GetOtherParam(self.url, urlList[i], type, vkey)
                        if DownloadUrl:
                            videoUrl = self.GetDownloadUrl(payload, DownloadUrl)
                            if videoUrl:
                                self.DownloadVideo(videoUrl, self.movieName, i + 1)
            finally:
                connectlock.release()
    
        def GetMovieUrl(self, url):
            heads = {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
                "Host": "91mjw.com",
                "Referer": "https://91mjw.com/"
            }
            html = requests.get(url, headers=heads).text
            xmlContent = etree.HTML(html)
            UrlList = xmlContent.xpath("//div[@id='video_list_li']/a/@href")
            if len(UrlList) > 0:
                return UrlList
            else:
                return None
    
        def GetVkeyParam(self, firstUrl, secUrl):
            heads = {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
                "Host": "91mjw.com",
                "Referer": firstUrl
            }
            try:
                html = requests.get(firstUrl + secUrl, headers=heads).text
                bs = BeautifulSoup(html, "html.parser")
                content = bs.find("body").find("script")
                reContent = re.findall('"(.*?)"', content.text)
                return reContent[0], reContent[1]
            except:
                return None, None
    
        def GetOtherParam(self, firstUrl, SecUrl, type, vKey):
            url = "https://api.1suplayer.me/player/?userID=&type=%s&vkey=%s" % (type, vKey)
            heads = {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
                "Host": "api.1suplayer.me",
                "Referer": firstUrl + SecUrl
            }
            try:
                html = requests.get(url, headers=heads).text
                bs = BeautifulSoup(html, "html.parser")
                content = bs.find("body").find("script").text
                recontent = re.findall(" = '(.+?)'", content)
                payload = {
                    "type": recontent[3],
                    "vkey": recontent[4],
                    "ckey": recontent[2],
                    "userID": "",
                    "userIP": recontent[0],
                    "refres": 1,
                    "my_url": recontent[1]
                }
                return payload, url
            except:
                return None, None
    
        def GetDownloadUrl(self, payload, refereUrl):
            heads = {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
                "Host": "api.1suplayer.me",
                "Referer": refereUrl,
                "Origin": "https://api.1suplayer.me",
                "X-Requested-With": "XMLHttpRequest"
            }
            while True:
                retData = requests.post("https://api.1suplayer.me/player/api.php", data=payload, headers=heads).json()
                if retData["code"] == 200:
                    return retData["url"]
                elif retData["code"] == 404:
                    payload["refres"] += 1;
                    continue
                else:
                    return None
    
        def DownloadVideo(self, url, videoName, videoNum):
            CurrentSize = 0
            heads = {
                "chrome-proxy": "frfr",
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
                "Host": "sh-yun-ftn.weiyun.com",
                "Range": "bytes=0-"
            }
            with closing(requests.get(url, headers=heads)) as response:
                retSize = int(response.headers['Content-Length'])
                chunkSize = 10240
                if response.status_code == 206:
                    print
                    '[File Size]: %0.2f MB
    ' % (retSize / 1024 / 1024)
                    with open("./video/%s/%02d.mp4" % (videoName, videoNum), "wb") as f:
                        for data in response.iter_content(chunk_size=chunkSize):
                            f.write(data)
                            CurrentSize += len(data)
                            f.flush()
                            print
                            '[Progress]: %0.2f%%' % float(CurrentSize * 100 / retSize) + '
    '
    
    
    def main():
        html = requests.get("https://91mjw.com", headers=gHeads).text
        xmlcontent = etree.HTML(html)
        UrlList = xmlcontent.xpath("//div[@class='m-movies clearfix']/article/a/@href")
        NameList = xmlcontent.xpath("//div[@class='m-movies clearfix']/article/h2/a/text()")
        for i in range(len(UrlList)):
            connectlock.acquire()
            url = UrlList[i]
            name = NameList[i].encode("utf-8")
            t = MovieThread(url, name)
            t.start()
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    Linux 文件取交集 并集 差集
    阿里花名推荐
    Linux bg fg命令的使用
    python导入自己创建的本地包报错
    数值计算方法
    数据库oracle回顾
    使用visualBox创建Centos/7,搭建docker,安装mysql,及远程连接
    git 合并分支到master
    git 本地文件修改错误,重新取回服务器历史版本
    三本不错的计算机专业书籍(需求分析,开发实务,恶意代码分析)
  • 原文地址:https://www.cnblogs.com/yigongzi/p/9169160.html
Copyright © 2020-2023  润新知