• yhdm动漫爬虫项目


    # -*- coding: utf-8 -*-
    """
    Created on Fri Aug 28 17:21:10 2020
    
    @author: Mto
    """
    """
    网址:http://www.yhdm.tv/
    目的
    获取视频文件
    8月28日,代码功能基本实现
    """
    import requests
    import re
    from bs4 import BeautifulSoup
    
    def getHTML(url):
        """访问网站获取页面,返回页面"""
        header = {
            'Host':'www.yhdm.tv',
            'User-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0'
        }
        r = requests.get(url,headers=header)
        r.encoding = 'utf-8'
        return r
    
    
    def GetMp4HTML(url):
        """访问网站获取页面,返回页面"""
        header = {
            'User-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0'
        }
        r = requests.get(url,headers=header)
        r.encoding = 'utf-8'
        print(r.status_code)
        return r
    
          
    def processHTML(r):
        """使用bs4进行处理"""
        soup = BeautifulSoup(r.text,'html.parser')
        return soup
    
    
    def GetLinksAndTitle(url1,title,links):
        r = getHTML(url1)
        soup = processHTML(r)
        print('要下载的动漫为:'+soup.h1.string)
        a = soup.findAll(style="display:block")
        href = a[0].find_all('a')
        for i in range(0,len(href)):
            title.append(href[i].text)
            links.append('http://www.yhdm.tv' + href[i]['href'])
        
            
        
    def getmp4(link):
        """提取页面中的视频文件链接"""
        r = getHTML(link)
        soup = processHTML(r)
        s = soup.select('div#playbox')
        geturl = re.compile('^https.*\.mp4')
        mo = geturl.search(s[0].attrs['data-vid'])
        return(str(mo.group()))
        #download(str(mo.group()), title)
    
    
    def download(mp4link,title):
        """下载视频"""
        #r = GetMp4HTML(mp4link)
        print(mp4link)
        print(title+'模拟访问成功,不下了,放过那个可怜的服务器吧')
        # try:
        #     with open(title+'.mp4','wb') as f:
        #         f.write(r.content)
        #         print(title+'下载成功')
        # except:
        #     print(title+'下载失败')
        
                
    def main():
        title = []
        links=[]
        url = 'http://www.yhdm.tv/show/4790.html'
        GetLinksAndTitle(url, title, links)
        for i in range(0,3):
            mp4link = getmp4(links[i])
            download(mp4link, title[i])
    main()
  • 相关阅读:
    文件内容排名算法,输入排名函数,返回排名后的文件名
    线段树做大数据排序
    给字符排序-基类排序二分查找-JavaScript
    后缀数组、名次数组-JavaScript
    二分查找法、二分去重排序法,返回最接近的位置和实际位置
    用四叉树对图像分类,获取tag和key
    Linux显示所在Git分支
    Linux中设置Git显示颜色
    屏蔽网页广告
    tf.add_to_collection,tf.get_collection简介
  • 原文地址:https://www.cnblogs.com/JKding233/p/15947091.html
Copyright © 2020-2023  润新知