• yhdm动漫爬虫项目


    # -*- coding: utf-8 -*-
    """
    Created on Fri Aug 28 17:21:10 2020
    
    @author: Mto
    """
    """
    网址:http://www.yhdm.tv/
    目的
    获取视频文件
    8月28日,代码功能基本实现
    """
    import requests
    import re
    from bs4 import BeautifulSoup
    
    def getHTML(url):
        """访问网站获取页面,返回页面"""
        header = {
            'Host':'www.yhdm.tv',
            'User-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0'
        }
        r = requests.get(url,headers=header)
        r.encoding = 'utf-8'
        return r
    
    
    def GetMp4HTML(url):
        """访问网站获取页面,返回页面"""
        header = {
            'User-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0'
        }
        r = requests.get(url,headers=header)
        r.encoding = 'utf-8'
        print(r.status_code)
        return r
    
          
    def processHTML(r):
        """使用bs4进行处理"""
        soup = BeautifulSoup(r.text,'html.parser')
        return soup
    
    
    def GetLinksAndTitle(url1,title,links):
        r = getHTML(url1)
        soup = processHTML(r)
        print('要下载的动漫为:'+soup.h1.string)
        a = soup.findAll(style="display:block")
        href = a[0].find_all('a')
        for i in range(0,len(href)):
            title.append(href[i].text)
            links.append('http://www.yhdm.tv' + href[i]['href'])
        
            
        
    def getmp4(link):
        """提取页面中的视频文件链接"""
        r = getHTML(link)
        soup = processHTML(r)
        s = soup.select('div#playbox')
        geturl = re.compile('^https.*\.mp4')
        mo = geturl.search(s[0].attrs['data-vid'])
        return(str(mo.group()))
        #download(str(mo.group()), title)
    
    
    def download(mp4link,title):
        """下载视频"""
        #r = GetMp4HTML(mp4link)
        print(mp4link)
        print(title+'模拟访问成功,不下了,放过那个可怜的服务器吧')
        # try:
        #     with open(title+'.mp4','wb') as f:
        #         f.write(r.content)
        #         print(title+'下载成功')
        # except:
        #     print(title+'下载失败')
        
                
    def main():
        title = []
        links=[]
        url = 'http://www.yhdm.tv/show/4790.html'
        GetLinksAndTitle(url, title, links)
        for i in range(0,3):
            mp4link = getmp4(links[i])
            download(mp4link, title[i])
    main()
  • 相关阅读:
    敏感性分析与风险分析
    深入理解PHP之foreach
    PHP上传文件到七牛(Qiniu)
    Swoft 新手向教程
    HP下kafka的实践
    关于BOOTSTRAP的整理和理解
    win10 ubuntu 子系统安装php
    CentOS7 安装 PHP7.2
    PHP 锁机制
    深入理解PHP之strpos
  • 原文地址:https://www.cnblogs.com/JKding233/p/15947091.html
Copyright © 2020-2023  润新知