• 爬虫之爬取梨视频


    # 1.爬取首页数据,解析获取视频的详情链接

    # 2.遍历每一个详情链接,并访问

    # 3.从详情页面解析得到需要的数据 (视频链接,标题,详情,时间,收藏次数)

    import requests
    
    import re
    
    import os
    
    from threading import Thread
    
    from concurrent.futures import  ThreadPoolExecutor
    
    
    
    
    
    base_url = "https://www.pearvideo.com/"
    
    
    
    def get_index():
    
        res = requests.get(base_url,headers={
    
            "user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    
            "referer": "https: // www.baidu.com / link?url = fUq54ztdrrLaIUXa - p6B9tuWXC3byFJCyBKuvuJ_qsPw8QLrWIfekFKGgmhqITyF & wd = & eqid = c5366da10000199a000000025c45768a"
    
        })
    
        return res.text
    
    
    
    def parser_index(text):
    
        urls = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">',text)
    
        urls = [base_url + i for i in  urls]
    
        # print(urls)
    
        return urls
    
    
    
    def get_details(url):
    
        res = requests.get(url)
    
        print(res.status_code)
    
        return  res.text
    
    
    
    def parser_details(text):
    
        # 视频的地址
    
        video_url = re.search(r'srcUrl="(.*?.mp4)"',text).group(1)
    
        # 标题
    
        title = re.search('<h1 class="video-tt">(.*?)</h1>',text).group(1)
    
        # 详情
    
        content = re.search('<div class="summary">(.*?)</div>',text).group(1)
    
        # 时间
    
        date = re.search('<div class="date">(.*?)</div>', text).group(1)
    
        # 点赞数量
    
        count = re.search('<div class="fav" data-id=".*?">(.*?)</div>', text).group(1)
    
    
    
        return  {"video_url":video_url,"title":title,"content":content,"date":date,"count":count}
    
    
    
    
    
    def download_video(url,title):
    
        data = requests.get(url)
    
    
    
        if not os.path.exists("videos"):
    
            os.makedirs("videos")
    
        filename = os.path.join("videos",title)+".mp4"
    
        filename = filename.replace(":","_")
    
    
    
        with open(filename,"wb") as f:
    
            f.write(data.content)
    
        print("%s download finished!" % title)
    
    
    
    
    
    if __name__ == '__main__':
    
        pool = ThreadPoolExecutor(5)
    
        data = get_index()
    
        urls = parser_index(data)
    
        for i in urls:
    
            t = get_details(i)
    
            dic = parser_details(t)
    
            # Thread(target=download_video,args=(dic["video_url"],dic["title"])).start()
    
            pool.submit(download_video,dic["video_url"],dic["title"])
    
            print("submit task",dic["title"])
    
    
    
        print("submit finished")
    
    
    
    # reqType=5 固定
    
    # categoryId 分类id
    
    # start 从第几个开始

    # 1.爬取首页数据,解析获取视频的详情链接# 2.遍历每一个详情链接,并访问# 3.从详情页面解析得到需要的数据 (视频链接,标题,详情,时间,收藏次数)
    import requestsimport reimport osfrom threading import Threadfrom concurrent.futures import  ThreadPoolExecutor

    base_url = "https://www.pearvideo.com/"
    def get_index():    res = requests.get(base_url,headers={        "user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",        "referer": "https: // www.baidu.com / link?url = fUq54ztdrrLaIUXa - p6B9tuWXC3byFJCyBKuvuJ_qsPw8QLrWIfekFKGgmhqITyF & wd = & eqid = c5366da10000199a000000025c45768a"    })    return res.text
    def parser_index(text):    urls = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">',text)    urls = [base_url + i for i in  urls]    # print(urls)    return urls
    def get_details(url):    res = requests.get(url)    print(res.status_code)    return  res.text
    def parser_details(text):    # 视频的地址    video_url = re.search(r'srcUrl="(.*?.mp4)"',text).group(1)    # 标题    title = re.search('<h1 class="video-tt">(.*?)</h1>',text).group(1)    # 详情    content = re.search('<div class="summary">(.*?)</div>',text).group(1)    # 时间    date = re.search('<div class="date">(.*?)</div>', text).group(1)    # 点赞数量    count = re.search('<div class="fav" data-id=".*?">(.*?)</div>', text).group(1)
        return  {"video_url":video_url,"title":title,"content":content,"date":date,"count":count}

    def download_video(url,title):    data = requests.get(url)
        if not os.path.exists("videos"):        os.makedirs("videos")    filename = os.path.join("videos",title)+".mp4"    filename = filename.replace(":","_")
        with open(filename,"wb") as f:        f.write(data.content)    print("%s download finished!" % title)

    if __name__ == '__main__':    pool = ThreadPoolExecutor(5)    data = get_index()    urls = parser_index(data)    for i in urls:        t = get_details(i)        dic = parser_details(t)        # Thread(target=download_video,args=(dic["video_url"],dic["title"])).start()        pool.submit(download_video,dic["video_url"],dic["title"])        print("submit task",dic["title"])
        print("submit finished")
    # reqType=5 固定# categoryId 分类id# start 从第几个开始

  • 相关阅读:
    你的服务器和网站为什么会被反复入侵
    MAC 查看当前安装的JDK位置
    Jmeter高并发测试
    解密AndroidManifest.xml、AXMLPrinter2.jar源码下载
    Win10家庭版如何启用本地组策略
    SQLFlow使用中的注意事项--设置篇
    Sqlflow 之隐私政策(Privacy plolicy)介绍
    血缘关系分析工具SQLFLOW--实践指南
    Oracle SQL 性能优化利器
    SQLFlow数据流分析工具的job功能介绍
  • 原文地址:https://www.cnblogs.com/xuecaichang/p/10486447.html
Copyright © 2020-2023  润新知