• 抖音网页版高清视频抓取教程selenium


    废话不多说,直接上代码

    from selenium import webdriver
    from selenium.webdriver import ChromeOptions
    import time
    import re
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    import uuid
    import os
    import requests
    
    
    option = ChromeOptions()
    option.add_argument(
        'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36"'
    )
    option.add_experimental_option('excludeSwitches', ['enable-automation'])  #防止系统检测到自动化工具
    option.add_experimental_option('useAutomationExtension', False)
    browser = webdriver.Chrome(options=option)
    browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
       'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
    })
    
    browser.maximize_window()#页面最大化
    
    def douyincrawler(keyword):
        url = 'https://www.douyin.com/search/'+keyword+'?publish_time=0&sort_type=0&source=switch_tab&type=video'
        browser.get(url)
        browser.find_element_by_xpath('//*[@id="qdblhsHs"]/button').click()  #点击登陆用抖音手机app扫码登陆
        time.sleep(15)  #设置等待时间扫码登陆
    
        for x in range(5):#自动下拉
            time.sleep(5)
            js_bottom = "var q=document.documentElement.scrollTop=10000"
            browser.execute_script(js_bottom)
            if '服务出现异常' in browser.page_source:   #刷新页面
                browser.refresh()
            if '服务异常,重新' in browser.page_source:
                browser.find_element_by_xpath('//*[@id="dark"]/div[2]/div/div[3]/div[2]/div/div/span').click()  #点击加载
    
        detail_url_lists = browser.find_elements_by_xpath('//*[@id="dark"]/div[2]/div/div[3]/div[2]/ul/li/div/div/a[1]')# 获取页面所有详情url
        print('共计侦查到{}个视频数据'.format(len(detail_url_lists)))
        for i in detail_url_lists:
            try:
                browser.execute_script("arguments[0].click();", i)   #防止页面有该元素却无法点击问题出现
                ws = browser.window_handles      #获取所有窗口
                browser.switch_to.window(ws[1])  #切换新句柄
                WebDriverWait(browser, 10).until(EC.presence_of_element_located((
                    By.XPATH, '//*[@id="root"]/div/div[2]/div/div/div[1]/div[1]/div[2]/div/div[1]/div/div[2]/div[2]/xg-video-container/video'
                )))  #显示等待视频标签出现
                video_url = 'https:' + re.findall(r'<source class="" src="(.*?)"', browser.page_source)[0]   # 正则获取视频链接
                savevideo(video_url)
                browser.close()                  #关闭当前窗口
                browser.switch_to.window(ws[0])  #切回主页面这一步很关键
            except Exception as e:
                print(e)
    
    
    
    def savevideo(video_url):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36",
        }
        video_dir = r'C:\Users\lvye\Desktop\dou_yin\video'
        video_full_path = os.path.join(video_dir,str(uuid.uuid4()) + '.mp4')
        response = requests.get(url=video_url,headers=headers)
        with open(video_full_path,'wb')as f:
            f.write(response.content)
        print('已下载:{}'.format(video_url))
    
    
    
    
    if __name__ == '__main__':
        douyincrawler('街拍美女')

    成果展示:

    注:该代码只做技术分享,不可用于违法犯罪

  • 相关阅读:
    多表连接查询
    从0开始独立完成企业级Java电商网站开发(服务端)
    ThreadLocal小试牛刀
    svn版本控制
    策略模式和工厂模式搭配使用
    MyBatis-Plus
    github骚操作
    【测试】 Java如何优雅的生成测试数据
    【随笔】开源之殇
    【Maven】maven 插件开发实战
  • 原文地址:https://www.cnblogs.com/lvye001/p/16054931.html
Copyright © 2020-2023  润新知