• [Crawler] Get the real file link of BaiduYun shared by user with Chrome


    Get the real file link of BaiduYun shared by user.

    with Python 2.7 + Selenium + Chrome driver

    We finally got a viable approach after several unsatisfactory attempts and one among them is:

    http://www.cnblogs.com/ghostr/p/5823191.html

    • Here still have a lot work to improve the performance, implement threading for example.

    • History is a sqlite database file which can be easily parse by sqlite3 module. You can browse the data with DB Browser for SQLite

    # -*- coding: utf-8 -*-
    #----------------------------
    # Author: Kun Liu         
    # Start date: 2017-03-10 
    # Latest edit: 2017-03-13
    # Email: lancelotdev@163.com
    #=============================
    # Read baiduyun file links from chrome history file
    
    """
    ### 解决方案:
        1. 制定user data目录,通过 selenium 模拟 chrome 浏览器创建下载任务,但并不完成下载。
        2. 解析 userdata 中的 History 获取真实资源链接。
    
    ### Note:
    1. 未做资源链接去重处理。
    2. 存在多次访问后出现的验证问题,待研究。
    """
    
    from __future__ import absolute_import
    from __future__ import division
    from __future__ import print_function
    from __future__ import unicode_literals
    
    import time
    import os
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.alert import Alert
    
    from FileItem import FileItem
    
    user_data_dir_path = "d://userData"
    
    options = webdriver.ChromeOptions()
    options.add_argument("user-data-dir=%s"%user_data_dir_path)
    
    
    # Travel all share url to get history.
    def baiduyun_url_travel(share_url_list=[]):
        driver = webdriver.Chrome(chrome_options=options)
        if not share_url_list:
            return
        # Init the user data such as cookie so you won't need to request a url twice.
        driver.get(share_url_list[0])
        for url in share_url_list:
            driver.get(url)
            time.sleep(3)
            js_str = "Object.defineProperty(Object.getPrototypeOf(navigator),'platform',{get:function(){return 'sb_baidu';}})"
            driver.execute_script(js_str)
            try:
                element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="layoutMain"]/div[1]/div[1]/div/div[2]/div/div/div[2]/a[2]'))
                )
                
            except Exception as e:
                element = driver.find_element_by_xpath('//*[@id="layoutMain"]/div[1]/div[1]/div/div[2]/div/div/div[2]/a[2]')
            finally:
                element.click()
            time.sleep(5)
        driver.quit()
    
    
    # 2017-03-13  Liu Kun
    # The 'History' file is a sqlite database.
    # Some download links may jump to other urls which is clearly marked by Chrome 
    # and here I use the direct link without jumping.
    def get_source_link_from_history(History_path):
        import sqlite3 as db
        conn = db.connect(History_path)
        cursor = conn.cursor()
        sql = "select id, chain_index, url from downloads_url_chains where chain_index=0"
        rows = cursor.execute(sql).fetchall()
        items = []
        for row in rows:
            id, _, file_link = row
            sql = "select current_path, start_time from downloads where id=%d"%int(id)
            file_info = cursor.execute(sql).fetchone()
            if file_info:
                current_path, time_stamp = file_info
                time_stamp = str(time_stamp)
                # C:Userskun_liuDownloadsshadowsocks-nightly-3.2.7.apk.crdownload
                file_name = current_path.split('\')[-1].replace('.crdownload','')
                x = time.localtime(int(time_stamp[0:10]))
                # time.strptime(a,'%Y-%m-%d %H:%M:%S')
                start_time = time.strftime('%Y-%m-%d %H:%M:%S',x)
                item = FileItem(file_name, file_link, start_time)
                items.append(item.make_dic())
        return items
    
    if __name__ == "__main__":
        # Movie:https://pan.baidu.com/s/1sl8litZ #App:https://pan.baidu.com/s/1o8K255K
        share_url = ["https://pan.baidu.com/s/1sl8litZ", "https://pan.baidu.com/s/1dFBr37F", "https://pan.baidu.com/s/1o8K255K"]
        baiduyun_url_travel(share_url)
        History_path = os.path.join(user_data_dir_path, "Default", "History")
        items = get_source_link_from_history(History_path)
        import pprint
        pprint.pprint(items)
        
        
    

    FileItem.py:

    # -*- coding: utf-8 -*-
    #----------------------------
    # Author: Kun Liu         
    # Start date: 2017-03-13  
    # Latest edit: 2017-03-13
    #=============================
    
    from __future__ import absolute_import
    from __future__ import division
    from __future__ import print_function
    from __future__ import unicode_literals
    import pprint
    
    class FileItem:
    	def __init__(self, file_name="", file_link="", catch_time= ""):
    		self.file_name = file_name
    		self.file_link = file_link
    		self.file_time = catch_time
    
    	def make_dic(self):
    		info_dic = {"file_name":self.file_name, "link":self.file_link, "time":self.file_time}
    		return info_dic
    		
    if __name__ == "__main__":
    	pass
    
    

  • 相关阅读:
    默认Web字体样式
    从Reddit学到的七条经验
    Git魔法 前言
    26个提升java性能需要注意的地方
    解密Redis持久化
    离开Java,寻找更佳语言的10大理由
    » DebBuilder V2.2.2 测试版发布 Wow! Ubuntu
    YaCy 1.0 发布,自由软件搜索引擎
    Socket 短连接、长连接_YTmarkit的空间_百度空间
    TopHQBooks – PDF 搜索引擎 小众软件
  • 原文地址:https://www.cnblogs.com/learn-to-rock/p/6542515.html
Copyright © 2020-2023  润新知