[Crawler] Get the real file link of BaiduYun shared by user with Chrome

Get the real file link of BaiduYun shared by user.

with Python 2.7 + Selenium + Chrome driver

We finally got a viable approach after several unsatisfactory attempts and one among them is:

http://www.cnblogs.com/ghostr/p/5823191.html

Here still have a lot work to improve the performance, implement threading for example.
History is a sqlite database file which can be easily parse by sqlite3 module. You can browse the data with DB Browser for SQLite

# -*- coding: utf-8 -*-
#----------------------------
# Author: Kun Liu         
# Start date: 2017-03-10 
# Latest edit: 2017-03-13
# Email: lancelotdev@163.com
#=============================
# Read baiduyun file links from chrome history file

"""
### 解决方案：
    1. 制定user data目录，通过 selenium 模拟 chrome 浏览器创建下载任务，但并不完成下载。
    2. 解析 userdata 中的 History 获取真实资源链接。

### Note：
1. 未做资源链接去重处理。
2. 存在多次访问后出现的验证问题，待研究。
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.alert import Alert

from FileItem import FileItem

user_data_dir_path = "d://userData"

options = webdriver.ChromeOptions()
options.add_argument("user-data-dir=%s"%user_data_dir_path)


# Travel all share url to get history.
def baiduyun_url_travel(share_url_list=[]):
    driver = webdriver.Chrome(chrome_options=options)
    if not share_url_list:
        return
    # Init the user data such as cookie so you won't need to request a url twice.
    driver.get(share_url_list[0])
    for url in share_url_list:
        driver.get(url)
        time.sleep(3)
        js_str = "Object.defineProperty(Object.getPrototypeOf(navigator),'platform',{get:function(){return 'sb_baidu';}})"
        driver.execute_script(js_str)
        try:
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="layoutMain"]/div[1]/div[1]/div/div[2]/div/div/div[2]/a[2]'))
            )
            
        except Exception as e:
            element = driver.find_element_by_xpath('//*[@id="layoutMain"]/div[1]/div[1]/div/div[2]/div/div/div[2]/a[2]')
        finally:
            element.click()
        time.sleep(5)
    driver.quit()


# 2017-03-13  Liu Kun
# The 'History' file is a sqlite database.
# Some download links may jump to other urls which is clearly marked by Chrome 
# and here I use the direct link without jumping.
def get_source_link_from_history(History_path):
    import sqlite3 as db
    conn = db.connect(History_path)
    cursor = conn.cursor()
    sql = "select id, chain_index, url from downloads_url_chains where chain_index=0"
    rows = cursor.execute(sql).fetchall()
    items = []
    for row in rows:
        id, _, file_link = row
        sql = "select current_path, start_time from downloads where id=%d"%int(id)
        file_info = cursor.execute(sql).fetchone()
        if file_info:
            current_path, time_stamp = file_info
            time_stamp = str(time_stamp)
            # C:Userskun_liuDownloadsshadowsocks-nightly-3.2.7.apk.crdownload
            file_name = current_path.split('\')[-1].replace('.crdownload','')
            x = time.localtime(int(time_stamp[0:10]))
            # time.strptime(a,'%Y-%m-%d %H:%M:%S')
            start_time = time.strftime('%Y-%m-%d %H:%M:%S',x)
            item = FileItem(file_name, file_link, start_time)
            items.append(item.make_dic())
    return items

if __name__ == "__main__":
    # Movie:https://pan.baidu.com/s/1sl8litZ #App:https://pan.baidu.com/s/1o8K255K
    share_url = ["https://pan.baidu.com/s/1sl8litZ", "https://pan.baidu.com/s/1dFBr37F", "https://pan.baidu.com/s/1o8K255K"]
    baiduyun_url_travel(share_url)
    History_path = os.path.join(user_data_dir_path, "Default", "History")
    items = get_source_link_from_history(History_path)
    import pprint
    pprint.pprint(items)

FileItem.py:

# -*- coding: utf-8 -*-
#----------------------------
# Author: Kun Liu         
# Start date: 2017-03-13  
# Latest edit: 2017-03-13
#=============================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import pprint

class FileItem:
	def __init__(self, file_name="", file_link="", catch_time= ""):
		self.file_name = file_name
		self.file_link = file_link
		self.file_time = catch_time

	def make_dic(self):
		info_dic = {"file_name":self.file_name, "link":self.file_link, "time":self.file_time}
		return info_dic
		
if __name__ == "__main__":
	pass

Document links：

Selenium-Python

相关阅读:
默认Web字体样式
 从Reddit学到的七条经验
 Git魔法前言
 26个提升java性能需要注意的地方
 解密Redis持久化
 离开Java，寻找更佳语言的10大理由
 » DebBuilder V2.2.2 测试版发布 Wow! Ubuntu
YaCy 1.0 发布，自由软件搜索引擎
 Socket 短连接、长连接_YTmarkit的空间_百度空间
 TopHQBooks – PDF 搜索引擎小众软件
原文地址：https://www.cnblogs.com/learn-to-rock/p/6542515.html