python爬虫:登录百度账户,并上传文件到百度云盘


     * Created by resolvewang on 2017/4/15.
    function getGid() {
        return "xxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx".replace(/[xy]/g, function (e) {
            var t = 16 * Math.random() | 0, n = "x" == e ? t : 3 & t | 8;
            return n.toString(16)
    function  getCallback() {
        return "bd__cbs__" + Math.floor(2147483648 * Math.random()).toString(36)


    #-*- coding:utf-8 -*-
    __author__ = 'Administrator'
    import time
    import json
    import re
    import requests
    import execjs
    import base64
    from urllib.parse import urlencode
    from requests_toolbelt import MultipartEncoder
    from Crypto.Cipher import PKCS1_v1_5
    from Crypto.PublicKey import RSA
    from hashlib import md5
    from zlib import crc32
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 '
                             '(KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    # 全局的session
    session = requests.session()
    session.get('https://pan.baidu.com', headers=headers)
    class BufferReader(MultipartEncoder):
        def __init__(self, fields, boundary=None, callback=None, cb_args=(), cb_kwargs=None):
            self._callback = callback
            self._progress = 0
            self._cb_args = cb_args
            self._cb_kwargs = cb_kwargs or {}
            super(BufferReader, self).__init__(fields, boundary)
        def read(self, size=None):
            chunk = super(BufferReader, self).read(size)
            self._progress += int(len(chunk))
                'size': self._len,
                'progress': self._progress
            if self._callback:
                    self._callback(*self._cb_args, **self._cb_kwargs)
                except:  # catches exception from the callback
                    # raise CancelledError('The upload was cancelled.')
            return chunk
    def _get_runntime():
        :param path: 加密js的路径,注意js中不要使用中文!估计是pyexecjs处理中文还有一些问题
        :return: 编译后的js环境,不清楚pyexecjs这个库的用法的请在github上查看相关文档
        phantom = execjs.get()  # 这里必须为phantomjs设置环境变量,否则可以写phantomjs的具体路径
        with open('login.js', 'r') as f:
            source = f.read()
        return phantom.compile(source)
    def get_gid():
        return _get_runntime().call('getGid')
    def get_callback():
        return _get_runntime().call('getCallback')
    def _get_curtime():
        return int(time.time()*1000)
    # 抓包也不是百分百可靠啊,这里?getapi一定要挨着https://passport.baidu.com/v2/api/写,才会到正确的路由
    def get_token(gid, callback):
        cur_time = _get_curtime()
        get_data = {
            'tpl': 'netdisk',
            'subpro': 'netdisk_web',
            'apiver': 'v3',
            'tt': cur_time,
            'class': 'login',
            'gid': gid,
            'logintype': 'basicLogin',
            'callback': callback
        headers.update(dict(Referer='http://pan.baidu.com/', Accept='*/*', Connection='keep-alive', Host='passport.baidu.com'))
        resp = session.get(url='https://passport.baidu.com/v2/api/?getapi', params=get_data, headers=headers)
        if resp.status_code == 200 and callback in resp.text:
            # 如果json字符串中带有单引号,会解析出错,只有统一成双引号才可以正确的解析
            #data = eval(re.search(r'.*?((.*))', resp.text).group(1))
            data = json.loads(re.search(r'.*?((.*))', resp.text).group(1).replace("'", '"'))
            return data.get('data').get('token')
            return None
    def get_rsa_key(token, gid, callback):
        cur_time = _get_curtime()
        get_data = {
            'token': token,
            'tpl': 'netdisk',
            'subpro': 'netdisk_web',
            'apiver': 'v3',
            'tt': cur_time,
            'gid': gid,
            'callback': callback,
        resp = session.get(url='https://passport.baidu.com/v2/getpublickey', headers=headers, params=get_data)
        if resp.status_code == 200 and callback in resp.text:
            data = json.loads(re.search(r'.*?((.*))', resp.text).group(1).replace("'", '"'))
            return data.get('pubkey'), data.get('key')
            print('获取rsa key失败')
            return None
    def encript_password(password, pubkey):
        import rsa
        pub = rsa.PublicKey.load_pkcs1_openssl_pem(pubkey.encode('utf-8'))
        encript_passwd = rsa.encrypt(password.encode('utf-8'), pub)
        return base64.b64encode(encript_passwd).decode('utf-8')
        # pubkey必须为bytes类型
        encript_passwd =encryptor.encrypt(password.encode('utf-8'))
        return base64.b64encode(encript_passwd).decode('utf-8')
    def login(token, gid, callback, rsakey, username, password):
        post_data = {
            'staticpage': 'http://pan.baidu.com/res/static/thirdparty/pass_v3_jump.html',
            'charset': 'utf-8',
            'token': token,
            'tpl': 'netdisk',
            'subpro': 'netdisk_web',
            'apiver': 'v3',
            'tt': _get_curtime(),
            'codestring': '',
            'safeflg': 0,
            'u': 'http://pan.baidu.com/disk/home',
            'isPhone': '',
            'detect': 1,
            'gid': gid,
            'quick_user': 0,
            'logintype': 'basicLogin',
            'logLoginType': 'pc_loginBasic',
            'idc': '',
            'loginmerge': 'true',
            'foreignusername': '',
            'username': username,
            'password': password,
            'mem_pass': 'on',
            # 返回的key
            'rsakey': rsakey,
            'crypttype': 12,
            'ppui_logintime': 33554,
            'countrycode': '',
            'callback': 'parent.'+callback
        resp = session.post(url='https://passport.baidu.com/v2/api/?login', data=post_data, headers=headers)
        if 'err_no=0' in resp.text:
    def upload(dest_path,file_handle,token):
         params = {
                'method': 'upload',
                'app_id': "250528",
                'BDUSS': session.cookies['BDUSS'],
                't': str(int(time.time())),
                'bdstoken': token,
                'path': dest_path,
                'ondup': "newcopy"
         # print(params)
         files = {'file': (str(int(time.time())), file_handle)}
         url = 'https://{0}/rest/2.0/pcs/file'.format('pcs.baidu.com')
         api = '%s?%s' % (url, urlencode(params))
         # print(api)
         body = BufferReader(files)
         # print(body)
         baibupan_header = {"Referer": "http://pan.baidu.com/disk/home",
                        "User-Agent": "netdisk;;PC;PC-Windows;10.0.10240;WindowsBaiduYunGuanJia"}
         header = dict(baibupan_header.items())
         # print(headers)
         header.update({"Content-Type": body.content_type})
         response = session.post(api, data=body, verify=False, headers=header)
         return response
    def rapidupload(dest_path,file_handler,token):
        :param file_handler: 文件handler, e.g. open('file','rb')
        :type file_handler: file
        :param dest_path: 上传到服务器的路径,包含文件名
        :type dest_path: str
        :return: requests.Response
            .. note::
                * 文件已在服务器上存在,不上传,返回示例
                    "path" : "/apps/album/1.jpg",
                    "size" : 372121,
                    "ctime" : 1234567890,
                    "mtime" : 1234567890,
                    "md5" : "cb123afcc12453543ef",
                    "fs_id" : 12345,
                    "isdir" : 0,
                    "request_id" : 12314124
                * 文件不存在,需要上传
                * 文件大小不足 256kb (slice-md5 == content-md5) 时
                * 远程文件已存在
        file_handler.seek(0, 2)
        _BLOCK_SIZE = 2 ** 20
        content_length = file_handler.tell()
        # 校验段为前 256KB
        first_256bytes = file_handler.read(256 * 1024)
        slice_md5 = md5(first_256bytes).hexdigest()
        content_crc32 = crc32(first_256bytes).conjugate()
        content_md5 = md5(first_256bytes)
        while True:
            block = file_handler.read(_BLOCK_SIZE)
            if not block:
            # 更新crc32和md5校验值
            content_crc32 = crc32(block, content_crc32).conjugate()
        params = {
                'method': 'rapidupload',
                'app_id': "250528",
                'BDUSS': session.cookies['BDUSS'],
                't': str(int(time.time())),
                'bdstoken': token,
                'path': dest_path,
                'ondup': "newcopy"
        data = {
                'content-length': content_length,
                'content-md5': content_md5.hexdigest(),
                'slice-md5': slice_md5,
                'content-crc32': '%d' % (content_crc32.conjugate() & 0xFFFFFFFF)
        baibupan_header = {"Referer": "http://pan.baidu.com/disk/home",
                        "User-Agent": "netdisk;;PC;PC-Windows;10.0.10240;WindowsBaiduYunGuanJia"}
        header = dict(baibupan_header.items())
        url = 'https://{0}/rest/2.0/pcs/file'.format('pcs.baidu.com')
        api = '%s?%s' % (url, urlencode(params))
        # print(api)
        response= session.post(api, data=data, verify=False,headers=header)
        return response
    if __name__ == '__main__':
        user='xxx'  #用户名
        password='xxx'  #密码
        cur_gid = get_gid()
        cur_callback = get_callback()
        cur_token = get_token(cur_gid, cur_callback)
        # print("token:%s" %(cur_token))
        cur_pubkey, cur_key = get_rsa_key(cur_token, cur_gid, cur_callback)
        encript_password = encript_password(password, cur_pubkey)
        login(cur_token, cur_gid, cur_callback, cur_key, user, encript_password)
        # print("cookies:%s" %(session.cookies['BDUSS']))
        # res=upload("/hello/temp.txt",open("temp.txt",'rb'),cur_token)
        # print(res.content.decode('utf-8'))
      1 #-*- coding:utf-8 -*-
      2 __author__ = 'Administrator'
      4 import time
      5 import json
      6 import re
      7 import requests
      8 import execjs
      9 import base64
     10 from urllib.parse import urlencode
     11 from requests_toolbelt import MultipartEncoder
     12 from Crypto.Cipher import PKCS1_v1_5
     13 from Crypto.PublicKey import RSA
     14 from hashlib import md5
     15 from zlib import crc32
     16 # import progressbar
     17 import sys
     18 from contextlib import closing
     19 import time
     20 import os
     21 from io import BytesIO
     23 try:
     24     requests.packages.urllib3.disable_warnings()
     25 except:
     26     pass
     28 # class BufferReader(MultipartEncoder):
     29 #     """将multipart-formdata转化为stream形式的Proxy类
     30 #     """
     31 #     def __init__(self, fields, boundary=None, callback=None, cb_args=(), cb_kwargs=None):
     32 #         self._callback = callback
     33 #         self._progress = 0
     34 #         self._cb_args = cb_args
     35 #         self._cb_kwargs = cb_kwargs or {}
     36 #         super(BufferReader, self).__init__(fields, boundary)
     37 #
     38 #     def read(self, size=None):
     39 #         chunk = super(BufferReader, self).read(size)
     40 #         self._progress += int(len(chunk))
     41 #         self._cb_kwargs.update({
     42 #             'size': self._len,
     43 #             'progress': self._progress
     44 #         })
     45 #         if self._callback:
     46 #             try:
     47 #                 self._callback(*self._cb_args, **self._cb_kwargs)
     48 #             except:  # catches exception from the callback
     49 #                 # raise CancelledError('The upload was cancelled.')
     50 #                 pass
     51 #         return chunk
     53 class BufferReader(BytesIO):
     54     """
     55     """
     56     def __init__(self, filebytes, callback=None):
     57         self._callback = callback
     58         self._progress = 0
     59         self._size =len(filebytes)
     60         super(BufferReader, self).__init__(filebytes)
     62     def read(self, size=-1):
     63         chunk_size=8192
     64         chunk = BytesIO.read(self,chunk_size)
     65         self._progress += int(len(chunk))
     66         if self._callback:
     67             self._callback(self._size,self._progress)
     68         return chunk
     70 class PCSBase():
     71     def __init__(self,username,password):
     72         self.session=requests.session()
     73         self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 '
     74                          '(KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
     75            }
     76         self.session.get('https://pan.baidu.com', headers=self.headers)
     77         self.username=username
     78         self.password=password
     79         self.user={}
     80         self.cur_gid=self.get_gid()
     81         self.cur_callback=self.get_callback()
     82         self.cur_time=self._get_curtime()
     83         self._initiate()#登录成功,并获取session.cookies
     85     def _initiate(self):
     86         self.user['token']= self.get_token()
     87         # print("token:%s" %(self.get_token()))
     88         self.login()
     89         # print("cookies:%s" %(session.cookies['BDUSS']))
     90     def _get_runntime(self):
     91         """
     92         :param path: 加密js的路径,注意js中不要使用中文!估计是pyexecjs处理中文还有一些问题
     93         :return: 编译后的js环境,不清楚pyexecjs这个库的用法的请在github上查看相关文档
     94         """
     95         phantom = execjs.get()  # 这里必须为phantomjs设置环境变量,否则可以写phantomjs的具体路径
     96         with open('login.js', 'r') as f:
     97             source = f.read()
     98         return phantom.compile(source)
    100     def get_gid(self):
    101         return self._get_runntime().call('getGid')
    103     def get_callback(self):
    104         return self._get_runntime().call('getCallback')
    106     def _get_curtime(self):
    107         return int(time.time()*1000)
    108         # 抓包也不是百分百可靠啊,这里?getapi一定要挨着https://passport.baidu.com/v2/api/写,才会到正确的路由
    109     def get_token(self):
    110         get_data = {
    111             'tpl': 'netdisk',
    112             'subpro': 'netdisk_web',
    113             'apiver': 'v3',
    114             'tt':self.cur_time,
    115             'class': 'login',
    116             'gid': self.cur_gid,
    117             'logintype': 'basicLogin',
    118             'callback': self.cur_callback
    119         }
    120         self.headers.update(dict(Referer='http://pan.baidu.com/', Accept='*/*', Connection='keep-alive', Host='passport.baidu.com'))
    121         resp = self.session.get(url='https://passport.baidu.com/v2/api/?getapi', params=get_data, headers=self.headers)
    122         if resp.status_code == 200 and self.cur_callback in resp.text:
    123             # 如果json字符串中带有单引号,会解析出错,只有统一成双引号才可以正确的解析
    124             #data = eval(re.search(r'.*?((.*))', resp.text).group(1))
    125             data = json.loads(re.search(r'.*?((.*))', resp.text).group(1).replace("'", '"'))
    126             return data.get('data').get('token')
    127         else:
    128             print('获取token失败')
    129             return None
    131     def get_rsa_key(self):
    132         get_data = {
    133             'token': self.user['token'],
    134             'tpl': 'netdisk',
    135             'subpro': 'netdisk_web',
    136             'apiver': 'v3',
    137             'tt': self.cur_time,
    138             'gid': self.cur_gid,
    139             'callback': self.cur_callback
    140         }
    141         resp = self.session.get(url='https://passport.baidu.com/v2/getpublickey', headers=self.headers, params=get_data)
    142         if resp.status_code == 200 and self.cur_callback in resp.text:
    143             data = json.loads(re.search(r'.*?((.*))', resp.text).group(1).replace("'", '"'))
    144             return data.get('pubkey'), data.get('key')
    145         else:
    146             print('获取rsa key失败')
    147             return None
    149     def encript_password(self,pubkey):
    150         """
    151         import rsa
    152         使用rsa库加密(法一)
    153         pub = rsa.PublicKey.load_pkcs1_openssl_pem(pubkey.encode('utf-8'))
    154         encript_passwd = rsa.encrypt(password.encode('utf-8'), pub)
    155         return base64.b64encode(encript_passwd).decode('utf-8')
    157         """
    158         # pubkey必须为bytes类型
    159         pub=RSA.importKey(pubkey.encode('utf-8'))
    160         #构造“加密器”
    161         encryptor=PKCS1_v1_5.new(pub)
    162         #加密的内容必须为bytes类型
    163         encript_passwd =encryptor.encrypt(self.password.encode('utf-8'))
    164         return base64.b64encode(encript_passwd).decode('utf-8')
    166     def login(self):
    167         cur_pubkey, cur_key = self.get_rsa_key()
    168         encript_password =self.encript_password(cur_pubkey)
    169         post_data = {
    170             'staticpage': 'http://pan.baidu.com/res/static/thirdparty/pass_v3_jump.html',
    171             'charset': 'utf-8',
    172             'token': self.user['token'],
    173             'tpl': 'netdisk',
    174             'subpro': 'netdisk_web',
    175             'apiver': 'v3',
    176             'tt': self.cur_time,
    177             'codestring': '',
    178             'safeflg': 0,
    179             'u': 'http://pan.baidu.com/disk/home',
    180             'isPhone': '',
    181             'detect': 1,
    182             'gid': self.cur_gid,
    183             'quick_user': 0,
    184             'logintype': 'basicLogin',
    185             'logLoginType': 'pc_loginBasic',
    186             'idc': '',
    187             'loginmerge': 'true',
    188             'foreignusername': '',
    189             'username': self.username,
    190             'password': encript_password,
    191             'mem_pass': 'on',
    192             # 返回的key
    193             'rsakey': cur_key,
    194             'crypttype': 12,
    195             'ppui_logintime': 33554,
    196             'countrycode': '',
    197             'callback': 'parent.'+self.cur_callback
    198         }
    199         resp = self.session.post(url='https://passport.baidu.com/v2/api/?login', data=post_data, headers=self.headers)
    200         if 'err_no=0' in resp.text:
    201             print('登录成功')
    202             self.user['BDUSS'] = self.session.cookies['BDUSS']
    203         else:
    204             print('登录失败')
    205             self.user['BDUSS']=None
    207     def _request(self,url,data=None,files=None,extra_params=None,callback=None):
    208         params={
    209             'app_id': "250528",
    210             'BDUSS': self.user['BDUSS'],
    211             't': str(int(time.time())),
    212             'bdstoken': self.user['token']
    213         }
    214         if extra_params:
    215             params.update(extra_params)
    216         # print("params:%s" %params)
    217         baibupan_header = {"Referer": "http://pan.baidu.com/disk/home",
    218                     "User-Agent": "netdisk;;PC;PC-Windows;10.0.10240;WindowsBaiduYunGuanJia"}
    219         header= dict(baibupan_header.items())
    220         if data or files:
    221             api = '%s?%s' % (url, urlencode(params))
    222             # print("api:%s" %api)
    223             if data:
    224                 res=self.session.post(api,data=data,verify=False, headers=header)
    225                 return res
    226             else:
    227                 # print(callback==None)
    228                 (filedata,contenttype)=requests.packages.urllib3.filepost.encode_multipart_formdata(files)
    229 # print("body:%s" %type(body)) 230 header.update({ 231 "Content-Type": contenttype 232 }) 233 # print("header:%s" %header) 234 res=self.session.post(api,data=body,verify=False, headers=header) 235 return res 236 else: 237 res=self.session.get(url,params=params,verify=False, headers=header,stream=True) 238 return res 239 240 class PCS(PCSBase): 241 def __init__(self,username,password): 242 self.username=username 243 self.password=password 244 super(PCS,self).__init__(self.username,self.password) 245 246 def upload(self,remote_path,file_handler,callback=None): 247 params={ 248 'method': 'upload', 249 'path': remote_path, 250 'ondup': "newcopy" 251 } 252 files = {'file': (str(int(time.time())), file_handler)} 253 url = 'https://{0}/rest/2.0/pcs/file'.format('pcs.baidu.com') 254 response=self._request(url,files=files,extra_params=params,callback=callback) 255 return response 256 257 def rapid_upload(self,remote_path,file_handler,callback=None): 258 params={ 259 'method':"rapidupload", 260 'path':remote_path, 261 'ondup':"newcopy" 262 } 263 url = 'https://{0}/rest/2.0/pcs/file'.format('pcs.baidu.com') 264 file_handler.seek(0, 2) 265 _BLOCK_SIZE = 2 ** 20 #1MB大小 266 # print(_BLOCK_SIZE) 267 content_length = file_handler.tell() 268 # print(content_length) 269 file_handler.seek(0) 270 271 # 校验段为前 256KB 272 first_256bytes = file_handler.read(256 * 1024) 273 slice_md5 = md5(first_256bytes).hexdigest() 274 275 content_crc32 = crc32(first_256bytes).conjugate() 276 content_md5 = md5(first_256bytes) 277 278 count=1 279 while True: 280 block = file_handler.read(_BLOCK_SIZE) 281 if callback: 282 callback(size=content_length,progress=count*_BLOCK_SIZE) 283 count=count+1 284 if not block: 285 break 286 # 更新crc32和md5校验值 287 content_crc32 = crc32(block, content_crc32).conjugate() 288 content_md5.update(block) 289 data = { 290 'content-length': content_length, 291 'content-md5': content_md5.hexdigest(), 292 'slice-md5': slice_md5, 293 'content-crc32': '%d' % (content_crc32.conjugate() & 0xFFFFFFFF) 294 } 295 response=self._request(url,data=data,extra_params=params,callback=callback) 296 return response 297 298 def download(self,remote_path,local_path,callback=None): 299 params={ 300 'method':"download", 301 'path':remote_path 302 } 303 # 兼容原有域名pcs.baidu.com;使用新域名d.pcs.baidu.com,则提供更快、更稳定的下载服务 304 url = 'https://{0}/rest/2.0/pcs/file'.format('d.pcs.baidu.com') 305 with closing(self._request(url, extra_params=params)) as response: 306 chunk_size=1024 #单次请求最大值 307 count=1 308 total_size=int(response.headers['content-length']) #内容体总大小 309 with open(local_path,'wb') as file: 310 for data in response.iter_content(chunk_size=chunk_size): 311 file.write(data) 312 self.progressbar(size=total_size,progress=count*chunk_size,progress_title="正在下载",finish_title="下载完成") 313 count=count+1 314 315 def progressbar(self,size=None, progress=None,progress_title="正在上传",finish_title="上传完成"): 316 #size:文件总字节数 progress:当前传输完成字节数 317 # print("{0} / {1}".format(size, progress)) 318 if progress<size: 319 sys.stdout.write(progress_title+""+str(int((progress/size)*100))+' % '+" ") 320 sys.stdout.flush() 321 else: 322 progress=size 323 sys.stdout.write(finish_title+""+str(int((progress/size)*100))+' % '+" ") 324 325 326 if __name__ == '__main__': 327 username="xxx" 328 password="xxx" 329 pcs=PCS(username,password) 330 res=pcs.upload("/hello/word.js",open("login.js",'rb').read(),callback=pcs.progressbar) 331 print(res.content.decode('utf-8')) 332 res=pcs.rapid_upload("/hello/word.js",open("login.js",'rb'),callback=pcs.progressbar) 333 print(res.content.decode('utf-8')) 334 pcs.download("/hello/word.js","temp.js")
