• 登录专用_ +提取


    import rsa
    import re
    import json
    import time
    import requests
    import base64
    import urllib3
    import binascii
    import xlwt
    from lxml import etree
    urllib3.disable_warnings()


    class Weibo_login():
    def __init__(self, username, password):
    self.user_name = username
    self.pass_word = password
    self.session = requests.session()

    def login(self):
    json_data = self.get_json_data() # 第一次请求, 获取服务端返回的参数
    if not json_data:
    return False
    pass_word = self.get_password(json_data['servertime'], json_data['nonce'], json_data['pubkey'])

    post_data = {
    'entry': 'weibo',
    'gateway': '1',
    'from': '',
    'savestate': '7',
    'qrcode_flag': 'false',
    'useticket': '1',
    # 'pagerefer': 'https://www.baidu.com/',
    'vsnf': '1',
    'su': self.username, # 通过bs64加密后的得到的
    'service': 'miniblog',
    'servertime': json_data['servertime'],
    'nonce': json_data['nonce'],
    'pwencode': 'rsa2',
    'rsakv': json_data['rsakv'],
    'sp': pass_word, # 通过rsa加密过的pass_word
    'sr': '1920*1080',
    'encoding': 'UTF-8',
    'prelt': 28,
    'url': 'https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
    'returntype': 'META'
    }
    login_url = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)'
    response_data = self.session.post(login_url, data=post_data, verify=False)

    print(response_data.content.decode('gbk')) # 先转成二进制, 然后转成dbk

    # 第一次跳转
    url1 = re.findall(r'location.replace("(.*?)")', response_data.content.decode('gbk'))[0] # 加上[0]就会变的 只剩下想要的数据
    print("要打印的url:", url1)

    # 打印,获取第二次跳转的url
    result = self.session.get(url1).content.decode('gbk')
    print("result", result)

    # 第三次跳转
    url1 = re.findall(r"location.replace('(.*?)')", result)[0]
    print("打印url1******************:", url1)

    # 访问url1
    self.session.get(url1, verify=False)
    # print(self.session.get("https://weibo.com/6779766492", verify=False).content.decode()) # 可以随意访问已经登录的网页
    # print("cookies",self.session.cookies)
    # print("hreader:",self.session.headers)
    return self.session

    #帐号加密
    def get_password(self, servertime, nonce, pubkey):
    s = str(servertime) + ' ' + str(nonce) + ' ' + str(self.pass_word)
    public_key = rsa.PublicKey(int(pubkey, 16), int('10001', 16))
    password = rsa.encrypt(s.encode(), public_key)
    # print(binascii.b2a_hex(password))
    return binascii.b2a_hex(password).decode()

    #密码加密
    def get_username(self):
    self.username = base64.b64encode(self.user_name.encode()) # 对传进去的进行编码

    def get_json_data(self, ):
    '''
    #这个就是按下tab以后, 产生的get请求
    前面的请求就是为了给后面的post请求铺路
    '''
    params = {
    'entry': 'weibo',
    'callback': 'sinaSSOController.preloginCallBack',
    'su': self.username,
    'rsakt': 'mod',
    'checkpin': '1',
    'client': 'ssologin.js(v1.4.19)',
    '_': int(time.time() * 1000),
    }
    response = self.session.get('https://login.sina.com.cn/sso/prelogin.php?', params=params,
    verify=False) # ?号后面的都是一些参数

    try:
    json_loads = json.loads(re.findall(r'preloginCallBack((.*?))', response.text)[0]) # ****重点来了(的意思是从preloginCallBack(开始 ,到)结束的所有都要 (.*?)匹配全部
    except:
    json_loads = {}
    # print(json_loads)
    return json_loads


    class get_location():

    def __init__(self):
    self.headers = {

    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Cookie': 'SINAGLOBAL=686853348606.9484.1541139120819; SCF=AkhFCrKCUq9wsyl3f_fDwgGgFRpQSCclo3gd_NQHuXjdyEEjbRsY9PC-MgFG6xjCDbPu4IOAIOCXyPLirzjFWdo.; SUHB=00xZwj0czv30Gl; Ugrow-G0=56862bac2f6bf97368b95873bc687eef; SL_GWPT_Show_Hide_tmp=1; SL_wptGlobTipTmp=1; YF-Page-G0=86b4280420ced6d22f1c1e4dc25fe846; YF-V5-G0=a9b587b1791ab233f24db4e09dad383c; wb_view_log_6382564064=1920*10801; _s_tentry=-; Apache=2872156489358.7036.1542848804714; ULV=1542848804736:6:6:3:2872156489358.7036.1542848804714:1542706013869; _T_WM=3e8b91cb0fc1afc6f6b197111618f23a; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9Wh3SFjwsRGoBE8ZBHKOX.cV5JpVF02RehnESKe41h24; SUB=_2AkMsqrpQdcPxrAVSkPoQym_iaolH-jyff9OmAn7uJhMyAxgv7nNeqSVutBF-XJVarBSeGUU5IPLleV-08g-rwQpV; login_sid_t=a2b0e92dbac53ded77a3b3a4328efdaa; cross_origin_proto=SSL; WBStorage=f44cc46b96043278|undefined; UOR=,,login.sina.com.cn; wb_view_log=1920*10801',
    'Host': 'weibo.com',
    'Referer': 'https://weibo.com/2889942201/GFo7omJqz?type=comment',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest'

    }


    self.f = xlwt.Workbook(encoding='utf-8')
    self.sheet1 = self.f.add_sheet(u'sheet1', cell_overwrite_ok=True)

    def get_html(self, url):
    try:
    response = requests.get(url, headers=self.headers)

    if response.status_code == 200:
    return response.text
    else:
    print(response.status_code)
    time.sleep(0.5)
    return self.get_html(url)
    except requests.ConnectionError as e:
    print('Error', e.args)


    def parse(self, html): # 解析
    print(type(html))
    id = set(re.findall('usercard="id=(d{10})"',html.replace('\','')))
    return id




    def location_parse(self,response): #红点的坐标拼接url.
    print(response.content.decode())
    res = response.content.decode()

    selector = etree.HTML(res)

    scripts = selector.xpath('.//script')
    for script in scripts:
    try:
    text = script.xpath('.//text()')
    text = ''.join(text)
    text = text.replace('//', '/')
    text = re.findall(r'<div.*/div>', text)[0]
    text = text.replace(r' ', ' ').replace(r' ', ' ').replace(r'"', '"').replace(r'/', '/')

    se = etree.HTML(text)

    span = se.xpath('.//span[@class = "item_text W_fl"]/text()')
    for s in span:
    s = s.replace(' ', '').replace(' ', '')
    print("*"*50,s)
    except:
    pass



    def excel_write(self, text_ip):

    self.sheet1.write(self.i, 0, text_ip)
    self.i += 1
    self.f.save(r'C:Users85740Desktopget_ip.xls') # 保存


    if __name__ == '__main__':
    login = Weibo_login('18082539819', '25257758') # 先设置账号密码
    login.get_username() # 给传过去的账号密码进行加密
    login.get_json_data() # 开始请求
    session = login.login()


    a = get_location()
    url1 = 'https://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=4298641625417843&max_id=4309096804900746&page={}'
    oneself = 'https://weibo.com/{}'

    for i in range(1,10):
    page_url = url1.format(i)
    id_list = a.parse(a.get_html(page_url))
    for id in id_list:
    private_url = oneself.format(id)
    print("详细页面的url", private_url)
    a.location_parse(session.get(private_url, headers= a.headers, verify=False))

    #session.get("https://weibo.com/6779766492", verify=False).content.decode()
  • 相关阅读:
    Pills
    项链
    IIS7.5 关于ASP连接ACCESS数据库超时问题
    清明
    The New Beginning
    新的开始
    Mysql中文乱码以及导出为sql语句和Excel问题解决
    Spring学习笔记01. 入门知识,IoC/DI
    selenium.common.exceptions.WebDriverException: Message: 'chromedriver' executable needs to be in PAT
    Django框架 python3.7+django2.2 报错:AttributeError: ‘str’ object has no attribute ‘decode’解决办法
  • 原文地址:https://www.cnblogs.com/yuanjia8888/p/10005513.html
Copyright © 2020-2023  润新知