• 模拟知乎登陆(requests和scrapy)


    1. request

      登录知乎需要向服务器提交的信息有:

        ①headers

        ②_xsrf

        ③captcha

      需要通过解析页面获得_xsrf和captcha(验证码)

      而有关captcha的获取则必须要用session的方式获得, 目的是为了使_xsrf和验证码信息一致

      (因为session中可以保存cookie, 保证数据的一致性)代码如下:

      1 import re
      2 import time
      3 import os.path
      4 import requests
      5 
      6 try:
      7     import cookielib
      8 except:
      9     import http.cookiejar as cookielib
     10 
     11 from PIL import Image
     12 
     13 session = requests.session()
     14 session.cookies = cookielib.LWPCookieJar(filename="cookies")# 登陆成功后将cookie保存到文件中, 之后登陆就可以直接加载cookie,而不需要输入账号和密码(session机制)
     15 try:
     16     session.cookies.load(ignore_discard=True)
     17 except:
     18     print("cookies未能加载")
     19 
     20 agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'
     21 # agent = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'
     22 
     23 # agent = "Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/57.0"
     24 headers = {
     25     "Host": "www.zhihu.com",
     26     "Referer": "https://www.zhihu.com/",
     27     "User-Agent": agent,
     28 }
     29 
     30 
     31 def get_xsrf():
     32     response = session.get("https://www.zhihu.com/", headers= headers)
     33     match_ojb = re.search('name="_xsrf" value="(.*)"', response.text)
     34     print(response.text)
     35     if match_ojb:
     36         return match_ojb.group(1)
     37     else:
     38         print("error")
     39 
     40 
     41 def get_captcha():
     42     t = str(int(time.time() * 1000))
     43     captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
     44     r = session.get(captcha_url, headers=headers)
     45     with open('captcha.jpg', 'wb') as f:
     46         f.write(r.content)
     47         f.close()
     48     try:
     49         im = Image.open('captcha.jpg')
     50         im.show()
     51         im.close()
     52     except:
     53         print('请到 % s找到captcha.jpg手动输入'.format(os.path.abspath('captcha.jpg')))
     54     captcha = input("please input the captcha
    ")
     55     return captcha
     56 
     57 
     58 def is_login():
     59     # 通过用户个人中心验证是否登陆成功
     60     check_url = "https://www.zhihu.com/settings/profile"
     61     response = session.get(check_url, headers=headers, allow_redirects=False)
     62     if response.status_code != 200:
     63         return False
     64     else:
     65         return True
     66 
     67 
     68 def login(account, password):
     69     # 知乎登陆
     70     _xsrf = get_xsrf()
     71     if '@' in account:
     72         print("邮箱登陆")
     73         post_url = "https://www.zhihu.com/login/email"
     74         post_data = {
     75             "_xsrf": _xsrf,
     76             "password": password,
     77             "email": account,
     78         }
     79     else:
     80         if re.match('^1d{10}', account):
     81             print("手机登陆")
     82         post_url = "https://www.zhihu.com/login/phone_num"
     83         post_data = {
     84             "_xsrf": get_xsrf(),
     85             "password": password,
     86             "phone_num": account,
     87         }
     88     # 不需要验证码直接登录成功
     89     response = session.post(post_url, data=post_data, headers=header)
     90     login_code = response.json()
     91 
     92     if login_code['r'] == 1:
     93         print("不输入验证码登陆失败")
     94         #当不输入验证码登录失败时, 获取验证码, 重新登录
     95         post_data["captcha"] = get_captcha()
     96         response = session.post(post_url, data=post_data, headers=header)
     97         login_code = response.json()
     98         print(login_code['msg'])
     99 
    100     session.cookies.save()
    101 
    102 if __name__ == '__main__':
    103     if is_login():
    104         print("已经登陆!")
    105     else:
    106         login(account, password)

    2. scrapy

      如果在scrapy中直接调用上文中的get_captcha()函数来获得验证码, 然后提交是无法登陆成功的, 原因是数据不一致,也就是说获取的_xsrf和验证码一起提交到服务器是不匹配的.

      scrapy机制是默认保存cookie的,所以可以通过两个request请求来将得到的信息保存在默认的cookie中,代码如下:

      

     1 # -*- coding: utf-8 -*-
     2 import re
     3 import json
     4 import datetime
     5 
     6 try:
     7     import urlparse as parse
     8 except:
     9     from urllib import parse
    10 
    11 import scrapy
    12 
    13 
    14 class ZhihuSpider(scrapy.Spider):
    15     name = "zhihu"
    16     allowed_domains = ["www.zhihu.com"]
    17     start_urls = ['https://www.zhihu.com/']
    18 
    19     headers = {
    20         "HOST": "www.zhihu.com",
    21         "Referer": "https://www.zhizhu.com",
    22         'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
    23     }
    24 
    25     def start_requests(self):
    26         return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)]
    27     #获取_xsrf
    28     def login(self, response):
    29         response_text = response.text
    30         match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL)
    31         xsrf = ''
    32         if match_obj:
    33             xsrf = (match_obj.group(1))
    34 
    35         if xsrf:
    36             post_url = "https://www.zhihu.com/login/phone_num"
    37             post_data = {
    38                 "_xsrf": xsrf,
    39                 "phone_num": "",
    40                 "password": "",
    41                 "captcha": ""
    42             }
    43 
    44             import time
    45             t = str(int(time.time() * 1000))
    46             captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login".format(t)
    47             yield scrapy.Request(captcha_url, headers=self.headers, meta={"post_data":post_data}, callback=self.login_after_captcha)
    48 
    49     #获取验证码
    50     def login_after_captcha(self, response):
    51         with open("captcha.jpg", "wb") as f:
    52             f.write(response.body)
    53             f.close()
    54 
    55         from PIL import Image
    56         try:
    57             im = Image.open('captcha.jpg')
    58             im.show()
    59             im.close()
    60         except:
    61             pass
    62 
    63         captcha = input("输入验证码
    >")
    64 
    65         post_data = response.meta.get("post_data", {})
    66         post_url = "https://www.zhihu.com/login/phone_num"
    67         post_data["captcha"] = captcha
    68         return [scrapy.FormRequest(
    69             url=post_url,
    70             formdata=post_data,
    71             headers=self.headers,
    72             callback=self.check_login
    73         )]
    74 
    75     def check_login(self, response):
    76         #验证服务器的返回数据判断是否成功
    77         text_json = json.loads(response.text)
    78         if "msg" in text_json and text_json["msg"] == "登录成功":
    79             for url in self.start_urls:
    80                 yield scrapy.Request(url, dont_filter=True, headers=self.headers)

      

  • 相关阅读:
    ping-tool
    yum 安装 5.6
    音视频编辑
    图表
    VC2013设置输出文件目录
    hdu 4679 Terrorist’s destroy 树形DP
    poj 3580 SuperMemo splay tree(重口味)
    hdu 1890 Robotic Sort splaytree+懒惰标记
    bzoj 1588 [HNOI2002]营业额统计 splay tree
    bzoj 1503 [NOI2004]郁闷的出纳员 splay tree
  • 原文地址:https://www.cnblogs.com/fenglj/p/7891500.html
Copyright © 2020-2023  润新知