• Scrapy 知乎验证码


    # -*- coding: utf-8 -*-
    #scrapy genspider zhihu www.zhihu.com
    import scrapy import time import json import re from scrapy.http import Request, FormRequest try: from PIL import Image except: pass class ZhihuSpider(scrapy.Spider): name = 'zhihu' allowed_domains = ['www.zhihu.com'] start_urls = ['http://www.zhihu.com/'] agent = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36' headers = { "Host": "www.zhihu.com", "Referer": "https://www.zhihu.com/", 'User-Agent': agent } def parse(self, response): pass def start_requests(self): #首先要完成登录 因此要重写scrapy的start_requests t = str(int(time.time() * 1000)) captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + '&type=login' return [Request(captcha_url, callback=self.parser_captcha, headers=self.headers)] def parser_captcha(self, response): with open('captcha.jpg', 'wb') as f: f.write(response.body) f.close() im = Image.open('captcha.jpg') im.show() im.close() captcha = input("输入验证码: >") return Request(url='https://www.zhihu.com/', callback=self.login, meta={'captcha': captcha},headers=self.headers) def login(self, response): match_obj = re.match('.*name="_xsrf" value="(.*?)"', response.text, re.DOTALL) xsrf = '' if match_obj: xsrf = match_obj.group(1) print('xsrf:' + xsrf) print(response.meta['captcha']) return [FormRequest('https://www.zhihu.com/login/phone_num', method='POST', formdata={ 'phone_num': '1303922', 'password': '123456', '_xsrf': xsrf, 'captcha_type': 'en', 'captcha': response.meta['captcha'], }, callback=self.after_login, headers=self.headers )] def after_login(self, response): json_file = json.loads(response.text) if json_file['r'] == 0: print('success........登录成功') else: print('登录失败!')

    scrapy shell 环境下:

      scrapy shell https://www.zhihu.com/question/52117226/answer/257823105

      response   <500 https://www.zhihu.com/question/52117226/answer/257823105>

    出现状态码500错误就是没有加上header问题导致

    解决方法:

      >scrapy shell -s USER_AGENT="Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36 " https://www.zhihu.com/question/52117226/answer/257823105

      状态变为:response   <200 https://www.zhihu.com/question/52117226/answer/257823105>

  • 相关阅读:
    Java线程死锁模拟
    Arrays Multi
    PHP Forms
    simple grammer
    有意义的命名 Meaningful names
    整洁代码
    XPath
    多态
    复用类
    访问控制权限
  • 原文地址:https://www.cnblogs.com/wlc297984368/p/7819998.html
Copyright © 2020-2023  润新知