# -*- coding: utf-8 -*-
#scrapy genspider zhihu www.zhihu.com
import scrapy import time import json import re from scrapy.http import Request, FormRequest try: from PIL import Image except: pass class ZhihuSpider(scrapy.Spider): name = 'zhihu' allowed_domains = ['www.zhihu.com'] start_urls = ['http://www.zhihu.com/'] agent = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36' headers = { "Host": "www.zhihu.com", "Referer": "https://www.zhihu.com/", 'User-Agent': agent } def parse(self, response): pass def start_requests(self): #首先要完成登录 因此要重写scrapy的start_requests t = str(int(time.time() * 1000)) captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + '&type=login' return [Request(captcha_url, callback=self.parser_captcha, headers=self.headers)] def parser_captcha(self, response): with open('captcha.jpg', 'wb') as f: f.write(response.body) f.close() im = Image.open('captcha.jpg') im.show() im.close() captcha = input("输入验证码: >") return Request(url='https://www.zhihu.com/', callback=self.login, meta={'captcha': captcha},headers=self.headers) def login(self, response): match_obj = re.match('.*name="_xsrf" value="(.*?)"', response.text, re.DOTALL) xsrf = '' if match_obj: xsrf = match_obj.group(1) print('xsrf:' + xsrf) print(response.meta['captcha']) return [FormRequest('https://www.zhihu.com/login/phone_num', method='POST', formdata={ 'phone_num': '1303922', 'password': '123456', '_xsrf': xsrf, 'captcha_type': 'en', 'captcha': response.meta['captcha'], }, callback=self.after_login, headers=self.headers )] def after_login(self, response): json_file = json.loads(response.text) if json_file['r'] == 0: print('success........登录成功') else: print('登录失败!')
scrapy shell 环境下:
scrapy shell https://www.zhihu.com/question/52117226/answer/257823105
response <500 https://www.zhihu.com/question/52117226/answer/257823105>
出现状态码500错误就是没有加上header问题导致
解决方法:
>scrapy shell -s USER_AGENT="Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36 " https://www.zhihu.com/question/52117226/answer/257823105
状态变为:response <200 https://www.zhihu.com/question/52117226/answer/257823105>