用到的知识点
1、cookie
a. from scrapy.http.cookies import CookieJar #用这个包获取第一次访问时的cookie
b、cookie_odj = CookieJar() #实例化对象
cookie_odj.extract_cookies(response,response.request)
cookie_dict = cookie_odj._cookies #获取cookie
2、Request 传参
yield scrapy.Request(
url="https://dig.chouti.com/login", #地址
method="POST", #请求方式
body="phone=8619923803579&password=140709ben&oneMonth=1", #请求体
headers={"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8"},#请求头
cookies=cookies_dict, #cookie
callback=self.check_log, #回调函数
)
示例:
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.http.cookies import CookieJar # 用这个包获取第一次访问时的cookie 4 5 6 class CtiSpider(scrapy.Spider): 7 name = 'cti' 8 allowed_domains = ['dig.chouti.com'] 9 start_urls = ['http://dig.chouti.com/'] 10 cookies_dict = None # 保存好的cookie方便其它方法调用 11 12 def parse(self, response): 13 cookie_odj = CookieJar() # 实例化对象 14 cookie_odj.extract_cookies(response, response.request) 15 self.cookies_dict = cookie_odj._cookies 16 17 yield scrapy.Request( 18 url="https://dig.chouti.com/login", 19 method="POST", 20 body="phone=8619923803579&password=0709ben&oneMonth=1", 21 headers={"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8"}, 22 cookies=self.cookies_dict, 23 callback=self.check_log, 24 ) 25 26 def check_log(self, response): 27 28 yield scrapy.Request(url="https://dig.chouti.com/", callback=self.good) 29 30 def good(self, response): 31 url_list = response.xpath("//div[@share-linkid]/@share-linkid").extract() 32 for i in url_list: 33 good_url = "https://dig.chouti.com/link/vote?linksId=%s" % i 34 35 yield scrapy.Request( 36 url=good_url, 37 method="POST", 38 cookies=self.cookies_dict, 39 callback=self.check_good 40 ) 41 42 pages_list = response.xpath("//div[@id='page-area']//a/@href").extract() 43 for a in pages_list: 44 page_url = "https://dig.chouti.com%s" % a 45 46 yield scrapy.Request(url=page_url, callback=self.good) 47 48 def check_good(self, response): 49 print(response.text)