一、dupefilter去重
1、编写类
dupefilters.py
from scrapy.dupefilter import BaseDupeFilter from scrapy.utils.request import request_fingerprint #自定义去重规则 class XdbDupeFilter(BaseDupeFilter): def __init__(self): self.visited_fd = set() @classmethod def from_settings(cls, settings): return cls() #return True表示去重 def request_seen(self, request): fd = request_fingerprint(request=request) #fd为url的唯一标识 if fd in self.visited_fd: return True self.visited_fd.add(fd) def open(self): # can return deferred print('爬虫开始') def close(self, reason): # can return a deferred print('爬虫结束') # def log(self, request, spider): # log that a request has been filtered # print('日志')
2、配置
settings.py
# 修改默认的去重规则 # DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' #默认 DUPEFILTER_CLASS = 'xdb.dupefilters.XdbDupeFilter'
3、爬虫使用
函数中可以通过dont_filter设置是否遵循去重规则
chouti.py
class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['https://dig.chouti.com/'] def parse(self, response): print(response.request.url) # item_list = response.xpath('//div[@id="content-list"]/div[@class="item"]') # for item in item_list: # text = item.xpath('.//a/text()').extract_first() # href = item.xpath('.//a/@href').extract_first() page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract() for page in page_list: from scrapy.http import Request page = "https://dig.chouti.com" + page #遵循去重规则 # yield Request(url=page,callback=self.parse,dont_filter=False) # https://dig.chouti.com/all/hot/recent/2 # 不遵循去重规则 yield Request(url=page,callback=self.parse,dont_filter=True) # https://dig.chouti.com/all/hot/recent/2
二、深度查询
配置文件settings进行如下设置即可:
# 限制深度 DEPTH_LIMIT = 3
三、cookie
方式一、
import scrapy from scrapy.http.cookies import CookieJar from scrapy.http import Request from urllib.parse import urlencode class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['https://dig.chouti.com/'] cookie_dict = {} def parse(self, response): """ 第一次访问抽屉返回的内容:response :param response: :return: """ # 去响应头中获取cookie # 去响应头中获取cookie,cookie保存在cookie_jar对象 cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) # 1、去对象中将cookie解析到字典 for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value # 2、携带cookies发送登录请求 yield Request( url='https://dig.chouti.com/login', method='POST', body="phone=8613122765216&password=dsjfoejfih&oneMonth=1",# body=urlencode({phone:8613122765216,password:dsjfoejfih,oneMonth:1}) cookies=self.cookie_dict, headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' }, callback=self.check_login )
方式二:meta