scrapy 中间件
中间件流程:
class WxappSpiderMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# 爬虫处理前
return None
def process_spider_output(self, response, result, spider):
# 爬虫返回数据前执行
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# 异常处理
pass
def process_start_requests(self, start_requests, spider):
# 爬虫开始请求前
for r in start_requests:
yield r
def spider_opened(self, spider):
#爬虫结束执行
spider.logger.info('Spider opened: %s' % spider.name)
随机请求头:
http://httpbin.org/user-agent 查看自己的user-agent
import random
# 请求头--》 所有(http://useragentstring.com/pages/useragentstring.php?name=Chrome)
class UserAgentDownloaderMiddleware(object):
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2919.83 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2762.73 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
]
def process_request(self, request, spider):
user_agent = random.choice(self.USER_AGENTS)
request.headers['User-Agent'] = user_agent
settings 配置:
DOWNLOADER_MIDDLEWARES = {
'wxapp.middlewares.UserAgentDownloaderMiddleware': 543,
}
spider 配置:
def parse(self,response):
user_agent = json.loads(response.text)['user-agent']
# 重复请求 url
yield scrapy Request(self.start_url[0],dont_filter=True) # 关闭去重请求
ip 代理中间件:
出现验证码: 1. 识别 2. 跟换代理
代理服务商:
快代理
httpbin.org/ip 打印当前代理ip
class IPDownloaderMiddleware(object):
# 高匿名 + https + 稳定
PROXIES =[
"ip:port",
"",
"",
"",
"",
]
def process_request(self,request,spider):
proxy = random.choice(self.PROXIES)
request.meta['proxy'] = proxy
setting 配置:
DOWNLOADER_MIDDLEWARES = {
'wxapp.middlewares.IPDownloaderMiddleware': 543,
}
ip独享模式:
import base64
class IPDownloaderMiddleware(object):
# 高匿名 + https + 稳定
def process_request(self, request, spider):
proxy = "ip:port"
user_password = "name:password"
b64_user = base64.b64encode(user_password.encode('utf-8'))
request.meta['proxy'] = proxy
request.headers['Proxy-Authorization'] = 'Basic' + b64_user.decode('utf-8')
注意:
content 用extract()
content = "".join(content).strip()