随机替换请求头中的User-Agent
基于github开源项目,实现User-Agent的动态切换和管理
1 https://github.com/hellysmile/fake-useragent
fake-useragent维护不同User-Agent的字段值
1 https://fake-useragent.herokuapp.com/browsers/0.1.8
middlewares.py
from fake_useragent import UserAgent class RandomUserAgentMiddlware(object): def __init__(self, crawler): super(RandomUserAgentMiddlware, self).__init__() self.ua = UserAgent() self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random") @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): def get_ua(): return getattr(self.ua, self.ua_type) request.headers.setdefault('User-Agent', get_ua())
配置settings中downloader middleware的优先级
DOWNLOADER_MIDDLEWARES = { 'ArticleSpider.middlewares.JSPageMiddleware': 1, 'ArticleSpider.middlewares.RandomUserAgentMiddlware': 543, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, } RANDOM_UA_TYPE = "random"