• 爬虫常用设置


    1. setting.py

     1 #!/usr/bin/python
     2 # -*- coding: utf-8 -*-
     3 """
     4 @author: yugengde
     5 @contact: yugengde@163.com
     6 @file : settings.py
     7 @time: 2017/11/22 15:41
     8 """
     9 
    10 BOT_NAME = 'pro'
    11 
    12 SPIDER_MODULES = ['pro.spiders']
    13 NEWSPIDER_MODULE = 'pro.spiders'
    14 
    15 ROBOTSTXT_OBEY = False
    16 
    17 DOWNLOAD_DELAY = 3
    18 COOKIES_ENABLED = False
    19 
    20 DOWNLOADER_MIDDLEWARES = {
    21     'pro.middlewares.PhantomJSMiddleware': 301,
    22     'pro.middlewares.UserAgentMiddleware': 300,
    23 }
    24 
    25 ITEM_PIPELINES = {
    26     'scrapy_redis.pipelines.RedisPipeline': 301,
    27     'pro.pipelines.DuplicatesPipeline': 300,
    28 }
    29 
    30 LOG_ENABLED = True
    31 LOG_ENCODING = 'utf-8'
    32 LOG_FILE = 'pro.log'
    33 LOG_LEVEL = 'DEBUG'
    34 # LOG_STDOUT =
    35 
    36 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    37 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
    38 REDIS_URL = 'redis://root:password@localhost:6379'

    2. middlewares.py

      1 class PhantomJSMiddleware(object):
      2     @classmethod
      3     def process_request(cls, request, spider):
      4         from selenium import webdriver
      5         from scrapy.http import HtmlResponse
      6         driver = webdriver.PhantomJS(r'C:InstallFilePhantomjsinphantomjs.exe')
      7         driver.get(request.url)
      8         content = driver.page_source.encode('utf-8')
      9         driver.quit()
     10 
     11         return HtmlResponse(request.url, encoding='utf-8', body=content, request=request)
     12 
     13 
     14 class UserAgentMiddleware(object):
     15     @classmethod
     16     def process_request(cls, request, spider):
     17         import random
     18         user_agents = [ 
    20
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0", 21 "Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0", # 可以使用 UserAgent()函数生成      ] 294 request.headers.setdefault('UserAgent',random.choice(user_agents))

     3. pipelines.py

     1 #!/usr/bin/python
     2 # -*-coding:utf-8-*-
     3 
     4 from scrapy.exceptions import DropItem
     5 
     6 
     7 # 数据的去重
     8 class DuplicatesPipeline(object):
     9     def __init__(self):
    10         self.ids_seen = set()
    11 
    12     def process_item(self, item, spider):
    13         if not item['title']:
    14             raise DropItem("Missing title in %s " % item)
    15 
    16         if item['item_id'] in self.ids_seen:
    17             raise DropItem("Duplicate item found: %s" % item)
    18         else:
    19             self.ids_seen.add(item['item_id'])
    20             yield item
  • 相关阅读:
    移动端前端开发模型
    swift中高阶函数map、flatMap、filter、reduce
    函数式编程-构建
    Swift 4.0:访问级别(访问控制)
    swift内存管理
    swift where 的作用
    Swift 中的协议
    swift语言点评二十一-协议
    swift语言点评二十-扩展
    swift 20
  • 原文地址:https://www.cnblogs.com/liyugeng/p/7879544.html
Copyright © 2020-2023  润新知