• Spark源码分析 – SparkEnv


    SparkEnv在两个地方会被创建, 由于SparkEnv中包含了很多重要的模块, 比如BlockManager, 所以SparkEnv很重要
    Driver端, 在SparkContext初始化的时候, SparkEnv会被创建

      // Create the Spark execution environment (cache, map output tracker, etc)
      private[spark] val env = SparkEnv.createFromSystemProperties(
        "<driver>",  // 表示是driver, 下面的executor则是executorid
        System.getProperty("spark.driver.host"),
        System.getProperty("spark.driver.port").toInt,
        true,
        isLocal)
      SparkEnv.set(env)

    Executor端, 在executor初始化时被创建

      // Initialize Spark environment (using system properties read above)
      val env = SparkEnv.createFromSystemProperties(executorId, slaveHostname, 0, false, false)
      SparkEnv.set(env)

     

    SparkEnv Class

    用于hold所有Spark运行时的环境对象, serializer, Akka actor system, block manager, and map output tracker等

    /**
     * Holds all the runtime environment objects for a running Spark instance (either master or worker),
     * including the serializer, Akka actor system, block manager, map output tracker, etc. Currently
     * Spark code finds the SparkEnv through a thread-local variable, so each thread that accesses these
     * objects needs to have the right SparkEnv set. You can get the current environment with
     * SparkEnv.get (e.g. after creating a SparkContext) and set it with SparkEnv.set.
     */
    class SparkEnv (
        val executorId: String,
        val actorSystem: ActorSystem,
        val serializerManager: SerializerManager,
        val serializer: Serializer,
        val closureSerializer: Serializer,
        val cacheManager: CacheManager,
        val mapOutputTracker: MapOutputTracker,
        val shuffleFetcher: ShuffleFetcher,
        val broadcastManager: BroadcastManager,
        val blockManager: BlockManager,
        val connectionManager: ConnectionManager,
        val httpFileServer: HttpFileServer,
        val sparkFilesDir: String,
        val metricsSystem: MetricsSystem) {
    }

    SparkEnv Object

    scala使用伴生object当作类接口
    除了基本的get和set
    就是在createFromSystemProperties中创建了一堆很关键的对象

    object SparkEnv extends Logging {
      private val env = new ThreadLocal[SparkEnv] // ThreadLocal,所以每个线程各访问各的
      @volatile private var lastSetSparkEnv : SparkEnv = _ // 缓存最新更新的SparkEnv,并且volatile,便于其他线程获得
    
      def set(e: SparkEnv) {
        lastSetSparkEnv = e
        env.set(e)
      }
    
      /**
       * Returns the ThreadLocal SparkEnv, if non-null. Else returns the SparkEnv
       * previously set in any thread.
       */
      def get: SparkEnv = {
        Option(env.get()).getOrElse(lastSetSparkEnv) // 没有local时, 可以用lastSetSparkEnv 
      }
    
      /**
       * Returns the ThreadLocal SparkEnv.
       */
      def getThreadLocal : SparkEnv = {
        env.get() // 只取到local的
      }
    
      def createFromSystemProperties(
          executorId: String,
          hostname: String,
          port: Int,
          isDriver: Boolean,
          isLocal: Boolean): SparkEnv = {
    
        val (actorSystem, boundPort) = AkkaUtils.createActorSystem("spark", hostname, port)
    
        val classLoader = Thread.currentThread.getContextClassLoader
    
        // Create an instance of the class named by the given Java system property, or by
        // defaultClassName if the property is not set, and return it as a T
        def instantiateClass[T](propertyName: String, defaultClassName: String): T = {
          val name = System.getProperty(propertyName, defaultClassName)
          Class.forName(name, true, classLoader).newInstance().asInstanceOf[T]
        }
    
        val serializerManager = new SerializerManager
    
        val serializer = serializerManager.setDefault(
          System.getProperty("spark.serializer", "org.apache.spark.serializer.JavaSerializer"))
    
        val closureSerializer = serializerManager.get(
          System.getProperty("spark.closure.serializer", "org.apache.spark.serializer.JavaSerializer"))
    
        val connectionManager = blockManager.connectionManager
    
        val broadcastManager = new BroadcastManager(isDriver)
    
        val cacheManager = new CacheManager(blockManager)
        // BlockManager 
        val blockManagerMaster = new BlockManagerMaster(registerOrLookup( // registerOrLookup表示只有在master上创建Actor对象, slave上只是创建ref
          "BlockManagerMaster",
          new BlockManagerMasterActor(isLocal)))
        val blockManager = new BlockManager(executorId, actorSystem, blockManagerMaster, serializer)
        // MapOutputTracker
        val mapOutputTracker = new MapOutputTracker()
        mapOutputTracker.trackerActor = registerOrLookup( // 同样只有在master创建actor对象
          "MapOutputTracker",
          new MapOutputTrackerActor(mapOutputTracker))
        
        // ShuffleFetcher
        val shuffleFetcher = instantiateClass[ShuffleFetcher](
          "spark.shuffle.fetcher", "org.apache.spark.BlockStoreShuffleFetcher")
    
        val httpFileServer = new HttpFileServer()
        httpFileServer.initialize()
        System.setProperty("spark.fileserver.uri", httpFileServer.serverUri)
    
        val metricsSystem = if (isDriver) {
          MetricsSystem.createMetricsSystem("driver")
        } else {
          MetricsSystem.createMetricsSystem("executor")
        }
        metricsSystem.start()
    
        new SparkEnv(
          executorId,
          actorSystem,
          serializerManager,
          serializer,
          closureSerializer,
          cacheManager,
          mapOutputTracker,
          shuffleFetcher,
          broadcastManager,
          blockManager,
          connectionManager,
          httpFileServer,
          sparkFilesDir,
          metricsSystem)
      }
    }
  • 相关阅读:
    模拟算法(八)
    迭代算法(七)
    试探法是一种委婉的做法(六)
    贪心算法并不贪婪(五)
    各个击破的分治算法(四)
    充分利用自己的递归算法(三)
    一起学Spring之Web基础篇
    C# 利用AForge进行摄像头信息采集
    一起学Spring之注解和Schema方式实现AOP
    一起学Spring之AOP
  • 原文地址:https://www.cnblogs.com/fxjwind/p/3517051.html
Copyright © 2020-2023  润新知