• spark2.1源码分析2:从SparkPi分析一个job的执行


    从SparkPi的一个行动操作入手,选择Run–Debug SparkPi进入调试:
    F8:Step Over
    F7:Step Into
    右键Run to Cursor
    Ctrl+B 查看定义
    导航–Back和Forward

    SparkPi:
    
    val count = spark.sparkContext.parallelize(1 until n, slices).map { i =>
          val x = random * 2 - 1
          val y = random * 2 - 1
          if (x*x + y*y < 1) 1 else 0
        }.~~reduce(_ + _)~~
    
    RDD:
    /**
       * Reduces the elements of this RDD using the specified commutative and
       * associative binary operator.
       */
      def reduce(f: (T, T) => T): T = withScope {
        val cleanF = sc.clean(f)
    //    对单个Partition执行clean后的函数
        val reducePartition: Iterator[T] => Option[T] = iter => {
          if (iter.hasNext) {
            Some(iter.reduceLeft(cleanF))
          } else {
            None
          }
        }
        var jobResult: Option[T] = None
    //    合并所有Partition结果
        val mergeResult = (index: Int, taskResult: Option[T]) => {
          if (taskResult.isDefined) {
            jobResult = jobResult match {
              case Some(value) => Some(f(value, taskResult.get))
              case None => taskResult
            }
          }
        }
        ~~sc.runJob(this, reducePartition, mergeResult)~~
        // Get the final result out of our Option, or throw an exception if the RDD was empty
        jobResult.getOrElse(throw new UnsupportedOperationException("empty collection"))
      }
    SparkContext:
      /**
       * Run a job on all partitions in an RDD and pass the results to a handler function.
       */
      def runJob[T, U: ClassTag](
          rdd: RDD[T],
          processPartition: Iterator[T] => U,
          resultHandler: (Int, U) => Unit)
      {
        //进一步封装对每个Partition处理处理的函数
        val processFunc = (context: TaskContext, iter: Iterator[T]) => processPartition(iter)
        ~~runJob[T, U](rdd, processFunc, 0 until rdd.partitions.length, resultHandler)~~
      }
    
    SparkContext:
     /**
       * Run a function on a given set of partitions in an RDD and pass the results to the given
       * handler function. This is the main entry point for all actions in Spark.
       */
      def runJob[T, U: ClassTag](
          rdd: RDD[T],
          func: (TaskContext, Iterator[T]) => U,
          partitions: Seq[Int],
          resultHandler: (Int, U) => Unit): Unit = {
        //判断drive是否调用sc.stop停止程序
        if (stopped.get()) {
          throw new IllegalStateException("SparkContext has been shutdown")
        }
        val callSite = getCallSite
        val cleanedFunc = clean(func)
        logInfo("Starting job: " + callSite.shortForm)
        if (conf.getBoolean("spark.logLineage", false)) {
          logInfo("RDD's recursive dependencies:
    " + rdd.toDebugString)
        }
        //cleanedFunc每个分区处理函数
        //partitions分区数
        //resultHandler每个分区结果的处理函数
        ~~dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)~~
        progressBar.foreach(_.finishAll())
        //请注意此处会执行检查点操作
        rdd.doCheckpoint()
      }
    
    DAGScheduler
     /**
       * Run an action job on the given RDD and pass all the results to the resultHandler function as
       * they arrive.
       *
       * @param rdd target RDD to run tasks on
       * @param func a function to run on each partition of the RDD
       * @param partitions set of partitions to run on; some jobs may not want to compute on all
       *   partitions of the target RDD, e.g. for operations like first()
       * @param callSite where in the user program this job was called
       * @param resultHandler callback to pass each result to
       * @param properties scheduler properties to attach to this job, e.g. fair scheduler pool name
       *
       * @throws Exception when the job fails
       */
      def runJob[T, U](
          rdd: RDD[T],
          func: (TaskContext, Iterator[T]) => U,
          partitions: Seq[Int],
          callSite: CallSite,
          resultHandler: (Int, U) => Unit,
          properties: Properties): Unit = {
        val start = System.nanoTime
        ~~val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)~~
        // Note: Do not call Await.ready(future) because that calls `scala.concurrent.blocking`,
        // which causes concurrent SQL executions to fail if a fork-join pool is used. Note that
        // due to idiosyncrasies in Scala, `awaitPermission` is not actually used anywhere so it's
        // safe to pass in null here. For more detail, see SPARK-13747.
        val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait]
        waiter.completionFuture.ready(Duration.Inf)(awaitPermission)
        waiter.completionFuture.value.get match {
          case scala.util.Success(_) =>
            logInfo("Job %d finished: %s, took %f s".format
              (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
          case scala.util.Failure(exception) =>
            logInfo("Job %d failed: %s, took %f s".format
              (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
            // SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.
            val callerStackTrace = Thread.currentThread().getStackTrace.tail
            exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)
            throw exception
        }
      }
    
    DAGScheduler:
      /**
       * Submit an action job to the scheduler.
       *
       * @param rdd target RDD to run tasks on
       * @param func a function to run on each partition of the RDD
       * @param partitions set of partitions to run on; some jobs may not want to compute on all
       *   partitions of the target RDD, e.g. for operations like first()
       * @param callSite where in the user program this job was called
       * @param resultHandler callback to pass each result to
       * @param properties scheduler properties to attach to this job, e.g. fair scheduler pool name
       *
       * @return a JobWaiter object that can be used to block until the job finishes executing
       *         or can be used to cancel the job.
       *
       * @throws IllegalArgumentException when partitions ids are illegal
       */
      def submitJob[T, U](
          rdd: RDD[T],
          func: (TaskContext, Iterator[T]) => U,
          partitions: Seq[Int],
          callSite: CallSite,
          resultHandler: (Int, U) => Unit,
          properties: Properties): JobWaiter[U] = {
        // Check to make sure we are not launching a task on a partition that does not exist.
        val maxPartitions = rdd.partitions.length
        partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>
          throw new IllegalArgumentException(
            "Attempting to access a non-existent partition: " + p + ". " +
              "Total number of partitions: " + maxPartitions)
        }
    
        val jobId = nextJobId.getAndIncrement()
        if (partitions.size == 0) {
          // Return immediately if the job is running 0 tasks
          return new JobWaiter[U](this, jobId, 0, resultHandler)
        }
    
        assert(partitions.size > 0)
        val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
        //将resultHandler也就是一开始reduce中的mergeResult封装进JobWaiter
        ~~val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)~~
       //Put the event into the event queue. The event thread will process it later.
        eventProcessLoop.post(JobSubmitted(
          jobId, rdd, func2, partitions.toArray, callSite, waiter,
          SerializationUtils.clone(properties)))
        waiter
      }
    private[scheduler] val eventProcessLoop = new DAGSchedulerEventProcessLoop(this)
    
    当job被执行后,程序返回到DAGScheduler.runJob函数,显示成功或者失败的信息。
    此时JobWaiter中执行了mergeResult函数,因为mergeResult是个闭包,
    引用了RDD类中的JobResult,所以结果已经返回到RDD对象中。
    一直返回到RDD:reduce中jobResult.getOrElse(throw new U nsupportedOperationException("empty collection"))
    会看到最终返回了jobResult。
    
    
    JobWaiter:
    /**
     * An object that waits for a DAGScheduler job to complete. As tasks finish, it passes their
     * results to the given handler function.
     */
    异步等待job完成,内部调用reduce中传入的mergeResult将每个Partition的结果合并,返回最终结果
    
  • 相关阅读:
    1052 卖个萌 (20 分)
    1046 划拳 (15 分)
    1051 复数乘法 (15 分)
    1042 字符统计 (20 分)哈希
    1041 考试座位号 (15 分)哈希
    1061 判断题 (15 分)
    1093 字符串A+B (20 分)简单哈希
    Hibernate框架
    SVN的安装与介绍
    easyUI的简单操作
  • 原文地址:https://www.cnblogs.com/ggzone/p/10121122.html
Copyright © 2020-2023  润新知