• spark源码(三)Master recive方法


        Master recive全部方法
            一.case ElectedLeader => ......
            二.case CompleteRecovery => ......
            三.case RevokedLeadership => ......
            四.case WorkerDecommissioning => ......
            五.case DecommissionWorkers => ......
            六.case RegisterWorker => ......
            七.case RegisterApplication => ......
            八.case ExecutorStateChanged => ......
            九.case DriverStateChanged => ......
            十.case Heartbeat => ......
            十一.case MasterChangeAcknowledged => ......
            十二.case WorkerSchedulerStateResponse => ......
            十三.case WorkerLatestState => ......
            十四.case UnregisterApplication => ......
            十五.case CheckForWorkerTimeOut => ......

    三.RevokedLeadership 详解

        logError("Leadership has been revoked -- master shutting down.")
        System.exit(0)  //方法还是比较简单的,直接把当前master节点退出就行  
                        //别听公众号瞎几把扯  exit 就是最优雅的退出方式

    四.WorkerDecommissioning 详解 单个worker节点坏了

        if (state == RecoveryState.STANDBY) {//STANDBY就是备份的节点
            workerRef.send(MasterInStandby)
        } else {
            idToWorker.get(id).foreach(decommissionWorker)
        }
    4.1 decommissionWorker 详解
        private def decommissionWorker(worker: WorkerInfo): Unit = {
            if (worker.state != WorkerState.DECOMMISSIONED) {
              logInfo("Decommissioning worker %s on %s:%d".format(worker.id, worker.host, worker.port))
              worker.setState(WorkerState.DECOMMISSIONED)
              for (exec <- worker.executors.values) {
                logInfo("Telling app of decommission executors")
                exec.application.driver.send(ExecutorUpdated(//在别的节点上新起executor 
                  exec.id, ExecutorState.DECOMMISSIONED,
                  Some("worker decommissioned"), None,
                  Some(worker.host)))
                exec.state = ExecutorState.DECOMMISSIONED
                exec.application.removeExecutor(exec)//worker上所有executor去除
              }
              persistenceEngine.removeWorker(worker)
            } else {
              logWarning("Skipping decommissioning worker %s on %s:%d as worker is already decommissioned".
                format(worker.id, worker.host, worker.port))
            }
        }

    五.DecommissionWorkers 详解 所有节点全坏了

        assert(state != RecoveryState.STANDBY)
        ids.foreach ( id =>
            idToWorker.get(id).foreach { w =>
              decommissionWorker(w)
              w.endpoint.send(DecommissionWorker)
            }
        )

    六.registerWorker 详解 注册节点

        private def registerWorker(worker: WorkerInfo): Boolean = {
            workers.filter { w =>
              (w.host == worker.host && w.port == worker.port) && (w.state == WorkerState.DEAD)
            }.foreach { w =>
              workers -= w
            }//当前节点是死节点  就去掉当前节点
            val workerAddress = worker.endpoint.address
            if (addressToWorker.contains(workerAddress)) {//如果当前节点是已经注册过 
                val oldWorker = addressToWorker(workerAddress)
                if (oldWorker.state == WorkerState.UNKNOWN) {
                    removeWorker(oldWorker, "Worker replaced by a new worker with same address")
                } else {
                    logInfo("Attempted to re-register worker at same address: " + workerAddress)
                    return false
                }
            }
            workers += worker//真正注册节点的代码
            idToWorker(worker.id) = worker//真正注册节点的代码
            addressToWorker(workerAddress) = worker//真正注册节点的代码
            true
        }

    七.RegisterApplication 详解

        if (state == RecoveryState.STANDBY) {
            // ignore, don't send response
        } else {
            logInfo("Registering app " + description.name)
            val app = createApplication(description, driver)//只是new 了一个ApplicationInfo对象
            registerApplication(app)/*注册app相关信息  资源相关的处理*/
            logInfo("Registered app " + description.name + " with ID " + app.id)
            persistenceEngine.addApplication(app)
            driver.send(RegisteredApplication(app.id, self))/*返回一个注册成功的消息*/
            schedule()
        }

    八.ExecutorStateChanged 详解

        (appId, execId, state, message, exitStatus) =>
        val execOption = idToApp.get(appId).flatMap(app => app.executors.get(execId))
        execOption match {
          case Some(exec) =>
            val appInfo = idToApp(appId)
            val oldState = exec.state
            exec.state = state
            if (state == ExecutorState.RUNNING) {//没有看懂这一块 如果状态是running 代码做什么
              assert(oldState == ExecutorState.LAUNCHING,
                s"executor $execId state transfer from $oldState to RUNNING is illegal")
              appInfo.resetRetryCount()
            }
            exec.application.driver.send(ExecutorUpdated(execId, state, message, exitStatus, None))
            if (ExecutorState.isFinished(state)) {//如果任务完成  移除executor
              logInfo(s"Removing executor ${exec.fullId} because it is $state")
              if (!appInfo.isFinished) {
                appInfo.removeExecutor(exec)//移除Executor 这只是移除资源
              }
              exec.worker.removeExecutor(exec)
              val normalExit = exitStatus == Some(0)
              if (!normalExit
                  && oldState != ExecutorState.DECOMMISSIONED
                  && appInfo.incrementRetryCount() >= maxExecutorRetries
                  && maxExecutorRetries >= 0) { //任务完成状态  其中是失败导致的
                val execs = appInfo.executors.values
                if (!execs.exists(_.state == ExecutorState.RUNNING)) {
                  logError(s"Application ${appInfo.desc.name} with ID ${appInfo.id} failed " +
                    s"${appInfo.retryCount} times; removing it")
                  removeApplication(appInfo, ApplicationState.FAILED)
                }
              }
            }
            schedule()
          case None =>
            logWarning(s"Got status update for unknown executor $appId/$execId")
        }
    8.1 removeApplication 详解
        def removeApplication(app: ApplicationInfo, state: ApplicationState.Value): Unit = {
            if (apps.contains(app)) {
              logInfo("Removing app " + app.id)
              apps -= app
              idToApp -= app.id
              endpointToApp -= app.driver
              addressToApp -= app.driver.address
              if (completedApps.size >= retainedApplications) {
                val toRemove = math.max(retainedApplications / 101)
                completedApps.take(toRemove).foreach { a =>
                  applicationMetricsSystem.removeSource(a.appSource)
                }
                completedApps.trimStart(toRemove)
              }
              completedApps += app 
              waitingApps -= app
              for (exec <- app.executors.values) {
                killExecutor(exec)
              }
              app.markFinished(state)
              if (state != ApplicationState.FINISHED) {
                app.driver.send(ApplicationRemoved(state.toString))//这其实已经把driver相关的移除了
              }
              persistenceEngine.removeApplication(app)
              schedule()
              workers.foreach { w =>
                w.endpoint.send(ApplicationFinished(app.id))//上面执行了一遍KillExecutor 这又执行一遍 ApplicationFinished 
              }
            }
        }
    8.1.1 killExecutor 详解
        private def killExecutor(exec: ExecutorDesc): Unit = {
            exec.worker.removeExecutor(exec)
            exec.worker.endpoint.send(KillExecutor(masterUrl, exec.application.id, exec.id))//这才是重点代码啊
            exec.state = ExecutorState.KILLED
        }
  • 相关阅读:
    Windows服务BAT命令-安装、卸载、启动、停止
    身份认证
    密钥管理概述
    快速上手几个Linux命令
    递归
    数字签名的实现方案
    数字签名
    密码学基础
    你可以把Linux内核当成一家软件外包公司的老板
    数学归纳法
  • 原文地址:https://www.cnblogs.com/wuxiaolong4/p/16685501.html
Copyright © 2020-2023  润新知