Worker类源码位置: org.apache.spark.deploy.worker
1 /** 2 *启动driver的源码分析 3 */ 4 case LaunchDriver(driverId, driverDesc) => 5 logInfo(s"Asked to launch driver $driverId") 6 7 //创建DriverRunner线程 8 val driver = new DriverRunner( 9 conf, 10 driverId, 11 workDir, 12 sparkHome, 13 driverDesc.copy(command = Worker.maybeUpdateSSLSettings(driverDesc.command, conf)), 14 self, 15 workerUri, 16 securityMgr) 17 18 //把DriverRunner线程加入Drivers的hashset中 19 drivers(driverId) = driver 20 21 //启动driver 22 driver.start() //详细代码见:代码1 23 24 25 coresUsed += driverDesc.cores 26 memoryUsed += driverDesc.mem 27
代码1
1 /** Starts a thread to run and manage the driver. */ 2 private[worker] def start() = { 3 4 //DriverRunner机制分析 5 //启动一个java线程 6 new Thread("DriverRunner for " + driverId) { 7 override def run() { 8 var shutdownHook: AnyRef = null 9 try { 10 shutdownHook = ShutdownHookManager.addShutdownHook { () => 11 logInfo(s"Worker shutting down, killing driver $driverId") 12 kill() 13 } 14 15 // prepare driver jars and run driver 16 // 在此处进行第一步:创建DriverRunner的工作目录 17 // 第二步,下载用户上传的jar(我们编写完的spark应用程序,如果是java,用maven打个jar包,如果是scala,那么会用export将它导出为jar包) 18 //第三步 构建ProcessBuilder 19 val exitCode = prepareAndRunDriver()//详细代码见:代码2 20 21 22 // set final state depending on if forcibly killed and process exit code 23 // 对driver的退出状态做一些处理 24 finalState = if (exitCode == 0) { 25 Some(DriverState.FINISHED) 26 } else if (killed) { 27 Some(DriverState.KILLED) 28 } else { 29 Some(DriverState.FAILED) 30 } 31 } catch { 32 case e: Exception => 33 kill() 34 finalState = Some(DriverState.ERROR) 35 finalException = Some(e) 36 } finally { 37 if (shutdownHook != null) { 38 ShutdownHookManager.removeShutdownHook(shutdownHook) 39 } 40 } 41 42 43 // notify worker of final driver state, possible exception 44 // 这个DriverRunner这个线程,向它所属的worker的actor,发送一个DriverStateChanged的事件 45 worker.send(DriverStateChanged(driverId, finalState.get, finalException))//详细代码见:代码3 46 } 47 }.start() 48 } 49
代码2
1 private[worker] def prepareAndRunDriver(): Int = { 2 val driverDir = createWorkingDirectory()//创建DriverRunner的工作目录 3 val localJarFilename = downloadUserJar(driverDir)//第二步,下载用户上传的jar 4 5 6 def substituteVariables(argument: String): String = argument match { 7 case "{{WORKER_URL}}" => workerUrl 8 case "{{USER_JAR}}" => localJarFilename 9 case other => other 10 } 11 12 13 // TODO: If we add ability to submit multiple jars they should also be added here 14 15 // 构建ProcessBuilder 16 // 传入了driver的启动命令,需要的内存大小等信息 17 val builder = CommandUtils.buildProcessBuilder(driverDesc.command, securityManager, 18 driverDesc.mem, sparkHome.getAbsolutePath, substituteVariables) 19 20 21 runDriver(builder, driverDir, driverDesc.supervise) 22 } 23
代码3
1 //driver执行完以后,driverrunner线程会发送一个状态给worker 2 //然后worker实际上会将DriverStateChanged消息发送给Master 3 case driverStateChanged @ DriverStateChanged(driverId, state, exception) => 4 handleDriverStateChanged(driverStateChanged)//详细代码见:代码4 5
代码4
1 private[worker] def handleDriverStateChanged(driverStateChanged: DriverStateChanged): Unit = { 2 val driverId = driverStateChanged.driverId 3 val exception = driverStateChanged.exception 4 val state = driverStateChanged.state 5 state match { 6 case DriverState.ERROR => 7 logWarning(s"Driver $driverId failed with unrecoverable exception: ${exception.get}") 8 case DriverState.FAILED => 9 logWarning(s"Driver $driverId exited with failure") 10 case DriverState.FINISHED => 11 logInfo(s"Driver $driverId exited successfully") 12 case DriverState.KILLED => 13 logInfo(s"Driver $driverId was killed by user") 14 case _ => 15 logDebug(s"Driver $driverId changed state to $state") 16 } 17 18 //worker把DriverStateChanged消息发送给Master 19 // Master会对状态进行修改 20 sendToMaster(driverStateChanged) 21 22 //将driver从本地缓存中移除 23 val driver = drivers.remove(driverId).get 24 25 //将driver加入完成driver的队列 26 finishedDrivers(driverId) = driver 27 trimFinishedDriversIfNecessary() 28 29 //将driver的内存和CPU进行释放 30 memoryUsed -= driver.driverDesc.mem 31 coresUsed -= driver.driverDesc.cores 32 } 33 34 /** 35 *启动Executor的源码分析 36 */ 37 case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_) => 38 if (masterUrl != activeMasterUrl) { 39 logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.") 40 } else { 41 try { 42 logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name)) 43 44 45 46 // Create the executor's working directory 47 // 创建executor本地工作目录 48 val executorDir = new File(workDir, appId + "/" + execId) 49 if (!executorDir.mkdirs()) { 50 throw new IOException("Failed to create directory " + executorDir) 51 } 52 53 54 // Create local dirs for the executor. These are passed to the executor via the 55 // SPARK_EXECUTOR_DIRS environment variable, and deleted by the Worker when the 56 // application finishes. 57 val appLocalDirs = appDirectories.getOrElse(appId, 58 Utils.getOrCreateLocalRootDirs(conf).map { dir => 59 val appDir = Utils.createDirectory(dir, namePrefix = "executor") 60 Utils.chmod700(appDir) 61 appDir.getAbsolutePath() 62 }.toSeq) 63 appDirectories(appId) = appLocalDirs 64 65 //创建ExecutorRunner 66 val manager = new ExecutorRunner( 67 appId, 68 execId, 69 appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)), 70 cores_, 71 memory_, 72 self, 73 workerId, 74 host, 75 webUi.boundPort, 76 publicAddress, 77 sparkHome, 78 executorDir, 79 workerUri, 80 conf, 81 appLocalDirs, ExecutorState.RUNNING) 82 83 //把executorRunner加入本地缓存 84 executors(appId + "/" + execId) = manager 85 86 //启动ExecutorRunner 87 manager.start()//详细代码:见代码5 88 89 //加上Executor需要使用的CPU 内存的资源 90 coresUsed += cores_ 91 memoryUsed += memory_ 92 93 //向master返回一个ExecutorStateChanged事件,用于master修改状态 94 sendToMaster(ExecutorStateChanged(appId, execId, manager.state, None, None)) 95 } catch { 96 case e: Exception => 97 logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e) 98 if (executors.contains(appId + "/" + execId)) { 99 executors(appId + "/" + execId).kill() 100 executors -= appId + "/" + execId 101 } 102 sendToMaster(ExecutorStateChanged(appId, execId, ExecutorState.FAILED, 103 Some(e.toString), None)) 104 } 105 } 106
代码5
1 private[worker] def start() { 2 3 //创建一个java线程 4 workerThread = new Thread("ExecutorRunner for " + fullId) { 5 override def run() { fetchAndRunExecutor() }//详细代码见代码6 6 } 7 workerThread.start() 8 // Shutdown hook that kills actors on shutdown. 9 shutdownHook = ShutdownHookManager.addShutdownHook { () => 10 // It's possible that we arrive here before calling `fetchAndRunExecutor`, then `state` will 11 // be `ExecutorState.RUNNING`. In this case, we should set `state` to `FAILED`. 12 if (state == ExecutorState.RUNNING) { 13 state = ExecutorState.FAILED 14 } 15 killProcess(Some("Worker shutting down")) } 16 } 17
代码6
1 /** 2 * Download and run the executor described in our ApplicationDescription 3 */ 4 private def fetchAndRunExecutor() { 5 try { 6 // Launch the process 7 8 //封装一个ProcessBuilder 9 val builder = CommandUtils.buildProcessBuilder(appDesc.command, new SecurityManager(conf), 10 memory, sparkHome.getAbsolutePath, substituteVariables) 11 val command = builder.command() 12 val formattedCommand = command.asScala.mkString(""", "" "", """) 13 logInfo(s"Launch command: $formattedCommand") 14 15 16 builder.directory(executorDir) 17 builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator)) 18 // In case we are running this from within the Spark Shell, avoid creating a "scala" 19 // parent process for the executor command 20 builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0") 21 22 23 // Add webUI log urls 24 val baseUrl = 25 if (conf.getBoolean("spark.ui.reverseProxy", false)) { 26 s"/proxy/$workerId/logPage/?appId=$appId&executorId=$execId&logType=" 27 } else { 28 s"http://$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType=" 29 } 30 builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr") 31 builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout") 32 33 34 process = builder.start() 35 36 //重定向到输出流文件(将是stdout和stderr) 37 //将executor的InputStream和ErrorStream,输出的信息 38 //分贝重定向到本地工作目录的stdout文件,和stderr文件中 39 val header = "Spark Executor Command: %s %s ".format( 40 formattedCommand, "=" * 40) 41 42 43 // Redirect its stdout and stderr to files 44 val stdout = new File(executorDir, "stdout") 45 stdoutAppender = FileAppender(process.getInputStream, stdout, conf) 46 47 48 val stderr = new File(executorDir, "stderr") 49 Files.write(header, stderr, StandardCharsets.UTF_8) 50 stderrAppender = FileAppender(process.getErrorStream, stderr, conf) 51 52 53 // Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown) 54 // or with nonzero exit code 55 // 调用Proess的waitFor()方法,启动executor进程 56 val exitCode = process.waitFor() 57 58 // executor执行完之后拿到返回值状态 59 state = ExecutorState.EXITED 60 val message = "Command exited with code " + exitCode 61 62 //向ExecutorRunner线程所属的Worker actor,发送ExecutorStateChanged消息 63 worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))//详细代码见:代码7 64 } catch { 65 case interrupted: InterruptedException => 66 logInfo("Runner thread for executor " + fullId + " interrupted") 67 state = ExecutorState.KILLED 68 killProcess(None) 69 case e: Exception => 70 logError("Error running executor", e) 71 state = ExecutorState.FAILED 72 killProcess(Some(e.toString)) 73 } 74 }
代码7
//向master发送executorstatechanged事件 case executorStateChanged @ ExecutorStateChanged(appId, execId, state, message, exitStatus) => handleExecutorStateChanged(executorStateChanged)//详细代码见:代码8
代码8
1 private[worker] def handleExecutorStateChanged(executorStateChanged: ExecutorStateChanged): 2 Unit = { 3 4 // 直接向master也发送一个executorstatechanged消息 5 sendToMaster(executorStateChanged) 6 val state = executorStateChanged.state 7 8 // 如果executor状态是finished 9 if (ExecutorState.isFinished(state)) { 10 val appId = executorStateChanged.appId 11 val fullId = appId + "/" + executorStateChanged.execId 12 val message = executorStateChanged.message 13 val exitStatus = executorStateChanged.exitStatus 14 executors.get(fullId) match { 15 case Some(executor) => 16 logInfo("Executor " + fullId + " finished with state " + state + 17 message.map(" message " + _).getOrElse("") + 18 exitStatus.map(" exitStatus " + _).getOrElse("")) 19 20 // 将executor从内存中移除 21 executors -= fullId 22 finishedExecutors(fullId) = executor 23 trimFinishedExecutorsIfNecessary() 24 25 // 释放executor占用的内存和CPU资源 26 coresUsed -= executor.cores 27 memoryUsed -= executor.memory 28 case None => 29 logInfo("Unknown Executor " + fullId + " finished with state " + state + 30 message.map(" message " + _).getOrElse("") + 31 exitStatus.map(" exitStatus " + _).getOrElse("")) 32 } 33 maybeCleanupApplication(appId) 34 } 35 }