spark应用运行机制解析1
- bin/spark-submit
#设置SPARK_HOME
if [ -z "${SPARK_HOME}" ]; then
export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
fi
# disable randomized hash for string in Python 3.3+
export PYTHONHASHSEED=0
#调用org.apache.spark.deploy.SparkSubmit的main方法
exec "${SPARK_HOME}"/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@"
- org.apache.spark.deploy.SparkSubmit
def main(args: Array[String]): Unit = {
val appArgs = new SparkSubmitArguments(args)
if (appArgs.verbose) {
// scalastyle:off println
printStream.println(appArgs)
// scalastyle:on println
}
appArgs.action match {
//解析参数,调用submit(appArgs)方法
case SparkSubmitAction.SUBMIT => submit(appArgs)
case SparkSubmitAction.KILL => kill(appArgs)
case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs)
}
}
- org.apache.spark.deploy.SparkSubmit#submit
private def submit(args: SparkSubmitArguments): Unit = {
//首先要解析参数,得到需要的classpath,环境变量,mainClass
val (childArgs, childClasspath, sysProps, childMainClass) = prepareSubmitEnvironment(args)
def doRunMain(): Unit = {
if (args.proxyUser != null) {
//.....简化代码
runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
} else {
//如果未设置代理的话,将调用此处方法
runMain(childArgs, childClasspath, sysProps, childMainClass, args.verbose)
}
}
-
org.apache.spark.deploy.SparkSubmit#prepareSubmitEnvironment
此方法是整个启动过程的核心代码,if条件太多,有点凌乱,最终返回值
* Prepare the environment for submitting an application.
* This returns a 4-tuple:
* (1) the arguments for the child process,
* (2) a list of classpath entries for the child,
* (3) a map of system properties, and
* (4) the main class for the child
* Exposed for testing.
*/
1. 核心思想就是返回子进程需要的参数、子进程需要的classpath、环境变量和mainClass
2. 此处我们只需寻找yarn-client和yarn-cluster的过程。其它mesos和standalone先不做分析。
// Set the cluster manager
val clusterManager: Int = args.master match {
case m if m.startsWith("yarn") => YARN
case m if m.startsWith("spark") => STANDALONE
case m if m.startsWith("mesos") => MESOS
case m if m.startsWith("local") => LOCAL
case _ => printErrorAndExit("Master must start with yarn, spark, mesos, or local"); -1
}
// Set the deploy mode; default is client mode
var deployMode: Int = args.deployMode match {
case "client" | null => CLIENT
case "cluster" => CLUSTER
case _ => printErrorAndExit("Deploy mode must be either client or cluster"); -1
}
- 以上代码是判断我们的集群类型和部署方式,集群类型分为四种,分别为YARN/STANDALONE/MESOS/LOCAL。部署方式分为 CLIENT/CLUSTER
// In client mode, launch the application main class directly
// In addition, add the main application jar and any added jars (if any) to the classpath
if (deployMode == CLIENT) {
childMainClass = args.mainClass
if (isUserJar(args.primaryResource)) {
childClasspath += args.primaryResource
}
if (args.jars != null) { childClasspath ++= args.jars.split(",") }
if (args.childArgs != null) { childArgs ++= args.childArgs }
}
-
CLIENT的模式,那么直接用我们application的mainClass, mainClass要么用--class指定,要么从jar包的读取。
mainClass = jar.getManifest.getMainAttributes.getValue("Main-Class")
并且将application的jar包加到我们classpath中。
// In yarn-cluster mode, use yarn.Client as a wrapper around the user class
if (isYarnCluster) {
childMainClass = "org.apache.spark.deploy.yarn.Client"
if (args.isPython) {
childArgs += ("--primary-py-file", args.primaryResource)
if (args.pyFiles != null) {
childArgs += ("--py-files", args.pyFiles)
}
childArgs += ("--class", "org.apache.spark.deploy.PythonRunner")
} else if (args.isR) {
val mainFile = new Path(args.primaryResource).getName
childArgs += ("--primary-r-file", mainFile)
childArgs += ("--class", "org.apache.spark.deploy.RRunner")
} else {
if (args.primaryResource != SPARK_INTERNAL) {
childArgs += ("--jar", args.primaryResource)
}
childArgs += ("--class", args.mainClass)
}
if (args.childArgs != null) {
args.childArgs.foreach { arg => childArgs += ("--arg", arg) }
}
}
-
CLUSTER 模式呢,mainClass指定为
org.apache.spark.deploy.yarn.Client
,对application本身的mainClass做了一层封装。 -
将application自己的mainClass以--class参数的方式传递到
org.apache.spark.deploy.yarn.Client
的main方法中。 在Client中,会以此参数作为区别CLUSTER和CLIENT的依据。def isClusterMode: Boolean = userClass != null
-
为什么需要在Client类中还要区分CLUSTER和CLIENT呢?后面会有详细的介绍。
-
最终我们生成了mainClass和classPath了,下一步需要执行了。
/**
* Run the main method of the child class using the provided launch environment.
*
* Note that this main class will not be the one provided by the user if we're
* running cluster deploy mode or python applications.
*/
private def runMain(
childArgs: Seq[String],
childClasspath: Seq[String],
sysProps: Map[String, String],
childMainClass: String,
verbose: Boolean): Unit = {
// scalastyle:off println
if (verbose) {
printStream.println(s"Main class:
$childMainClass")
printStream.println(s"Arguments:
${childArgs.mkString("
")}")
printStream.println(s"System properties:
${sysProps.mkString("
")}")
printStream.println(s"Classpath elements:
${childClasspath.mkString("
")}")
printStream.println("
")
}
// scalastyle:on println
//自定义classLoader,具体classLoader的机制,请自行Google。
val loader =
if (sysProps.getOrElse("spark.driver.userClassPathFirst", "false").toBoolean) {
new ChildFirstURLClassLoader(new Array[URL](0),
Thread.currentThread.getContextClassLoader)
} else {
new MutableURLClassLoader(new Array[URL](0),
Thread.currentThread.getContextClassLoader)
}
//设置新的classLoader
Thread.currentThread.setContextClassLoader(loader)
//此处将我们定义好的classpath加载到我们定义好的classloader,这样我们才能找到application定义好的mainClass呢。
for (jar <- childClasspath) {
addJarToClasspath(jar, loader)
}
//设置环境变量
for ((key, value) <- sysProps) {
System.setProperty(key, value)
}
var mainClass: Class[_] = null
try {
//加载mainClass
mainClass = Utils.classForName(childMainClass)
} catch {
case e: ClassNotFoundException =>
e.printStackTrace(printStream)
//如果想用hive的话,自行编译spark
if (childMainClass.contains("thriftserver")) {
// scalastyle:off println
printStream.println(s"Failed to load main class $childMainClass.")
printStream.println("You need to build Spark with -Phive and -Phive-thriftserver.")
// scalastyle:on println
}
System.exit(CLASS_NOT_FOUND_EXIT_STATUS)
case e: NoClassDefFoundError =>
e.printStackTrace(printStream)
if (e.getMessage.contains("org/apache/hadoop/hive")) {
// scalastyle:off println
printStream.println(s"Failed to load hive class.")
printStream.println("You need to build Spark with -Phive and -Phive-thriftserver.")
// scalastyle:on println
}
System.exit(CLASS_NOT_FOUND_EXIT_STATUS)
}
// SPARK-4170
//scala代码有两种app运行方式,一种是用main方法,另外一种是继承App的方式
if (classOf[scala.App].isAssignableFrom(mainClass)) {
printWarning("Subclasses of scala.App may not work correctly. Use a main() method instead.")
}
//找到main方法
val mainMethod = mainClass.getMethod("main", new Array[String](0).getClass)
if (!Modifier.isStatic(mainMethod.getModifiers)) {
throw new IllegalStateException("The main method in the given main class must be static")
}
def findCause(t: Throwable): Throwable = t match {
case e: UndeclaredThrowableException =>
if (e.getCause() != null) findCause(e.getCause()) else e
case e: InvocationTargetException =>
if (e.getCause() != null) findCause(e.getCause()) else e
case e: Throwable =>
e
}
try {
//开始执行
mainMethod.invoke(null, childArgs.toArray)
} catch {
case t: Throwable =>
findCause(t) match {
case SparkUserAppException(exitCode) =>
System.exit(exitCode)
case t: Throwable =>
throw t
}
}
}
- CLIENT模式,那么application的mainClass开始执行,一般是初始化SparkConf,创建SparkContext,生成DAGScheduler、TaskScheduler和YarnClientSchedulerBackend。并且在
YarnClientSchedulerBackend
中启动org.apache.spark.deploy.yarn.Client
override def start() {
val driverHost = conf.get("spark.driver.host")
val driverPort = conf.get("spark.driver.port")
val hostport = driverHost + ":" + driverPort
sc.ui.foreach { ui => conf.set("spark.driver.appUIAddress", ui.appUIAddress) }
val argsArrayBuf = new ArrayBuffer[String]()
argsArrayBuf += ("--arg", hostport)
argsArrayBuf ++= getExtraClientArguments
logDebug("ClientArguments called with: " + argsArrayBuf.mkString(" "))
val args = new ClientArguments(argsArrayBuf.toArray, conf)
totalExpectedExecutors = args.numExecutors
client = new Client(args, conf)
appId = client.submitApplication()
// SPARK-8687: Ensure all necessary properties have already been set before
// we initialize our driver scheduler backend, which serves these properties
// to the executors
super.start()
waitForApplication()
// SPARK-8851: In yarn-client mode, the AM still does the credentials refresh. The driver
// reads the credentials from HDFS, just like the executors and updates its own credentials
// cache.
if (conf.contains("spark.yarn.credentials.file")) {
YarnSparkHadoopUtil.get.startExecutorDelegationTokenRenewer(conf)
}
monitorThread = asyncMonitorApplication()
monitorThread.start()
}
- CLUSTER模式下,在运行runMainClass的时候,会调用
org.apache.spark.deploy.yarn.Client
的main方法
def main(argStrings: Array[String]) {
if (!sys.props.contains("SPARK_SUBMIT")) {
logWarning("WARNING: This client is deprecated and will be removed in a " +
"future version of Spark. Use ./bin/spark-submit with "--master yarn"")
}
// Set an env variable indicating we are running in YARN mode.
// Note that any env variable with the SPARK_ prefix gets propagated to all (remote) processes
System.setProperty("SPARK_YARN_MODE", "true")
val sparkConf = new SparkConf
val args = new ClientArguments(argStrings, sparkConf)
// to maintain backwards-compatibility
if (!Utils.isDynamicAllocationEnabled(sparkConf)) {
sparkConf.setIfMissing("spark.executor.instances", args.numExecutors.toString)
}
new Client(args, sparkConf).run()
}
/**
* Submit an application to the ResourceManager.
* If set spark.yarn.submit.waitAppCompletion to true, it will stay alive
* reporting the application's status until the application has exited for any reason.
* Otherwise, the client process will exit after submission.
* If the application finishes with a failed, killed, or undefined status,
* throw an appropriate SparkException.
*/
def run(): Unit = {
this.appId = submitApplication()
if (!launcherBackend.isConnected() && fireAndForget) {
val report = getApplicationReport(appId)
val state = report.getYarnApplicationState
logInfo(s"Application report for $appId (state: $state)")
logInfo(formatReportDetails(report))
if (state == YarnApplicationState.FAILED || state == YarnApplicationState.KILLED) {
throw new SparkException(s"Application $appId finished with status: $state")
}
} else {
val (yarnApplicationState, finalApplicationStatus) = monitorApplication(appId)
if (yarnApplicationState == YarnApplicationState.FAILED ||
finalApplicationStatus == FinalApplicationStatus.FAILED) {
throw new SparkException(s"Application $appId finished with failed status")
}
if (yarnApplicationState == YarnApplicationState.KILLED ||
finalApplicationStatus == FinalApplicationStatus.KILLED) {
throw new SparkException(s"Application $appId is killed")
}
if (finalApplicationStatus == FinalApplicationStatus.UNDEFINED) {
throw new SparkException(s"The final status of application $appId is undefined")
}
}
}
- 两者最终都会调用submitApplication的方法,提交应用到yarn,会在另外一篇博客中详细讲解。