• org.apache.spark.launcher.Main源码分析


     public static void main(String[] argsArray) throws Exception {
        //org.apache.spark.launcher.Main
        checkArgument(argsArray.length > 0, "Not enough arguments: missing class name.");
      /**
      * java -cp spark_home/lib/spark-assembly-1.6.0-hadoop2.6.0.jar org.apache.spark.launcher.Main  org.apache.spark.deploy.SparkSubmit
      *      --class org.apache.spark.repl.Main --name "Spark shell" --master spark://ip:7077
         这个main方法最终会将org.apache.spark.deploy.SparkSubmit --class org.apache.spark.repl.Main --name "Spark shell" --master spark://luyl152:7077
         给spark-class的 exec "${CMD[@]}"执行
      */
    
    
        List<String> args = new ArrayList<>(Arrays.asList(argsArray));
        //spark-shell执行时第1个参数就是org.apache.spark.deploy.SparkSubmit
        String className = args.remove(0);//从args中移除org.apache.spark.deploy.SparkSubmit参数
        //可以在spark-class或别的配制文件中 export SPARK_PRINT_LAUNCH_COMMAND=任何值,只要不为空即可
        //可以用它来打印cmd,也就是spark-class的 exec "${CMD[@]}"中的值
        boolean printLaunchCommand = !isEmpty(System.getenv("SPARK_PRINT_LAUNCH_COMMAND"));
        AbstractCommandBuilder builder;//创建命令解析器
        if (className.equals("org.apache.spark.deploy.SparkSubmit")) {
          try {
            //将参数解析到spark对应的变量中,如 --class的值 放到mainClass变量中。
            //如果有多出来的参数则将该参数放到SparkSubmitCommandBuilder成员sparkArgs这个集合中
            builder = new SparkSubmitCommandBuilder(args);
          } catch (IllegalArgumentException e) {
            printLaunchCommand = false;
            System.err.println("Error: " + e.getMessage());
            System.err.println();
    
            MainClassOptionParser parser = new MainClassOptionParser();
            try {
              parser.parse(args);
            } catch (Exception ignored) {
              // Ignore parsing exceptions.
            }
    
            List<String> help = new ArrayList<>();
            if (parser.className != null) {
              help.add(parser.CLASS);
              help.add(parser.className);
            }
            help.add(parser.USAGE_ERROR);
            builder = new SparkSubmitCommandBuilder(help);
          }
        } else {
    //第一个参数如果不是:org.apache.spark.deploy.SparkSubmit,则使用SparkClassCommandBuilder,解析器 builder
    = new SparkClassCommandBuilder(className, args); } Map<String, String> env = new HashMap<>(); List<String> cmd = builder.buildCommand(env);//这个在SparkSubmitCommandBuilder中重新实现了,不是抽象类中的函数 if (printLaunchCommand) { System.err.println("Spark Command: " + join(" ", cmd)); System.err.println("========================================"); } if (isWindows()) { System.out.println(prepareWindowsCommand(cmd, env)); } else { // In bash, use NULL as the arg separator since it cannot be used in an argument.

        //返回有效的参数,会通过打印的方式给spark-class的 exec "${CMD[@]}"执行
        /**  ''和空格不是同一个概念。        

            ''表示字符串结束符,代表字符串结束,而空格是一个普通字符,显示在文本中可以选中。        

            ''的ASCII码为0,空格的ASCII码为32,两个不是同一个字符         

            在计算机程序中通常使用''表示字符串结束,空格为文本字符,二者完全不同
        */

          List<String> bashCmd = prepareBashCommand(cmd, env);
          for (String c : bashCmd) {
            System.out.print(c);//打印返回的命令字符,即被spark-class的CMD接收
            System.out.print('');
          }
        }
      }
     // org.apache.spark.launcher.SparkSubmitCommandBuilder
      SparkSubmitCommandBuilder(List<String> args) {
        // args参数是这些:--class org.apache.spark.repl.Main --name "Spark shell" --master spark://ip:7077
        //或者:pyspark-shell-main --name PySparkShell
        //或者: --master yarn --deploy-mode cluster --name Hbase --verbose --conf xxxx
        //即PYSPARK_SHELL,SPARKR_SHELL,RUN_EXAMPLE这种运行的第一个参数就是这些类型的名称
        this.allowsMixedArguments = false;
        this.sparkArgs = new ArrayList<>();
        boolean isExample = false;
        List<String> submitArgs = args;
    
        if (args.size() > 0) {
          switch (args.get(0)) {
            //第一个参数值是pyspark-shell-main,如果python执行的
            case PYSPARK_SHELL:
              this.allowsMixedArguments = true;
              appResource = PYSPARK_SHELL;
              submitArgs = args.subList(1, args.size());
              break;
    
            case SPARKR_SHELL: //"sparkr-shell-main"
              this.allowsMixedArguments = true;
              appResource = SPARKR_SHELL;
              submitArgs = args.subList(1, args.size());
              break;
    
            case RUN_EXAMPLE:
              isExample = true;
              submitArgs = args.subList(1, args.size());
          }
    
          this.isExample = isExample;
          //作用就是将spark-submit放进来的参数对应值赋到spark对应的变量中,如 --class的值 放到mainClass变量中
          //submitArgs会去除PYSPARK_SHELL,SPARKR_SHELL,RUN_EXAMPLE名称参数
          // OptionParser属于org.apache.spark.launcher.SparkSubmitCommandBuilder内部类
          //private class OptionParser extends SparkSubmitOptionParser
          //重新实现了handle系列的函数
          OptionParser parser = new OptionParser(); 
          parser.parse(submitArgs); //它的父类方法SparkSubmitOptionParser实现的
          this.isAppResourceReq = parser.isAppResourceReq;
        }  else {
          this.isExample = isExample;
          this.isAppResourceReq = false;
        }
      }
     /**
       * Parse a list of spark-submit command line options.
       * <p>
       * See SparkSubmitArguments.scala for a more formal description of available options.
       *
       * @throws IllegalArgumentException If an error is found during parsing.
       *  参数是这些:--class org.apache.spark.repl.Main --name"Spark shell" --master spark://ip:7077. 
       *  作用就是将spark-submit放进来的参数对应值赋到spark对应的变量中,如 --class的值 放到mainClass变量中
       *  org.apache.spark.launcher.SparkSubmitOptionParser(parse)
      */
      
      protected final void parse(List<String> args) {
        //spark-submit可以传sparkConf参数:--confPROP=VALUE ,参数可以看org.apache.spark.deploy.SparkSubmitArguments类最后面
        //或spark-submit-h就可以查看
        Pattern eqSeparatedOpt = Pattern.compile("(--[^=]+)=(.+)");
    
        int idx = 0;
        for (idx = 0; idx < args.size(); idx++) {
          String arg = args.get(idx);
          String value = null;
    
          Matcher m = eqSeparatedOpt.matcher(arg);
          if (m.matches()) {
            arg = m.group(1);//--conf PROP
            value = m.group(2);//VALUE
          }
    
          // Look for options with a value.
          //该方法主要是找到spark-submit后面的带有--参数,如args 放进"--class",和opts二维数组进行匹配
          //匹配到的还是返回--class,如果没有匹配到则null
          String name = findCliOption(arg, opts);
          if (name != null) {
            if (value == null) {
              if (idx == args.size() - 1) {//如果匹配了并且没有参数值则报错,如:只有 --class ,则size是1,idx此时0, 1-1=0
                throw new IllegalArgumentException(
                    String.format("Missing argument for option '%s'.", arg));
              }
              idx++;
              value = args.get(idx); //如果有值,则idx索引的下一位就是参数对应的值
            }
            //name就是spark-submit的参数如--class,而value就是参数对应的值
            //OptionParser属于org.apache.spark.launcher.SparkSubmitCommandBuilder内部类
            //private class OptionParser extends SparkSubmitOptionParser     
            // 在它的自身OptionParser做的实现,作用就是将spark-submit放进来的参数对应值赋到spark对应的变量中      //如 --class的值放到mainClass变量中(里面实现很easy,就不写了)
            if (!handle(name, value)) { //调用的是OptionParser类重新实现的handle函数
              break;
            }
            continue;
          }
    
          // Look for a switch.
          // 如果上面没有匹配到,会再去匹配一下是否有出现-verbose这样参数
          name = findCliOption(arg, switches);
          if (name != null) {
            if (!handle(name, null)) {//调用的是OptionParser类重新实现的handle函数
              break;
            }
            continue;
          }
    
          if (!handleUnknown(arg)) {//调用的是OptionParser类重新实现的handleUnknown函数
            break;
          }
        }
    
        if (idx < args.size()) {
          idx++;
        }
        //将多出来的参数加到 SparkSubmitCommandBuilder() {his.sparkArgs = new ArrayList<String>();..}
        handleExtraArgs(args.subList(idx, args.size()));//调用的是OptionParser类重新实现的handleExtraArgs函数
      }
    //其中一些key的定义
     protected final String CLASS = "--class";
      protected final String CONF = "--conf";
      protected final String DEPLOY_MODE = "--deploy-mode";
      protected final String DRIVER_CLASS_PATH = "--driver-class-path";
      protected final String DRIVER_CORES = "--driver-cores";
      protected final String DRIVER_JAVA_OPTIONS =  "--driver-java-options";
      protected final String DRIVER_LIBRARY_PATH = "--driver-library-path";
      protected final String DRIVER_MEMORY = "--driver-memory";
      protected final String EXECUTOR_MEMORY = "--executor-memory";
      protected final String FILES = "--files";
      protected final String JARS = "--jars";
      protected final String KILL_SUBMISSION = "--kill";
      protected final String MASTER = "--master";
      protected final String NAME = "--name";
      protected final String PACKAGES = "--packages";
      protected final String PACKAGES_EXCLUDE = "--exclude-packages";
      protected final String PROPERTIES_FILE = "--properties-file";
      protected final String PROXY_USER = "--proxy-user";
      protected final String PY_FILES = "--py-files";
      protected final String REPOSITORIES = "--repositories";
      protected final String STATUS = "--status";
      protected final String TOTAL_EXECUTOR_CORES = "--total-executor-cores";
    
      // Options that do not take arguments.
      protected final String HELP = "--help";
      protected final String SUPERVISE = "--supervise";
      protected final String USAGE_ERROR = "--usage-error";
      protected final String VERBOSE = "--verbose";
      protected final String VERSION = "--version";
    
      // Standalone-only options.
    
      // YARN-only options.
      protected final String ARCHIVES = "--archives";
      protected final String EXECUTOR_CORES = "--executor-cores";
      protected final String KEYTAB = "--keytab";
      protected final String NUM_EXECUTORS = "--num-executors";
      protected final String PRINCIPAL = "--principal";
      protected final String QUEUE = "--queue";
    
      /**
       * This is the canonical list of spark-submit options. Each entry in the array contains the
       * different aliases for the same option; the first element of each entry is the "official"
       * name of the option, passed to {@link #handle(String, String)}.
       * <p>
       * Options not listed here nor in the "switch" list below will result in a call to
       * {@link #handleUnknown(String)}.
       * <p>
       * These two arrays are visible for tests.
       */
      final String[][] opts = {
        { ARCHIVES },
        { CLASS },
        { CONF, "-c" },
        { DEPLOY_MODE },
        { DRIVER_CLASS_PATH },
        { DRIVER_CORES },
        { DRIVER_JAVA_OPTIONS },
        { DRIVER_LIBRARY_PATH },
        { DRIVER_MEMORY },
        { EXECUTOR_CORES },
        { EXECUTOR_MEMORY },
        { FILES },
        { JARS },
        { KEYTAB },
        { KILL_SUBMISSION },
        { MASTER },
        { NAME },
        { NUM_EXECUTORS },
        { PACKAGES },
        { PACKAGES_EXCLUDE },
        { PRINCIPAL },
        { PROPERTIES_FILE },
        { PROXY_USER },
        { PY_FILES },
        { QUEUE },
        { REPOSITORIES },
        { STATUS },
        { TOTAL_EXECUTOR_CORES },
      };
    
      /**
       * List of switches (command line options that do not take parameters) recognized by spark-submit.
       */
      final String[][] switches = {
        { HELP, "-h" },
        { SUPERVISE },
        { USAGE_ERROR },
        { VERBOSE, "-v" },
        { VERSION },
      };
    private class OptionParser extends SparkSubmitOptionParser {
    
        boolean isAppResourceReq = true;
        /**
         *作用就是将spark-submit放进来的参数对应值赋到spark对应的变量中
       */
        @Override
        protected boolean handle(String opt, String value) {
          switch (opt) {
            case MASTER:
              master = value;
              break;
            case DEPLOY_MODE:
              deployMode = value;
              break;
            case PROPERTIES_FILE:
              propertiesFile = value;
              break;
            case DRIVER_MEMORY:
              conf.put(SparkLauncher.DRIVER_MEMORY, value);
              break;
            case DRIVER_JAVA_OPTIONS:
              conf.put(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, value);
              break;
            case DRIVER_LIBRARY_PATH:
              conf.put(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH, value);
              break;
            case DRIVER_CLASS_PATH:
              conf.put(SparkLauncher.DRIVER_EXTRA_CLASSPATH, value);
              break;
            case CONF:
              String[] setConf = value.split("=", 2);
              checkArgument(setConf.length == 2, "Invalid argument to %s: %s", CONF, value);
              conf.put(setConf[0], setConf[1]);
              break;
            case CLASS:
              // The special classes require some special command line handling, since they allow
              // mixing spark-submit arguments with arguments that should be propagated to the shell
              // itself. Note that for this to work, the "--class" argument must come before any
              // non-spark-submit arguments.
              mainClass = value;
              if (specialClasses.containsKey(value)) {
                allowsMixedArguments = true;
                appResource = specialClasses.get(value);
              }
              break;
    ................................................
     @Override
      public List<String> buildCommand(Map<String, String> env) //此处看一下SparkSubmitOptionParser.buildCommand(Map)这个方法
          throws IOException, IllegalArgumentException {
            //PYSPARK_SHELL_RESOURCE表示python,SPARKR_SHELL_RESOURCE表示r语言
        if (PYSPARK_SHELL.equals(appResource) && isAppResourceReq) {
          return buildPySparkShellCommand(env);
        } else if (SPARKR_SHELL.equals(appResource) && isAppResourceReq) {
          return buildSparkRCommand(env);
        } else {
          //这个env就是一个空的Map,会调用buildSparkSubmitCommand()方法
          return buildSparkSubmitCommand(env);
        }
      }
    //org.apache.spark.launcher.SparkSubmitCommandBuilder$buildSparkSubmitCommand)
      private List<String> buildSparkSubmitCommand(Map<String, String> env)
          throws IOException, IllegalArgumentException {
        // Load the properties file and check whether spark-submit will be running the app's driver
        // or just launching a cluster app. When running the driver, the JVM's argument will be
        // modified to cover the driver's configuration.
        //加载属性文件,并检查spark-submit是否正在运行driver的应用程序或仅启动集群应用程序。
        // 在运行驱动程序时,JVM的参数将被修改以涵盖驱动程序的配置。
        Map<String, String> config = getEffectiveConfig();
        boolean isClientMode = isClientMode(config);
        //默认如果standalone不匹配--deploy-mode cluster就是client,所以这个值是true 
        // 这个DRIVER_EXTRA_CLASSPATH在client模式是不能直接在SparkConf中设置的,因为driver的JVM已经被Spark-submit通过反射启动起来了 
        // 而是通过参数:--driver-class-path来设置的 
        // 这个DRIVER_EXTRA_CLASSPATH在client模式是不能直接在SparkConf中设置的,
        // 因为driver的JVM已经被Spark-submit通过反射启动起来了,应该通过参数:--driver-class-path来设置的
        String extraClassPath = isClientMode ? config.get(SparkLauncher.DRIVER_EXTRA_CLASSPATH) : null;
    
        List<String> cmd = buildJavaCommand(extraClassPath);
        // Take Thrift Server as daemon
        if (isThriftServer(mainClass)) {
          addOptionString(cmd, System.getenv("SPARK_DAEMON_JAVA_OPTS"));
        }
        //SPARK_SUBMIT_OPTS就是在spark-shell中提到的,需要将java的classpath手动设置到scala中
        // SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS-Dscala.usejavacp=true"
        addOptionString(cmd, System.getenv("SPARK_SUBMIT_OPTS"));
    
        // We don't want the client to specify Xmx. These have to be set by their corresponding
        // memory flag --driver-memory or configuration entry spark.driver.memory
        String driverExtraJavaOptions = config.get(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS);
        if (!isEmpty(driverExtraJavaOptions) && driverExtraJavaOptions.contains("Xmx")) {
          String msg = String.format("Not allowed to specify max heap(Xmx) memory settings through " +
                       "java options (was %s). Use the corresponding --driver-memory or " +
                       "spark.driver.memory configuration instead.", driverExtraJavaOptions);
          throw new IllegalArgumentException(msg);
        }
    
        if (isClientMode) {
          // Figuring out where the memory value come from is a little tricky due to precedence.
          // Precedence is observed in the following order:
          // - explicit configuration (setConf()), which also covers --driver-memory cli argument.
          // - properties file.
          // - SPARK_DRIVER_MEMORY env variable
          // - SPARK_MEM env variable
          // - default value (1g)
          // Take Thrift Server as daemon
          String tsMemory =
            isThriftServer(mainClass) ? System.getenv("SPARK_DAEMON_MEMORY") : null;
          String memory = firstNonEmpty(tsMemory, config.get(SparkLauncher.DRIVER_MEMORY),
            System.getenv("SPARK_DRIVER_MEMORY"), System.getenv("SPARK_MEM"), DEFAULT_MEM);
          cmd.add("-Xmx" + memory);//最大、小堆内存默认是1g
          addOptionString(cmd, driverExtraJavaOptions);
          mergeEnvPathList(env, getLibPathEnvName(),
            config.get(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH));
        }
    
        cmd.add("org.apache.spark.deploy.SparkSubmit");
        //buildSparkSubmitArgs()返回list将上面spark-submit参数注入进来的参数及对应值取出来
        cmd.addAll(buildSparkSubmitArgs());
        return cmd;
      }
    //org.apache.spark.launcher.SparkSubmitCommandBuilder$buildSparkSubmitArgs
      List<String> buildSparkSubmitArgs() {
        List<String> args = new ArrayList<>();
        SparkSubmitOptionParser parser = new SparkSubmitOptionParser();
    
        if (!allowsMixedArguments && isAppResourceReq) {
          checkArgument(appResource != null, "Missing application resource.");
        }
    
        if (verbose) {
          args.add(parser.VERBOSE);
        }
    
        if (master != null) {
          args.add(parser.MASTER);
          args.add(master);
        }
    
        if (deployMode != null) {
          args.add(parser.DEPLOY_MODE);
          args.add(deployMode);
        }
    
        if (appName != null) {
          args.add(parser.NAME);
          args.add(appName);
        }
    
        for (Map.Entry<String, String> e : conf.entrySet()) {
          args.add(parser.CONF);
          args.add(String.format("%s=%s", e.getKey(), e.getValue()));
        }
    
        if (propertiesFile != null) {
          args.add(parser.PROPERTIES_FILE);
          args.add(propertiesFile);
        }
    
        if (isExample) {
          jars.addAll(findExamplesJars());
        }
    
        if (!jars.isEmpty()) {
          args.add(parser.JARS);
          args.add(join(",", jars));
        }
    
        if (!files.isEmpty()) {
          args.add(parser.FILES);
          args.add(join(",", files));
        }
    ...................................................
    //CMD的命令如下
     /usr/java/jdk1.7.0_79/bin/java -cp /opt/cloudera/parcels/SPARK2-2.0.0.cloudera.beta1-1.cdh5.7.0.p0.108015/lib/spark2/conf/:/opt/cloudera/parcels/SPARK2-2.0.0.cloudera.beta1-1.cdh5.7.0.p0.108015/lib/spark2/jars/*:/opt/cloudera/parcels/SPARK2-2.0.0.cloudera.beta1-1.cdh5.7.0.p0.108015/lib/spark2/conf/yarn-conf/ -Dscala.usejavacp=true -Xmx1g -XX:MaxPermSize=256m org.apache.spark.deploy.SparkSubmit --class org.apache.spark.repl.Main --name Spark shell spark-shell
    //或者
    /usr/java/jdk1.7.0_79/bin/java -cp /opt/cloudera/parcels/SPARK2-2.0.0.cloudera.beta1-1.cdh5.7.0.p0.108015/lib/spark2/conf/:/opt/cloudera/parcels/SPARK2-2.0.0.cloudera.beta1-1.cdh5.7.0.p0.108015/lib/spark2/jars/*:/etc/hadoop/:/etc/hadoop/conf.cloudera.yarn/ -XX:MaxPermSize=256m org.apache.spark.deploy.SparkSubmit --master yarn --deploy-mode cluster --conf spark.driver.extraClassPath=/opt/cloudera/parcels/CDH/lib/hbase/lib/* --conf spark.scheduler.mode=FAIR --conf spark.executorEnv.JAVA_HOME=/usr/java/jdk1.8 --conf spark.yarn.appMasterEnv.JAVA_HOME=/usr/java/jdk1.8 --conf spark.yarn.maxAppAttempts=1 --class opHbase.opHbase.TopHbase --name Hbase --verbose --files /etc/hadoop/conf/log4j.properties,/etc/hive/conf/hive-site.xml --jars hdfs://10.8.18.74:8020/ada/spark/share/tech_component/tc.plat.spark.jar,hdfs://10.8.18.74:8020/ada/spark/share/tech_component/bigdata4i-1.0.jar,hdfs://10.8.18.74:8020/ada/spark/share/tech_component/bigdata-sparklog-1.0.jar,hdfs://108474.server.bigdata.com.cn:8020/user/lyy/App/tc.app.test.opHbase-1.0.jar,hdfs://10.8.18.74:8020/ada/spark/share/tech_component/mysql-connector-java-5.1.24-bin.jar hdfs://108474.server.bigdata.com.cn:8020/user/lyy/App/opHbase.opHbase.jar loglevel=ALL path=hdfs://108474.server.bigdata.com.cn:8020/user/lyy/data/hfile hbtab=hbase_tes
  • 相关阅读:
    Goahead 3.1.0 发布,嵌入式 Web 服务器
    jdao 1.0.2 发布,轻量级的orm工具包
    pythonbitstring 3.1.0 发布
    JavaScript 搜索引擎 lunr.js
    Difeye 1.1.4 版本发布
    Chronon 3.5 发布,支持 Java 7
    性能扩展的那些事儿:一味增加硬件并不能解决响应时间问题
    Eclipse SDK 4.2.2/Equinox 3.8.2 发布
    Linux Kernel 3.8.1 发布
    Armadillo C++ Library 3.800 发布
  • 原文地址:https://www.cnblogs.com/lyy-blog/p/9724416.html
Copyright © 2020-2023  润新知