spark-submit方式提交應用

啓動腳本文件

# 命令行提交Spark應用樣例：
#./bin/spark-submit \
#  --class com.imooc.spark.Test.TestOfSparkContext2 \
#  --conf spark.master spark://localhost:7077 \
#  --master local[2] \
#  /home/hadoop/data/test-jar/sql-1.0.jar arg1 arg2
#
if [ -z "${SPARK_HOME}" ]; then
  source "$(dirname "$0")"/find-spark-home
fi

# disable randomized hash for string in Python 3.3+
export PYTHONHASHSEED=0

exec "${SPARK_HOME}"/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@"

構建入口類

構建SparkSumit對象，它是一個Spark應用程序的入口類（工具類），可以完成提交、停止、查詢狀態等功能。

  override def main(args: Array[String]): Unit = {
  	// 創建自定義SparkSubmit類，使用匿名子類的創建方式來override一些方法
    val submit = new SparkSubmit() {
      self => // 創建SparkSubmit類的一個別名
      // 自定義參數解析類匿名子類對象，主要自定義瞭如何打印日誌
      override protected def parseArguments(args: Array[String]): SparkSubmitArguments = {
        new SparkSubmitArguments(args) {
          override protected def logInfo(msg: => String): Unit = self.logInfo(msg)

          override protected def logWarning(msg: => String): Unit = self.logWarning(msg)
        }
      }

      override protected def logInfo(msg: => String): Unit = printMessage(msg)

      override protected def logWarning(msg: => String): Unit = printMessage(s"Warning: $msg")
      // 重載此方法，主要是添加try...catch語句，捕獲異常
      override def doSubmit(args: Array[String]): Unit = {
        try {
          super.doSubmit(args)
        } catch {
          case e: SparkUserAppException =>
            exitFn(e.exitCode)
        }
      }

    }

    submit.doSubmit(args)
  }

在SparkSubmit類對象主要有2個功能，一個是完成參數的解析及加載，一個是嘗試提交、停止、查詢某一個Spark應用。

參數解析及加載

參數的解析及加載過程通過SparkSubmitArguments(...)完成，詳細的主流程如下代碼片段。這段代碼也告訴了我們Spark加載參數的順序，根據參數的作用優先級低到高排列如下：

加載通過參數--properties-file指定的文件中加載配置信息作爲默認的屬性
加載用戶通過命令行指定的各項屬性，包括--conf | --jars | --class等，作爲
如果用戶沒有通過參數--properties-file，指定屬性文件，則加載環境變量SPARK_CONF_DIR指定的路徑或是${SPARK_HOME}/conf路徑下的spark-defaults.conf文件中的配置信息，並與前面所有讀取的屬性合併
加載通過環境變量指定的各種屬性，後續在訪問每個變量時，優先使用相應的環境變量

此方法會盡最大可能的來符合Spark定義的參數信息，而忽略掉那些可以不符合規則（不以spark.開頭的屬性）的屬性，最終如果參數解析完成就會生成有效的類對象，否則會輸出有效的提示信息並退出當前進程。

/** args函數參數：通過啓動腳本接收到的所有在/bin/spark-submit之後的參數 **/
private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, String] = sys.env)
  extends SparkSubmitArgumentsParser with Logging {
  /** Default properties present in the currently defined defaults file. */
  lazy val defaultSparkProperties: HashMap[String, String] = {
    val defaultProperties = new HashMap[String, String]()
    if (verbose) {
      logInfo(s"Using properties file: $propertiesFile")
    }
    Option(propertiesFile).foreach { filename =>
      val properties = Utils.getPropertiesFromFile(filename)
      properties.foreach { case (k, v) =>
        defaultProperties(k) = v
      }
      // Property files may contain sensitive information, so redact before printing
      if (verbose) {
        Utils.redact(properties).foreach { case (k, v) =>
          logInfo(s"Adding default property: $k=$v")
        }
      }
    }
    defaultProperties
  }

  // Set parameters from command line arguments
  parse(args.asJava)
  // Populate `sparkProperties` map from properties file
  mergeDefaultSparkProperties()
  // Remove keys that don't start with "spark." from `sparkProperties`.
  ignoreNonSparkProperties()
  // Use `sparkProperties` map along with env vars to fill in any missing parameters
  loadEnvironmentArguments()

  useRest = sparkProperties.getOrElse("spark.master.rest.enabled", "false").toBoolean

  validateArguments()

執行命令

從下面代碼可以看到Spark CLI支持4種操作，但這裏主要關注submit流程，其它方法暫不深究，詳細的分析見下一小節。

  def doSubmit(args: Array[String]): Unit = {
    // Initialize logging if it hasn't been done yet. Keep track of whether logging needs to
    // be reset before the application starts.
    val uninitLog = initializeLogIfNecessary(true, silent = true)

    val appArgs = parseArguments(args)
    if (appArgs.verbose) {
      logInfo(appArgs.toString)
    }
    appArgs.action match {
      case SparkSubmitAction.SUBMIT => submit(appArgs, uninitLog)
      case SparkSubmitAction.KILL => kill(appArgs)
      case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs)
      case SparkSubmitAction.PRINT_VERSION => printVersion()
    }
  }

提交任務

submit(...)方法主要的功能就是，解析參數、構建和執行用戶指定的入口類或是Spark內部類。有關加載各種參數的過程在前面的小節已經分析過，這裏我們主要看一下runMain(...)方法。

  /**
   * Submit the application using the provided parameters.
   *
   * This runs in two steps. First, we prepare the launch environment by setting up
   * the appropriate classpath, system properties, and application arguments for
   * running the child main class based on the cluster manager and the deploy mode.
   * Second, we use this launch environment to invoke the main method of the child
   * main class.
   */
  @tailrec
  private def submit(args: SparkSubmitArguments, uninitLog: Boolean): Unit = {
  	// 分類參數，child指用戶指定的入口類的子進程的概念：
  	//   childArgs      Array，包含了傳遞給
  	//   childClasspath Array，包含了用戶通過spark.jars屬性、--jars參數及指定的
  	//                  入口jar包，其中當提交的任務模式爲client時，會首先嚐試下載通過spark.jars或
  	//   
    val (childArgs, childClasspath, sparkConf, childMainClass) = prepareSubmitEnvironment(args)

    def doRunMain(): Unit = {
      if (args.proxyUser != null) {
        val proxyUser = UserGroupInformation.createProxyUser(args.proxyUser,
          UserGroupInformation.getCurrentUser())
        try {
          proxyUser.doAs(new PrivilegedExceptionAction[Unit]() {
            override def run(): Unit = {
              runMain(childArgs, childClasspath, sparkConf, childMainClass, args.verbose)
            }
          })
        } catch {
          case e: Exception =>
            // Hadoop's AuthorizationException suppresses the exception's stack trace, which
            // makes the message printed to the output by the JVM not very helpful. Instead,
            // detect exceptions with empty stack traces here, and treat them differently.
            if (e.getStackTrace().length == 0) {
              error(s"ERROR: ${e.getClass().getName()}: ${e.getMessage()}")
            } else {
              throw e
            }
        }
      } else {
        runMain(childArgs, childClasspath, sparkConf, childMainClass, args.verbose)
      }
    }


    // In standalone cluster mode, there are two submission gateways:
    //   (1) The traditional RPC gateway using o.a.s.deploy.Client as a wrapper
    //   (2) The new REST-based gateway introduced in Spark 1.3
    // The latter is the default behavior as of Spark 1.3, but Spark submit will fail over
    // to use the legacy gateway if the master endpoint turns out to be not a REST server.
    if (args.isStandaloneCluster && args.useRest) {
      try {
        logInfo("Running Spark using the REST application submission protocol.")
        doRunMain()
      } catch {
        // Fail over to use the legacy submission gateway
        case e: SubmitRestConnectionException =>
          logWarning(s"Master endpoint ${args.master} was not a REST server. " +
            "Falling back to legacy submission gateway instead.")
          args.useRest = false
          submit(args, false)
      }
    // In all other modes, just run the main class as prepared
    } else {
      doRunMain()
    }
  }

runMain方法

此方法通過反射的方式，生成用戶指定的入口類或是Spark的內置類，由於生成的類可能是實現了SparkApplication接口的子類抑或是一個自定義的類，因此需要根據這兩種情況分析選擇是直接執行生成類的start(...)方法間接調用生成類的main(...)方法，同時傳遞所有解析到的spark參數及需要應用接收的各個args。

  /**
   * Run the main method of the child class using the provided launch environment.
   *
   * Note that this main class will not be the one provided by the user if we're
   * running cluster deploy mode or python applications.
   */
  private def runMain(
      childArgs: Seq[String],
      childClasspath: Seq[String],
      sparkConf: SparkConf,
      childMainClass: String,
      verbose: Boolean): Unit = {
    // ... 忽略添加jar包到JAVA的系統路徑下的代碼邏輯，這裏會根據用戶是否指定了
    // spark.driver.userClassPathFirst
    // 這個參數，來選擇添加jar包的優先級
    
    var mainClass: Class[_] = null
    try {
      mainClass = Utils.classForName(childMainClass)
    } catch {
      case e: ClassNotFoundException =>
        logWarning(s"Failed to load $childMainClass.", e)
        if (childMainClass.contains("thriftserver")) {
          logInfo(s"Failed to load main class $childMainClass.")
          logInfo("You need to build Spark with -Phive and -Phive-thriftserver.")
        }
        throw new SparkUserAppException(CLASS_NOT_FOUND_EXIT_STATUS)
      case e: NoClassDefFoundError =>
        logWarning(s"Failed to load $childMainClass: ${e.getMessage()}")
        if (e.getMessage.contains("org/apache/hadoop/hive")) {
          logInfo(s"Failed to load hive class.")
          logInfo("You need to build Spark with -Phive and -Phive-thriftserver.")
        }
        throw new SparkUserAppException(CLASS_NOT_FOUND_EXIT_STATUS)
    }

    val app: SparkApplication = if (classOf[SparkApplication].isAssignableFrom(mainClass)) {
      mainClass.newInstance().asInstanceOf[SparkApplication]
    } else {
      // SPARK-4170
      if (classOf[scala.App].isAssignableFrom(mainClass)) {
        logWarning("Subclasses of scala.App may not work correctly. Use a main() method instead.")
      }
      new JavaMainApplication(mainClass)
    }

    @tailrec
    def findCause(t: Throwable): Throwable = t match {
      case e: UndeclaredThrowableException =>
        if (e.getCause() != null) findCause(e.getCause()) else e
      case e: InvocationTargetException =>
        if (e.getCause() != null) findCause(e.getCause()) else e
      case e: Throwable =>
        e
    }

    try {
      app.start(childArgs.toArray, sparkConf)
    } catch {
      case t: Throwable =>
        throw findCause(t)
    }
}

到此，整個通過spark-submit命令提交任務的流程已簡單剖析完畢，更詳細的內容讀者可以自行查看源碼。

下一篇淺析創建Kubernetes任務的流程。

spark-submit命令行提交Spark任務流程解讀

spark-submit方式提交應用

啓動腳本文件

構建入口類

參數解析及加載

執行命令

提交任務

runMain方法

sm4加密工具類

Spark之Stage的生成及任務的執行

spark-submit命令行提交Spark任務流程解讀

Spark之SparkEnv實例的構建

Spark集羣創建之核心角色的創建過程

堆排序算法之前K個最大/最小值

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結