spark-submit方式提交應用
啓動腳本文件
# 命令行提交Spark應用樣例:
#./bin/spark-submit \
# --class com.imooc.spark.Test.TestOfSparkContext2 \
# --conf spark.master spark://localhost:7077 \
# --master local[2] \
# /home/hadoop/data/test-jar/sql-1.0.jar arg1 arg2
#
if [ -z "${SPARK_HOME}" ]; then
source "$(dirname "$0")"/find-spark-home
fi
# disable randomized hash for string in Python 3.3+
export PYTHONHASHSEED=0
exec "${SPARK_HOME}"/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@"
構建入口類
構建SparkSumit
對象,它是一個Spark應用程序的入口類(工具類),可以完成提交、停止、查詢狀態等功能。
override def main(args: Array[String]): Unit = {
// 創建自定義SparkSubmit類,使用匿名子類的創建方式來override一些方法
val submit = new SparkSubmit() {
self => // 創建SparkSubmit類的一個別名
// 自定義參數解析類匿名子類對象,主要自定義瞭如何打印日誌
override protected def parseArguments(args: Array[String]): SparkSubmitArguments = {
new SparkSubmitArguments(args) {
override protected def logInfo(msg: => String): Unit = self.logInfo(msg)
override protected def logWarning(msg: => String): Unit = self.logWarning(msg)
}
}
override protected def logInfo(msg: => String): Unit = printMessage(msg)
override protected def logWarning(msg: => String): Unit = printMessage(s"Warning: $msg")
// 重載此方法,主要是添加try...catch語句,捕獲異常
override def doSubmit(args: Array[String]): Unit = {
try {
super.doSubmit(args)
} catch {
case e: SparkUserAppException =>
exitFn(e.exitCode)
}
}
}
submit.doSubmit(args)
}
在SparkSubmit類對象主要有2個功能,一個是完成參數的解析及加載,一個是嘗試提交、停止、查詢某一個Spark應用。
參數解析及加載
參數的解析及加載過程通過SparkSubmitArguments(...)
完成,詳細的主流程如下代碼片段。這段代碼也告訴了我們Spark加載參數的順序,根據參數的作用優先級低到高排列如下:
- 加載通過參數
--properties-file
指定的文件中加載配置信息作爲默認的屬性 - 加載用戶通過命令行指定的各項屬性,包括
--conf | --jars | --class
等,作爲 - 如果用戶沒有通過參數
--properties-file
,指定屬性文件,則加載環境變量SPARK_CONF_DIR
指定的路徑或是${SPARK_HOME}/conf
路徑下的spark-defaults.conf
文件中的配置信息,並與前面所有讀取的屬性合併 - 加載通過環境變量指定的各種屬性,後續在訪問每個變量時,優先使用相應的環境變量
此方法會盡最大可能的來符合Spark定義的參數信息,而忽略掉那些可以不符合規則(不以spark.開頭的屬性)的屬性,最終如果參數解析完成就會生成有效的類對象,否則會輸出有效的提示信息並退出當前進程。
/** args函數參數:通過啓動腳本接收到的所有在/bin/spark-submit之後的參數 **/
private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, String] = sys.env)
extends SparkSubmitArgumentsParser with Logging {
/** Default properties present in the currently defined defaults file. */
lazy val defaultSparkProperties: HashMap[String, String] = {
val defaultProperties = new HashMap[String, String]()
if (verbose) {
logInfo(s"Using properties file: $propertiesFile")
}
Option(propertiesFile).foreach { filename =>
val properties = Utils.getPropertiesFromFile(filename)
properties.foreach { case (k, v) =>
defaultProperties(k) = v
}
// Property files may contain sensitive information, so redact before printing
if (verbose) {
Utils.redact(properties).foreach { case (k, v) =>
logInfo(s"Adding default property: $k=$v")
}
}
}
defaultProperties
}
// Set parameters from command line arguments
parse(args.asJava)
// Populate `sparkProperties` map from properties file
mergeDefaultSparkProperties()
// Remove keys that don't start with "spark." from `sparkProperties`.
ignoreNonSparkProperties()
// Use `sparkProperties` map along with env vars to fill in any missing parameters
loadEnvironmentArguments()
useRest = sparkProperties.getOrElse("spark.master.rest.enabled", "false").toBoolean
validateArguments()
執行命令
從下面代碼可以看到Spark CLI支持4種操作,但這裏主要關注submit
流程,其它方法暫不深究,詳細的分析見下一小節。
def doSubmit(args: Array[String]): Unit = {
// Initialize logging if it hasn't been done yet. Keep track of whether logging needs to
// be reset before the application starts.
val uninitLog = initializeLogIfNecessary(true, silent = true)
val appArgs = parseArguments(args)
if (appArgs.verbose) {
logInfo(appArgs.toString)
}
appArgs.action match {
case SparkSubmitAction.SUBMIT => submit(appArgs, uninitLog)
case SparkSubmitAction.KILL => kill(appArgs)
case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs)
case SparkSubmitAction.PRINT_VERSION => printVersion()
}
}
提交任務
submit(...)
方法主要的功能就是,解析參數、構建和執行用戶指定的入口類或是Spark內部類。有關加載各種參數的過程在前面的小節已經分析過,這裏我們主要看一下runMain(...)
方法。
/**
* Submit the application using the provided parameters.
*
* This runs in two steps. First, we prepare the launch environment by setting up
* the appropriate classpath, system properties, and application arguments for
* running the child main class based on the cluster manager and the deploy mode.
* Second, we use this launch environment to invoke the main method of the child
* main class.
*/
@tailrec
private def submit(args: SparkSubmitArguments, uninitLog: Boolean): Unit = {
// 分類參數,child指用戶指定的入口類的子進程的概念:
// childArgs Array,包含了傳遞給
// childClasspath Array,包含了用戶通過spark.jars屬性、--jars參數及指定的
// 入口jar包,其中當提交的任務模式爲client時,會首先嚐試下載通過spark.jars或
//
val (childArgs, childClasspath, sparkConf, childMainClass) = prepareSubmitEnvironment(args)
def doRunMain(): Unit = {
if (args.proxyUser != null) {
val proxyUser = UserGroupInformation.createProxyUser(args.proxyUser,
UserGroupInformation.getCurrentUser())
try {
proxyUser.doAs(new PrivilegedExceptionAction[Unit]() {
override def run(): Unit = {
runMain(childArgs, childClasspath, sparkConf, childMainClass, args.verbose)
}
})
} catch {
case e: Exception =>
// Hadoop's AuthorizationException suppresses the exception's stack trace, which
// makes the message printed to the output by the JVM not very helpful. Instead,
// detect exceptions with empty stack traces here, and treat them differently.
if (e.getStackTrace().length == 0) {
error(s"ERROR: ${e.getClass().getName()}: ${e.getMessage()}")
} else {
throw e
}
}
} else {
runMain(childArgs, childClasspath, sparkConf, childMainClass, args.verbose)
}
}
// In standalone cluster mode, there are two submission gateways:
// (1) The traditional RPC gateway using o.a.s.deploy.Client as a wrapper
// (2) The new REST-based gateway introduced in Spark 1.3
// The latter is the default behavior as of Spark 1.3, but Spark submit will fail over
// to use the legacy gateway if the master endpoint turns out to be not a REST server.
if (args.isStandaloneCluster && args.useRest) {
try {
logInfo("Running Spark using the REST application submission protocol.")
doRunMain()
} catch {
// Fail over to use the legacy submission gateway
case e: SubmitRestConnectionException =>
logWarning(s"Master endpoint ${args.master} was not a REST server. " +
"Falling back to legacy submission gateway instead.")
args.useRest = false
submit(args, false)
}
// In all other modes, just run the main class as prepared
} else {
doRunMain()
}
}
runMain方法
此方法通過反射的方式,生成用戶指定的入口類或是Spark的內置類,由於生成的類可能是實現了SparkApplication
接口的子類抑或是一個自定義的類,因此需要根據這兩種情況分析選擇是直接執行生成類的start(...)
方法間接調用生成類的main(...)
方法,同時傳遞所有解析到的spark參數及需要應用接收的各個args。
/**
* Run the main method of the child class using the provided launch environment.
*
* Note that this main class will not be the one provided by the user if we're
* running cluster deploy mode or python applications.
*/
private def runMain(
childArgs: Seq[String],
childClasspath: Seq[String],
sparkConf: SparkConf,
childMainClass: String,
verbose: Boolean): Unit = {
// ... 忽略添加jar包到JAVA的系統路徑下的代碼邏輯,這裏會根據用戶是否指定了
// spark.driver.userClassPathFirst
// 這個參數,來選擇添加jar包的優先級
var mainClass: Class[_] = null
try {
mainClass = Utils.classForName(childMainClass)
} catch {
case e: ClassNotFoundException =>
logWarning(s"Failed to load $childMainClass.", e)
if (childMainClass.contains("thriftserver")) {
logInfo(s"Failed to load main class $childMainClass.")
logInfo("You need to build Spark with -Phive and -Phive-thriftserver.")
}
throw new SparkUserAppException(CLASS_NOT_FOUND_EXIT_STATUS)
case e: NoClassDefFoundError =>
logWarning(s"Failed to load $childMainClass: ${e.getMessage()}")
if (e.getMessage.contains("org/apache/hadoop/hive")) {
logInfo(s"Failed to load hive class.")
logInfo("You need to build Spark with -Phive and -Phive-thriftserver.")
}
throw new SparkUserAppException(CLASS_NOT_FOUND_EXIT_STATUS)
}
val app: SparkApplication = if (classOf[SparkApplication].isAssignableFrom(mainClass)) {
mainClass.newInstance().asInstanceOf[SparkApplication]
} else {
// SPARK-4170
if (classOf[scala.App].isAssignableFrom(mainClass)) {
logWarning("Subclasses of scala.App may not work correctly. Use a main() method instead.")
}
new JavaMainApplication(mainClass)
}
@tailrec
def findCause(t: Throwable): Throwable = t match {
case e: UndeclaredThrowableException =>
if (e.getCause() != null) findCause(e.getCause()) else e
case e: InvocationTargetException =>
if (e.getCause() != null) findCause(e.getCause()) else e
case e: Throwable =>
e
}
try {
app.start(childArgs.toArray, sparkConf)
} catch {
case t: Throwable =>
throw findCause(t)
}
}
到此,整個通過spark-submit命令提交任務的流程已簡單剖析完畢,更詳細的內容讀者可以自行查看源碼。
下一篇淺析創建Kubernetes任務的流程。