stage劃分算法總結
最後一個RDD創建finalstage
finalstage倒推
通過寬依賴,來進行新的stage劃分
使用遞歸,依次提交stage,從父stage開始
源碼 org.apache.spark.scheduler包下
stage劃分算法由 submitStage和getMissingParentStages方法組成
第一步:使用觸發job的最後一個RDD,創建finalstage,傳入到newstage方法中
var finalStage: Stage = null
//創建一個stage對象,並且將stage加入到DAGscheduler中
finalStage = newStage(finalRDD, partitions.size, None, jobId, callSite)
第二步:用finalstage創建一個job,也就是說,這個job的最後一個stage,當然就是finalstage
val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)
第三步:將job加入到內存緩存中
jobIdToActiveJob(jobId) = job
activeJobs += job
finalStage.resultOfJob = Some(job)
val stageIds = jobIdToStageIds(jobId).toArray
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
第四步:使用submitStage方法提交finalstage(嘗試)
submitStage(finalStage)
//調用getMissingParentStages方法,去獲取當前stage的父stage
val missing = getMissingParentStages(stage).sortBy(_.id)
//首先往棧中,推入了最後一個RDD
waitingForVisit.push(stage.rdd)
//然後進行while循環,調用自己內部定義的visit()方法
while (!waitingForVisit.isEmpty) {
visit(waitingForVisit.pop())
}
在visit()方法內,遍歷RDD的依賴
for (dep <- rdd.dependencies)
如果是窄依賴,那麼將依賴的RDD放入棧中
case narrowDep: NarrowDependency[_] =>
waitingForVisit.push(narrowDep.rdd)
如果是寬依賴,那麼使用依賴的RDD創建一個新的stage,並且會將isShuffleMap設置爲true
(默認的最後一個stage不是shuffleMap stage)
除了finalstage都是shuffleMap stage
case shufDep: ShuffleDependency[_, _, _] =>
val mapStage = getShuffleMapStage(shufDep, stage.jobId)
if (missing == Nil) {
//如果嗎沒有父stage則執行
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
submitMissingTasks(stage, jobId.get)
} else {
//遞歸調用submit方法去去提交父stage
for (parent <- missing) {
submitStage(parent)
}
//並且將當前stage放入等待執行的stage隊列中
waitingStages += stage
}
/*
* 提交stage的方法
*/
private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
logDebug("submitStage(" + stage + ")")
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
//調用getMissingParentStages方法,去獲取當前stage的父stage
val missing = getMissingParentStages(stage).sortBy(_.id)
logDebug("missing: " + missing)
if (missing == Nil) {
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
submitMissingTasks(stage, jobId.get)
} else {
for (parent <- missing) {
submitStage(parent)
}
waitingStages += stage
}
}
} else {
abortStage(stage, "No active job for stage " + stage.id)
}
}
/*
* 獲取某個stage的父stage方法
*/
private def getMissingParentStages(stage: Stage): List[Stage] = {
val missing = new HashSet[Stage]
val visited = new HashSet[RDD[_]]
// We are manually maintaining a stack here to prevent StackOverflowError
// caused by recursively visiting
val waitingForVisit = new Stack[RDD[_]]
def visit(rdd: RDD[_]) {
if (!visited(rdd)) {
visited += rdd
if (getCacheLocs(rdd).contains(Nil)) {
for (dep <- rdd.dependencies) {
//遍歷RDD的父依賴
dep match {
case shufDep: ShuffleDependency[_, _, _] =>
val mapStage = getShuffleMapStage(shufDep, stage.jobId)
if (!mapStage.isAvailable) {
missing += mapStage
}
case narrowDep: NarrowDependency[_] =>
waitingForVisit.push(narrowDep.rdd)
}
}
}
}
}
//首先往棧中,推入了最後一個RDD
waitingForVisit.push(stage.rdd)
//然後進行循環,調用自己內部定義的visit()方法
while (!waitingForVisit.isEmpty) {
visit(waitingForVisit.pop())
}
missing.toList
}