問題描述
SparkStreaming在長時間運行時,偶爾會出現下面的異常:
2018-01-08 18:42:03 [ SparkListenerBus:32824468 ] - [ ERROR ] Listener SQLListener threw an exception
java.lang.IllegalStateException: Attempted to access garbage collected accumulator 5618419
at org.apache.spark.util.AccumulatorContext$$anonfun$get$1.apply(AccumulatorV2.scala:268)
at org.apache.spark.util.AccumulatorContext$$anonfun$get$1.apply(AccumulatorV2.scala:264)
at scala.Option.map(Option.scala:146)
at org.apache.spark.util.AccumulatorContext$.get(AccumulatorV2.scala:264)
at org.apache.spark.util.AccumulatorV2$$anonfun$name$1.apply(AccumulatorV2.scala:90)
at org.apache.spark.util.AccumulatorV2$$anonfun$name$1.apply(AccumulatorV2.scala:90)
at scala.Option.orElse(Option.scala:289)
at org.apache.spark.util.AccumulatorV2.name(AccumulatorV2.scala:90)
at org.apache.spark.util.AccumulatorV2.toInfo(AccumulatorV2.scala:111)
at org.apache.spark.sql.execution.ui.SQLListener$$anonfun$onTaskEnd$1.apply(SQLListener.scala:216)
at org.apache.spark.sql.execution.ui.SQLListener$$anonfun$onTaskEnd$1.apply(SQLListener.scala:216)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.spark.sql.execution.ui.SQLListener.onTaskEnd(SQLListener.scala:216)
at org.apache.spark.scheduler.SparkListenerBus$class.doPostEvent(SparkListenerBus.scala:45)
at org.apache.spark.scheduler.LiveListenerBus.doPostEvent(LiveListenerBus.scala:36)
at org.apache.spark.scheduler.LiveListenerBus.doPostEvent(LiveListenerBus.scala:36)
at org.apache.spark.util.ListenerBus$class.postToAll(ListenerBus.scala:63)
at org.apache.spark.scheduler.LiveListenerBus.postToAll(LiveListenerBus.scala:36)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(LiveListenerBus.scala:94)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(LiveListenerBus.scala:79)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(LiveListenerBus.scala:79)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1.apply$mcV$sp(LiveListenerBus.scala:78)
at org.apache.spark.util.Utils$.tryOrStopSparkContext(Utils.scala:1279)
at org.apache.spark.scheduler.LiveListenerBus$$anon$1.run(LiveListenerBus.scala:77)
通過Spark源碼異常分析
SparkContext初始化的時候會創建LiveListenerBus,它是用執行Spark提交事件的事件隊列,比如Job的啓動停止,Task的啓動停止,stage的提交,block添加刪除等等
SparkContext初始化LiveListenerBus啓動源碼如下:
private var _listenerBus: LiveListenerBus = _ // 195
private[spark] def listenerBus: LiveListenerBus = _listenerBus // 248
_listenerBus = new LiveListenerBus(_conf) // 417
listenerBus.start(this, _env.metricsSystem) // 2393
LiveListenerBus的start方法源碼如下:private val eventQueue =
new LinkedBlockingQueue[SparkListenerEvent](conf.get(LISTENER_BUS_EVENT_QUEUE_CAPACITY))
def start(sc: SparkContext, metricsSystem: MetricsSystem): Unit = {
if (started.compareAndSet(false, true)) {
sparkContext = sc
metricsSystem.registerSource(metrics)
listenerThread.start()
} else {
throw new IllegalStateException(s"$name already started!")
}
}
private val listenerThread = new Thread(name) {
setDaemon(true)
override def run(): Unit = Utils.tryOrStopSparkContext(sparkContext) {
LiveListenerBus.withinListenerThread.withValue(true) {
val timer = metrics.eventProcessingTime
while (true) {
eventLock.acquire()
self.synchronized {
processingEvent = true
}
try {
val event = eventQueue.poll
if (event == null) {
// Get out of the while loop and shutdown the daemon thread
if (!stopped.get) {
throw new IllegalStateException("Polling `null` from eventQueue means" +
" the listener bus has been stopped. So `stopped` must be true")
}
return
}
val timerContext = timer.time()
try {
postToAll(event)
} finally {
timerContext.stop()
}
} finally {
self.synchronized {
processingEvent = false
}
}
}
}
}
}
可以看到,SparkContext初始化的時候會調用LivListenerBus的start方法來啓動線程,該線程的作用是去取LiveListenerBus初始化時創建的LinkedBlockingQueue隊列中的Event事件來執行相應的作業。
從異常棧中我們可以看到是在執行SQLListener的onTaskEnd方法時報的錯,下面是SQLListener的源碼:
override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized {
if (taskEnd.taskMetrics != null) {
updateTaskAccumulatorValues(
taskEnd.taskInfo.taskId,
taskEnd.stageId,
taskEnd.stageAttemptId,
taskEnd.taskMetrics.externalAccums.map(a => a.toInfo(Some(a.value), None)),
finishTask = true)
}
}
該方法調用了AccumulatroV2的name的toInfo()方法,然後toInfo()方法調用了name()方法,name()方法通過Accumulator的id去取originals的ConcurrentHashMap中的對象,但是這個是一個弱引用,GC回去進行回收的,所以,當內存不足時就會出現部分弱應用被回收,進而get的時候取不到數據,然後拋出異常:throw new IllegalStateException(s"Attempted to access garbage collected accumulator $id")
private val originals = new ConcurrentHashMap[Long, jl.ref.WeakReference[AccumulatorV2[_, _]]]
private[spark] def toInfo(update: Option[Any], value: Option[Any]): AccumulableInfo = {
val isInternal = name.exists(_.startsWith(InternalAccumulator.METRICS_PREFIX))
new AccumulableInfo(id, name, update, value, isInternal, countFailedValues)
}
final def name: Option[String] = {
assertMetadataNotNull()
if (atDriverSide) {
metadata.name.orElse(AccumulatorContext.get(id).flatMap(_.metadata.name))
} else {
metadata.name
}
}
def get(id: Long): Option[AccumulatorV2[_, _]] = {
Option(originals.get(id)).map { ref =>
// Since we are storing weak references, we must check whether the underlying data is valid.
val acc = ref.get
if (acc eq null) {
throw new IllegalStateException(s"Attempted to access garbage collected accumulator $id")
}
acc
}
}
def postToAll(event: E): Unit = {
// JavaConverters can create a JIterableWrapper if we use asScala.
// However, this method will be called frequently. To avoid the wrapper cost, here we use
// Java Iterator directly.
val iter = listenersPlusTimers.iterator
while (iter.hasNext) {
val listenerAndMaybeTimer = iter.next()
val listener = listenerAndMaybeTimer._1
val maybeTimer = listenerAndMaybeTimer._2
val maybeTimerContext = if (maybeTimer.isDefined) {
maybeTimer.get.time()
} else {
null
}
try {
doPostEvent(listener, event)
} catch {
case NonFatal(e) =>
logError(s"Listener ${Utils.getFormattedClassName(listener)} threw an exception", e)
} finally {
if (maybeTimerContext != null) {
maybeTimerContext.stop()
}
}
}
}
這個地方就出現了我們上面的異常信息。我們可以看到最後,這個異常其實是不影響我們系統的正常運行的。