【異常】SparkStreaming拋出Listener SQLListener threw an exception異常

問題描述

SparkStreaming在長時間運行時,偶爾會出現下面的異常:
2018-01-08 18:42:03  [ SparkListenerBus:32824468 ] - [ ERROR ]  Listener SQLListener threw an exception
java.lang.IllegalStateException: Attempted to access garbage collected accumulator 5618419
        at org.apache.spark.util.AccumulatorContext$$anonfun$get$1.apply(AccumulatorV2.scala:268)
        at org.apache.spark.util.AccumulatorContext$$anonfun$get$1.apply(AccumulatorV2.scala:264)
        at scala.Option.map(Option.scala:146)
        at org.apache.spark.util.AccumulatorContext$.get(AccumulatorV2.scala:264)
        at org.apache.spark.util.AccumulatorV2$$anonfun$name$1.apply(AccumulatorV2.scala:90)
        at org.apache.spark.util.AccumulatorV2$$anonfun$name$1.apply(AccumulatorV2.scala:90)
        at scala.Option.orElse(Option.scala:289)
        at org.apache.spark.util.AccumulatorV2.name(AccumulatorV2.scala:90)
        at org.apache.spark.util.AccumulatorV2.toInfo(AccumulatorV2.scala:111)
        at org.apache.spark.sql.execution.ui.SQLListener$$anonfun$onTaskEnd$1.apply(SQLListener.scala:216)
        at org.apache.spark.sql.execution.ui.SQLListener$$anonfun$onTaskEnd$1.apply(SQLListener.scala:216)
        at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
        at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
        at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
        at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
        at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
        at scala.collection.AbstractTraversable.map(Traversable.scala:104)
        at org.apache.spark.sql.execution.ui.SQLListener.onTaskEnd(SQLListener.scala:216)
        at org.apache.spark.scheduler.SparkListenerBus$class.doPostEvent(SparkListenerBus.scala:45)
        at org.apache.spark.scheduler.LiveListenerBus.doPostEvent(LiveListenerBus.scala:36)
        at org.apache.spark.scheduler.LiveListenerBus.doPostEvent(LiveListenerBus.scala:36)
        at org.apache.spark.util.ListenerBus$class.postToAll(ListenerBus.scala:63)
        at org.apache.spark.scheduler.LiveListenerBus.postToAll(LiveListenerBus.scala:36)
        at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(LiveListenerBus.scala:94)
        at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(LiveListenerBus.scala:79)
        at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(LiveListenerBus.scala:79)
        at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
        at org.apache.spark.scheduler.LiveListenerBus$$anon$1$$anonfun$run$1.apply$mcV$sp(LiveListenerBus.scala:78)
        at org.apache.spark.util.Utils$.tryOrStopSparkContext(Utils.scala:1279)
        at org.apache.spark.scheduler.LiveListenerBus$$anon$1.run(LiveListenerBus.scala:77)

通過Spark源碼異常分析

SparkContext初始化的時候會創建LiveListenerBus,它是用執行Spark提交事件的事件隊列,比如Job的啓動停止,Task的啓動停止,stage的提交,block添加刪除等等
SparkContext初始化LiveListenerBus啓動源碼如下:
private var _listenerBus: LiveListenerBus = _ // 195
private[spark] def listenerBus: LiveListenerBus = _listenerBus // 248
_listenerBus = new LiveListenerBus(_conf) // 417
listenerBus.start(this, _env.metricsSystem) // 2393
LiveListenerBus的start方法源碼如下:
private val eventQueue =
    new LinkedBlockingQueue[SparkListenerEvent](conf.get(LISTENER_BUS_EVENT_QUEUE_CAPACITY))
def start(sc: SparkContext, metricsSystem: MetricsSystem): Unit = {
    if (started.compareAndSet(false, true)) {
      sparkContext = sc
      metricsSystem.registerSource(metrics)
      listenerThread.start()
    } else {
      throw new IllegalStateException(s"$name already started!")
    }
  }
  private val listenerThread = new Thread(name) {
    setDaemon(true)
    override def run(): Unit = Utils.tryOrStopSparkContext(sparkContext) {
      LiveListenerBus.withinListenerThread.withValue(true) {
        val timer = metrics.eventProcessingTime
        while (true) {
          eventLock.acquire()
          self.synchronized {
            processingEvent = true
          }
          try {
            val event = eventQueue.poll
            if (event == null) {
              // Get out of the while loop and shutdown the daemon thread
              if (!stopped.get) {
                throw new IllegalStateException("Polling `null` from eventQueue means" +
                  " the listener bus has been stopped. So `stopped` must be true")
              }
              return
            }
            val timerContext = timer.time()
            try {
              postToAll(event)
            } finally {
              timerContext.stop()
            }
          } finally {
            self.synchronized {
              processingEvent = false
            }
          }
        }
      }
    }
  }
可以看到,SparkContext初始化的時候會調用LivListenerBus的start方法來啓動線程,該線程的作用是去取LiveListenerBus初始化時創建的LinkedBlockingQueue隊列中的Event事件來執行相應的作業。

從異常棧中我們可以看到是在執行SQLListener的onTaskEnd方法時報的錯,下面是SQLListener的源碼:
  override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized {
    if (taskEnd.taskMetrics != null) {
      updateTaskAccumulatorValues(
        taskEnd.taskInfo.taskId,
        taskEnd.stageId,
        taskEnd.stageAttemptId,
        taskEnd.taskMetrics.externalAccums.map(a => a.toInfo(Some(a.value), None)),
        finishTask = true)
    }
  }
該方法調用了AccumulatroV2的name的toInfo()方法,然後toInfo()方法調用了name()方法,name()方法通過Accumulator的id去取originals的ConcurrentHashMap中的對象,但是這個是一個弱引用,GC回去進行回收的,所以,當內存不足時就會出現部分弱應用被回收,進而get的時候取不到數據,然後拋出異常:
throw new IllegalStateException(s"Attempted to access garbage collected accumulator $id")
private val originals = new ConcurrentHashMap[Long, jl.ref.WeakReference[AccumulatorV2[_, _]]]
private[spark] def toInfo(update: Option[Any], value: Option[Any]): AccumulableInfo = {
    val isInternal = name.exists(_.startsWith(InternalAccumulator.METRICS_PREFIX))
    new AccumulableInfo(id, name, update, value, isInternal, countFailedValues)
  }
final def name: Option[String] = {
    assertMetadataNotNull()

    if (atDriverSide) {
      metadata.name.orElse(AccumulatorContext.get(id).flatMap(_.metadata.name))
    } else {
      metadata.name
    }
  }
 def get(id: Long): Option[AccumulatorV2[_, _]] = {
    Option(originals.get(id)).map { ref =>
      // Since we are storing weak references, we must check whether the underlying data is valid.
      val acc = ref.get
      if (acc eq null) {
        throw new IllegalStateException(s"Attempted to access garbage collected accumulator $id")
      }
      acc
    }
  }
然後拋出的異常會被ListenerBus類的postToAll()方法捕獲到:
  def postToAll(event: E): Unit = {
    // JavaConverters can create a JIterableWrapper if we use asScala.
    // However, this method will be called frequently. To avoid the wrapper cost, here we use
    // Java Iterator directly.
    val iter = listenersPlusTimers.iterator
    while (iter.hasNext) {
      val listenerAndMaybeTimer = iter.next()
      val listener = listenerAndMaybeTimer._1
      val maybeTimer = listenerAndMaybeTimer._2
      val maybeTimerContext = if (maybeTimer.isDefined) {
        maybeTimer.get.time()
      } else {
        null
      }
      try {
        doPostEvent(listener, event)
      } catch {
        case NonFatal(e) =>
          logError(s"Listener ${Utils.getFormattedClassName(listener)} threw an exception", e)
      } finally {
        if (maybeTimerContext != null) {
          maybeTimerContext.stop()
        }
      }
    }
  }
這個地方就出現了我們上面的異常信息。
我們可以看到最後,這個異常其實是不影響我們系統的正常運行的。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章