SparkStreaming消費Kafka數據Offset的管理

參考鏈接:

https://blog.csdn.net/xueba207/article/details/51135423   kafka 0.8.2.2

http://spark.apache.org/docs/latest/streaming-kafka-0-10-integration.html

 

一、前言(Kafka2.0.0)

Kafka支持producer的冪等性,並不支持consumer的冪等性,因此我們選擇使用外部存儲來維護consumer的offset。

以詞頻統計爲例實現

SparkStreaming消費Kafka數據,把統計結果和offset保存到Redis中

詞頻結果Redis保存格式:wc_redis word cnt

offset信息Redis保存格式:topic_groupid partition offset

二、需要考慮的問題

1、業務結果和offset結果保存到Redis需要保證一致性

解決方法:使用jedis.pipeline

2、當redisOffset<KafkaOffset時,會報numRecords must not be negative,需要修正offset。

解決方法:1)把舊的詞頻結果rename(oldkey,old_tmp)

                  2) 把所有partition的offset設置爲0,也就是重刷數據,寫入到oldkey

三、代碼實現

使用到以下四個類:

1)RedisUtils:設置redis連接信息,獲取redis連接

2)KafkaOffsetTool:使用KafkaConsumer獲取指定topic的每個partition的endOffset

3)MyOffsetsManager:獲取和保存offset

4)StreamingKakfaApp:業務處理

RedisUtils

package com.wsd.spark.utils
import redis.clients.jedis.{JedisPool, JedisPoolConfig}

/**
  * Redis連接
  *
  * @author wsd
  * @date 2020/4/26 14:46
  */
object RedisUtils {

  private val poolConfig = new JedisPoolConfig
  poolConfig.setMaxTotal(2000)
  poolConfig.setMaxIdle(1000)
  poolConfig.setTestOnBorrow(true)
  private val pool = new JedisPool(poolConfig, "master", 6379)

  def getJedis = pool.getResource
}

KafkaOffsetTool

package com.wsd.spark.streaming.offset

import java.util.Properties
import java.{lang, util}

import org.apache.kafka.clients.consumer.KafkaConsumer
import org.apache.kafka.common.{PartitionInfo, TopicPartition}

import scala.collection.mutable.ListBuffer
import scala.collection.{JavaConverters, mutable}

/**
  * KafkaOffsetTool
  * 根據topic獲取每個partition的endoffset
  *
  * @author wsd
  * @date 2020/4/27 11:01
  */
object KafkaOffsetTool {

  def getLastOffset(topic: String): mutable.Map[TopicPartition, lang.Long] = {

    var partitionToLongMap: mutable.Map[TopicPartition, lang.Long] = null

    val props = new Properties
    props.put("bootstrap.servers", "master:9092,slave1:9092,slave2:9092")
    props.put("group.id", "wsd")
    props.put("enable.auto.commit", "false")
    props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
    props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")

    var kafkaConsumer: KafkaConsumer[String, String] = null
    var topicPartitions: ListBuffer[TopicPartition] = null
    try {
      //1、初始化
      kafkaConsumer = new KafkaConsumer[String, String](props)
      topicPartitions = ListBuffer[TopicPartition]()
      //2、獲取partition,構建 topicPartition
      val partitionInfos: util.List[PartitionInfo] = kafkaConsumer.partitionsFor(topic)
      import scala.collection.JavaConversions._
      partitionInfos.foreach(partitionInfo => {
        topicPartitions.append(new TopicPartition(topic, partitionInfo.partition()))
      })
      //3、根據topicPartition獲取endoffset
      val partitionToLongJavaMap: util.Map[TopicPartition, lang.Long] = kafkaConsumer.endOffsets(topicPartitions)
      partitionToLongMap = JavaConverters.mapAsScalaMap(partitionToLongJavaMap)

    } catch {
      case e: Exception =>
        e.printStackTrace()
    } finally {
      if (kafkaConsumer != null) kafkaConsumer.close()
    }
    partitionToLongMap
  }
}

MyOffsetsManagerV2

package com.wsd.spark.streaming.offset

import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.OffsetRange
import redis.clients.jedis.Pipeline

/**
  * 獲取和保存offset接口
  * @author wsd
  * @date 2020/4/27 10:17
  */
trait OffsetsManager {

  def obtainOffsets(topic: String, groupId: String): Map[TopicPartition, Long]

  def storeOffsets(pipeline: Pipeline, offsetRanges: Array[OffsetRange], groupId: String): Unit

}
package com.wsd.spark.streaming.offset

import java.{lang, util}

import com.wsd.spark.utils.RedisUtils
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.OffsetRange
import redis.clients.jedis.Pipeline

import scala.collection.JavaConversions._
import scala.collection.mutable

/**
  * offsetsManager實現類
  * 1) 獲取offset
  * 2) 保存offset
  *
  * @author wsd
  * @date 2020/4/26 19:56
  */
object MyOffsetsManagerV2 extends OffsetsManager {

  var isReset = false
  
  /**
    * 從redis、kafka獲取offset
    *
    * if(redisOffset>kafkaOffset)
    * return allPartition.offset)=0
    *
    * @param topic
    * @param groupId
    * @return
    */
  override def obtainOffsets(topic: String, groupId: String): Map[TopicPartition, Long] = {
    var offsets = Map[TopicPartition, Long]() //返回值
    val jedis = RedisUtils.getJedis
    //1.1 從redis獲取offset
    val redisMap: util.Map[String, String] = jedis.hgetAll(topic + "_" + groupId)
    //1.2 從kafka獲取到的offset
    val kafkaMap: mutable.Map[TopicPartition, lang.Long] = KafkaOffsetTool.getLastOffset(topic)
    //2、進行比較判斷,如果fromOffset>untilOffset isReset=true
    kafkaMap.foreach(x => {
      val redisOffset = redisMap.getOrDefault(x._1.partition() + "", "0")
      if (redisOffset.toInt > x._2) {
        isReset = true
      }
    })
    //redisOffset轉換爲fromOffset
    //if isReset==true 設置所有partition的offset爲0
    redisMap.foreach(pair => {
      val topicPartition = new TopicPartition(topic, pair._1.toInt)
      var offset = pair._2
      if (isReset == true) {
        //重置offset
        offset = "0"
      }
      offsets += topicPartition -> offset.toLong
    })
    offsets
  }

  /**
    * 保存offset到Redis
    *
    * @param offsetRanges
    * @param groupId
    */
  override def storeOffsets(pipeline: Pipeline, offsetRanges: Array[OffsetRange], groupId: String): Unit = {
    offsetRanges.foreach(o => {
      pipeline.hset(o.topic + "_" + groupId, o.partition + "", o.untilOffset + "")
    })
  }
}

StreamingKafkaApp

package com.wsd.spark.streaming.kafka

import com.wsd.spark.streaming.offset.MyOffsetsManagerV2
import com.wsd.spark.utils.RedisUtils
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import redis.clients.jedis.{Jedis, Pipeline}

/**
  * SS消費Kafka數據,使用Redis維護offset
  * offset存儲格式 topic_groupid partition offset
  * 1) 添加offset矯正
  * 2) 使用pipeline保證業務數據和offset的提交的一致性
  *
  * @author wsd
  * @date 2020/4/26 10:59
  */
object StreamingKafkaApp {
  def main(args: Array[String]): Unit = {

    //1、獲取StreamingContext、以及設置KafkaParams和topics
    val sparkConf = new SparkConf().setMaster("local[2]").setAppName(this.getClass.getSimpleName)
    val ssc = new StreamingContext(sparkConf, Seconds(5))

    val groupId = "wsd"
    val wcRedisKeyName = "wc_redis"
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "master:9092,slave1:9092,slave2:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "wsd",
      "auto.offset.reset" -> "earliest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    val topics = Array("test")
    //2、獲取offset
    val fromOffset: collection.Map[TopicPartition, Long] = MyOffsetsManagerV2.obtainOffsets(topics(0), groupId)
    //3、業務處理、offset提交
    val stream = KafkaUtils.createDirectStream(
      ssc,
      LocationStrategies.PreferConsistent,
      Subscribe[String, String](topics, kafkaParams, fromOffset))

    stream.foreachRDD(rdd => {
      println("接收數據條數: " + rdd.count())
      if (!rdd.isEmpty()) {
        val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges //獲取最新的offset信息
        val result = rdd.map(x => (x.value(), 1)).reduceByKey(_ + _).collect() //業務邏輯處理:詞頻統計
        var jedis: Jedis = null
        var pipeline: Pipeline = null
        try {
          //4、開啓jedis pipelined
          jedis = RedisUtils.getJedis
          pipeline = jedis.pipelined()
          pipeline.multi() //JedisDataException: DISCARD without MULTI
          //if isReset 把舊數據放到tmp中
          if (MyOffsetsManagerV2.isReset) {
            pipeline.rename(wcRedisKeyName, wcRedisKeyName + "_tmp")
            MyOffsetsManagerV2.isReset = false
          }

          //4.1 保存計算結果
          result.map(pair => {
            pipeline.hincrBy(wcRedisKeyName, pair._1, pair._2)
          })
          //4.2 保存offset
          MyOffsetsManagerV2.storeOffsets(pipeline, offsetRanges, groupId)
          //4.3 pipe提交
          pipeline.exec()
          pipeline.sync()
        } catch {
          case e: Exception =>
            pipeline.discard() //不成功,清除
            e.printStackTrace()
        }
        finally {
          pipeline.close()
          jedis.close()
        }
      }
      else {
        println("當前批次無數據")
      }
    })
    //5、開始作業
    ssc.start()
    ssc.awaitTermination()
  }
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章