參考鏈接:
https://blog.csdn.net/xueba207/article/details/51135423 kafka 0.8.2.2
http://spark.apache.org/docs/latest/streaming-kafka-0-10-integration.html
一、前言(Kafka2.0.0)
Kafka支持producer的冪等性,並不支持consumer的冪等性,因此我們選擇使用外部存儲來維護consumer的offset。
以詞頻統計爲例實現
SparkStreaming消費Kafka數據,把統計結果和offset保存到Redis中
詞頻結果Redis保存格式:wc_redis word cnt
offset信息Redis保存格式:topic_groupid partition offset
二、需要考慮的問題
1、業務結果和offset結果保存到Redis需要保證一致性
解決方法:使用jedis.pipeline
2、當redisOffset<KafkaOffset時,會報numRecords must not be negative,需要修正offset。
解決方法:1)把舊的詞頻結果rename(oldkey,old_tmp)
2) 把所有partition的offset設置爲0,也就是重刷數據,寫入到oldkey
三、代碼實現
使用到以下四個類:
1)RedisUtils:設置redis連接信息,獲取redis連接
2)KafkaOffsetTool:使用KafkaConsumer獲取指定topic的每個partition的endOffset
3)MyOffsetsManager:獲取和保存offset
4)StreamingKakfaApp:業務處理
RedisUtils
package com.wsd.spark.utils
import redis.clients.jedis.{JedisPool, JedisPoolConfig}
/**
* Redis連接
*
* @author wsd
* @date 2020/4/26 14:46
*/
object RedisUtils {
private val poolConfig = new JedisPoolConfig
poolConfig.setMaxTotal(2000)
poolConfig.setMaxIdle(1000)
poolConfig.setTestOnBorrow(true)
private val pool = new JedisPool(poolConfig, "master", 6379)
def getJedis = pool.getResource
}
KafkaOffsetTool
package com.wsd.spark.streaming.offset
import java.util.Properties
import java.{lang, util}
import org.apache.kafka.clients.consumer.KafkaConsumer
import org.apache.kafka.common.{PartitionInfo, TopicPartition}
import scala.collection.mutable.ListBuffer
import scala.collection.{JavaConverters, mutable}
/**
* KafkaOffsetTool
* 根據topic獲取每個partition的endoffset
*
* @author wsd
* @date 2020/4/27 11:01
*/
object KafkaOffsetTool {
def getLastOffset(topic: String): mutable.Map[TopicPartition, lang.Long] = {
var partitionToLongMap: mutable.Map[TopicPartition, lang.Long] = null
val props = new Properties
props.put("bootstrap.servers", "master:9092,slave1:9092,slave2:9092")
props.put("group.id", "wsd")
props.put("enable.auto.commit", "false")
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
var kafkaConsumer: KafkaConsumer[String, String] = null
var topicPartitions: ListBuffer[TopicPartition] = null
try {
//1、初始化
kafkaConsumer = new KafkaConsumer[String, String](props)
topicPartitions = ListBuffer[TopicPartition]()
//2、獲取partition,構建 topicPartition
val partitionInfos: util.List[PartitionInfo] = kafkaConsumer.partitionsFor(topic)
import scala.collection.JavaConversions._
partitionInfos.foreach(partitionInfo => {
topicPartitions.append(new TopicPartition(topic, partitionInfo.partition()))
})
//3、根據topicPartition獲取endoffset
val partitionToLongJavaMap: util.Map[TopicPartition, lang.Long] = kafkaConsumer.endOffsets(topicPartitions)
partitionToLongMap = JavaConverters.mapAsScalaMap(partitionToLongJavaMap)
} catch {
case e: Exception =>
e.printStackTrace()
} finally {
if (kafkaConsumer != null) kafkaConsumer.close()
}
partitionToLongMap
}
}
MyOffsetsManagerV2
package com.wsd.spark.streaming.offset
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.OffsetRange
import redis.clients.jedis.Pipeline
/**
* 獲取和保存offset接口
* @author wsd
* @date 2020/4/27 10:17
*/
trait OffsetsManager {
def obtainOffsets(topic: String, groupId: String): Map[TopicPartition, Long]
def storeOffsets(pipeline: Pipeline, offsetRanges: Array[OffsetRange], groupId: String): Unit
}
package com.wsd.spark.streaming.offset
import java.{lang, util}
import com.wsd.spark.utils.RedisUtils
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.OffsetRange
import redis.clients.jedis.Pipeline
import scala.collection.JavaConversions._
import scala.collection.mutable
/**
* offsetsManager實現類
* 1) 獲取offset
* 2) 保存offset
*
* @author wsd
* @date 2020/4/26 19:56
*/
object MyOffsetsManagerV2 extends OffsetsManager {
var isReset = false
/**
* 從redis、kafka獲取offset
*
* if(redisOffset>kafkaOffset)
* return allPartition.offset)=0
*
* @param topic
* @param groupId
* @return
*/
override def obtainOffsets(topic: String, groupId: String): Map[TopicPartition, Long] = {
var offsets = Map[TopicPartition, Long]() //返回值
val jedis = RedisUtils.getJedis
//1.1 從redis獲取offset
val redisMap: util.Map[String, String] = jedis.hgetAll(topic + "_" + groupId)
//1.2 從kafka獲取到的offset
val kafkaMap: mutable.Map[TopicPartition, lang.Long] = KafkaOffsetTool.getLastOffset(topic)
//2、進行比較判斷,如果fromOffset>untilOffset isReset=true
kafkaMap.foreach(x => {
val redisOffset = redisMap.getOrDefault(x._1.partition() + "", "0")
if (redisOffset.toInt > x._2) {
isReset = true
}
})
//redisOffset轉換爲fromOffset
//if isReset==true 設置所有partition的offset爲0
redisMap.foreach(pair => {
val topicPartition = new TopicPartition(topic, pair._1.toInt)
var offset = pair._2
if (isReset == true) {
//重置offset
offset = "0"
}
offsets += topicPartition -> offset.toLong
})
offsets
}
/**
* 保存offset到Redis
*
* @param offsetRanges
* @param groupId
*/
override def storeOffsets(pipeline: Pipeline, offsetRanges: Array[OffsetRange], groupId: String): Unit = {
offsetRanges.foreach(o => {
pipeline.hset(o.topic + "_" + groupId, o.partition + "", o.untilOffset + "")
})
}
}
StreamingKafkaApp
package com.wsd.spark.streaming.kafka
import com.wsd.spark.streaming.offset.MyOffsetsManagerV2
import com.wsd.spark.utils.RedisUtils
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import redis.clients.jedis.{Jedis, Pipeline}
/**
* SS消費Kafka數據,使用Redis維護offset
* offset存儲格式 topic_groupid partition offset
* 1) 添加offset矯正
* 2) 使用pipeline保證業務數據和offset的提交的一致性
*
* @author wsd
* @date 2020/4/26 10:59
*/
object StreamingKafkaApp {
def main(args: Array[String]): Unit = {
//1、獲取StreamingContext、以及設置KafkaParams和topics
val sparkConf = new SparkConf().setMaster("local[2]").setAppName(this.getClass.getSimpleName)
val ssc = new StreamingContext(sparkConf, Seconds(5))
val groupId = "wsd"
val wcRedisKeyName = "wc_redis"
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "master:9092,slave1:9092,slave2:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "wsd",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("test")
//2、獲取offset
val fromOffset: collection.Map[TopicPartition, Long] = MyOffsetsManagerV2.obtainOffsets(topics(0), groupId)
//3、業務處理、offset提交
val stream = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
Subscribe[String, String](topics, kafkaParams, fromOffset))
stream.foreachRDD(rdd => {
println("接收數據條數: " + rdd.count())
if (!rdd.isEmpty()) {
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges //獲取最新的offset信息
val result = rdd.map(x => (x.value(), 1)).reduceByKey(_ + _).collect() //業務邏輯處理:詞頻統計
var jedis: Jedis = null
var pipeline: Pipeline = null
try {
//4、開啓jedis pipelined
jedis = RedisUtils.getJedis
pipeline = jedis.pipelined()
pipeline.multi() //JedisDataException: DISCARD without MULTI
//if isReset 把舊數據放到tmp中
if (MyOffsetsManagerV2.isReset) {
pipeline.rename(wcRedisKeyName, wcRedisKeyName + "_tmp")
MyOffsetsManagerV2.isReset = false
}
//4.1 保存計算結果
result.map(pair => {
pipeline.hincrBy(wcRedisKeyName, pair._1, pair._2)
})
//4.2 保存offset
MyOffsetsManagerV2.storeOffsets(pipeline, offsetRanges, groupId)
//4.3 pipe提交
pipeline.exec()
pipeline.sync()
} catch {
case e: Exception =>
pipeline.discard() //不成功,清除
e.printStackTrace()
}
finally {
pipeline.close()
jedis.close()
}
}
else {
println("當前批次無數據")
}
})
//5、開始作業
ssc.start()
ssc.awaitTermination()
}
}