spark到kafka的消息消費--SparkKafkaDriverHAZooKeeperOps
流式計算中最重要的消息的消費
當我們使用spark做準實時計算的時候,很大場景都是和kafka的通信,總結下spark使用kafka的注意事項,下面上代碼
package com.aura.bigdata.spark.scala.streaming.p1
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.JavaConversions
/**
* 使用的zookeeper來管理sparkdriver讀取的offset偏移量
* 將kafka對應的topic的offset保存到的路徑
*
* 約定,offset的保存到路徑
* /xxxxx/offsets/topic/group/partition/
* 0
* 1
* 2
*
* bigdata01:2181,bigdata02:2181,bigdata03:2181/kafka
*/
object _07SparkKafkaDriverHAZooKeeperOps {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.project-spark").setLevel(Level.WARN)
if(args == null || args.length < 4) {
println(
"""
|Parameter Errors! Usage: <batchInterval> <zkQuorum> <groupId> <topics>
|batchInterval : 批次間隔時間
|zkQuorum : zookeeper url地址
|groupId : 消費組的id
|topic : 讀取的topic
""".stripMargin)
System.exit(-1)
}
val Array(batchInterval, zkQuorum, group, topic) = args
val kafkaParams = Map[String, String](
"bootstrap.servers" -> "bigdata01:9092,bigdata02:9092,bigdata03:9092",
"auto.offset.reset"-> "smallest"
)
val conf = new SparkConf().setMaster("local[2]").setAppName("_06SparkKafkaDirectOps2")
def createFunc():StreamingContext = {
val ssc = new StreamingContext(conf, Seconds(batchInterval.toLong))
//讀取kafka的數據
val messages = createMessage(ssc, kafkaParams, topic, group)
//業務操作
messages.foreachRDD((rdd, bTime) => {
if(!rdd.isEmpty()) {
println("###########################->RDD count: " + rdd.count)
println("###########################->RDD count: " + bTime)
//所有的業務操作只能在這裏完成 這裏的處理邏輯和rdd的操作一模一樣
}
//處理完畢之後將偏移量保存回去
storeOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, topic, group)
})
ssc
}
//開啓的高可用的方式 要從失敗中恢復過來
val ssc = StreamingContext.getActiveOrCreate(createFunc _)
ssc.start()
ssc.awaitTermination()
}
def storeOffsets(offsetRanges: Array[OffsetRange], topic: String, group: String): Unit = {
val zkTopicPath = s"/offsets/${topic}/${group}"
for (range <- offsetRanges) {//每一個range中都存儲了當前rdd中消費之後的偏移量
val path = s"${zkTopicPath}/${range.partition}"
ensureZKExists(path)
client.setData().forPath(path, (range.untilOffset + "").getBytes())
}
}
/*
* 約定,offset的保存到路徑 ----->zookeeper
* /xxxxx/offsets/topic/group/partition/
* 0
* 1
* 2
*/
def createMessage(ssc:StreamingContext, kafkaParams:Map[String, String], topic:String, group:String):InputDStream[(String, String)] = {
//從zookeeper中讀取對應的偏移量,返回值適應fromOffsets和flag(標誌位)
val (fromOffsets, flag) = getFromOffsets(topic, group)
var message:InputDStream[(String, String)] = null
if(!flag) {
//有數據-->zookeeper中是否保存了SparkStreaming程序消費kafka的偏移量信息
//處理第一次以外,從這個接口讀取kafka對應的數據
val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
message = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
} else {
//第一次讀取的時候
message = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topic.split(",").toSet)
}
message
}
//從zookeeper中讀取kafka對應的offset --->
def getFromOffsets(topic:String, group:String): (Map[TopicAndPartition, Long], Boolean) = {
///xxxxx/offsets/topic/group/partition/
val zkTopicPath = s"/offsets/${topic}/${group}"
ensureZKExists(zkTopicPath)
//如果有直接讀取對應的數據
val offsets = for{p <- JavaConversions.asScalaBuffer(
client.getChildren.forPath(zkTopicPath))} yield {
// p --->分區所對應的值
val offset = client.getData.forPath(s"${zkTopicPath}/${p}")
(TopicAndPartition(topic, p.toInt), new String(offset).toLong)
}
if(!offsets.isEmpty) {
(offsets.toMap, false)
} else {
(offsets.toMap, true)
}
}
def ensureZKExists(zkTopicPath:String): Unit = {
if(client.checkExists().forPath(zkTopicPath) == null) {//zk中沒有沒寫過數據
client.create().creatingParentsIfNeeded().forPath(zkTopicPath)
}
}
val client = {//代碼塊編程 zk(servlet)--->Curator(SpringMVC/Struts2)
val client = CuratorFrameworkFactory.builder()
.namespace("mykafka")//命名空間就是目錄意思
.connectString("bigdata01:2181,bigdata02:2181,bigdata03:2181/kafka")
.retryPolicy(new ExponentialBackoffRetry(1000, 3))
.build()
client.start()
client
}
}
總結
在spark的使用是特別注意使用kafka的時候要處理消息的偏移量。