Spark-Streaming 和 Kafka 做實時計算需要注意的點

spark到kafka的消息消費--SparkKafkaDriverHAZooKeeperOps

流式計算中最重要的消息的消費

當我們使用spark做準實時計算的時候,很大場景都是和kafka的通信,總結下spark使用kafka的注意事項,下面上代碼
package com.aura.bigdata.spark.scala.streaming.p1

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}

import scala.collection.JavaConversions

/**
  * 使用的zookeeper來管理sparkdriver讀取的offset偏移量
  * 將kafka對應的topic的offset保存到的路徑
  *
  * 約定,offset的保存到路徑
  * /xxxxx/offsets/topic/group/partition/
  *     0
  *     1
  *     2
  *
  * bigdata01:2181,bigdata02:2181,bigdata03:2181/kafka
  */
object _07SparkKafkaDriverHAZooKeeperOps {

    def main(args: Array[String]): Unit = {
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.project-spark").setLevel(Level.WARN)

        if(args == null || args.length < 4) {
            println(
                """
                  |Parameter Errors! Usage: <batchInterval> <zkQuorum> <groupId> <topics>
                  |batchInterval        : 批次間隔時間
                  |zkQuorum             : zookeeper url地址
                  |groupId              : 消費組的id
                  |topic                : 讀取的topic
                """.stripMargin)
            System.exit(-1)
        }
        val Array(batchInterval, zkQuorum, group, topic) = args
        val kafkaParams = Map[String, String](
            "bootstrap.servers" -> "bigdata01:9092,bigdata02:9092,bigdata03:9092",
            "auto.offset.reset"-> "smallest"
        )

        val conf = new SparkConf().setMaster("local[2]").setAppName("_06SparkKafkaDirectOps2")

        def createFunc():StreamingContext = {
            val ssc = new StreamingContext(conf, Seconds(batchInterval.toLong))
            //讀取kafka的數據
            val messages = createMessage(ssc, kafkaParams, topic, group)
            //業務操作
            messages.foreachRDD((rdd, bTime) => {
                if(!rdd.isEmpty()) {
                    println("###########################->RDD count: " + rdd.count)
                    println("###########################->RDD count: " + bTime)
                    //所有的業務操作只能在這裏完成 這裏的處理邏輯和rdd的操作一模一樣
                }
                //處理完畢之後將偏移量保存回去
                storeOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, topic, group)
            })
            ssc
        }

        //開啓的高可用的方式 要從失敗中恢復過來
        val ssc = StreamingContext.getActiveOrCreate(createFunc _)
        ssc.start()
        ssc.awaitTermination()
    }

    def storeOffsets(offsetRanges: Array[OffsetRange], topic: String, group: String): Unit = {
        val zkTopicPath = s"/offsets/${topic}/${group}"
        for (range <- offsetRanges) {//每一個range中都存儲了當前rdd中消費之後的偏移量
            val path = s"${zkTopicPath}/${range.partition}"
            ensureZKExists(path)
            client.setData().forPath(path, (range.untilOffset + "").getBytes())
        }
    }
    /*
      * 約定,offset的保存到路徑 ----->zookeeper
      * /xxxxx/offsets/topic/group/partition/
      *     0
      *     1
      *     2
     */
    def createMessage(ssc:StreamingContext, kafkaParams:Map[String, String], topic:String, group:String):InputDStream[(String, String)] = {
        //從zookeeper中讀取對應的偏移量,返回值適應fromOffsets和flag(標誌位)
        val (fromOffsets, flag)  = getFromOffsets(topic, group)

        var message:InputDStream[(String, String)] = null
        if(!flag) {
            //有數據-->zookeeper中是否保存了SparkStreaming程序消費kafka的偏移量信息
            //處理第一次以外,從這個接口讀取kafka對應的數據
            val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
            message = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
        } else {
            //第一次讀取的時候
            message = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topic.split(",").toSet)
        }
        message
    }

    //從zookeeper中讀取kafka對應的offset --->
    def getFromOffsets(topic:String, group:String): (Map[TopicAndPartition, Long], Boolean) = {
        ///xxxxx/offsets/topic/group/partition/
        val zkTopicPath = s"/offsets/${topic}/${group}"
        ensureZKExists(zkTopicPath)

        //如果有直接讀取對應的數據
        val offsets = for{p <- JavaConversions.asScalaBuffer(
            client.getChildren.forPath(zkTopicPath))} yield {
//                p --->分區所對應的值
            val offset = client.getData.forPath(s"${zkTopicPath}/${p}")
            (TopicAndPartition(topic, p.toInt), new String(offset).toLong)
        }
        if(!offsets.isEmpty) {
            (offsets.toMap, false)
        } else {
            (offsets.toMap, true)
        }
    }

    def ensureZKExists(zkTopicPath:String): Unit = {
        if(client.checkExists().forPath(zkTopicPath) == null) {//zk中沒有沒寫過數據
            client.create().creatingParentsIfNeeded().forPath(zkTopicPath)
        }
    }

    val client = {//代碼塊編程 zk(servlet)--->Curator(SpringMVC/Struts2)
        val client = CuratorFrameworkFactory.builder()
                    .namespace("mykafka")//命名空間就是目錄意思
                    .connectString("bigdata01:2181,bigdata02:2181,bigdata03:2181/kafka")
                    .retryPolicy(new ExponentialBackoffRetry(1000, 3))
                    .build()
        client.start()
        client
    }
}

總結

在spark的使用是特別注意使用kafka的時候要處理消息的偏移量。

【1】http://www.aboutyun.com

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章