package com.zhengkw.stu.day01.kafkademo
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* @ClassName:SparkCKP
* @author: zhengkw
* @description: 檢查點恢復!直連非直連保證輸出不丟失!
* @date: 20/05/17下午 10:05
* @version:1.0
* @since: jdk 1.8 scala 2.11.8
*/
object SparkCKP {
def createFun(): StreamingContext = {
val conf = new SparkConf()
.setMaster("local[*]").setAppName("SparkCKP")
val ssc: StreamingContext = new StreamingContext(conf, Seconds(3))
//必須聲明!
ssc.checkpoint("./ck1") // 對ssc做checkpoint.
//非直連模式通過實現WAL來避免丟失數據,或者用ckp!下面的是ckp來保證輸出不丟失!
val rids: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(
ssc,
"hadoop102:2181,hadoop103:2181,hadoop104:2181/mykafka",
"zhengkw",
Map("sparktest" -> 2)
)
rids
.map(_._2)
.flatMap(_.split(" "))
.map((_, 1))
.reduceByKey(_ + _)
.print
ssc
/*
//直連利用ckp
val param = Map[String, String](
"bootstrap.servers" -> "hadoop102:9092,hadoop103:9092,hadoop104:9092",
"group.id" -> "zhengkw"
)
val stream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc,
param,
Set("sparktest"))
stream
.map(_._2)
.flatMap(_.split(" "))
.map((_, 1))
.reduceByKey(_ + _)
.print
ssc*/
}
def main(args: Array[String]): Unit = {
val ssc = StreamingContext.getActiveOrCreate("./ck1", createFun _)
ssc.start()
ssc.awaitTermination()
}
}
利用console端的生產者生產數據,進行測試!
kafka-console-producer.sh --broker-list hadoop102:9092 --topic sparktest
直連和非直連區別
非直連
創建時需要指定消費線程個數,即啓動多少個消費者對kafka進行消費!
因爲非直連架構是將數據通過多線程方式消費到接收器Reciver上,再通過Reciver給到executor上!
一般爲了防止數據丟失,用預寫日誌(WAL)來進行避免!
直連模式
在直連模式下,sparkstreaming會創建對應分區數的RDD來對kafka分區進行消費!通過這種one-by-one的方式容易理解,也容易調優!而且相較於非直連模式,非直連模式需要將數據通過reciver進行中轉,也限制了讀數據的性能!
注意
直連模式如果需要提高並行度,只需要增加kafka的topic分區即可!因爲sparkstreaming會創建出相同分區數的RDD來消費!