1.消費者
import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.clients.consumer.ConsumerRecords; import org.apache.kafka.clients.consumer.KafkaConsumer; import org.apache.kafka.common.serialization.StringDeserializer; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.Properties; public class Consumer { public static void main(String[] args){ HashMap<String,Object > config = new HashMap<>(); config.put("bootstrap.servers","hadoop01:9092,hadoop02:9092,hadoop03:9092"); config.put("key.deserializer",StringDeserializer.class.getName()); config.put("value.deserializer",StringDeserializer.class.getName()); config.put("group.id","g000001"); /** * 從哪個位置獲取數據 * [latest,earliest,none] */ config.put("auto.offset.reset","earliest"); //是否要自動遞交偏移量(offset) //config.put("enable.auto.commit","false"); config.put("enable.auto.commit","true"); config.put("","500"); // 創建一個消費者客戶端實例 KafkaConsumer<String,String> consumer = new KafkaConsumer<String, String>(config); consumer.subscribe(Arrays.asList("test")); while (true) { //拉去數據, 會從kafka所有分區下拉取數據 ConsumerRecords<String, String> records = consumer.poll(2000); Iterator<ConsumerRecord<String, String>> iterator = records.iterator(); while (iterator.hasNext()) { ConsumerRecord<String, String> record = iterator.next(); System.out.println("record = " + record); } } //釋放連接 // consumer.close(); } }
2.生產者
package day12; import org.apache.kafka.clients.producer.KafkaProducer; import org.apache.kafka.clients.producer.ProducerRecord; import org.apache.kafka.common.serialization.StringSerializer; import java.util.Properties; public class Producer { public static void main(String[] args) throws Exception{ Properties props = new Properties(); props.setProperty("bootstrap.servers","hadoop01:9092,hadoop02:9092,hadoop03:9092"); props.setProperty("key.serializer",StringSerializer.class.getName()); props.setProperty("value.serializer","org.apache.kafka.common.serialization.StringSerializer"); //發送數據的時候時候應答 默認1 //props.setProperty("scks","1"); //自定義分區 默認爲 //org.apache.kafka.clients.producer.internals.DefaultPartitioner // props.setProperty("partitioner.class","org.apache.kafka.clients.producer.internals.DefaultPartitioner"); //創建生產者實例 KafkaProducer<String,String> kafkaProducer = new KafkaProducer<>(props); int count = 0; while(count < 10000000){ // int partitionNum = count%3; ProducerRecord record = new ProducerRecord("test",0,"","NO:"+count); kafkaProducer.send(record); count++; Thread.sleep(1000); } kafkaProducer.close(); } }
3.Kafka整合SparkStreaming
package day12 import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka010._ import org.apache.spark.streaming.{Seconds, StreamingContext} import org.codehaus.jackson.map.deser.std.StringDeserializer /** * SparkStreaming整合kafka */ object SparkStreaming_kafka { def main(args: Array[String]): Unit = { Logger.getLogger("org.apache.spark").setLevel(Level.OFF) val conf = new SparkConf().setMaster("local[2]").setAppName(s"${this.getClass.getSimpleName}") val ssc = new StreamingContext(conf,Seconds(5)) /** * kafka參數列表 */ val kafkaParams = Map[String,Object]( "bootstrap.servers" -> "hadoop01:9092,hadoop02:9092,hadoop03:9092", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "day12_005", "auto.offset.reset" -> "earliest", "enable.auto.comit" -> (false:java.lang.Boolean) ) //指定主題 val topics = Array("test") /** * 指定kafka數據源 */ val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaParams) ) /* val maped: DStream[(String, String)] = stream.map(record => (record.key,record.value)) maped.foreachRDD(rdd => { //計算邏輯 rdd.foreach(println) })*/ stream.foreachRDD(rdd => { val offsetRange = rdd.asInstanceOf[HasOffsetRanges].offsetRanges val maped: RDD[(String, String)] = rdd.map(record => (record.key,record.value)) //計算邏輯 maped.foreach(println) //循環輸出 for(o <- offsetRange){ println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}") } }) //啓動程序 ssc.start() //等待程序被終止 ssc.awaitTermination() } }
4.Zookeeper管理Kafka的Offset
package day12 import kafka.utils.{ZKGroupTopicDirs, ZkUtils} import org.I0Itec.zkclient.ZkClient import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.serialization.StringDeserializer import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies} import org.apache.spark.streaming.{Seconds, StreamingContext} /** * zk管理kafka的offset * Created by zhangjingcun on 2018/10/11 8:49. */ object SSCDirectKafka010_ZK_Offset { def main(args: Array[String]): Unit = { Logger.getLogger("org.apache.spark").setLevel(Level.OFF) val conf = new SparkConf().setMaster("local[*]").setAppName(s"${this.getClass.getSimpleName}") // 批次時間爲2s val ssc = new StreamingContext(conf, Seconds(2)) val groupId = "day13_001"; /** * kafka參數列表 */ val kafkaParams = Map[String, Object]( "bootstrap.servers" -> "bigdata01:9092,bigdata02:9092,bigdata03:9092", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> groupId, "auto.offset.reset" -> "earliest", "enable.auto.commit" -> (false: java.lang.Boolean) ) val topic = "testTopic" val topics = Array(topic) /** * 如果我們自己維護偏移量 * 問題: * 1:程序在第一次啓動的時候,應該從什麼開始消費數據?earliest * 2:程序如果不是第一次啓動的話,應該 從什麼位置開始消費數據?上一次自己維護的偏移量接着往後消費,比如上一次存儲的offset=88 * * 該類主要拼接字符串 */ val zKGroupTopicDirs: ZKGroupTopicDirs = new ZKGroupTopicDirs(groupId, topic) /** * 生成的目錄結構 * /customer/day13_001/offsets/testTopic */ val offsetDir = zKGroupTopicDirs.consumerOffsetDir //zk字符串連接組 val zkGroups = "hadoop01:2181,hadoop02:2181,hadoop03:2181" /** * 創建一個zkClient連接 * 判斷/customer/day13_001/offsets/testTopic 下面有沒有孩子節點,如果有說明之前維護過偏移量,如果沒有的話說明程序是第一次執行 */ val zkClient = new ZkClient(zkGroups) val childrenCount = zkClient.countChildren(offsetDir) val stream = if(childrenCount>0){ //非第一次啓動 println("----------已經啓動過------------") //用來存儲我們讀取到的偏移量 var fromOffsets = Map[TopicPartition, Long]() //customer/day13_001/offsets/testTopic/0 //customer/day13_001/offsets/testTopic/1 //customer/day13_001/offsets/testTopic/2 (0 until childrenCount).foreach(partitionId => { val offset = zkClient.readData[String](offsetDir+s"/${partitionId}") fromOffsets += (new TopicPartition(topic, partitionId) -> offset.toLong) }) KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets)) } else { //第一次啓動 println("-------------第一次啓動-----------") KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)) } stream.foreachRDD(rdd=>{ val offsetRange = rdd.asInstanceOf[HasOffsetRanges].offsetRanges val maped: RDD[(String, String)] = rdd.map(record => (record.key, record.value)) //計算邏輯 maped.foreach(println) //自己存儲數據,自己管理 for(o<-offsetRange){ //println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}") //寫入到Zookeeper ZkUtils(zkClient, false).updatePersistentPath(offsetDir+"/"+o.partition, o.untilOffset.toString) } }) ssc.start() ssc.awaitTermination() } }