Kafka Streams之WordCount

一、實現流程

1、注意

Kafka中的數據都以<key, value>的形式存在。

2、wordCount流程

(1)Stream 從topic中取出每一條數據記錄 (<key, value>格式): <null, "Spark and spark">

(2)MapValue 將value中所有文本轉換成小寫形式:<null, "spark and spark">

(3)FlatMapValues 按空格分解成單詞 :<null, “spark”>,<null, “and”>, <null, “spark”>

(4)SelectKey 將value的值賦給key :<"spark", “spark”>,<“and”, “and”>, <“spark”, “spark”>

(5)GroupByKey 按相同的Key分組 :(<"spark", “spark”>, <"spark, “spark”>),(<"and", “and”>)

(6)Count 計算每個組中元素個數 :<"spark", 2>,<"and", 1>

(7)To 將結果返回Kafka

二、代碼實現

1、pom依賴

       <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka_2.11</artifactId>
            <version>0.11.0.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-streams</artifactId>
            <version>1.0.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>1.0.2</version>
        </dependency>

2、kafkaStreams主程序

package com.cn.kafkaStreams;

import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.StreamsBuilder;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.Topology;
import org.apache.kafka.streams.kstream.KStream;
import org.apache.kafka.streams.kstream.KTable;
import org.apache.kafka.streams.kstream.Materialized;
import org.apache.kafka.streams.kstream.Produced;

import java.util.Arrays;
import java.util.Properties;

public class KafkaStreamsMain {
    public static void main(String[] args) {
        //首先進行配置
        Properties config = new Properties();
        config.put(StreamsConfig.APPLICATION_ID_CONFIG, "wordcount");
        config.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.230.21:6667,192.168.230.22:6667,192.168.230.23:6667");
        config.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
        config.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
        config.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());

        StreamsBuilder builder = new StreamsBuilder();
        //構建KStream
        KStream<String, String> textLines = builder.stream("test_wordCount");

        //得到結果後將其存儲爲KTable
        KTable<String, Long> wordCounts =
                //將數據記錄中的大寫全部替換成小寫:
                textLines.mapValues(values -> values.toLowerCase())
                //將各行數據按空格拆分
                /**
                 * 由於flatMapValues(ValueMapper<? super V, ? extends Iterable<? extends VR>> var1)
                 * key: ? super V
                 * value(屬於集合): ? extends Iterable<? extends VR>
                 * 故將數組轉化爲集合方式:Arrays.asList()
                 */
                .flatMapValues(values -> Arrays.asList(values.split(" ")))
                //將value作爲新的key
                .selectKey((key, word) -> word)
                //aggregation操作前group by key:
                .groupByKey()
                //計算每個組中的元素個數
                .count(Materialized.as("Counts"));
                //將KStream寫回Kafka,key爲String,value爲Long。
        wordCounts.toStream().to("test_out", Produced.with(Serdes.String(), Serdes.Long()));
        Topology topology = builder.build();
        //System.out.println(topology.describe());
        KafkaStreams kafkaStreams = new KafkaStreams(topology, config);
        kafkaStreams.start();
    }
}

3、向kafka造數據

package com.cn.kafkaStreams;

import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;

import java.util.Properties;

public class kafkaProducer {
    static String arr[]={"Spark is spark","hbase can save bigdata","hive can select data"};
    static int p= -1;
    public static String getWord(){
        p=(p+1)%arr.length;
        return arr[p];
    }

    public static void main(String[] args) {
        String topic = "test_wordCount";
        String brokers = "192.168.230.21:6667,192.168.230.22:6667,192.168.230.23:6667";
        //設置屬性,配置
        Properties props = new Properties();
        props.setProperty("bootstrap.servers", brokers);
        props.setProperty("metadata.broker.list", brokers);
        props.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        props.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");

        //生成producer對象
        KafkaProducer<String, String> producer = new KafkaProducer<String, String>(props);

        //傳輸數據
        while (true) {
            String event = getWord();
            System.out.println(event);
            //發送數據
            producer.send(new ProducerRecord<String, String>(topic, event));
            try{
                Thread.sleep(2000);
            }catch (Exception e){
                e.printStackTrace();
            }
        }
    }
}

4、消費回寫kafka的結果

package com.cn.kafkaStreams;

import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;

import java.time.Duration;
import java.util.Arrays;
import java.util.Properties;

public class kafkaConsumerMain {
    public static void main(String[] args) {
        // Kafka consumer configuration settings
        String topicName = "test_out";
        Properties props = new Properties();

        props.put("bootstrap.servers", "192.168.230.21:6667,192.168.230.22:6667,192.168.230.23:6667");
        props.put("group.id", "test");
        props.put("enable.auto.commit", "true");
        props.put("auto.commit.interval.ms", "1000");
        props.put("session.timeout.ms", "30000");
        props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        props.put("value.deserializer", "org.apache.kafka.common.serialization.LongDeserializer");
        KafkaConsumer<String, String> kafkaConsumer = new KafkaConsumer<String, String>(props);
        // Kafka Consumer subscribes list of topics here.
        kafkaConsumer.subscribe(Arrays.asList(topicName));

        while (true) {
            ConsumerRecords<String, String> records = kafkaConsumer.poll(5);
            for (ConsumerRecord<String, String> record : records) {
                // print the offset,key and value for the consumer records.
                System.out.printf("offset = %d, key = %s, value = %s\n", record.offset(), record.key(), record.value());
            }
        }

    }
}

三、控制檯輸出

1、kafkaProducer

...
Spark is spark
hbase can save bigdata
hive can select data
Spark is spark
hbase can save bigdata
hive can select data

...

2、kafkaConsumerMain

...
offset = 32, key = spark, value = 45
offset = 33, key = hbase, value = 40
offset = 34, key = save, value = 82
offset = 35, key = bigdata, value = 40
offset = 36, key = hive, value = 37
offset = 37, key = can, value = 163
offset = 38, key = select, value = 65
offset = 39, key = data, value = 123
offset = 40, key = is, value = 48
offset = 41, key = spark, value = 55
offset = 42, key = hbase, value = 45
offset = 43, key = save, value = 87
offset = 44, key = bigdata, value = 45
offset = 45, key = hive, value = 42
offset = 46, key = can, value = 173
offset = 47, key = select, value = 70
offset = 48, key = data, value = 128
...

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章