Spark實時項目第二天-代碼開發之消費kafka|Redis去重|建立模板將數據保存到ES中

創建Maven項目並導入POM

<properties>
        <spark.version>2.4.0</spark.version>
        <scala.version>2.11.8</scala.version>
        <kafka.version>1.0.0</kafka.version>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
        <java.version>1.8</java.version>
    </properties>
    
    <dependencies>
        <dependency>
            <groupId>io.searchbox</groupId>
            <artifactId>jest</artifactId>
            <version>5.3.3</version>

        </dependency>

        <dependency>
            <groupId>net.java.dev.jna</groupId>
            <artifactId>jna</artifactId>
            <version>4.5.2</version>
        </dependency>

        <dependency>
            <groupId>org.codehaus.janino</groupId>
            <artifactId>commons-compiler</artifactId>
            <version>2.7.8</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.elasticsearch/elasticsearch -->
        <dependency>
            <groupId>org.elasticsearch</groupId>
            <artifactId>elasticsearch</artifactId>
            <version>2.4.6</version>
        </dependency>

        <dependency>
            <groupId>io.searchbox</groupId>
            <artifactId>jest</artifactId>
            <version>5.3.3</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-api</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>net.java.dev.jna</groupId>
            <artifactId>jna</artifactId>
            <version>4.5.2</version>
        </dependency>

        <dependency>
            <groupId>org.codehaus.janino</groupId>
            <artifactId>commons-compiler</artifactId>
            <version>2.7.8</version>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.56</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>${kafka.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>redis.clients</groupId>
            <artifactId>jedis</artifactId>
            <version>2.9.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.phoenix</groupId>
            <artifactId>phoenix-spark</artifactId>
            <version>4.14.2-HBase-1.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>io.searchbox</groupId>
            <artifactId>jest</artifactId>
            <version>5.3.3</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-api</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>net.java.dev.jna</groupId>
            <artifactId>jna</artifactId>
            <version>4.5.2</version>
        </dependency>
        <dependency>
            <groupId>org.codehaus.janino</groupId>
            <artifactId>commons-compiler</artifactId>
            <version>2.7.8</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <!-- 該插件用於將Scala代碼編譯成class文件 -->
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.4.6</version>
                <executions>
                    <execution>
                        <!-- 聲明綁定到maven的compile階段 -->
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.0.0</version>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

在這裏插入圖片描述

增加log4j.properties

在resources文件夾下新建一個文件爲log4j.properties,並添加以下內容

log4j.rootLogger=error,stdout,logfile
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=E://logs//mr.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n

增加config.properties

在resources文件夾下新建一個文件爲config.properties,並添加以下內容

# Kafka配置
kafka.broker.list=hadoop102:9092,hadoop103:9092,hadoop104:9092
# Redis配置
redis.host=hadoop102
redis.port=6379

在這裏插入圖片描述

創建PropertiesUtil

在scala下創建com/atguigu/gmall/realtime/utils/PropertiesUtil

import java.io.InputStreamReader
import java.util.Properties

object PropertiesUtil {

  def main(args: Array[String]): Unit = {
    val properties: Properties = PropertiesUtil.load("config.properties")

    println(properties.getProperty("kafka.broker.list"))
  }

  def load(propertieName:String): Properties ={
    val prop=new Properties();
    prop.load(new InputStreamReader(Thread.currentThread().getContextClassLoader.getResourceAsStream(propertieName) , "UTF-8"))
    prop
  }
}

創建MyKafkaUtil

在scala下創建com/atguigu/gmall/realtime/utils/MyKafkaUtil

import java.util.Properties

import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}

object MyKafkaUtil {
  private val properties: Properties = PropertiesUtil.load("config.properties")
  val broker_list = properties.getProperty("kafka.broker.list")

  // kafka消費者配置
  var kafkaParam = collection.mutable.Map(
    "bootstrap.servers" -> broker_list,//用於初始化鏈接到集羣的地址
    "key.deserializer" -> classOf[StringDeserializer],
    "value.deserializer" -> classOf[StringDeserializer],
    //用於標識這個消費者屬於哪個消費團體
    "group.id" -> "gmall_consumer_group",
    //如果沒有初始化偏移量或者當前的偏移量不存在任何服務器上,可以使用這個配置屬性
    //可以使用這個配置,latest自動重置偏移量爲最新的偏移量
    "auto.offset.reset" -> "latest",
    //如果是true,則這個消費者的偏移量會在後臺自動提交,但是kafka宕機容易丟失數據
    //如果是false,會需要手動維護kafka偏移量
    "enable.auto.commit" -> (false: java.lang.Boolean)
  )

  // 創建DStream,返回接收到的輸入數據
  // LocationStrategies:根據給定的主題和集羣地址創建consumer
  // LocationStrategies.PreferConsistent:持續的在所有Executor之間分配分區
  // ConsumerStrategies:選擇如何在Driver和Executor上創建和配置Kafka Consumer
  // ConsumerStrategies.Subscribe:訂閱一系列主題


  def getKafkaStream(topic: String,ssc:StreamingContext ): InputDStream[ConsumerRecord[String,String]]={
    val dStream = KafkaUtils.createDirectStream[String,String](ssc, LocationStrategies.PreferConsistent,ConsumerStrategies.Subscribe[String,String](Array(topic),kafkaParam ))
    dStream
  }


  def getKafkaStream(topic: String,ssc:StreamingContext,groupId:String): InputDStream[ConsumerRecord[String,String]]={
    kafkaParam("group.id")=groupId
    val dStream = KafkaUtils.createDirectStream[String,String](ssc, LocationStrategies.PreferConsistent,ConsumerStrategies.Subscribe[String,String](Array(topic),kafkaParam ))
    dStream
  }



  def getKafkaStream(topic: String,ssc:StreamingContext,offsets:Map[TopicPartition,Long],groupId:String): InputDStream[ConsumerRecord[String,String]]={
    kafkaParam("group.id")=groupId
    val dStream = KafkaUtils.createDirectStream[String,String](ssc, LocationStrategies.PreferConsistent,ConsumerStrategies.Subscribe[String,String](Array(topic),kafkaParam,offsets))
    dStream
  }
}

創建RedisUtil

在scala下創建com/atguigu/gmall/realtime/utils/RedisUtil

import redis.clients.jedis.{Jedis, JedisPool, JedisPoolConfig}

object RedisUtil {

  var jedisPool:JedisPool=null

  def getJedisClient: Jedis = {
    if(jedisPool==null){
      //      println("開闢一個連接池")
      val config = PropertiesUtil.load("config.properties")
      val host = config.getProperty("redis.host")
      val port = config.getProperty("redis.port")

      val jedisPoolConfig = new JedisPoolConfig()
      jedisPoolConfig.setMaxTotal(100)  //最大連接數
      jedisPoolConfig.setMaxIdle(20)   //最大空閒
      jedisPoolConfig.setMinIdle(20)     //最小空閒
      jedisPoolConfig.setBlockWhenExhausted(true)  //忙碌時是否等待
      jedisPoolConfig.setMaxWaitMillis(500)//忙碌時等待時長 毫秒
      jedisPoolConfig.setTestOnBorrow(true) //每次獲得連接的進行測試

      jedisPool=new JedisPool(jedisPoolConfig,host,port.toInt)
    }
    //    println(s"jedisPool.getNumActive = ${jedisPool.getNumActive}")
    //   println("獲得一個連接")
    jedisPool.getResource
  }
}

創建MyEsUtil

在scala下創建com/atguigu/gmall/realtime/utils/MyEsUtil

object MyEsUtil {


  private var factory: JestClientFactory = null;

  def getClient: JestClient = {
    if (factory == null) build();
    factory.getObject

  }

  def build(): Unit = {
    factory = new JestClientFactory
    factory.setHttpClientConfig(new HttpClientConfig.Builder("http://hadoop102:9200")
      .multiThreaded(true)
      .maxTotalConnection(20)
      .connTimeout(10000).readTimeout(10000).build())
  }


  // batch
  def saveBulk(dataList: List[(String, AnyRef)], indexName: String): Unit = {
    if (dataList != null && dataList.size > 0) {
      val jest: JestClient = getClient
      val bulkBuilder = new Bulk.Builder()
      bulkBuilder.defaultIndex(indexName).defaultType("_doc")
      for ((id, data) <- dataList) {
        val index: Index = new Index.Builder(data).id(id).build()
        bulkBuilder.addAction(index)
      }
      val bulk: Bulk = bulkBuilder.build()
      val items: util.List[BulkResult#BulkResultItem] = jest.execute(bulk).getItems
      println("已保存:" + items.size() + "條數據!")
      jest.close()
    }
  }


  def main(args: Array[String]): Unit = {
    val jest: JestClient = getClient
    // any ==>  case class
    //val index =  new Index.Builder(Movie(4,"紅海戰役",9.0)).index("movie_chn1122").`type`("movie").id("4").build()
    val query = "{\n  \"query\": {\n    \"match\": {\n      \"name\": \"紅海戰役\"\n    }\n  }\n}"
    val sourceBuilder = new SearchSourceBuilder
    sourceBuilder.query(new MatchQueryBuilder("name", "紅海戰役"))
    sourceBuilder.sort("doubanScore", SortOrder.ASC)
    val query2: String = sourceBuilder.toString
    println(query2)
    val search = new Search.Builder(query2).addIndex("movie_chn1122").addType("movie").build()
    val result: SearchResult = jest.execute(search)
    val movieRsList: util.List[SearchResult#Hit[Movie, Void]] = result.getHits(classOf[Movie])
    import scala.collection.JavaConversions._
    val movieList = ListBuffer[Movie]()
    for (hit <- movieRsList) {
      val movie: Movie = hit.source
      movieList += movie
    }
    println(movieList.mkString("\n"))
    jest.close()
  }


  case class Movie(id: Long, name: String, doubanScore: Double) {


  }


}

在這裏插入圖片描述

創建DauInfo

在scala下com/atguigu/gmall/realtime/bean/DauInfo

case class DauInfo(
                    mid:String,
                    uid:String,
                    ar:String,
                    ch:String,
                    vc:String,
                    var dt:String,
                    var hr:String,
                    var mi:String,
                    ts:Long) {

}

創建DauApp

在scala下com\atguigu\gmall\realtime\app\DauApp

import com.atguigu.gmall.realtime.utils.{MyEsUtil, MyKafkaUtil, OffsetManagerUtil, RedisUtil}
import java.lang
import java.text.SimpleDateFormat
import java.util.Date

import com.alibaba.fastjson.{JSON, JSONObject}
import com.atguigu.gmall.realtime.bean.DauInfo
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}

import scala.collection.mutable.ListBuffer

object DauApp {

  def main(args: Array[String]): Unit = {
    val sparkConf: SparkConf = new SparkConf().setAppName("dau_app").setMaster("local[*]")
    val ssc = new StreamingContext(sparkConf,Seconds(5))
    // 自定義消費topic 和 groupId
    val topic="GMALL_START"
    val groupId="GMALL_DAU_CONSUMER"

    // 用OffsetManagerUtil.getOffset()方法,獲取偏移量 -> 爲達到手動提交偏移量做鋪墊
     val startOffset: Map[TopicPartition, Long] = OffsetManagerUtil.getOffset(groupId,topic)
    // Kafka消費數據
    val startInputDstream: InputDStream[ConsumerRecord[String, String]] = MyKafkaUtil.getKafkaStream(topic,ssc,startOffset,groupId)
    //startInputDstream.map(_.value).print(1000)

    // 將數據轉換爲標準JSON格式
    val startJsonObjDstream: DStream[JSONObject] = startInputDstream.map { record =>
      val jsonString: String = record.value()
      val jSONObject: JSONObject = JSON.parseObject(jsonString)
      jSONObject
    }
    //寫入去重清單  日活 每天一個清單 key: 每天一個key   //不夠優化,連接次數較多
    //    startJsonObjDstream.map{jsonObj=>
    //
    //      //Redis  寫入    type?  set        key?    dau:2020-05-12      value?  mid
    //      val dateStr: String = new SimpleDateFormat("yyyyMMdd").format(new Date(jsonObj.getLong("ts")))
    //
    //      val dauKey="dau:"+dateStr
    //      val jedis = new Jedis("hadoop102",6379)
    //      val mid: String = jsonObj.getJSONObject("common").getString("mid")
    //      jedis.sadd(dauKey,mid)
    //      jedis.close()
    //    }

    // Redis去重
    val startJsonObjWithDauDstream: DStream[JSONObject] = startJsonObjDstream.mapPartitions { jsonObjItr =>
      // 獲取連接池
      val jedis = RedisUtil.getJedisClient
      // 轉換成一個JSONObject List
      val jsonObjList: List[JSONObject] = jsonObjItr.toList
      println("過濾前:"+jsonObjList.size)

      // 存儲過濾後的BufferList
      val jsonObjFilteredList = new ListBuffer[JSONObject]()
      // 遍歷
      for (jsonObj <- jsonObjList) {
        // 獲取日誌中的ts字段的時間戳進行格式化日期
        val dateStr: String = new SimpleDateFormat("yyyyMMdd").format(new Date(jsonObj.getLong("ts")))
        // 定義每日的key,不是單純的日期。 -> dau:2020-05-12
        val dauKey = "dau:" + dateStr
        // 獲取日誌中的mid
        val mid: String = jsonObj.getJSONObject("common").getString("mid")
        // 用Redis的插入寫操作,返回一個 0 或 1的值
        val isFirstFlag: lang.Long = jedis.sadd(dauKey, mid)
        // 數值爲1 表示首次插入成功, 數值爲0 表示插入失敗,以此達到去重的目的
        if(isFirstFlag==1L){
          jsonObjFilteredList+=jsonObj
        }

      }
      jedis.close()
      println("過濾後:"+jsonObjFilteredList.size)
      // 返回一個去重的List集合
      jsonObjFilteredList.toIterator
    }
    // startJsonObjWithDauDstream.print(1000)


    // 變換結構
    val dauInfoDstream: DStream[DauInfo] = startJsonObjWithDauDstream.map { jsonObj =>
      val commonJsonObj: JSONObject = jsonObj.getJSONObject("common")
      val dateTimeStr: String = new SimpleDateFormat("yyyy-MM-dd HH:mm").format(new Date(jsonObj.getLong("ts")))
      // 對日期數據按照" "進行切分
      val dateTimeArr: Array[String] = dateTimeStr.split(" ")
      // 獲取 yyyy-MM-dd
      val dt: String = dateTimeArr(0)
      // 將HH:mm 按照 ":" 進行切分
      val timeArr: Array[String] = dateTimeArr(1).split(":")
      // 提取時分數據
      val hr = timeArr(0)
      val mi = timeArr(1)

      // 對數據進行封裝
      DauInfo(commonJsonObj.getString("mid"),
        commonJsonObj.getString("uid"),
        commonJsonObj.getString("ar"),
        commonJsonObj.getString("ch"),
        commonJsonObj.getString("vc"),
        dt, hr, mi, jsonObj.getLong("ts")
      )

    }

    //要插入gmall1122_dau_info_2020xxxxxx  索引中
    dauInfoDstream.foreachRDD {rdd=>
      // 又區內遍歷
      rdd.foreachPartition { dauInfoItr =>
        // 封裝程一個以mid和DauInfo的對偶元組的List
        val dataList: List[(String, DauInfo)] = dauInfoItr.toList.map { dauInfo => (dauInfo.mid, dauInfo) }
        // 獲取時間
        val dt = new SimpleDateFormat("yyyyMMdd").format(new Date())
        val indexName = "gmall1122_dau_info_" + dt
        MyEsUtil.saveBulk(dataList, indexName)
      }
    }
    ssc.start()
    ssc.awaitTermination()

  }
}

建立索引模板

PUT   _template/gmall_dau_info_template
{
  "index_patterns": ["gmall_dau_info*"],                  
  "settings": {                                               
    "number_of_shards": 3
  },
  "aliases" : { 
    "{index}-query": {},
    "gmall_dau_info-query":{}
  },
   "mappings": {
     "_doc":{  
       "properties":{
         "mid":{
           "type":"keyword"
         },
         "uid":{
           "type":"keyword"
         },
         "ar":{
           "type":"keyword"
         },
         "ch":{
           "type":"keyword"
         },
         "vc":{
           "type":"keyword"
         },
          "dt":{
           "type":"keyword"
         },
          "hr":{
           "type":"keyword"
         },
          "mi":{
           "type":"keyword"
         },
         "ts":{
           "type":"date"
         }    
       }
     }
   }
}

在這裏插入圖片描述

啓動相應進程

包括啓動Redis
在這裏插入圖片描述

啓動Redis報錯解決

若是報以下錯,則

Exception in thread "main" redis.clients.jedis.exceptions.JedisConnectionException: Could not get a resource from the pool

在這裏插入圖片描述

解決辦法(另外執行命令時,需要記得帶配置文件)

sudo bin/redis-server ./redis.conf 

在這裏插入圖片描述

啓動程序,執行日誌生成jar

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章