創建Maven項目並導入POM
<properties>
<spark.version>2.4.0</spark.version>
<scala.version>2.11.8</scala.version>
<kafka.version>1.0.0</kafka.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>io.searchbox</groupId>
<artifactId>jest</artifactId>
<version>5.3.3</version>
</dependency>
<dependency>
<groupId>net.java.dev.jna</groupId>
<artifactId>jna</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>commons-compiler</artifactId>
<version>2.7.8</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.elasticsearch/elasticsearch -->
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>2.4.6</version>
</dependency>
<dependency>
<groupId>io.searchbox</groupId>
<artifactId>jest</artifactId>
<version>5.3.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>net.java.dev.jna</groupId>
<artifactId>jna</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>commons-compiler</artifactId>
<version>2.7.8</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.56</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>${kafka.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.phoenix</groupId>
<artifactId>phoenix-spark</artifactId>
<version>4.14.2-HBase-1.3</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>io.searchbox</groupId>
<artifactId>jest</artifactId>
<version>5.3.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>net.java.dev.jna</groupId>
<artifactId>jna</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>commons-compiler</artifactId>
<version>2.7.8</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- 該插件用於將Scala代碼編譯成class文件 -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.4.6</version>
<executions>
<execution>
<!-- 聲明綁定到maven的compile階段 -->
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.0.0</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
增加log4j.properties
在resources文件夾下新建一個文件爲log4j.properties,並添加以下內容
log4j.rootLogger=error,stdout,logfile
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=E://logs//mr.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n
增加config.properties
在resources文件夾下新建一個文件爲config.properties,並添加以下內容
# Kafka配置
kafka.broker.list=hadoop102:9092,hadoop103:9092,hadoop104:9092
# Redis配置
redis.host=hadoop102
redis.port=6379
創建PropertiesUtil
在scala下創建com/atguigu/gmall/realtime/utils/PropertiesUtil
import java.io.InputStreamReader
import java.util.Properties
object PropertiesUtil {
def main(args: Array[String]): Unit = {
val properties: Properties = PropertiesUtil.load("config.properties")
println(properties.getProperty("kafka.broker.list"))
}
def load(propertieName:String): Properties ={
val prop=new Properties();
prop.load(new InputStreamReader(Thread.currentThread().getContextClassLoader.getResourceAsStream(propertieName) , "UTF-8"))
prop
}
}
創建MyKafkaUtil
在scala下創建com/atguigu/gmall/realtime/utils/MyKafkaUtil
import java.util.Properties
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
object MyKafkaUtil {
private val properties: Properties = PropertiesUtil.load("config.properties")
val broker_list = properties.getProperty("kafka.broker.list")
// kafka消費者配置
var kafkaParam = collection.mutable.Map(
"bootstrap.servers" -> broker_list,//用於初始化鏈接到集羣的地址
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
//用於標識這個消費者屬於哪個消費團體
"group.id" -> "gmall_consumer_group",
//如果沒有初始化偏移量或者當前的偏移量不存在任何服務器上,可以使用這個配置屬性
//可以使用這個配置,latest自動重置偏移量爲最新的偏移量
"auto.offset.reset" -> "latest",
//如果是true,則這個消費者的偏移量會在後臺自動提交,但是kafka宕機容易丟失數據
//如果是false,會需要手動維護kafka偏移量
"enable.auto.commit" -> (false: java.lang.Boolean)
)
// 創建DStream,返回接收到的輸入數據
// LocationStrategies:根據給定的主題和集羣地址創建consumer
// LocationStrategies.PreferConsistent:持續的在所有Executor之間分配分區
// ConsumerStrategies:選擇如何在Driver和Executor上創建和配置Kafka Consumer
// ConsumerStrategies.Subscribe:訂閱一系列主題
def getKafkaStream(topic: String,ssc:StreamingContext ): InputDStream[ConsumerRecord[String,String]]={
val dStream = KafkaUtils.createDirectStream[String,String](ssc, LocationStrategies.PreferConsistent,ConsumerStrategies.Subscribe[String,String](Array(topic),kafkaParam ))
dStream
}
def getKafkaStream(topic: String,ssc:StreamingContext,groupId:String): InputDStream[ConsumerRecord[String,String]]={
kafkaParam("group.id")=groupId
val dStream = KafkaUtils.createDirectStream[String,String](ssc, LocationStrategies.PreferConsistent,ConsumerStrategies.Subscribe[String,String](Array(topic),kafkaParam ))
dStream
}
def getKafkaStream(topic: String,ssc:StreamingContext,offsets:Map[TopicPartition,Long],groupId:String): InputDStream[ConsumerRecord[String,String]]={
kafkaParam("group.id")=groupId
val dStream = KafkaUtils.createDirectStream[String,String](ssc, LocationStrategies.PreferConsistent,ConsumerStrategies.Subscribe[String,String](Array(topic),kafkaParam,offsets))
dStream
}
}
創建RedisUtil
在scala下創建com/atguigu/gmall/realtime/utils/RedisUtil
import redis.clients.jedis.{Jedis, JedisPool, JedisPoolConfig}
object RedisUtil {
var jedisPool:JedisPool=null
def getJedisClient: Jedis = {
if(jedisPool==null){
// println("開闢一個連接池")
val config = PropertiesUtil.load("config.properties")
val host = config.getProperty("redis.host")
val port = config.getProperty("redis.port")
val jedisPoolConfig = new JedisPoolConfig()
jedisPoolConfig.setMaxTotal(100) //最大連接數
jedisPoolConfig.setMaxIdle(20) //最大空閒
jedisPoolConfig.setMinIdle(20) //最小空閒
jedisPoolConfig.setBlockWhenExhausted(true) //忙碌時是否等待
jedisPoolConfig.setMaxWaitMillis(500)//忙碌時等待時長 毫秒
jedisPoolConfig.setTestOnBorrow(true) //每次獲得連接的進行測試
jedisPool=new JedisPool(jedisPoolConfig,host,port.toInt)
}
// println(s"jedisPool.getNumActive = ${jedisPool.getNumActive}")
// println("獲得一個連接")
jedisPool.getResource
}
}
創建MyEsUtil
在scala下創建com/atguigu/gmall/realtime/utils/MyEsUtil
object MyEsUtil {
private var factory: JestClientFactory = null;
def getClient: JestClient = {
if (factory == null) build();
factory.getObject
}
def build(): Unit = {
factory = new JestClientFactory
factory.setHttpClientConfig(new HttpClientConfig.Builder("http://hadoop102:9200")
.multiThreaded(true)
.maxTotalConnection(20)
.connTimeout(10000).readTimeout(10000).build())
}
// batch
def saveBulk(dataList: List[(String, AnyRef)], indexName: String): Unit = {
if (dataList != null && dataList.size > 0) {
val jest: JestClient = getClient
val bulkBuilder = new Bulk.Builder()
bulkBuilder.defaultIndex(indexName).defaultType("_doc")
for ((id, data) <- dataList) {
val index: Index = new Index.Builder(data).id(id).build()
bulkBuilder.addAction(index)
}
val bulk: Bulk = bulkBuilder.build()
val items: util.List[BulkResult#BulkResultItem] = jest.execute(bulk).getItems
println("已保存:" + items.size() + "條數據!")
jest.close()
}
}
def main(args: Array[String]): Unit = {
val jest: JestClient = getClient
// any ==> case class
//val index = new Index.Builder(Movie(4,"紅海戰役",9.0)).index("movie_chn1122").`type`("movie").id("4").build()
val query = "{\n \"query\": {\n \"match\": {\n \"name\": \"紅海戰役\"\n }\n }\n}"
val sourceBuilder = new SearchSourceBuilder
sourceBuilder.query(new MatchQueryBuilder("name", "紅海戰役"))
sourceBuilder.sort("doubanScore", SortOrder.ASC)
val query2: String = sourceBuilder.toString
println(query2)
val search = new Search.Builder(query2).addIndex("movie_chn1122").addType("movie").build()
val result: SearchResult = jest.execute(search)
val movieRsList: util.List[SearchResult#Hit[Movie, Void]] = result.getHits(classOf[Movie])
import scala.collection.JavaConversions._
val movieList = ListBuffer[Movie]()
for (hit <- movieRsList) {
val movie: Movie = hit.source
movieList += movie
}
println(movieList.mkString("\n"))
jest.close()
}
case class Movie(id: Long, name: String, doubanScore: Double) {
}
}
創建DauInfo
在scala下com/atguigu/gmall/realtime/bean/DauInfo
case class DauInfo(
mid:String,
uid:String,
ar:String,
ch:String,
vc:String,
var dt:String,
var hr:String,
var mi:String,
ts:Long) {
}
創建DauApp
在scala下com\atguigu\gmall\realtime\app\DauApp
import com.atguigu.gmall.realtime.utils.{MyEsUtil, MyKafkaUtil, OffsetManagerUtil, RedisUtil}
import java.lang
import java.text.SimpleDateFormat
import java.util.Date
import com.alibaba.fastjson.{JSON, JSONObject}
import com.atguigu.gmall.realtime.bean.DauInfo
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable.ListBuffer
object DauApp {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("dau_app").setMaster("local[*]")
val ssc = new StreamingContext(sparkConf,Seconds(5))
// 自定義消費topic 和 groupId
val topic="GMALL_START"
val groupId="GMALL_DAU_CONSUMER"
// 用OffsetManagerUtil.getOffset()方法,獲取偏移量 -> 爲達到手動提交偏移量做鋪墊
val startOffset: Map[TopicPartition, Long] = OffsetManagerUtil.getOffset(groupId,topic)
// Kafka消費數據
val startInputDstream: InputDStream[ConsumerRecord[String, String]] = MyKafkaUtil.getKafkaStream(topic,ssc,startOffset,groupId)
//startInputDstream.map(_.value).print(1000)
// 將數據轉換爲標準JSON格式
val startJsonObjDstream: DStream[JSONObject] = startInputDstream.map { record =>
val jsonString: String = record.value()
val jSONObject: JSONObject = JSON.parseObject(jsonString)
jSONObject
}
//寫入去重清單 日活 每天一個清單 key: 每天一個key //不夠優化,連接次數較多
// startJsonObjDstream.map{jsonObj=>
//
// //Redis 寫入 type? set key? dau:2020-05-12 value? mid
// val dateStr: String = new SimpleDateFormat("yyyyMMdd").format(new Date(jsonObj.getLong("ts")))
//
// val dauKey="dau:"+dateStr
// val jedis = new Jedis("hadoop102",6379)
// val mid: String = jsonObj.getJSONObject("common").getString("mid")
// jedis.sadd(dauKey,mid)
// jedis.close()
// }
// Redis去重
val startJsonObjWithDauDstream: DStream[JSONObject] = startJsonObjDstream.mapPartitions { jsonObjItr =>
// 獲取連接池
val jedis = RedisUtil.getJedisClient
// 轉換成一個JSONObject List
val jsonObjList: List[JSONObject] = jsonObjItr.toList
println("過濾前:"+jsonObjList.size)
// 存儲過濾後的BufferList
val jsonObjFilteredList = new ListBuffer[JSONObject]()
// 遍歷
for (jsonObj <- jsonObjList) {
// 獲取日誌中的ts字段的時間戳進行格式化日期
val dateStr: String = new SimpleDateFormat("yyyyMMdd").format(new Date(jsonObj.getLong("ts")))
// 定義每日的key,不是單純的日期。 -> dau:2020-05-12
val dauKey = "dau:" + dateStr
// 獲取日誌中的mid
val mid: String = jsonObj.getJSONObject("common").getString("mid")
// 用Redis的插入寫操作,返回一個 0 或 1的值
val isFirstFlag: lang.Long = jedis.sadd(dauKey, mid)
// 數值爲1 表示首次插入成功, 數值爲0 表示插入失敗,以此達到去重的目的
if(isFirstFlag==1L){
jsonObjFilteredList+=jsonObj
}
}
jedis.close()
println("過濾後:"+jsonObjFilteredList.size)
// 返回一個去重的List集合
jsonObjFilteredList.toIterator
}
// startJsonObjWithDauDstream.print(1000)
// 變換結構
val dauInfoDstream: DStream[DauInfo] = startJsonObjWithDauDstream.map { jsonObj =>
val commonJsonObj: JSONObject = jsonObj.getJSONObject("common")
val dateTimeStr: String = new SimpleDateFormat("yyyy-MM-dd HH:mm").format(new Date(jsonObj.getLong("ts")))
// 對日期數據按照" "進行切分
val dateTimeArr: Array[String] = dateTimeStr.split(" ")
// 獲取 yyyy-MM-dd
val dt: String = dateTimeArr(0)
// 將HH:mm 按照 ":" 進行切分
val timeArr: Array[String] = dateTimeArr(1).split(":")
// 提取時分數據
val hr = timeArr(0)
val mi = timeArr(1)
// 對數據進行封裝
DauInfo(commonJsonObj.getString("mid"),
commonJsonObj.getString("uid"),
commonJsonObj.getString("ar"),
commonJsonObj.getString("ch"),
commonJsonObj.getString("vc"),
dt, hr, mi, jsonObj.getLong("ts")
)
}
//要插入gmall1122_dau_info_2020xxxxxx 索引中
dauInfoDstream.foreachRDD {rdd=>
// 又區內遍歷
rdd.foreachPartition { dauInfoItr =>
// 封裝程一個以mid和DauInfo的對偶元組的List
val dataList: List[(String, DauInfo)] = dauInfoItr.toList.map { dauInfo => (dauInfo.mid, dauInfo) }
// 獲取時間
val dt = new SimpleDateFormat("yyyyMMdd").format(new Date())
val indexName = "gmall1122_dau_info_" + dt
MyEsUtil.saveBulk(dataList, indexName)
}
}
ssc.start()
ssc.awaitTermination()
}
}
建立索引模板
PUT _template/gmall_dau_info_template
{
"index_patterns": ["gmall_dau_info*"],
"settings": {
"number_of_shards": 3
},
"aliases" : {
"{index}-query": {},
"gmall_dau_info-query":{}
},
"mappings": {
"_doc":{
"properties":{
"mid":{
"type":"keyword"
},
"uid":{
"type":"keyword"
},
"ar":{
"type":"keyword"
},
"ch":{
"type":"keyword"
},
"vc":{
"type":"keyword"
},
"dt":{
"type":"keyword"
},
"hr":{
"type":"keyword"
},
"mi":{
"type":"keyword"
},
"ts":{
"type":"date"
}
}
}
}
}
啓動相應進程
包括啓動Redis
啓動Redis報錯解決
若是報以下錯,則
Exception in thread "main" redis.clients.jedis.exceptions.JedisConnectionException: Could not get a resource from the pool
解決辦法(另外執行命令時,需要記得帶配置文件)
sudo bin/redis-server ./redis.conf