一、說明
1、一個程序模擬用戶每個時間點到達的地方和走的步數信息,並實時寫入kafka主題;sparkStreaming實時從kafka消費這些信息進行分析並存儲到mysql;這裏直接存儲到mysql;
2、sparkStreaming存儲mysql的最好思路爲這樣:
3、mysql要提前創建表
create table walk_info(user varchar(20),counttime varchar(40),walkplace nvarchar(100),newwalknum int(20));
二、 代碼實現
1、maven依賴
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.spark.demo</groupId>
<artifactId>sparkProgram</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.8</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-compiler</artifactId>
<version>2.11.8</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-reflect</artifactId>
<version>2.11.8</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.2.1</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.2.1</version>
</dependency>
<dependency><!-- Spark Streaming Kafka -->
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka_2.10</artifactId>
<version>1.6.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.2.1</version>
</dependency>
<dependency>
<groupId>org.gavaghan</groupId>
<artifactId>geodesy</artifactId>
<version>1.1.3</version>
</dependency>
<dependency>
<groupId>com.github.scopt</groupId>
<artifactId>scopt_2.11</artifactId>
<version>3.7.0</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.2.4</version>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.codehaus.jettison/jettison -->
<dependency>
<groupId>org.codehaus.jettison</groupId>
<artifactId>jettison</artifactId>
<version>1.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/net.sf.json-lib/json-lib -->
<dependency>
<groupId>net.sf.json-lib</groupId>
<artifactId>json-lib</artifactId>
<version>2.4</version>
<classifier>jdk15</classifier>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-pool2 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-pool2</artifactId>
<version>2.4.2</version>
</dependency>
<!-- orcale驅動 -->
<dependency>
<groupId>com.oracle</groupId>
<artifactId>ojdbc6</artifactId>
<version>12.1.0.1-atlassian-hosted</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.6</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-server -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.2.6</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive-thriftserver -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive-thriftserver_2.11</artifactId>
<version>2.1.1</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.11</artifactId>
<version>0.11.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-streams</artifactId>
<version>1.0.2</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>1.0.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka-streams-scala -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-streams-scala_2.11</artifactId>
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>
<dependency>
<groupId>commons-dbcp</groupId>
<artifactId>commons-dbcp</artifactId>
<version>1.4</version>
</dependency>
</dependencies>
<build>
<finalName>CountWalk-1.0.0</finalName>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.0</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.12.4</version>
</plugin>
</plugins>
</pluginManagement>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<!-- <descriptor>src/main/resources/assembly.xml</descriptor> -->
<appendAssemblyId>false</appendAssemblyId>
</configuration>
</plugin>
<!-- 拷貝依賴的jar包到lib目錄 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>copy</id>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>
${project.build.directory}/lib
</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
2、模擬數據代碼
package com.hyj.util
import java.util.Properties
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.codehaus.jettison.json.JSONObject
import scala.util.Random
/**
* 編寫一個提交數據到kafka集羣的producer
* 模擬場景:
* 統計一些用戶實時步行的總步數,每隔1s統計一次,包括某個用戶新統計時的時間、所在地點、新增步數;
*/
object KafkaEventProducer {
//用戶
private val users = Array(
"zhangSan", "liSi",
"wangWu", "xiaoQiang"
)
private var pointer = -1
//隨機獲得用戶
def getUser(): String = {
pointer = (pointer + 1) % users.length
users(pointer)
}
//獲取新增步數
val random = new Random()
//
def getNewStepNum(): Int = {
random.nextInt(users.length)+1
}
//獲取統計時間
def getTime(): Long = {
System.currentTimeMillis()
}
//獲取行走地點
val walkPlace = Array(
"操場南門", "操場東門", "操場北門", "操場西門", "操場東南門", "操場西北門", "操場西南門", "操場東北門"
)
def getWalkPlace(): String = {
walkPlace(random.nextInt(walkPlace.length))
}
def main(args: Array[String]): Unit = {
val topic = "topic_walkCount"
val brokers = "192.168.230.21:6667,192.168.230.22:6667,192.168.230.23:6667"
//設置屬性,配置
val props = new Properties()
props.setProperty("bootstrap.servers", brokers)
props.setProperty("metadata.broker.list", brokers)
props.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
props.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
//生成producer對象
val producer = new KafkaProducer[String, String](props)
//傳輸數據
while (true) {
val event = new JSONObject()
event.put("user", getUser())
//.put("count_time", getTime())
.put("count_time", TimeUtil.tranTimeToString(getTime().toString))
.put("walk_place", getWalkPlace())
.put("new_walkNum", getNewStepNum())
println(event.toString())
//發送數據
producer.send(new ProducerRecord[String, String](topic, event.toString))
Thread.sleep(1000)
}
}
}
3、數據庫連接池
package com.hyj.util;
import java.sql.Connection;
import java.sql.DriverManager;
import java.util.LinkedList;
public class MySqlConnectionPool {
private static LinkedList<Connection> connectionQueue;
static {
try {
Class.forName("com.mysql.jdbc.Driver");
}catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
public synchronized static Connection getConnection() {
try {
if (connectionQueue == null) {
connectionQueue = new LinkedList<Connection>();
for (int i = 0;i < 5;i ++) {
Connection conn = DriverManager.getConnection(
"jdbc:mysql://192.168.230.21:3306/test?characterEncoding=utf8&useSSL=true",
"root",
"123456"
);
connectionQueue.push(conn);
}
}
}catch (Exception e) {
e.printStackTrace();
}
return connectionQueue.poll();
}
public static void returnConnection(Connection conn) {
connectionQueue.push(conn);
}
}
4、工具類
package com.hyj.util
import java.text.SimpleDateFormat
import java.util.{Calendar, Date}
object TimeUtil {
//時間轉化爲時間戳
def tranTimeToLong(tm:String) :Long={
val fm = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val dt = fm.parse(tm)
val aa = fm.format(dt)
val tim: Long = dt.getTime()
tim
}
//時間戳轉化爲時間
def tranTimeToString(tm:String) :String={
val fm = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val tim = fm.format(new Date(tm.toLong))
tim
}
//得到本月最後一天的日期
def getLastDateOfMonth():String={
val now: Date = new Date();
val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd");
val dateNow = dateFormat.format(now);
val day =dateNow.substring(0,4)+dateNow.substring(5,7)+dateNow.substring(8,10);
val year=dateNow.substring(0,4).toInt;
val month=dateNow.substring(5,7).toInt;
val cal = Calendar.getInstance();
cal.set(Calendar.YEAR, year);
//cal.set(Calendar.MONTH, month-2);//上個月最後一天
cal.set(Calendar.MONTH, month-1);
cal.set(Calendar.DAY_OF_MONTH,cal.getActualMaximum(Calendar.DATE));
new SimpleDateFormat("yyyy-MM-dd").format(cal.getTime())+" 23:59:59"
}
//獲取當前時間
def getCurrentTime():String={
val now: Date = new Date();
val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
dateFormat.format(now);
}
//獲取兩個日期時間間隔秒數
def getTimeDifferenceOfTwoTime(startTime:String,endTime:String):Int={
val l_time=(tranTimeToLong(endTime)-tranTimeToLong(startTime))/1000
l_time.asInstanceOf[Int]
}
def main(args: Array[String]): Unit = {
println(getCurrentTime())
println(getTimeDifferenceOfTwoTime(getCurrentTime(),"2020-03-11 14:35:40"))
}
}
5、主程序
package com.hyj.main
import com.hyj.util.{MySqlConnectionPool, RedisUtils}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.codehaus.jettison.json.JSONObject
/**
* 統計一些用戶實時步行的步數信息,包括某個用戶新統計時的時間、所在地點、新增步數;
* 將每個用戶以及實時更新的步數信息保存到mysql數據庫中;
*/
object kafka2sparkStreaming2mySql {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("kafka2sparkStreaming2mySql")
.setMaster("local[1]")
//.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
//設置流數據每批的時間間隔爲2s
val ssc = new StreamingContext(conf, Seconds(1))
//控制日誌輸出級別
ssc.sparkContext.setLogLevel("WARN") //WARN,INFO,DEBUG
ssc.checkpoint("checkpoint")
val topic = "topic_walkCount"
val groupId = "t03"
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "192.168.230.21:6667,192.168.230.22:6667,192.168.230.23:6667",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,
"auto.offset.reset" -> "earliest", // 初次啓動從最開始的位置開始消費
"enable.auto.commit" -> (false: java.lang.Boolean) // 自動提交設置爲 false
)
val topics = Array(topic)
val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent, //均勻分發到executor
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
)
stream.foreachRDD(rdd => {
// 獲取每一個分區的消費的偏移量
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.foreachPartition(partitions => {
val conn = MySqlConnectionPool.getConnection
partitions.foreach(records => {
val record = new JSONObject(records.value())
val user = record.getString("user")
val countTime = record.getString("count_time")
val walkPlace = record.getString("walk_place")
val newWalkNum = record.getInt("new_walkNum")
println(record.toString)
val sql = "insert into walk_info(user,counttime,walkplace,newwalknum) values(\'" + user + "\',\'" + countTime + "\',\'"+walkPlace+"\',"+newWalkNum+")"
println(sql)
val stmt = conn.createStatement
stmt.executeUpdate(sql)
})
MySqlConnectionPool.returnConnection(conn)
})
// 手動提交偏移量
stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
})
ssc.start()
ssc.awaitTermination()
}
}
三、出現的問題
java.sql.SQLException: Incorrect string value: '\xE6\x93\x8D\xE5\x9C\xBA...' for column walkplace
解決辦法:
將創建表的sql中,類型從varchar改爲nvarchar;
四、效果
mysql> select * from walk_info;
+-----------+---------------------+-----------------+------------+
| user | counttime | walkplace | newwalknum |
+-----------+---------------------+-----------------+------------+
| zhangSan | 2020-06-30 16:45:52 | 操場西門 | 1 |
| liSi | 2020-06-30 16:45:54 | 操場西南門 | 3 |
| wangWu | 2020-06-30 16:45:55 | 操場北門 | 3 |
| xiaoQiang | 2020-06-30 16:45:56 | 操場東南門 | 3 |
| zhangSan | 2020-06-30 16:45:57 | 操場北門 | 4 |
| liSi | 2020-06-30 16:45:58 | 操場西北門 | 1 |
| zhangSan | 2020-06-30 16:51:51 | 操場西南門 | 4 |
| liSi | 2020-06-30 16:51:53 | 操場西北門 | 3 |
| wangWu | 2020-06-30 16:51:54 | 操場北門 | 3 |
| xiaoQiang | 2020-06-30 16:51:55 | 操場西北門 | 3 |
| zhangSan | 2020-06-30 16:51:56 | 操場南門 | 2 |
| liSi | 2020-06-30 16:51:57 | 操場西南門 | 3 |
| wangWu | 2020-06-30 16:51:58 | 操場東門 | 3 |
| xiaoQiang | 2020-06-30 16:51:59 | 操場東北門 | 2 |
| zhangSan | 2020-06-30 16:52:00 | 操場東南門 | 4 |
+-----------+---------------------+-----------------+------------+
15 rows in set (0.00 sec)
mysql>