代碼到GitHub:https://github.com/SmallScorpion/flink-tutorial.git
Sink
Flink沒有類似於spark中foreach方法,讓用戶進行迭代的操作。雖有對外的輸出操作都要利用Sink完成。最後通過類似如下方式完成整個任務最終輸出操作。
stream.addSink(new MySink(xxxx))
官方提供了一部分的框架的sink。除此以外,需要用戶自定義實現sink
Bahir添加了Redis這些Sink
輸出到文件
import com.atguigu.bean.SensorReading
import org.apache.flink.api.common.serialization.{SimpleStringEncoder, SimpleStringSchema}
import org.apache.flink.core.fs.Path
import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer011, FlinkKafkaProducer011}
object SinkToFile {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val inputDStream: DataStream[String] = env.readTextFile("D:\\MyWork\\WorkSpaceIDEA\\flink-tutorial\\src\\main\\resources\\SensorReading.txt")
val dataDstream: DataStream[String] = inputDStream.map(
data => {
val dataArray: Array[String] = data.split(",")
SensorReading(dataArray(0), dataArray(1).toLong, dataArray(2).toDouble).toString()
})
// 直接寫入文件
//dataDstream.writeAsText("D:\\MyWork\\WorkSpaceIDEA\\flink-tutorial\\src\\main\\resources\\out")
// 上面方法要被丟棄,新方法用addSink,輸出的文件帶時間
dataDstream.addSink( StreamingFileSink.forRowFormat[String](
new Path("D:\\MyWork\\WorkSpaceIDEA\\flink-tutorial\\src\\main\\resources\\out"),
new SimpleStringEncoder[String]("UTF-8")
).build() )
env.execute("sink test job")
}
}
輸出到Kafka
pom.xml
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-connector-kafka-0.11 -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka-0.11_2.11</artifactId>
<version>1.10.0</version>
</dependency>
SinkToKafka.scala
import java.util.Properties
import com.atguigu.bean.SensorReading
import org.apache.flink.api.common.serialization.{SimpleStringEncoder, SimpleStringSchema}
import org.apache.flink.core.fs.Path
import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer011, FlinkKafkaProducer011}
/**
* 完整的kafka source生成數據 經過flink的transform轉換結構, 最後還是輸出到kafka sink
*/
object SinkToKafka {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
// 從kafka讀取數據
val properties: Properties = new Properties()
properties.setProperty("bootstrap.servers", "hadoop102:9092")
properties.setProperty("group.id", "consumer-group")
properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
properties.setProperty("auto.offset.reset", "latest")
// 在kafka的sensor的topic發送數據 獲取到flink中
val inputDStream: DataStream[String] = env.addSource(
new FlinkKafkaConsumer011[String]("sensor", new SimpleStringSchema(), properties)
)
// 轉換操作
val dataDstream: DataStream[String] = inputDStream.map(
data => {
val dataArray: Array[String] = data.split(",")
SensorReading(dataArray(0), dataArray(1).toLong, dataArray(2).toDouble).toString
})
// 將數據傳輸到kafka的sink_test中
dataDstream.addSink(
new FlinkKafkaProducer011[String]("hadoop102:9092","sink_test", new SimpleStringSchema())
)
env.execute("sink test job")
}
}
輸出到Redis
pom.xml
<!-- https://mvnrepository.com/artifact/org.apache.bahir/flink-connector-redis -->
<dependency>
<groupId>org.apache.bahir</groupId>
<artifactId>flink-connector-redis_2.11</artifactId>
<version>1.0</version>
</dependency>
RedisSink的參數有一個FlinkJedisConfigBase的抽象類,可以用其子類FlinkJedisPoolConfig
私有構造方法,找Builder方法
最後自定義一個mapper
import com.atguigu.bean.SensorReading
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.redis.RedisSink
import org.apache.flink.streaming.connectors.redis.common.config.FlinkJedisPoolConfig
import org.apache.flink.streaming.connectors.redis.common.mapper.{RedisCommand, RedisCommandDescription, RedisMapper}
object SinkToRedis {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val inputDStream: DataStream[String] = env.readTextFile("D:\\MyWork\\WorkSpaceIDEA\\flink-tutorial\\src\\main\\resources\\SensorReading.txt")
val dataDstream: DataStream[SensorReading] = inputDStream.map(
data => {
val dataArray: Array[String] = data.split(",")
SensorReading(dataArray(0), dataArray(1).toLong, dataArray(2).toDouble)
})
// 連接池配置對象
val config: FlinkJedisPoolConfig = new FlinkJedisPoolConfig.Builder()
.setHost("hadoop102")
.setPort(6379)
.build() // 創建對象
dataDstream.addSink( new RedisSink[SensorReading](config, MyRedisMapper()))
env.execute("sink test job")
}
}
// 自定義一個RedisMapper
case class MyRedisMapper() extends RedisMapper[SensorReading]{
// 定義寫入redis的命令,HSET sensor_temp key value
override def getCommandDescription: RedisCommandDescription =
new RedisCommandDescription(RedisCommand.HSET, "sensor_temp")
override def getKeyFromData(data: SensorReading): String = data.id
override def getValueFromData(data: SensorReading): String = data.temperature.toString
}
輸出到Elasticsearch
pom.xml
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-elasticsearch6_2.11</artifactId>
<version>1.10.0</version>
</dependency>
定義一個httpHosts和一個ElasticsearchSinkFunction
import java.util
import com.atguigu.bean.SensorReading
import org.apache.flink.api.common.functions.RuntimeContext
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer}
import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink
import org.apache.http.HttpHost
import org.elasticsearch.action.index.IndexRequest
import org.elasticsearch.client.Requests
/**
* Elasticsearch
*/
object SinkToES {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val inputDStream: DataStream[String] = env.readTextFile("D:\\MyWork\\WorkSpaceIDEA\\flink-tutorial\\src\\main\\resources\\SensorReading.txt")
val dataDstream: DataStream[SensorReading] = inputDStream.map(
data => {
val dataArray: Array[String] = data.split(",")
SensorReading(dataArray(0), dataArray(1).toLong, dataArray(2).toDouble)
})
// 定義httphosts
val httpHosts: util.ArrayList[HttpHost] = new util.ArrayList[HttpHost]()
// 添加host和port
httpHosts.add( new HttpHost("hadoop102", 9200) )
// 定義ElasticsearchSinkFunction
val esSinkFunc: ElasticsearchSinkFunction[SensorReading] = new ElasticsearchSinkFunction[SensorReading]() {
override def process(element: SensorReading, ctx: RuntimeContext, indexer: RequestIndexer): Unit = {
// 首先定義寫入es的source
val dataSource = new util.HashMap[String, String]()
dataSource.put("sensor_id", element.id)
dataSource.put("temp", element.temperature.toString)
dataSource.put("ts", element.timestamp.toString)
// 創建index(表)
val indexRequest: IndexRequest = Requests.indexRequest()
.index("sensor")
.`type`("data")
.source(dataSource)
// 使用RequestIndexer發送http請求
indexer.add(indexRequest)
println("data " + element + " saved successfully")
}
}
dataDstream.addSink( new ElasticsearchSink.Builder[SensorReading](httpHosts, esSinkFunc).build())
env.execute("sink test job")
}
}
輸出到MySQL
import java.sql.{Connection, DriverManager, PreparedStatement}
import com.atguigu.bean.SensorReading
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.sink.{RichSinkFunction, SinkFunction}
import org.apache.flink.streaming.api.scala._
object SinkToJDBC {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val inputDStream: DataStream[String] = env.readTextFile("D:\\MyWork\\WorkSpaceIDEA\\flink-tutorial\\src\\main\\resources\\SensorReading.txt")
val dataDstream: DataStream[SensorReading] = inputDStream.map(
data => {
val dataArray: Array[String] = data.split(",")
SensorReading(dataArray(0), dataArray(1).toLong, dataArray(2).toDouble)
})
dataDstream.addSink( MyJdbcSink() )
dataDstream.print("mysql")
env.execute("sink test job")
}
}
case class MyJdbcSink() extends RichSinkFunction[SensorReading]{
// 聲明連接變量
var conn: Connection = _
var insertStmt: PreparedStatement = _
var updateStmt: PreparedStatement = _
override def open(parameters: Configuration): Unit = {
// 創建連接和預編譯語句
conn = DriverManager.getConnection("jdbc:mysql://hadoop102:3306/flink","root","000000")
insertStmt = conn.prepareStatement("insert into sensor_temp(id,temperature) values(?,?)")
updateStmt = conn.prepareStatement("update sensor_temp set temperature = ? where id = ?")
}
// 每來一條數據,就調用連接,執行一次sql
override def invoke(value: SensorReading, context: SinkFunction.Context[_]): Unit = {
// 直接執行udate語句,如果沒有更新數據,那麼執行insert
updateStmt.setDouble(1, value.temperature)
updateStmt.setString(2, value.id)
updateStmt.execute()
if(updateStmt.getUpdateCount == 0){
insertStmt.setString(1, value.id)
insertStmt.setDouble(2, value.temperature)
insertStmt.execute()
}
}
override def close(): Unit = {
insertStmt.close()
updateStmt.close()
conn.close()
}
}