參考文章:kudu簡介與操作方式
1、pyspark連接kudu
pyspark --jars /home/oicq/guomm/kudu-spark2_2.11-1.6.0.jar # 啓動
sqlContext = pyspark.sql.SQLContext(spark) # 創建sql連接
df = sqlContext.read.format('org.apache.kudu.spark.kudu').options(**{"kudu.master":"127.0.0.1:7051", "kudu.table":"python-example"}).load() # 讀取kudu表
df.write.format('org.apache.kudu.spark.kudu').option('kudu.master', '127.0.0.1:7051').option('kudu.table', 'python-example1').mode('append').save() # 寫入kudu表
2、scala spark連接kudu(記得添加jar包)
jar包:
kudu-client-1.6.0.jar
kudu-spark2_2.11-1.6.0.jar
Pom.xml
<dependency>
<groupId>org.apache.kudu</groupId>
<artifactId>kudu-client</artifactId>
<version>1.7.0</version>
</dependency>
創建表
def createTable(client: KuduClient, tableName: String): Unit = {
//首先我們需要引入java轉換
import scala.collection.JavaConverters._
//寫一個java類型的list,裏面放入ColumnSchema,用於存放字段信息
//ColumnSchemaBuilder裏有兩個參數,分別是字段名和type類型的字段類型
//後面跟上一個.key函數表示是否爲主鍵
val columns = List(
new ColumnSchema.ColumnSchemaBuilder("word", Type.STRING).key(true).build(),
new ColumnSchema.ColumnSchemaBuilder("cnt", Type.INT32).build()
).asJava
//將字段信息封裝成一個schema
val schema = new Schema(columns)
//這邊我們需要新建一個CreateTableOptions用於存放創建表的選項
val options: CreateTableOptions = new CreateTableOptions()
//設置副本系數爲1
options.setNumReplicas(1)
//這邊必須設置addHashPartitions,有兩個參數
//第一個參數指定哪個字段進行分區,可以有多個所以需要一個list
//第二個設置分爲幾個桶
options.addHashPartitions(parcols,3)
//這邊新建了一個LinkedList
val parcols: util.LinkedList[String] = new util.LinkedList[String]()
//設置使用word字段進行分發
parcols.add("word")
//使用client.createTable進行創建表,有三個參數
//1.表名
//2.schema信息
//3.創建表選項
client.createTable(tableName,schema,options)
}
編寫main方法
def main(args: Array[String]): Unit = {
//設置master
val KUDU_MASTERS = "localhost"
//使用new KuduClient.KuduClientBuilder創建一個KuduClient ,需要傳入master
val client: KuduClient = new KuduClient.KuduClientBuilder(KUDU_MASTERS).build()
//設置表名
val tableName = "ods"
//調用createTable方法,將KuduClient和表名傳入
createTable(client, tableName)
//關閉服務
client.close()
}
運行成功查看ui界面
表創建成功
插入(Insert)
def insertRows(client: KuduClient, tableName: String) = {
//通過client.openTable打開一個表
val table: KuduTable = client.openTable(tableName)
//要進行表操作必須新建一個Session
val session: KuduSession = client.newSession()
//通過for循壞插入數據
for(i<-1 to 10) {
//通過table.newInsert新建一個插入操作
val insert: Insert = table.newInsert()
//通過insert.getRow拿到行信息
val row: PartialRow = insert.getRow
//插入操作
row.addString("word",s"dam-$i")
row.addInt("cnt", 100+i)
//通過session.apply進行運行
session.apply(insert)
}
}
查詢(Select)
def query(client: KuduClient, tableName: String) = {
//打開表
val table: KuduTable = client.openTable(tableName)
//要查詢需要client.newScannerBuilder(table).build()獲得KuduScanner
val scanner: KuduScanner = client.newScannerBuilder(table).build()
//while循環數據
while(scanner.hasMoreRows) {
//scanner.nextRows()獲得可迭代行數據
val iterator: RowResultIterator = scanner.nextRows()
//迭代這個行數據
while(iterator.hasNext) {
val result: RowResult = iterator.next()
println(result.getString("word") + " => " + result.getInt("cnt"))
}
}
}
查看控制檯
這樣就驗證了插入數據和查詢數據完成
更新(Update)
def alterRow(client: KuduClient, tableName: String) = {
//打開表
val table: KuduTable = client.openTable(tableName)
//新建session
val session: KuduSession = client.newSession()
//創建一個update操作
val update: Update = table.newUpdate()
//獲得row
val row: PartialRow = update.getRow
//這邊設置將word爲"dam-10"的數據"cnt"設爲8888
row.addString("word", "dam-10")
row.addInt("cnt", 8888)
session.apply(update)
}
查看控制檯
修改表名
def renameTable(client: KuduClient, tableName: String, newTableName: String) = {
//這邊需要新建一個AlterTableOptions
val options: AlterTableOptions = new AlterTableOptions()
//通過options.renameTable,將新表名設置進去
options.renameTable(newTableName)
//需要調用client.alterTable修改表名,兩個參數分別爲舊錶名,上面新建的AlterTableOptions
client.alterTable(tableName, options)
}
查看ui界面
刪除表
def deleteTable(client: KuduClient, tableName: String) = {
//直接使用client.deleteTable就可以了
client.deleteTable(tableName)
}
綜合示例
package com.is
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.kudu.spark.kudu._
import org.apache.kudu.spark
object SparkKuduWrite {
def main(args:Array[String]) {
if(args.length < 2){
println("Usage:SparkKuduWrite <data_path><kudu_table_name><kudu_master_hosts>")
System.exit(1)
}
var data_path = args(0)
var kudu_table_name = args(1)
var kudu_master_hosts = args(2)
println(data_path)
println(kudu_table_name)
println(kudu_master_hosts)
var conf = new SparkConf().setAppName("stra_platform_test")
val spark = SparkSession.builder().config(conf).getOrCreate()
val sc = spark.sparkContext
import spark.implicits._
val kuduContext = new KuduContext(kudu_master_hosts, sc)
var df = spark.read.load(data_path)
# 通過kuduContext可以操作kudu的所有功能
kuduContext.upsertRows(df, kudu_table_name)
}
}
3、有用的文章:
kudu主頁:https://kudu.apache.org/docs/index.html
kudu的分區詳細信息:https://kudu.apache.org/docs/schema_design.html
操作kudu的各種形式:https://kudu.apache.org/docs/developing.html#_viewing_the_api_documentation
kudu python客戶端源代碼:https://github.com/apache/kudu/blob/master/python/kudu/client.pyx
kudu scala spark操作詳細例子:https://blog.cloudera.com/blog/2017/02/up-and-running-with-apache-spark-on-apache-kudu/