RDD 與 MYSQL 交互:
poml.xml 文件依賴:
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.27</version>
</dependency>
插入數據不能保證有序,因爲是每個分區在 executor 上執行操作。
初始化對象要在插入數據sql 代碼塊外部初始化,並用 xxpartitionxx 方法。
//讀數據庫
import java.util.Properties
val prop = {
val p = new Properties()
p.put("driver", "com.mysql.jdbc.Driver")
p.put("url", "jdbc:mysql://10.18.2.3:3306/dbname?"+
"useUnicode=true&characterEncoding=utf8&rewriteBatchedStatements=true")
p.put("user", "username")
p.put("password", "****")
p
}
prop.put("dbtable", "tablename")
prop.put("dbtable", "(select * from tablename) tablename_")
import scala.collection.JavaConversions._
spark.read.format("jdbc").options(prop).load()
//寫數據庫:
import org.apache.spark.sql.SaveMode
df.write
.mode(SaveMode.Append)
.jdbc(prop.getProperty("url"), "tablename", prop)
不用spark API 的話自己寫:
import java.sql.DriverManager
import java.util.Properties
object JDBCUtil extends Serializable {
def apply(prop: Properties) = {
new JDBCUtil(prop)
}
}
class JDBCUtil private (prop: Properties) extends Serializable {
Class.forName(prop.getProperty("driver"))
val url = prop.getProperty("url")
private[this] val user = prop.getProperty("user")
private[this] val password = prop.getProperty("password")
def getConnect() = {
DriverManager.getConnection(url, user, password)
}
def getJDBCProp() = {
val prop_copy = new Properties()
prop_copy.putAll(prop)
prop_copy
}
}
// import org.apache.spark.SparkFiles
object DBUtil extends Serializable {
private[this] val jdbc_conf = {
val p = new Properties()
p.put("driver", "com.mysql.jdbc.Driver")
p.put("url", "jdbc:mysql://10.18.2.3:3306/dbname?" +
"useUnicode=true&characterEncoding=utf8&rewriteBatchedStatements=true")
p.put("user", "username")
p.put("password", "****")
p
}
//private[this] val jdbc_file = SparkFiles.get("route_analysis.properties")
private[this] val jdbcUtil = JDBCUtil(jdbc_conf)
val url = jdbcUtil.url
def conn = jdbcUtil.getConnect
def prop = jdbcUtil.getJDBCProp
import scala.collection.JavaConversions._
def options: scala.collection.mutable.Map[String, String] = prop
}
// 使用:
def update(df: DataFrame) {
val table = "tablename"
df.foreachPartition { part =>
val conn = DBUtil.conn
part.foreach { r =>
val col = r.getAs[String]("col")
//查詢已有數據
val sql = s"""select col from ${table} where col = '${col}' """
val stmt_query = conn.createStatement()
val res = stmt_query.executeQuery(sql)
val stmt_new = conn.createStatement()
//如果結果集爲空則插入一條記錄
if (!res.first) {
val sql_insert = s"""insert into ${table} (col) values ('${col}') """
stmt_new.execute(sql_insert)
} else {
//若找到記錄可以看看是否可更新ex_station字段。
val sql_update = s"""update ${table} set col = '${col}' where id = ${id} """
stmt_new.executeUpdate(sql_update)
}
res.close()
stmt_query.close()
stmt_new.close()
}
}
println(s"update表(${table})完成...")
}
Hbase 交互
在resource 目錄下放 hbase-site.xml 文件
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.3.1</version>
</dependency>
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
object SparkOperateHbase{
def main(args:Array[String]): Unit ={
//建立Hbase的連接
val conf = HBaseConfiguration.create();
//設置查詢的表名student
conf.set(TableInputFormat.INPUT_TABLE,"student")
//通過SparkContext將student表中數據創建一個rdd
val sc = new SparkContext(new SparkConf());
val stuRdd = sc.newAPIHadoopRDD(conf,classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result]);
stuRdd.cache();//持久化
//計算數據條數
val count = stuRdd.count();
println("Student rdd count:"+count);
//遍歷輸出
//當我們建立Rdd的時候,前邊全部是參數信息,後邊的result纔是保存數據的數據集
stuRdd.foreach({case (_,result) =>
//通過result.getRow來獲取行鍵
val key = Bytes.toString(result.getRow);
//通過result.getValue("列族","列名")來獲取值
//注意這裏需要使用getBytes將字符流轉化成字節流
val name = Bytes.toString(result.getValue("info".getBytes,"name".getBytes));
val gender = Bytes.toString(result.getValue("info".getBytes,"gender".getBytes));
val age = Bytes.toString(result.getValue("info".getBytes,"age".getBytes));
//打印結果
println("Row key:"+key+" Name:"+name+" Gender:"+gender+" Age:"+age);
});
}
}
插入數據
import org.apache.hadoop.hbase.client.{Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{TableInputFormat, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.{SparkConf, SparkContext}
object HbasePut{
def main(args:Array[String]): Unit = {
//建立sparkcontext
val sparkConf = new SparkConf().setAppName("HbasePut").setMaster("local")
val sc = new SparkContext(sparkConf)
//與hbase的student表建立連接
val tableName = "student"
sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE,tableName)
//建立任務job
val job = new Job(sc.hadoopConfiguration)
//配置job參數
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputValueClass(classOf[Result])
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
//要插入的數據,這裏的makeRDD是parallelize的擴展版
val indataRdd = sc.makeRDD(Array("3,zhang,M,26","4,yue,M,27"))
val rdd = indataRdd.map(_.split(",")).map(arr=>{
val put = new Put(Bytes.toBytes(arr(0))) //行鍵的值
//依次給列族info的列添加值
put.add(Bytes.toBytes("info"),Bytes.toBytes("name"),Bytes.toBytes(arr(1)))
put.add(Bytes.toBytes("info"),Bytes.toBytes("gender"),Bytes.toBytes(arr(2)))
put.add(Bytes.toBytes("info"),Bytes.toBytes("age"),Bytes.toBytes(arr(3)))
//必須有這兩個返回值,put爲要傳入的數據
(new ImmutableBytesWritable,put)
})
rdd.saveAsNewAPIHadoopDataset(job.getConfiguration)
}
}