RDD 與 mysql 和 Hbase 交互

RDD 與 MYSQL 交互:

poml.xml 文件依賴:

<dependency>
     <groupId>mysql</groupId>
     <artifactId>mysql-connector-java</artifactId>
     <version>5.1.27</version>
</dependency>
插入數據不能保證有序,因爲是每個分區在 executor 上執行操作。
初始化對象要在插入數據sql 代碼塊外部初始化,並用 xxpartitionxx 方法。
//讀數據庫
import java.util.Properties
val prop = {
    val p = new Properties()
    p.put("driver", "com.mysql.jdbc.Driver")
    p.put("url", "jdbc:mysql://10.18.2.3:3306/dbname?"+
 "useUnicode=true&characterEncoding=utf8&rewriteBatchedStatements=true")
    p.put("user", "username")
    p.put("password", "****")
    p
}
prop.put("dbtable", "tablename")
prop.put("dbtable", "(select * from tablename) tablename_")
import scala.collection.JavaConversions._
spark.read.format("jdbc").options(prop).load()

//寫數據庫:
import org.apache.spark.sql.SaveMode
df.write
    .mode(SaveMode.Append)
    .jdbc(prop.getProperty("url"), "tablename", prop)

不用spark API 的話自己寫:

import java.sql.DriverManager
import java.util.Properties

object JDBCUtil extends Serializable {
  def apply(prop: Properties) = {
    new JDBCUtil(prop)
  }
}

class JDBCUtil private (prop: Properties) extends Serializable {
  Class.forName(prop.getProperty("driver"))

  val url = prop.getProperty("url")
  private[this] val user = prop.getProperty("user")
  private[this] val password = prop.getProperty("password")

  def getConnect() = {
    DriverManager.getConnection(url, user, password)
  }

  def getJDBCProp() = {
    val prop_copy = new Properties()
    prop_copy.putAll(prop)
    prop_copy
  }
}

// import org.apache.spark.SparkFiles
object DBUtil extends Serializable {
  private[this] val jdbc_conf = {
    val p = new Properties()
    p.put("driver", "com.mysql.jdbc.Driver")
    p.put("url", "jdbc:mysql://10.18.2.3:3306/dbname?" +
      "useUnicode=true&characterEncoding=utf8&rewriteBatchedStatements=true")
    p.put("user", "username")
    p.put("password", "****")
    p
  }
  //private[this] val jdbc_file = SparkFiles.get("route_analysis.properties")
  private[this] val jdbcUtil = JDBCUtil(jdbc_conf)
  val url = jdbcUtil.url
  def conn = jdbcUtil.getConnect
  def prop = jdbcUtil.getJDBCProp
  import scala.collection.JavaConversions._
  def options: scala.collection.mutable.Map[String, String] = prop
}

// 使用:
def update(df: DataFrame) {
  val table = "tablename"
  df.foreachPartition { part =>
    val conn = DBUtil.conn
    part.foreach { r =>
      val col = r.getAs[String]("col")
      //查詢已有數據
      val sql = s"""select col from ${table} where col = '${col}' """
      val stmt_query = conn.createStatement()
      val res = stmt_query.executeQuery(sql)
      val stmt_new = conn.createStatement()
      //如果結果集爲空則插入一條記錄
      if (!res.first) {
        val sql_insert = s"""insert into ${table} (col) values ('${col}') """
        stmt_new.execute(sql_insert)
      } else {
        //若找到記錄可以看看是否可更新ex_station字段。
        val sql_update = s"""update ${table} set col = '${col}' where id = ${id} """
        stmt_new.executeUpdate(sql_update)
      }
      res.close()
      stmt_query.close()
      stmt_new.close()
    }
  }
  println(s"update表(${table})完成...")
}

Hbase 交互

在resource 目錄下放 hbase-site.xml 文件

 <dependency>
      <groupId>org.apache.hbase</groupId>
      <artifactId>hbase-server</artifactId>
      <version>1.3.1</version>
  </dependency>
  <dependency>
      <groupId>org.apache.hbase</groupId>
      <artifactId>hbase-client</artifactId>
      <version>1.3.1</version>
  </dependency>
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
object SparkOperateHbase{
    def main(args:Array[String]): Unit ={
        //建立Hbase的連接
        val conf = HBaseConfiguration.create();
        //設置查詢的表名student
        conf.set(TableInputFormat.INPUT_TABLE,"student")
        //通過SparkContext將student表中數據創建一個rdd
        val sc = new SparkContext(new SparkConf());
        val stuRdd = sc.newAPIHadoopRDD(conf,classOf[TableInputFormat],
            classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
            classOf[org.apache.hadoop.hbase.client.Result]);
        stuRdd.cache();//持久化
        //計算數據條數
        val count = stuRdd.count();
        println("Student rdd count:"+count);
        //遍歷輸出
        //當我們建立Rdd的時候,前邊全部是參數信息,後邊的result纔是保存數據的數據集
        stuRdd.foreach({case (_,result) =>
            //通過result.getRow來獲取行鍵
            val key = Bytes.toString(result.getRow);
            //通過result.getValue("列族","列名")來獲取值
            //注意這裏需要使用getBytes將字符流轉化成字節流
            val name = Bytes.toString(result.getValue("info".getBytes,"name".getBytes));
            val gender = Bytes.toString(result.getValue("info".getBytes,"gender".getBytes));
            val age = Bytes.toString(result.getValue("info".getBytes,"age".getBytes));
            //打印結果
            println("Row key:"+key+" Name:"+name+" Gender:"+gender+" Age:"+age);
        });
    }
}

插入數據

import org.apache.hadoop.hbase.client.{Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{TableInputFormat, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.{SparkConf, SparkContext}
object HbasePut{
    def main(args:Array[String]): Unit = {
        //建立sparkcontext
        val sparkConf = new SparkConf().setAppName("HbasePut").setMaster("local")
        val sc = new SparkContext(sparkConf)
        //與hbase的student表建立連接
        val tableName = "student"
        sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE,tableName)
        //建立任務job
        val job = new Job(sc.hadoopConfiguration)
        //配置job參數
        job.setOutputKeyClass(classOf[ImmutableBytesWritable])
        job.setOutputValueClass(classOf[Result])
        job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
        //要插入的數據,這裏的makeRDD是parallelize的擴展版
        val indataRdd = sc.makeRDD(Array("3,zhang,M,26","4,yue,M,27"))
        val rdd = indataRdd.map(_.split(",")).map(arr=>{
            val put = new Put(Bytes.toBytes(arr(0))) //行鍵的值
            //依次給列族info的列添加值
            put.add(Bytes.toBytes("info"),Bytes.toBytes("name"),Bytes.toBytes(arr(1)))
            put.add(Bytes.toBytes("info"),Bytes.toBytes("gender"),Bytes.toBytes(arr(2)))
            put.add(Bytes.toBytes("info"),Bytes.toBytes("age"),Bytes.toBytes(arr(3)))
            //必須有這兩個返回值,put爲要傳入的數據
            (new ImmutableBytesWritable,put)
        })
        rdd.saveAsNewAPIHadoopDataset(job.getConfiguration)
    }
} 
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章