spark操做mysql數據庫

1. 將運算數據存儲到數據庫中。

操作數據來源:https://blog.csdn.net/erchouchou/article/details/99766584

將上面的結果輸出到數據庫中:

代碼如下:

import java.io.{BufferedReader, FileInputStream, InputStreamReader}
import java.sql.{Connection, Date, DriverManager, PreparedStatement}

import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.ArrayBuffer

object ipLocation {

     val data2MySQL = (iterator: Iterator[(String, Int)]) => {
      var conn: Connection = null
      var ps : PreparedStatement = null
      val sql = "INSERT INTO location_info (location, counts, aaccesse_date) VALUES (?, ?, ?)"
      try {
        conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/mydb", "root", "root")
        iterator.foreach(line => {
          ps = conn.prepareStatement(sql)
          ps.setString(1, line._1)
          ps.setInt(2, line._2)
          ps.setDate(3, new Date(System.currentTimeMillis()))
          ps.executeUpdate()
        })
      } catch {
        case e: Exception => println(e)
      } finally {
        if (ps != null)
          ps.close()
        if (conn != null)
          conn.close()
      }
    }

  def ip2Long(ip: String): Long = {
    val fragments = ip.split("[.]")
    var ipNum = 0L
    for (i <- 0 until fragments.length){
      ipNum =  fragments(i).toLong | ipNum << 8L
    }
    ipNum
  }

  def binarySearch(lines: Array[(String, String, String)], ip: Long) : Int = {
    var low = 0
    var high = lines.length - 1
    while (low <= high) {
      val middle = (low + high) / 2
      if ((ip >= lines(middle)._1.toLong) && (ip <= lines(middle)._2.toLong))
        return middle
      if (ip < lines(middle)._1.toLong)
        high = middle - 1
      else {
        low = middle + 1
      }
    }
    -1
  }

  def main(args: Array[String]){
    val conf = new SparkConf().setMaster("local").setAppName("IpLocation")
    val sc = new SparkContext(conf)
    val ipRulesRdd = sc.textFile("D:/logs/ip.txt").map( line=>{
      val fileds = line.split("\\|")
      val start_num = fileds(2)
      val end_num = fileds(3)
      val province = fileds(6)
      (start_num,end_num,province)
    }
    )
    //全部的ip映射規則
    val ipRolesArray = ipRulesRdd.collect()
    //廣播規則
    val ipRulesBroadcast =  sc.broadcast(ipRolesArray)
    //加載處理的數據
    val ipsRdd = sc.textFile("D:/logs/input").map(line=>{
      val fields = line.split("\\|")
      fields(1)
    })
    val rdd = ipsRdd.map(ip=>{
      val ipNum = ip2Long(ip)
      val index = binarySearch(ipRulesBroadcast.value,ipNum)
      val info = ipRulesBroadcast.value(index)
      info
    }).map(x=>(x._3,1)).reduceByKey(_+_)
    rdd.foreachPartition(data2MySQL)

    //println(rdd.collect().toBuffer)
    sc.stop()
  }
}

2.將數據庫中的數據讀取到出來

代碼如下:

import java.sql.DriverManager

import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}

object Demo1 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("demo").setMaster("local[2]")
    val connection = ()=>{
      Class.forName("com.mysql.jdbc.Driver").newInstance()
      DriverManager.getConnection("jdbc:mysql://localhost:3306/mydb","root","root")
    }
    val sc = new SparkContext(conf)
    /*
     sc: SparkContext,
    getConnection: () => Connection,
    sql: String,
    lowerBound: Long,
    upperBound: Long,
    numPartitions: Int,
    mapRow: (ResultSet) => T = JdbcRDD.resultSetToObjectArray _)*/
    val jdbcRdd = new JdbcRDD(
      sc,
      connection,
      "select * from ta where id >=? and id <= ?",
      1,
      5,
      2,
      rs=>{
        val id =rs.getInt(1)
        val code = rs.getString(2)
        (id,code)
      }
    )
    println(jdbcRdd.collect().toBuffer)
  }
}

注意點:

修改查詢的SQL,返回的數據量不對。"select * from ta where id >= ? and id < ?"

原因:

在分區的時候一共五個數據,將數據分成了[1,3)和(4,5],id爲3的沒有被查詢,所以出現了丟失數據的情況。在分區,要考慮開閉區間。

解決方案:

爲了避免出現丟數據,讀取數據時,區間兩端都包含。id >= 1 and id < =5。

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章