1. 將運算數據存儲到數據庫中。
操作數據來源:https://blog.csdn.net/erchouchou/article/details/99766584
將上面的結果輸出到數據庫中:
代碼如下:
import java.io.{BufferedReader, FileInputStream, InputStreamReader}
import java.sql.{Connection, Date, DriverManager, PreparedStatement}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer
object ipLocation {
val data2MySQL = (iterator: Iterator[(String, Int)]) => {
var conn: Connection = null
var ps : PreparedStatement = null
val sql = "INSERT INTO location_info (location, counts, aaccesse_date) VALUES (?, ?, ?)"
try {
conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/mydb", "root", "root")
iterator.foreach(line => {
ps = conn.prepareStatement(sql)
ps.setString(1, line._1)
ps.setInt(2, line._2)
ps.setDate(3, new Date(System.currentTimeMillis()))
ps.executeUpdate()
})
} catch {
case e: Exception => println(e)
} finally {
if (ps != null)
ps.close()
if (conn != null)
conn.close()
}
}
def ip2Long(ip: String): Long = {
val fragments = ip.split("[.]")
var ipNum = 0L
for (i <- 0 until fragments.length){
ipNum = fragments(i).toLong | ipNum << 8L
}
ipNum
}
def binarySearch(lines: Array[(String, String, String)], ip: Long) : Int = {
var low = 0
var high = lines.length - 1
while (low <= high) {
val middle = (low + high) / 2
if ((ip >= lines(middle)._1.toLong) && (ip <= lines(middle)._2.toLong))
return middle
if (ip < lines(middle)._1.toLong)
high = middle - 1
else {
low = middle + 1
}
}
-1
}
def main(args: Array[String]){
val conf = new SparkConf().setMaster("local").setAppName("IpLocation")
val sc = new SparkContext(conf)
val ipRulesRdd = sc.textFile("D:/logs/ip.txt").map( line=>{
val fileds = line.split("\\|")
val start_num = fileds(2)
val end_num = fileds(3)
val province = fileds(6)
(start_num,end_num,province)
}
)
//全部的ip映射規則
val ipRolesArray = ipRulesRdd.collect()
//廣播規則
val ipRulesBroadcast = sc.broadcast(ipRolesArray)
//加載處理的數據
val ipsRdd = sc.textFile("D:/logs/input").map(line=>{
val fields = line.split("\\|")
fields(1)
})
val rdd = ipsRdd.map(ip=>{
val ipNum = ip2Long(ip)
val index = binarySearch(ipRulesBroadcast.value,ipNum)
val info = ipRulesBroadcast.value(index)
info
}).map(x=>(x._3,1)).reduceByKey(_+_)
rdd.foreachPartition(data2MySQL)
//println(rdd.collect().toBuffer)
sc.stop()
}
}
2.將數據庫中的數據讀取到出來
代碼如下:
import java.sql.DriverManager
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo1 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("demo").setMaster("local[2]")
val connection = ()=>{
Class.forName("com.mysql.jdbc.Driver").newInstance()
DriverManager.getConnection("jdbc:mysql://localhost:3306/mydb","root","root")
}
val sc = new SparkContext(conf)
/*
sc: SparkContext,
getConnection: () => Connection,
sql: String,
lowerBound: Long,
upperBound: Long,
numPartitions: Int,
mapRow: (ResultSet) => T = JdbcRDD.resultSetToObjectArray _)*/
val jdbcRdd = new JdbcRDD(
sc,
connection,
"select * from ta where id >=? and id <= ?",
1,
5,
2,
rs=>{
val id =rs.getInt(1)
val code = rs.getString(2)
(id,code)
}
)
println(jdbcRdd.collect().toBuffer)
}
}
注意點:
修改查詢的SQL,返回的數據量不對。"select * from ta where id >= ? and id < ?"
原因:
在分區的時候一共五個數據,將數據分成了[1,3)和(4,5],id爲3的沒有被查詢,所以出現了丟失數據的情況。在分區,要考慮開閉區間。
解決方案:
爲了避免出現丟數據,讀取數據時,區間兩端都包含。id >= 1 and id < =5。