內容
本文講述使用SparkCore和SparkSQL實現每個省份點擊量最多的前三個廣告id,測試數據如下
省份id 廣告id
1 100
1 100
1 100
1 112
1 101
1 112
1 102
1 102
1 103
1 112
1 112
1 101
1 112
2 100
2 121
2 101
2 121
2 104
2 121
2 111
2 104
2 103
2 111
2 121
2 104
3 121
3 112
3 112
3 121
3 100
SparkCore
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer
/**
* Program: fastspark
* Package:
* Description: Created by felahong on 2020/4/15 12:03
* TODO 統計每個省份點擊TOP3的廣告
*/
case class AdClick(province: Int, ad: Int)
object ProvinceAdTopThree {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("province-ad-count").setMaster("local[*]")
val sc = SparkContext.getOrCreate(conf)
sc.setLogLevel("warn")
// 讀取數據文件
val logRdd = sc.textFile("hdfs://felahong:9000/test/pro_ad_tmp.txt", 2)
// logRdd.foreach(println)
// 封裝爲AdClick 類型的RDD
val adClickRdd: RDD[AdClick] = logRdd.map(line => {
val arr: Array[String] = line.split("\t").filter(_.length > 0)
// println(arr.length)
AdClick(arr(0).toInt, arr(1).toInt)
})
val proAd2CountRdd: RDD[(String, Int)] = adClickRdd.map(adClick => (adClick.province+"_"+adClick.ad, 1)).reduceByKey(_+_) // (pro_ad, sum)
val pro2AdsRdd = proAd2CountRdd.map(line => {
val arr = line._1.split("_")
(arr(0), (arr(1), line._2))
}).groupByKey() // (proid, ((adid, sum), (adid, sum)))
pro2AdsRdd.foreach(println)
// flatMap返回值必須是序列類的類型
val res: RDD[String] = pro2AdsRdd.flatMap({case(pro, items) =>
val topThree = items.toList
.sortWith(_._2 > _._2)
.take(3)
for(topn <- topThree) yield {
pro + " " + topn._1 + " " + topn._2
}
})
//
// val res = pro2AdsRdd.flatMap{ case(pro, items)=>
// val filterItems = items.toList.sortWith(_._2 > _._2).take(3).toArray
// val result = new ArrayBuffer[String]()
// for(item <- filterItems){
// result += (pro + " " + item._1 + " " + item._2)
// }
// result
// }
println
res.foreach(println)
sc.stop()
}
}
SparkSQL
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
/**
* Program: fastspark
* Package:
* Description: Created by felahong on 2020/4/15 16:49
* TODO 通過SparkSQL 統計每個省份點擊TOP3的廣告
*/
object ProvinceAdTopThreeSQL extends App {
private val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("province-ad-count-sql")
val ss = SparkSession.builder().config(conf).getOrCreate()
val sc = ss.sparkContext
sc.setLogLevel("WARN")
import ss.implicits._
val inpath = "hdfs://felahong:9000/test/pro_ad_tmp.txt"
val logRdd = sc.textFile(inpath).map{ line =>
val arr = line.split("\t")
(arr(0), arr(1))
}.toDF("proid", "adid")
.cache()
logRdd.createOrReplaceTempView("ad_log")
val hql =
"""select proid, adid, clickCount from(
| select proid, adid, clickCount, row_number() over(partition by proid order by clickCount desc) as rnk from (
| select proid, adid, count(*) clickCount from ad_log group by proid, adid
| ) t1
|)t
|where rnk<=3
""".stripMargin
val res = ss.sql(hql).cache()
res.show()
ss.stop()
sc.stop()
}