fastspark | 用SparkCore和SparkSQL兩種方式實現各省份廣告TopN統計

內容

本文講述使用SparkCore和SparkSQL實現每個省份點擊量最多的前三個廣告id,測試數據如下

省份id	廣告id
1	100
1	100
1	100
1	112
1	101
1	112
1	102
1	102
1	103
1	112
1	112
1	101
1	112
2	100
2	121
2	101
2	121
2	104
2	121
2	111
2	104
2	103
2	111
2	121
2	104
3	121
3	112
3	112
3	121
3	100

SparkCore


import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.ArrayBuffer

/**
  * Program: fastspark
  * Package:  
  * Description: Created by felahong on 2020/4/15 12:03
  * TODO 統計每個省份點擊TOP3的廣告
  */

case class AdClick(province: Int, ad: Int)

object ProvinceAdTopThree {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("province-ad-count").setMaster("local[*]")
    val sc = SparkContext.getOrCreate(conf)
    sc.setLogLevel("warn")

    // 讀取數據文件
    val logRdd = sc.textFile("hdfs://felahong:9000/test/pro_ad_tmp.txt", 2)
//    logRdd.foreach(println)

    // 封裝爲AdClick 類型的RDD
    val adClickRdd: RDD[AdClick] = logRdd.map(line => {
      val arr: Array[String] = line.split("\t").filter(_.length > 0)
//      println(arr.length)
      AdClick(arr(0).toInt, arr(1).toInt)
    })

    val proAd2CountRdd: RDD[(String, Int)] = adClickRdd.map(adClick => (adClick.province+"_"+adClick.ad, 1)).reduceByKey(_+_) // (pro_ad, sum)

    val pro2AdsRdd = proAd2CountRdd.map(line => {
      val arr = line._1.split("_")
      (arr(0), (arr(1), line._2))
    }).groupByKey() // (proid, ((adid, sum), (adid, sum)))
    pro2AdsRdd.foreach(println)

    // flatMap返回值必須是序列類的類型
    val res: RDD[String] = pro2AdsRdd.flatMap({case(pro, items) =>
      val topThree = items.toList
        .sortWith(_._2 > _._2)
        .take(3)
      for(topn <- topThree) yield {
        pro + " " + topn._1 + " " + topn._2
      }
    })
//
//    val res = pro2AdsRdd.flatMap{ case(pro, items)=>
//      val filterItems = items.toList.sortWith(_._2 > _._2).take(3).toArray
//      val result = new ArrayBuffer[String]()
//      for(item <- filterItems){
//        result += (pro + " " + item._1 + " " + item._2)
//      }
//      result
//    }

    println
    res.foreach(println)

    sc.stop()
  }

}

SparkSQL

import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}

/**
  * Program: fastspark
  * Package:  
  * Description: Created by felahong on 2020/4/15 16:49
  * TODO 通過SparkSQL 統計每個省份點擊TOP3的廣告
  */

object ProvinceAdTopThreeSQL extends App {

  private val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("province-ad-count-sql")
  val ss = SparkSession.builder().config(conf).getOrCreate()
  val sc = ss.sparkContext

  sc.setLogLevel("WARN")

  import ss.implicits._

  val inpath = "hdfs://felahong:9000/test/pro_ad_tmp.txt"
  val logRdd = sc.textFile(inpath).map{ line =>
    val arr = line.split("\t")
    (arr(0), arr(1))
  }.toDF("proid", "adid")
      .cache()

  logRdd.createOrReplaceTempView("ad_log")

  val hql =
    """select proid, adid, clickCount from(
      |  select proid, adid, clickCount, row_number() over(partition by proid order by clickCount desc) as rnk from (
      |     select proid, adid, count(*) clickCount from ad_log group by proid, adid
      |  ) t1
      |)t
      |where rnk<=3
    """.stripMargin

  val res = ss.sql(hql).cache()
  res.show()

  ss.stop()
  sc.stop()

}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章