1.sampleByKey
import org.apache.spark.{SparkConf, SparkContext} object testVector { def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local") .setAppName("testVector"); val sc = new SparkContext(conf); var data = sc.textFile("kimi.txt") .map(row => { if(row.length == 3) //判定字符數 (row,1) //建立對應Map else (row,2) }) val fractions: Map[String,Double] = Map("aa" -> 2);//設定抽樣格式 val approxSample = data.sampleByKey(withReplacement = false,fractions,0);//計算抽樣樣本 approxSample.foreach(println); } }程序結果:(aa,2)
withReplacement:每次抽樣是否放回
fractions:定義分類條件和採樣機率。
seed:隨機數種子