RDD編程模型
RDD運行規劃圖
注意:當以Cluster模式啓動Spark程序時,需要把本地Driver端的Client殺掉,否則會佔用本地內存,可能導致本地宕機
(可在spark-submit中加上spark.yarn.submit.waitAppCompletion=false)
operator
aggregateByKey(代碼如下,key只作爲標識,提供兩個變量供設計計算算法)
public class OptTest {
public static void main(String[] args) {
SparkConf sc = new SparkConf().setAppName("").setMaster("local[2]");
JavaSparkContext jsc = new JavaSparkContext(sc);
jsc.setLogLevel("WARN");
JavaRDD<Tuple2<Integer,Integer>> tupleRdd = jsc.parallelize(Arrays.asList(
/**
* 1;9;3
* 2;3;1
* 3;14;2
*/
new Tuple2<Integer,Integer>(1,3),
new Tuple2<Integer,Integer>(1,2),
new Tuple2<Integer,Integer>(1,4),
new Tuple2<Integer,Integer>(2,3),
new Tuple2<Integer,Integer>(3,6),
new Tuple2<Integer,Integer>(3,8)
));
JavaPairRDD<Integer,Integer> pairRdd = tupleRdd.mapToPair(tuple->tuple);
JavaPairRDD<Integer,Tuple2<Integer,Integer>> result =
pairRdd.aggregateByKey(new Tuple2<Integer, Integer>(0, 0),
new Function2<Tuple2<Integer, Integer>, Integer, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> tuple, Integer val2) throws Exception {
return new Tuple2<>(tuple._1() + val2, tuple._2() + 1);
}
}, new Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> tuple1, Tuple2<Integer, Integer> tuple2) throws Exception {
return new Tuple2<>(tuple1._1()+tuple2._1(),tuple1._2()+tuple2._2());
}
}
);
result.foreach(new VoidFunction<Tuple2<Integer, Tuple2<Integer, Integer>>>() {
@Override
public void call(Tuple2<Integer, Tuple2<Integer, Integer>> res) throws Exception {
System.out.println("res1:"+res._1+";res21:"+res._2._1+";res22:"+res._2._2);
}
});
//--------------------
JavaRDD<Tuple2<String,Integer>> msgRDD = jsc.parallelize(Arrays.asList(
new Tuple2<String,Integer>("msg1",3),
new Tuple2<String,Integer>("msg1",2),
new Tuple2<String,Integer>("msg1",4),
new Tuple2<String,Integer>("msg2",3),
new Tuple2<String,Integer>("msg3",6),
new Tuple2<String,Integer>("msg3",9),
new Tuple2<String,Integer>("msg3",8)
...
));
JavaPairRDD<String,Integer> msgPair = msgRDD.mapToPair(tuple->tuple);
/**msgPair.aggregateByKey()打印結果
*
* return new Tuple2<>(tuple1._1+tuple2._1,tuple1._2+tuple2._2)的結果
* msg1;9;3
* msg2;3;1
* msg3;168;24
*
* 分兩個分區後 分別return tuple1 和 tuple2 的結果
* msg1;9;3
* msg2;3;1
* msg3;98;14
*
* msg1;9;3
* msg2;3;1
* msg3;70;10
*
* 由結果可知,第一個Function2爲分區內計算 第二個爲分區間的計算(第一個Function2的結果作爲第二個Function2的參數宏觀調用)
*/
//sum and count
JavaPairRDD<String,Tuple2<Integer,Integer>> msgRes =
msgPair.aggregateByKey(new Tuple2<Integer, Integer>(0, 0),new Integer(4),
new Function2<Tuple2<Integer, Integer>, Integer, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> val1, Integer val2) throws Exception {
//分區內 param1 sum(val) param2 設計count遞增
return new Tuple2<>(val1._1 + val2, val1._2+1);//val1._1 + val2爲sum, val2, val1._2+1 每條記錄出現+1
}
},
new Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> tuple1, Tuple2<Integer, Integer> tuple2) throws Exception {
//return new Tuple2<>(tuple1._1+tuple2._1,tuple1._2+tuple2._2);
return tuple2;//
}
}
);
msgRes.foreach(new VoidFunction<Tuple2<String, Tuple2<Integer, Integer>>>() {
@Override
public void call(Tuple2<String, Tuple2<Integer, Integer>> msgTuple2) throws Exception {
System.out.println(msgTuple2._1+";"+msgTuple2._2._1+";"+msgTuple2._2._2);
}
});
//sum and max
JavaPairRDD<String,Tuple2<Integer,Integer>> msgMaxRes =
msgPair.aggregateByKey(new Tuple2<Integer, Integer>(0, 0),
new Function2<Tuple2<Integer, Integer>, Integer, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> val1, Integer val2) throws Exception {
return new Tuple2<Integer, Integer>(val1._1>val2?val1._1:val2,val1._2+1);
}
},
new Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> tuple1, Tuple2<Integer, Integer> tuple2) throws Exception {
return new Tuple2<Integer, Integer>(tuple1._1>tuple2._1?tuple1._1:tuple2._1,tuple1._2+tuple2._2);
}
}
);
msgMaxRes.foreach(new VoidFunction<Tuple2<String, Tuple2<Integer, Integer>>>() {
@Override
public void call(Tuple2<String, Tuple2<Integer, Integer>> msgTuple2) throws Exception {
System.out.println(msgTuple2._1+";"+msgTuple2._2._1+";"+msgTuple2._2._2);
}
});
}
}
//lambda版
msgPair.aggregate(new Tuple2<Integer, Integer>(0.0, 0),
(x,y)->new Tuple2<Integer, Integer>(x._1+y,x._2+1),
(x,y)->new Tuple2<Integer, Integer>(x._1+y._1,x._2+y._2));
combineByKey( aggregateByKey是combineByKey的簡版)
/********* combineByKey start **************/
//sum and count
JavaPairRDD<String,Tuple2<Integer,Integer>> pairRDD =
msgPair.combineByKey(
new Function<Integer, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer,Integer> call(Integer x) throws Exception {
return new Tuple2<Integer, Integer>(x,1);
}
},
new Function2<Tuple2<Integer, Integer>, Integer, Tuple2<Integer,Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> v, Integer x) throws Exception {
return new Tuple2<Integer, Integer>(v._1+x,v._2+1);
}
},
new Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> v, Tuple2<Integer, Integer> v1) throws Exception {
return new Tuple2<Integer, Integer>(v._1+v1._1,v._2+v1._2);
}
}
);
pairRDD.foreach(new VoidFunction<Tuple2<String, Tuple2<Integer, Integer>>>() {
@Override
public void call(Tuple2<String, Tuple2<Integer, Integer>> tuple2) throws Exception {
System.out.println("combineByKey==="+tuple2._1+";"+tuple2._2._1+";"+tuple2._2._2);
}
});
foldByKey (上面兩種的精簡版)
//sum
JavaPairRDD<String,Integer> foldRDD =
msgPair.foldByKey(new Integer(0),
new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1+v2;
}
}
);