一:重點方法
zero
addInPlace
addAccumulator
作用:作爲Spark自己封裝的計數器,可以免去我們考慮分佈式中出現的數據一致等問題,比較方便。
二:代碼
重寫類:
package com.TomYang.sparkProject.spark.session;
import org.apache.spark.AccumulatorParam;
import com.TomYang.sparkProject.constant.Constants;
import com.TomYang.sparkProject.util.StringUtils;
public class SessionAggrStatAccumulator implements AccumulatorParam<String> {
private static final long serialVersionUID = 6311074555136039130L;
@Override
/**
* 初始值
* Return the "zero" (identity) value for an accumulator type, given its initial value. For
* example, if R was a vector of N dimensions, this would return a vector of N zeroes.
*/
public String zero(String v) {
return Constants.SESSION_COUNT + "=0|"
+ Constants.TIME_PERIOD_1s_3s + "=0|"
+ Constants.TIME_PERIOD_4s_6s + "=0|"
+ Constants.TIME_PERIOD_7s_9s + "=0|"
+ Constants.TIME_PERIOD_10s_30s + "=0|"
+ Constants.TIME_PERIOD_30s_60s + "=0|"
+ Constants.TIME_PERIOD_1m_3m + "=0|"
+ Constants.TIME_PERIOD_3m_10m + "=0|"
+ Constants.TIME_PERIOD_10m_30m + "=0|"
+ Constants.TIME_PERIOD_30m + "=0|"
+ Constants.STEP_PERIOD_1_3 + "=0|"
+ Constants.STEP_PERIOD_4_6 + "=0|"
+ Constants.STEP_PERIOD_7_9 + "=0|"
+ Constants.STEP_PERIOD_10_30 + "=0|"
+ Constants.STEP_PERIOD_30_60 + "=0|"
+ Constants.STEP_PERIOD_60 + "=0";
}
//合併兩個累加器的值
@Override
public String addInPlace(String v1, String v2) {
return add(v1, v2);
}
//向累加器中添加值
@Override
public String addAccumulator(String v1, String v2) {
return add(v1, v2);
}
/**
* session統計計算邏輯
* @param v1 連接串
* @param v2 範圍區間
* /**
* addInPlace和addAccumulator
* 可以理解爲是一樣的
*
* 這兩個方法,其實主要就是實現,v1可能就是我們初始化的那個連接串
* v2,就是我們在遍歷session的時候,判斷出某個session對應的區間,然後會用Constants.TIME_PERIOD_1s_3s
* 所以,我們,要做的事情就是
* 在v1中,找到v2對應的value,累加1,然後再更新回連接串裏面去
*
* @return 更新以後的連接串
*/
private String add(String v1, String v2) {
if(StringUtils.isEmpty(v1)){
return v2;
}
String oldValue = StringUtils.getFieldFromConcatString(v1, "\\|", v2);
if(oldValue != null){
int newValue = Integer.valueOf(oldValue) + 1;
return StringUtils.setFieldInConcatString(v1, "\\!", v2, String.valueOf(newValue));
}
return v1;
}
}
使用類:
package com.TomYang.sparkProject.spark.session;
import groovy.ui.SystemOutputInterceptor;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import org.apache.spark.Accumulator;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;
import com.TomYang.sparkProject.conf.ConfigurationManager;
import com.TomYang.sparkProject.constant.Constants;
import com.TomYang.sparkProject.dao.ITaskDAO;
import com.TomYang.sparkProject.dao.factory.DAOFactory;
import com.TomYang.sparkProject.domain.Task;
import com.TomYang.sparkProject.test.MockData;
import com.TomYang.sparkProject.util.DateUtils;
import com.TomYang.sparkProject.util.ParamUtils;
import com.TomYang.sparkProject.util.StringUtils;
import com.TomYang.sparkProject.util.ValidUtils;
import com.alibaba.fastjson.JSONObject;
@SuppressWarnings("unused")
public class UserVisitSessionAnalyzeSpark {
public static void main(String[] args){
args = new String[]{"2"};
SparkConf conf = new SparkConf()
.setAppName(Constants.SPARK_APP_NAME_SESSION)
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = getSQLContext(sc.sc());
//步長統計
Accumulator<String> sessionAggrStatAccumulator = sc.accumulator("",new SessionAggrStatAccumulator());
//過濾session
JavaPairRDD<String,String> filteredSessionid2AggrInfoRDD = filterSession(sessionId2AggrInfoRDD, taskParam,sessionAggrStatAccumulator);
System.out.println("filteredSessionid2AggrInfoRDD:"+filteredSessionid2AggrInfoRDD.count());
sc.close();
}
private static SQLContext getSQLContext(SparkContext sc){
boolean local = ConfigurationManager.getBoolean(Constants.SPARK_LOCAL);
if(local){
return new SQLContext(sc);
}else{
return new HiveContext(sc);
}
}
private static JavaPairRDD<String,String> filterSession(JavaPairRDD<String, String> sessionId2AggreInfoRDD, final JSONObject taskParam,
final Accumulator<String> sessionAggrStatAccumulator){
/*System.out.println("2========================="+ taskParam);*/
String startAge = ParamUtils.getParam(taskParam, Constants.PARAM_START_AGE);
String endAge = ParamUtils.getParam(taskParam, Constants.PARAM_END_AGE);
//用戶的職業
String professionals = ParamUtils.getParam(taskParam, Constants.PARAM_PROFESSIONALS);
String cities = ParamUtils.getParam(taskParam, Constants.PARAM_CITIES);
String sex = ParamUtils.getParam(taskParam, Constants.PARAM_SEX);
String keywords = ParamUtils.getParam(taskParam, Constants.PARAM_KEYWORDS);
String categoryIds = ParamUtils.getParam(taskParam, Constants.PARAM_CATEGORY_IDS);
//拼接條件
String _parameter = (startAge != null ? Constants.PARAM_START_AGE + "=" + startAge + "|" : "")
+ (endAge != null ? Constants.PARAM_END_AGE + "=" + endAge + "|" : "")
+ (professionals != null ? Constants.PARAM_PROFESSIONALS + "=" + professionals + "|" : "")
+ (cities != null ? Constants.PARAM_CITIES + "=" + cities + "|" : "")
+ (sex != null ? Constants.PARAM_SEX + "=" + sex + "|" : "")
+ (keywords != null ? Constants.PARAM_KEYWORDS + "=" + keywords + "|" : "")
+ (categoryIds != null ? Constants.PARAM_CATEGORY_IDS + "=" + categoryIds: "");
if(_parameter.endsWith("\\|")){
_parameter = _parameter.substring(0,_parameter.length() - 1);
}
// ???
final String parameter = _parameter;
System.out.println("拼接條件:"+ parameter);
JavaPairRDD<String, String> filteredSessionId2AggrInfoRDD = sessionId2AggreInfoRDD.filter(new Function<Tuple2<String,String>,Boolean>(){
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Boolean call(Tuple2<String, String> tuple) throws Exception {
String aggrInfo = tuple._2;
sessionAggrStatAccumulator.add(Constants.SESSION_COUNT);
long visitLength = Long.valueOf(StringUtils.getFieldFromConcatString(
aggrInfo, "\\|", Constants.FIELD_VISIT_LENGTH));
long stepLength = Long.valueOf(StringUtils.getFieldFromConcatString(
aggrInfo, "\\|", Constants.FIELD_STEP_LENGTH));
calculateVisitLength(visitLength);
calculateStepLength(stepLength);
return true;
}
//計算訪問時長範圍
private void calculateVisitLength(long visitLength){
if(visitLength >=1 && visitLength <= 3) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_1s_3s);
} else if(visitLength >=4 && visitLength <= 6) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_4s_6s);
} else if(visitLength >=7 && visitLength <= 9) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_7s_9s);
} else if(visitLength >=10 && visitLength <= 30) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_10s_30s);
} else if(visitLength > 30 && visitLength <= 60) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_30s_60s);
} else if(visitLength > 60 && visitLength <= 180) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_1m_3m);
} else if(visitLength > 180 && visitLength <= 600) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_3m_10m);
} else if(visitLength > 600 && visitLength <= 1800) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_10m_30m);
} else if(visitLength > 1800) {
sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_30m);
}
}
private void calculateStepLength(long stepLength) {
if(stepLength >= 1 && stepLength <= 3) {
sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_1_3);
} else if(stepLength >= 4 && stepLength <= 6) {
sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_4_6);
} else if(stepLength >= 7 && stepLength <= 9) {
sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_7_9);
} else if(stepLength >= 10 && stepLength <= 30) {
sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_10_30);
} else if(stepLength > 30 && stepLength <= 60) {
sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_30_60);
} else if(stepLength > 60) {
sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_60);
}
}
});
return filteredSessionId2AggrInfoRDD;
}
}