重寫 AccumulatorParam 計時器

一:重點方法


zero
addInPlace
addAccumulator

作用:作爲Spark自己封裝的計數器,可以免去我們考慮分佈式中出現的數據一致等問題,比較方便。

二:代碼

 

重寫類:

package com.TomYang.sparkProject.spark.session;

import org.apache.spark.AccumulatorParam;
import com.TomYang.sparkProject.constant.Constants;
import com.TomYang.sparkProject.util.StringUtils;

public class SessionAggrStatAccumulator implements AccumulatorParam<String> {

	private static final long serialVersionUID = 6311074555136039130L;
	
	@Override
	  /**
	   * 初始值
	   * Return the "zero" (identity) value for an accumulator type, given its initial value. For
	   * example, if R was a vector of N dimensions, this would return a vector of N zeroes.
	   */
	public String zero(String v) {
		return Constants.SESSION_COUNT + "=0|"
				+ Constants.TIME_PERIOD_1s_3s + "=0|"
				+ Constants.TIME_PERIOD_4s_6s + "=0|"
				+ Constants.TIME_PERIOD_7s_9s + "=0|"
				+ Constants.TIME_PERIOD_10s_30s + "=0|"
				+ Constants.TIME_PERIOD_30s_60s + "=0|"
				+ Constants.TIME_PERIOD_1m_3m + "=0|"
				+ Constants.TIME_PERIOD_3m_10m + "=0|"
				+ Constants.TIME_PERIOD_10m_30m + "=0|"
				+ Constants.TIME_PERIOD_30m + "=0|"
				+ Constants.STEP_PERIOD_1_3 + "=0|"
				+ Constants.STEP_PERIOD_4_6 + "=0|"
				+ Constants.STEP_PERIOD_7_9 + "=0|"
				+ Constants.STEP_PERIOD_10_30 + "=0|"
				+ Constants.STEP_PERIOD_30_60 + "=0|"
				+ Constants.STEP_PERIOD_60 + "=0";
	}
	
	//合併兩個累加器的值
	@Override
	public String addInPlace(String v1, String v2) {
		return add(v1, v2);
	}
	
	//向累加器中添加值 
	@Override
	public String addAccumulator(String v1, String v2) {
		return add(v1, v2);
	}  
	
	/**
	 * session統計計算邏輯
	 * @param v1 連接串
	 * @param v2 範圍區間
	 * 	/**
	 * addInPlace和addAccumulator
	 * 可以理解爲是一樣的
	 * 
	 * 這兩個方法,其實主要就是實現,v1可能就是我們初始化的那個連接串
	 * v2,就是我們在遍歷session的時候,判斷出某個session對應的區間,然後會用Constants.TIME_PERIOD_1s_3s
	 * 所以,我們,要做的事情就是
	 * 在v1中,找到v2對應的value,累加1,然後再更新回連接串裏面去
	 * 
	 * @return 更新以後的連接串
	 */
	private String add(String v1, String v2) {
		if(StringUtils.isEmpty(v1)){
			return v2;
		}
		
		String oldValue = StringUtils.getFieldFromConcatString(v1, "\\|", v2);
		
		if(oldValue != null){
			int newValue = Integer.valueOf(oldValue) + 1;
			return StringUtils.setFieldInConcatString(v1, "\\!", v2, String.valueOf(newValue));
		}
		
		return v1;
	}
	
}

 

使用類:

package com.TomYang.sparkProject.spark.session;


import groovy.ui.SystemOutputInterceptor;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;

import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;

import org.apache.spark.Accumulator;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.storage.StorageLevel;

import scala.Tuple2;

import com.TomYang.sparkProject.conf.ConfigurationManager;
import com.TomYang.sparkProject.constant.Constants;
import com.TomYang.sparkProject.dao.ITaskDAO;
import com.TomYang.sparkProject.dao.factory.DAOFactory;
import com.TomYang.sparkProject.domain.Task;
import com.TomYang.sparkProject.test.MockData;
import com.TomYang.sparkProject.util.DateUtils;
import com.TomYang.sparkProject.util.ParamUtils;
import com.TomYang.sparkProject.util.StringUtils;
import com.TomYang.sparkProject.util.ValidUtils;
import com.alibaba.fastjson.JSONObject;

@SuppressWarnings("unused")
public class UserVisitSessionAnalyzeSpark {
	public static void main(String[] args){
		args = new String[]{"2"};
		SparkConf conf = new SparkConf()
				.setAppName(Constants.SPARK_APP_NAME_SESSION)
				.setMaster("local");
		JavaSparkContext sc = new JavaSparkContext(conf);
		SQLContext sqlContext = getSQLContext(sc.sc());
		//步長統計
		Accumulator<String> sessionAggrStatAccumulator = sc.accumulator("",new SessionAggrStatAccumulator());
		//過濾session 
		JavaPairRDD<String,String> filteredSessionid2AggrInfoRDD = filterSession(sessionId2AggrInfoRDD, taskParam,sessionAggrStatAccumulator);
		System.out.println("filteredSessionid2AggrInfoRDD:"+filteredSessionid2AggrInfoRDD.count());
		sc.close();
	
	}
	
	private static SQLContext getSQLContext(SparkContext sc){
		boolean local = ConfigurationManager.getBoolean(Constants.SPARK_LOCAL);
		if(local){
			return new SQLContext(sc);
		}else{
			return new HiveContext(sc);
		}
	}
	
	private static JavaPairRDD<String,String> filterSession(JavaPairRDD<String, String> sessionId2AggreInfoRDD, final JSONObject taskParam,
			final Accumulator<String> sessionAggrStatAccumulator){
		/*System.out.println("2========================="+ taskParam);*/
		String startAge = ParamUtils.getParam(taskParam, Constants.PARAM_START_AGE);
		String endAge = ParamUtils.getParam(taskParam, Constants.PARAM_END_AGE);
		//用戶的職業
		String professionals = ParamUtils.getParam(taskParam, Constants.PARAM_PROFESSIONALS);
		String cities = ParamUtils.getParam(taskParam, Constants.PARAM_CITIES);
		String sex = ParamUtils.getParam(taskParam, Constants.PARAM_SEX);
		String keywords = ParamUtils.getParam(taskParam, Constants.PARAM_KEYWORDS);
		String categoryIds = ParamUtils.getParam(taskParam, Constants.PARAM_CATEGORY_IDS);
		
		//拼接條件
		String _parameter = (startAge != null ? Constants.PARAM_START_AGE + "=" + startAge + "|" : "")
				+ (endAge != null ? Constants.PARAM_END_AGE + "=" + endAge + "|" : "")
				+ (professionals != null ? Constants.PARAM_PROFESSIONALS + "=" + professionals + "|" : "")
				+ (cities != null ? Constants.PARAM_CITIES + "=" + cities + "|" : "")
				+ (sex != null ? Constants.PARAM_SEX + "=" + sex + "|" : "")
				+ (keywords != null ? Constants.PARAM_KEYWORDS + "=" + keywords + "|" : "")
				+ (categoryIds != null ? Constants.PARAM_CATEGORY_IDS + "=" + categoryIds: "");
		
		if(_parameter.endsWith("\\|")){
			_parameter = _parameter.substring(0,_parameter.length() - 1);
		}
		
		// ??? 
		final String parameter = _parameter;
		
		System.out.println("拼接條件:"+ parameter);
		
		JavaPairRDD<String, String> filteredSessionId2AggrInfoRDD = sessionId2AggreInfoRDD.filter(new Function<Tuple2<String,String>,Boolean>(){
			
			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;

			@Override
			public Boolean call(Tuple2<String, String> tuple) throws Exception {
				String aggrInfo = tuple._2;
				sessionAggrStatAccumulator.add(Constants.SESSION_COUNT);
				long visitLength = Long.valueOf(StringUtils.getFieldFromConcatString(
						aggrInfo, "\\|", Constants.FIELD_VISIT_LENGTH)); 
				long stepLength = Long.valueOf(StringUtils.getFieldFromConcatString(
						aggrInfo, "\\|", Constants.FIELD_STEP_LENGTH));  
				calculateVisitLength(visitLength); 
				calculateStepLength(stepLength);  
				return true;
			}
			
			//計算訪問時長範圍
			private void calculateVisitLength(long visitLength){
				if(visitLength >=1 && visitLength <= 3) {
					sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_1s_3s);  
				} else if(visitLength >=4 && visitLength <= 6) {
					sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_4s_6s);  
				} else if(visitLength >=7 && visitLength <= 9) {
					sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_7s_9s);  
				} else if(visitLength >=10 && visitLength <= 30) {
					sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_10s_30s);  
				} else if(visitLength > 30 && visitLength <= 60) {
					sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_30s_60s);  
				} else if(visitLength > 60 && visitLength <= 180) {
					sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_1m_3m);  
				} else if(visitLength > 180 && visitLength <= 600) {
					sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_3m_10m);  
				} else if(visitLength > 600 && visitLength <= 1800) {  
					sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_10m_30m);  
				} else if(visitLength > 1800) {
					sessionAggrStatAccumulator.add(Constants.TIME_PERIOD_30m);  
				}
			}
			
			private void calculateStepLength(long stepLength) {
				if(stepLength >= 1 && stepLength <= 3) {
					sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_1_3);  
				} else if(stepLength >= 4 && stepLength <= 6) {
					sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_4_6);  
				} else if(stepLength >= 7 && stepLength <= 9) {
					sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_7_9);  
				} else if(stepLength >= 10 && stepLength <= 30) {
					sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_10_30);  
				} else if(stepLength > 30 && stepLength <= 60) {
					sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_30_60);  
				} else if(stepLength > 60) {
					sessionAggrStatAccumulator.add(Constants.STEP_PERIOD_60);    
				}
			}
		});
		  return filteredSessionId2AggrInfoRDD;
		}
	
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章