一、問題描述
在Hadoop序列化案例(http://blog.csdn.net/gaijianwei/article/details/46004025)的基礎上,將輸出的數據按照手機號所屬的運營商進行分區。
二、問題實現
DataCount代碼(只是對Hadoop序列化案例的DataCount代碼稍作修改)
package edu.jianwei.hadoop.mr;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.collections.map.HashedMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DataCount {
static class DCMapper extends Mapper<LongWritable, Text, Text, DataBean>{
private Text k=new Text();
private DataBean v=new DataBean();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line=value.toString();
String[] words=line.split("\t");
String telNum=words[1];
double upLoad=Double.parseDouble(words[8]);
double downLoad=Double.parseDouble(words[9]);
k.set(telNum);
v.Set(telNum, upLoad, downLoad);
context.write(k, v);
}
}
static class DCReduce extends Reducer<Text,DataBean, Text, DataBean>{
private DataBean v=new DataBean();
@Override
protected void reduce(Text key, Iterable<DataBean> v2s,
Context context)
throws IOException, InterruptedException {
double upTotal=0;
double downToal=0;
for (DataBean d : v2s) {
upTotal+=d.getUpLoad();
downToal+=d.getDownload();
}
v.Set("", upTotal, downToal);
context.write(key, v);
}
}
public static class DCPartitioner extends Partitioner<Text, DataBean>{
static Map<String,Integer> provider=new HashMap<String,Integer>();
static{
provider.put( "139",1);
provider.put( "138",1);
provider.put( "152",2);
provider.put("153", 2);
provider.put("182", 3);
provider.put("183", 3);
}
@Override
public int getPartition(Text k, DataBean value, int numPartitions) {
String tel_sub=k.toString().substring(0,3);
Integer counter;
counter=provider.get(tel_sub);
if(counter==null){
counter=0;
}
return counter;
}
}
public static void main(String[] args) throws Exception {
Configuration conf=new Configuration();
Job job=Job.getInstance();
job.setJarByClass(DataCount.class);
job.setMapperClass(DCMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DataBean.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
job.setReducerClass(DCReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DataBean.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setPartitionerClass(DCPartitioner.class);
job.setNumReduceTasks(Integer.parseInt(args[2]));
job.waitForCompletion(true);
}
}
DataBean同Hadoop序列化案例中的DataBean
三、代碼測試
1.代碼運行(啓動4個Reduce任務)
hadoop jar /root/dc.jar edu.jianwei.hadoop.mr.DataCount /dc /dc/res 4
2.運行結果
這裏輸出結果不在一一列舉, 例part-r-00001的數據:
13826544101 264.0 0.0 264.0
13922314466 3008.0 3720.0 6728.0
13925057413 11058.0 48243.0 59301.0
13926251106 240.0 0.0 240.0
13926435656 132.0 1512.0 1644.0
注意:
1’. 代碼運行(啓動3個Reduce任務)
hadoop jar /root/dc.jar edu.jianwei.hadoop.mr.DataCount /dc/HTTP_20130313143750.dat /dc/res_3 3
2‘.運行結果
1’‘.代碼運行(啓動5個Reduce任務)
hadoop jar /root/dc.jar edu.jianwei.hadoop.mr.DataCount /dc/HTTP_20130313143750.dat /dc/res_3 3
2''.運行結果