MapReduce實踐攻略
超詳細入門級-WordCount
問題描述:
統計一個文件中,各種單詞出現的次數
思路分析:
1. 在map階段,對每行數據調用一次map方法,對讀取到的每行數據按空格進行切割,將分割得到的每個單詞作爲key,value的值給定爲1傳遞給reduce
2. 在reduce階段,從map接收到傳遞過來的key和value,key值相同的爲同一組,對每一組只調用一次reduce方法,將每一組的value值累加即可得到該單詞出現的次數,最後將該組的key作爲key,累加的value作爲value作爲結果輸出
public class WordCountMR2 extends Configured implements Tool {
/**
* KEYIN: 默認情況下,是mr框架所讀到的一行文本的起始偏移量,Long,
* 但是在hadoop中有自己的更精簡的序列化接口,所以不直接用Long,而用LongWritable
* VALUEIN:默認情況下,是mr框架所讀到的一行文本的內容,String,同上,用Text
* KEYOUT:是用戶自定義邏輯處理完成之後輸出數據中的key,在此處是單詞,String,同上,用Text
* VALUEOUT:是用戶自定義邏輯處理完成之後輸出數據中的value,在此處是單詞次數,Integer,同上,用IntWritable
*/
public static class WCMapper extends Mapper<LongWritable,Text, Text, IntWritable> {
/**
* map階段的業務邏輯就寫在自定義的map()方法中
* maptask會對每一行輸入數據調用一次我們自定義的map()方法
* context是上下文引用對象,傳遞輸出值
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Collections.list(new StringTokenizer(value.toString()," ")).stream().map(s -> ((String)s).trim())
.filter(s -> s.length() > 1).forEach(ExceptionConsumer.of(word -> context.write(new Text(word),new IntWritable(1))));
}
}
/**
* KEYIN, VALUEIN對應mapper輸出的KEYOUT,VALUEOUT類型對應
* KEYOUT, VALUEOUT是自定義reduce邏輯處理結果的輸出數據類型
* KEYOUT是單詞
* VLAUEOUT是總次數
*/
public static class WCReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
/**
* reduce階段的業務邏輯就寫在自定義的reduce()方法中
* reducetask會對所有相同的key調用一次reduce()方法
* context是上下文引用對象,傳遞輸出值
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//map階段的輸出是reduce階段的輸入,樣式如下
//<helle,1><hello,1><helle,1><hello,1><helle,1><hello,1>
//<tom,1><tom,1><tom,1>
//<good,1>
// int count = 0;
// for (IntWritable value : values){
// count += value.get();
// }
// context.write(key, new IntWritable(count));
IntWritable count = StreamSupport.stream(values.spliterator(), false).collect(Collectors.toSet()).stream()
.reduce((a, b) -> new IntWritable(a.get() + b.get())).get();
context.write(key,count);
}
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
//創建job實例對象
Job job = Job.getInstance(conf,"test_fun_wordcount2");
//指定本程序的jar包所在的本地路徑
job.setJarByClass(this.getClass());
//指定本業務job要使用的mapper/Reducer業務類
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReducer.class);
//指定mapper輸出數據的kv類型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//指定最終輸出的數據的kv類型
//注:不是setReduceOutput,因爲有的時候只需要用到map,直接輸出map的結果就可以
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//指定job的輸入原始文件所在目錄
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
//指定job的輸出結果所在目錄
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
//指定開啓的reduce的數量
job.setNumReduceTasks(1);
//將job中配置的相關參數,以及job所用的java類所在的jar包,提交給yarn去運行
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception{
ToolRunner.run(new WordCountMR2(),args);
}
}
去重-DuplicateRemoveMR
問題描述:
去掉列表中所有重複的值,不考慮順序
思路分析:
將每一行的值按分隔符切開重新排序,然後再拼接起來作爲key,value置爲NullWritable類型,傳遞給reduce,reduce對相同的key只會輸出一次,以此達到去重複的效果。
public class DuplicateRemoveMR extends Configured implements Tool {
public static class DRMapper extends Mapper<LongWritable,Text, Text, NullWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String str = Collections.list(new StringTokenizer(value.toString(), ",")).stream()
.map(s -> ((String) s).trim()).filter(s -> s.length() > 1).sorted()
.collect(Collectors.joining(","));
context.write(new Text(str), NullWritable.get());
}
}
public static class DRReducer extends Reducer<Text,NullWritable,Text,NullWritable>{
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key,NullWritable.get());
}
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
Job job = Job.getInstance(conf,"dup_remove_xj");
job.setJarByClass(DuplicateRemoveMR.class);
job.setMapperClass(DRMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setReducerClass(DRReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
job.setNumReduceTasks(1);
return job.waitForCompletion(true)? 0 : 1;
}
public static void main(String[] args) throws Exception{
ToolRunner.run(new DuplicateRemoveMR(),args);
}
}
倒置索引-InvertIndexMR
問題描述:
統計不同文件中單詞出現的次數,還要輸出該單詞存在於哪些文件中
思路分析:
輸入的每一行按分隔符切割成一個個單詞,作爲key,當前文件路徑作爲value傳遞給reduce,在reduce階段統計相同key的個數即爲單詞個數,然後映射輸出形式和拼接value的值,最後將單詞作爲key,單詞個數和拼接起來的文件路徑作爲value輸出。
public class InvertIndexMR extends Configured implements Tool {
public static class IIMapper extends Mapper<LongWritable,Text, Text, Text> {
Text file = new Text();
@Override
protected void map(LongWritable key, Text value, Context context){
// ExceptionConsumer爲自定義捕獲異常類型,可用trycatch代替
Collections.list(new StringTokenizer(value.toString()," ")).stream().map(s -> ((String)s).trim())
.filter(s -> s.length() > 1).forEach(ExceptionConsumer.of(name -> context.write(new Text(name),file)));
}
// setup在map前就運行了
@Override
protected void setup(Context context){
String name = ((FileSplit) context.getInputSplit()).getPath().getName();
file.set(name);
}
}
public static class IIReducer extends Reducer<Text,Text,Text,Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
// StreamSupport.stream(values.spliterator(), false)是將Iterable類型轉換爲stream
String str = StreamSupport.stream(values.spliterator(), false)
.collect(Collectors.groupingBy(Text::toString, Collectors.counting())).entrySet().stream()
.map(en -> en.getKey() + ":" + en.getValue()).collect(Collectors.joining(" "));
context.write(key,new Text(str));
}
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
Job job = Job.getInstance(conf, "invert_index_xj");
job.setJarByClass(InvertIndexMR.class);
job.setMapperClass(IIMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(IIReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
job.setNumReduceTasks(1);
return job.waitForCompletion(true)? 0 : 1;
}
public static void main(String[] args) throws Exception{
ToolRunner.run(new InvertIndexMR(),args);
}
}
共現矩陣-ConcurrenceMR
問題描述:
求出兩兩共同好友出現的次數。例如,甲好友列表有1和2,乙好友列表也有1和2,那麼1和2共現的次數爲2,共現次數越大,說明兩者關聯的可能性越大。
思路分析:
第一步,先輸出每個人的所有好友。第二步,map階段循環每個人的好友兩兩組合的結果並排序,將所有的兩兩組合分別作爲key,value置爲1輸出,reduce階段直接統計相同key的個數即爲兩兩共同好友數。
第一步:FlatFriendsMR
public class FlatFriendsMR extends Configured implements Tool{
static class FFMapper extends Mapper<LongWritable,Text, Text, Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Stream.of(value.toString()).filter(s->s.length()>1).map(line->line.split(","))
.filter(arr->arr.length==2).forEach(ExceptionConsumer.of(arr->context
.write(new Text(arr[0].trim()),new Text(arr[1].trim()))));
}
}
static class FFReducer extends Reducer<Text,Text,Text,Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String fs = StreamSupport.stream(values.spliterator(), false).map(s -> s.toString())
.collect(Collectors.joining(","));
context.write(key,new Text(fs));
}
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
Job job = Job.getInstance(conf,"flat_friends_xj");
job.setJarByClass(this.getClass());
job.setMapperClass(FFMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(FFReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
job.setNumReduceTasks(1);
return job.waitForCompletion(true)? 0 : 1;
}
public static void main(String[] args) throws Exception{
ToolRunner.run(new FlatFriendsMR(),args);
}
}
第二步:ConcurrenceMR
public class ConcurrenceMR extends Configured implements Tool{
static class CCMapper extends Mapper<LongWritable,Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String[] arr = s.split("\t");
String[] names = arr[1].split(",");
// 將所有好友兩兩組合輸出
for (int i = 0; i < names.length-1; i++){
for (int j = i+1; j < names.length; j++){
String first = names[i];
String second = names[j];
String pair = getPair(first,second);
context.write(new Text(pair),new IntWritable(1));
}
}
}
/**
* 排序,防止key重複
* @param first
* @param second
* @return
*/
public String getPair(String first,String second){
if(first.compareTo(second) > 0){
return second+","+first;
}else{
return first+","+second;
}
}
}
static class CCReducer extends Reducer<Text,Text,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
// 將好友組合兩兩相同的累加
long count = StreamSupport.stream(values.spliterator(), false).count();
context.write(key,new IntWritable((int)count));
}
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
Job job = Job.getInstance(conf,"concurrence_xj");
job.setJarByClass(this.getClass());
job.setMapperClass(CCMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(CCReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
job.setNumReduceTasks(1);
return job.waitForCompletion(true)? 0 : 1;
}
public static void main(String[] args) throws Exception{
ToolRunner.run(new ConcurrenceMR(),args);
}
}
MapReduce排序
局部排序-PartitionSortMR
問題描述:
將所有數據根據氣溫排序,每個分區之間不存在排序關係,僅在各個區內部進行排序
思路分析:
默認排序方式,只需要將key設置爲溫度即可
public class PartitionSortMR extends Configured implements Tool {
public static class PSMapper extends Mapper<LongWritable, Text, DoubleWritable, Text>{
// 將氣溫作爲key,整體作爲value
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] ss = line.split("\t");
String tmp = ss[2];
context.write(new DoubleWritable(Double.parseDouble(tmp)),value);
}
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
Job job = Job.getInstance(conf,"part_sort_xj");
job.setJarByClass(this.getClass());
job.setMapperClass(PSMapper.class);
job.setMapOutputKeyClass(DoubleWritable.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(Reducer.class);
job.setOutputKeyClass(DoubleWritable.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
//-D mapreduce.job.reduces
job.setNumReduceTasks(5);
return job.waitForCompletion(true)? 0 : 1;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new PartitionSortMR(),args);
}
}
全局排序-TotalSortMR
問題描述:
將所有數據根據氣溫排序,每個分區之間也存在排序關係
思路分析:
設置成根據樣本分區排序,這樣的話必須保證樣本的泛型前後一致,故無法使用默認的輸入格式,可以修改InputFormat或者使用sequencefile,因爲sequencefile可以保存數據類型,案例中使用這種方法,先將數據轉化爲sequencefile,然後直接從sequencefile讀取數據進行分區排序。
OutSequenceMR
public class OutSequenceMR extends Configured implements Tool {
public static class OSMapper extends Mapper<LongWritable, Text, DoubleWritable, Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] ss = line.split("\t");
String tmp = ss[2];
context.write(new DoubleWritable(Double.parseDouble(tmp)),value);
}
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
Job job = Job.getInstance(conf, "out_sequence_xj");
job.setJarByClass(this.getClass());
job.setMapperClass(OSMapper.class);
job.setMapOutputKeyClass(DoubleWritable.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(Reducer.class);
job.setOutputKeyClass(DoubleWritable.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
SequenceFileOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
//-D mapreduce.job.reduces
//job.setNumReduceTasks(1);
return job.waitForCompletion(true)? 0 : 1;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new OutSequenceMR(),args);
}
}
TotalSortMR
public class TotalSortMR extends Configured implements Tool {
public static class TSMapper extends Mapper<DoubleWritable, Text, DoubleWritable, Text>{
@Override
protected void map(DoubleWritable key, Text value, Context context) throws IOException, InterruptedException {
context.write(key, value);
}
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
Job job = Job.getInstance(conf, "total_sort_xj");
job.setJarByClass(this.getClass());
job.setMapperClass(TSMapper.class);
job.setMapOutputKeyClass(DoubleWritable.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(Reducer.class);
job.setOutputKeyClass(DoubleWritable.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
SequenceFileInputFormat.addInputPath(job,new Path(conf.get("inpath")));
TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
// 設置成根據樣本分區排序
job.setPartitionerClass(TotalOrderPartitioner.class);
// 獲取隨機樣本
// 0.8表示,數量少的話,隨機取80%的數據作爲樣本
// 1000表示,數量很多的話,隨機取1000個數據作爲樣本
// 10表示,最大支持10個分區
InputSampler.RandomSampler<DoubleWritable,Text> sam = new InputSampler.RandomSampler(0.8,1000,10);
//把採樣結果傳遞給job
InputSampler.writePartitionFile(job,sam);
String file = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
job.addCacheFile(URI.create(file));
// job.setNumReduceTasks(5);
return job.waitForCompletion(true)? 0 : 1;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new TotalSortMR(),args);
}
}
二次排序-SecondarySortMR
問題描述:
將所有數據先根據年份升序排列,再根據氣溫降序排列
思路分析:
要進行二次排序,必須要創建一個複合類型作爲key來進行排序比較,這個複合類型實現WritableComparable接口,包含年份和氣溫兩個屬性,重寫compareTo()方法,按年份升序,按氣溫降序。除此之外,要實現二次排序必須保證相同年份的被分到同一個分區,這樣纔可以比較氣溫。因此,還需要定義一個類來繼承Partitioner抽象類,重寫getPartition()方法,使分區根據年份來劃分。另外,還需手動設置根據年份進行分組,故還需要創建一個類實現WritableComparator接口,重寫compare()方法,將相同年份的分爲同一組。最後,在主類中將複合類型作爲map的key的輸出類型,完成排序,在job上設置自定義的分區規則和分組規則。
YearTmp(複合類型)
public class YearTmp implements WritableComparable<YearTmp> {
private IntWritable year = new IntWritable(); // 年份
private DoubleWritable tmp = new DoubleWritable(); // 平均溫度
public YearTmp() {
}
public YearTmp(IntWritable year, DoubleWritable tmp) {
this.year = new IntWritable(year.get());
this.tmp = new DoubleWritable(tmp.get());
}
public YearTmp(int year, double tmp) {
this.year = new IntWritable(year);
this.tmp = new DoubleWritable(tmp);
}
public IntWritable getYear() {
return year;
}
public void setYear(IntWritable year) {
this.year = new IntWritable(year.get());
}
public DoubleWritable getTmp() {
return tmp;
}
public void setTmp(DoubleWritable tmp) {
this.tmp = new DoubleWritable(tmp.get());
}
// 第二步,排序,年份升序,溫度降序
@Override
public int compareTo(YearTmp o) {
return this.year.compareTo(o.year)==0 ? o.tmp.compareTo(this.tmp): this.year.compareTo(o.year);
}
@Override
public void write(DataOutput dataOutput) throws IOException {
year.write(dataOutput);
tmp.write(dataOutput);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
year.readFields(dataInput);
tmp.readFields(dataInput);
}
}
YearPartitioner(自定義分區規則)
public class YearPartitioner extends Partitioner<YearTmp, Text> {
public YearPartitioner() {
}
@Override
public int getPartition(YearTmp o,Text o2, int i) {
return o.getYear().get()%i;
}
}
YearGroupComparator(自定義分組規則)
public class YearGroupComparator extends WritableComparator {
public YearGroupComparator() {
super(YearTmp.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
YearTmp y1 = (YearTmp)a;
YearTmp y2 = (YearTmp)b;
return y1.getYear().compareTo(y2.getYear());
}
}
SecondarySortMR(MR主程序)
public class SecondarySortMR extends Configured implements Tool {
public static class SSMapper extends Mapper<LongWritable, Text, YearTmp,Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] infos = line.split("\t");
YearTmp yt = new YearTmp(Integer.parseInt(infos[0]), Double.parseDouble(infos[2]));
context.write(yt,new Text(infos[1]));
}
}
public static class SSReducer extends Reducer<YearTmp,Text,Text,Text>{
@Override
protected void reduce(YearTmp key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value : values) {
String str = key.getYear() + "\t" + key.getTmp();
context.write(new Text(str),value);
}
}
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
Job job = Job.getInstance(conf, "secondary_sort_xj");
job.setJarByClass(this.getClass());
job.setMapperClass(SSMapper.class);
job.setMapOutputKeyClass(YearTmp.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(SSReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath")));
// 設置分區規則
job.setPartitionerClass(YearPartitioner.class);
// 設置分組規則
job.setGroupingComparatorClass(YearGroupComparator.class);
return job.waitForCompletion(true)? 0 : 1;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new SecondarySortMR(),args);
}
}