mapreduce的二次排序 SecondarySort

原創

2020-02-23 11:31

在看Hadoop The definitive guide 時，關於二次排序，在設置好setGroupingComparatorClass 後一直不明白爲什麼reduce的入參就是要查詢的年最高溫度，代碼裏沒有看到是怎麼實現的：

代碼：

// cc MaxTemperatureUsingSecondarySort Application to find the maximum temperature by sorting temperatures in the key
import java.io.IOException;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

// vv MaxTemperatureUsingSecondarySort
public class MaxTemperatureUsingSecondarySort
  extends Configured implements Tool {
  
  static class MaxTemperatureMapper
    extends Mapper<LongWritable, Text, IntPair, NullWritable> {
  
    private NcdcRecordParser parser = new NcdcRecordParser();
    
    @Override
    protected void map(LongWritable key, Text value,
        Context context) throws IOException, InterruptedException {
      
      parser.parse(value);
      if (parser.isValidTemperature()) {
        /*[*/context.write(new IntPair(parser.getYearInt(),
            parser.getAirTemperature()), NullWritable.get());/*]*/
      }
    }
  }
  
  static class MaxTemperatureReducer
    extends Reducer<IntPair, NullWritable, IntPair, NullWritable> {
  
   <span style="color:#ff0000;"> @Override
    protected void reduce(IntPair key, Iterable<NullWritable> values,
        Context context) throws IOException, InterruptedException {
      
      /*[*/context.write(key, NullWritable.get());/*]*/
    }</span>
  }
  
  public static class FirstPartitioner
    extends Partitioner<IntPair, NullWritable> {

    @Override
    public int getPartition(IntPair key, NullWritable value, int numPartitions) {
      // multiply by 127 to perform some mixing
      return Math.abs(key.getFirst() * 127) % numPartitions;
    }
  }
  
  public static class KeyComparator extends WritableComparator {
    protected KeyComparator() {
      super(IntPair.class, true);
    }
    @Override
    public int compare(WritableComparable w1, WritableComparable w2) {
      IntPair ip1 = (IntPair) w1;
      IntPair ip2 = (IntPair) w2;
      int cmp = IntPair.compare(ip1.getFirst(), ip2.getFirst());
      if (cmp != 0) {
        return cmp;
      }
      return -IntPair.compare(ip1.getSecond(), ip2.getSecond()); //reverse
    }
  }

<span style="color:#ff0000;">//此處只是將相同的年份歸併爲一個組</span>
  public static class GroupComparator extends WritableComparator {
    protected GroupComparator() {
      super(IntPair.class, true);
    }
    @Override
    public int compare(WritableComparable w1, WritableComparable w2) {
      IntPair ip1 = (IntPair) w1;
      IntPair ip2 = (IntPair) w2;
      return IntPair.compare(ip1.getFirst(), ip2.getFirst());
    }
  }

  @Override
  public int run(String[] args) throws Exception {
    Job job = JobBuilder.parseInputAndOutput(this, getConf(), args);
    if (job == null) {
      return -1;
    }
    
    job.setMapperClass(MaxTemperatureMapper.class);
    /*[*/job.setPartitionerClass(FirstPartitioner.class);/*]*/
    /*[*/job.setSortComparatorClass(KeyComparator.class);/*]*/
    /*[*/job.setGroupingComparatorClass(GroupComparator.class);/*]*/
    job.setReducerClass(MaxTemperatureReducer.class);
    job.setOutputKeyClass(IntPair.class);
    job.setOutputValueClass(NullWritable.class);
    
    return job.waitForCompletion(true) ? 0 : 1;
  }
  
  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new MaxTemperatureUsingSecondarySort(), args);
    System.exit(exitCode);
  }
}
// ^^ MaxTemperatureUsingSecondarySort

查詢後發現二次排序的園裏是：

1：在map階段的最後，會先調用job.setPartitionerClass對這個List進行分區，每個分區映射到一個reducer。每個分區內又調用job.setSortComparatorClass設置的key比較函數類排序。可以看到，這本身就是一個二次排序。如果沒有通過job.setSortComparatorClass設置key比較函數類，則使用key的實現的compareTo方法。

2：在reduce階段，reducer接收到所有映射到這個reducer的map輸出後，也是會調用job.setSortComparatorClass設置的key比較函數類對所有數據對排序。然後開始構造一個key對應的value迭代器。這時就要用到分組，使用jobjob.setGroupingComparatorClass設置的分組函數類。只要這個比較器比較的兩個key相同，他們就屬於同一個組，它們的value放在一個value迭代器，而這個迭代器的key使用屬於同一個組的所有key的第一個key。最後就是進入Reducer的reduce方法，reduce方法的輸入是所有的（key和它的value迭代器）。

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

mapreduce的二次排序 SecondarySort

.Net 8.0 下的新RPC，IceRPC之試試的新玩法"打洞"

完美替代postman的軟件

Vue mockjs mock.js

關於遊戲付費的一點想法

我通過CKA和CKS啦！

《最新出爐》系列入門篇-Python+Playwright自動化測試-42-強大的可視化追蹤利器Trace Viewer

大數據怎麼學？對大數據開發領域及崗位的詳細解讀，完整理解大數據開發領域技術體系

spring mvc HandlerMethodReturnValueHandler

Node.js學習筆記之一：入門

Node.js學習筆記之三：事件

Callable 和Future 接口使用

Node.js學習筆記之三：事件_EventEmitter

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結