Sort

對hadoop例子Sort進行代碼分析學習。

注：本文僅爲學習筆記，中間會包含從網絡或其他出處獲取的資料，文後會標註出處，若有遺漏，麻煩提醒以便修訂，敬請原諒

作用

使用mapreduce框架來進行輸入的排序

主類

/**

* This is the trivial map/reduce program that does absolutely nothing

* other than use the framework to fragment and sort the input values.

* To run: bin/hadoop jar build/hadoop-examples.jar sort

* [-r reduces]

* [-inFormat input format class]

* [-outFormat output format class]

* [-outKey output key class]

* [-outValue output value class]

* [-totalOrder pcntnum samplesmax splits]

* in-dirout-dir

publicclass Sort<K,V> extends Configured implements Tool {//1

publicstaticfinal String REDUCES_PER_HOST =

"mapreduce.sort.reducesperhost";

private Job job = null;

staticint printUsage() {

System.out.println("sort [-r <reduces>] " +

"[-inFormat <input format class>] " +

"[-outFormat <output format class>] " +

"[-outKey <output key class>] " +

"[-outValue <output value class>] " +

"[-totalOrder <pcnt> <num samples> <max splits>] " +

"<input> <output>");

ToolRunner.printGenericCommandUsage(System.out);//2

return 2;

}

/**

* The main driver for sort program.

* Invoke this method to submit the map/reduce job.

* @throws IOException When there is communication problems with the

* job tracker.

publicint run(String[] args) throws Exception {

Configuration conf = getConf();

JobClient client = new JobClient(conf);//3

ClusterStatus cluster = client.getClusterStatus();//4

intnum_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);

String sort_reduces = conf.get(REDUCES_PER_HOST);//5

if (sort_reduces != null) {

num_reduces = cluster.getTaskTrackers() *

Integer.parseInt(sort_reduces); //6

}

Class<? extends InputFormat> inputFormatClass =

SequenceFileInputFormat.class; //7

Class<? extends OutputFormat> outputFormatClass =

SequenceFileOutputFormat.class;//8

Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;//9

Class<? extends Writable> outputValueClass = BytesWritable.class;//10

List<String> otherArgs = new ArrayList<String>();

InputSampler.Sampler<K,V> sampler = null; //11

for(inti=0; i < args.length; ++i) {

try {

if ("-r".equals(args[i])) {

num_reduces = Integer.parseInt(args[++i]);

} elseif ("-inFormat".equals(args[i])) {

inputFormatClass =

Class.forName(args[++i]).asSubclass(InputFormat.class);

} elseif ("-outFormat".equals(args[i])) {

outputFormatClass =

Class.forName(args[++i]).asSubclass(OutputFormat.class);

} elseif ("-outKey".equals(args[i])) {

outputKeyClass =

Class.forName(args[++i]).asSubclass(WritableComparable.class);

} elseif ("-outValue".equals(args[i])) {

outputValueClass =

Class.forName(args[++i]).asSubclass(Writable.class);

} elseif ("-totalOrder".equals(args[i])) {

doublepcnt = Double.parseDouble(args[++i]);

intnumSamples = Integer.parseInt(args[++i]);

intmaxSplits = Integer.parseInt(args[++i]);

if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE;

sampler =

new InputSampler.RandomSampler<K,V>(pcnt, numSamples, maxSplits);//12

} else {

otherArgs.add(args[i]);

}

} catch (NumberFormatException except) {

System.out.println("ERROR: Integer expected instead of " + args[i]);

return printUsage();

} catch (ArrayIndexOutOfBoundsException except) {

System.out.println("ERROR: Required parameter missing from " +

args[i-1]);

return printUsage(); // exits

}

// Set user-supplied (possibly default) job configs

job = Job.getInstance(conf);

job.setJobName("sorter");

job.setJarByClass(Sort.class);

job.setMapperClass(Mapper.class);

job.setReducerClass(Reducer.class);

job.setNumReduceTasks(num_reduces); //13

job.setInputFormatClass(inputFormatClass);

job.setOutputFormatClass(outputFormatClass);

job.setOutputKeyClass(outputKeyClass);

job.setOutputValueClass(outputValueClass);

// Make sure there are exactly 2 parameters left.

if (otherArgs.size() != 2) {

System.out.println("ERROR: Wrong number of parameters: " +

otherArgs.size() + " instead of 2.");

return printUsage();

}

FileInputFormat.setInputPaths(job, otherArgs.get(0));

FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));

if (sampler != null) {

System.out.println("Sampling input to effect total-order sort...");

job.setPartitionerClass(TotalOrderPartitioner.class); //14

Path inputDir = FileInputFormat.getInputPaths(job)[0];

inputDir = inputDir.~~makeQualified~~(inputDir.getFileSystem(conf));//15

Path partitionFile = new Path(inputDir, "_sortPartitioning");

TotalOrderPartitioner.setPartitionFile(conf, partitionFile);//16

InputSampler.<K,V>writePartitionFile(job, sampler);//17

URI partitionUri = new URI(partitionFile.toString() +

"#" + "_sortPartitioning");

~~DistributedCache~~.~~addCacheFile~~(partitionUri, conf);//18

}

System.out.println("Running on " +

cluster.getTaskTrackers() +

" nodes to sort from " +

FileInputFormat.getInputPaths(job)[0] + " into " +

FileOutputFormat.getOutputPath(job) +

" with " + num_reduces + " reduces.");

Date startTime = new Date();

System.out.println("Job started: " + startTime);

intret = job.waitForCompletion(true) ? 0 : 1;

Date end_time = new Date();

System.out.println("Job ended: " + end_time);

System.out.println("The job took " +

(end_time.getTime() - startTime.getTime()) /1000 + " seconds.");

returnret;

}

publicstaticvoid main(String[] args) throws Exception {

intres = ToolRunner.run(new Configuration(), new Sort(), args);

System.exit(res);

}

/**

* Get the last job that was run using this instance.

* @return the results of the last job that was run

public Job getResult() {

returnjob;

}

1. extends Configured implementsTool : 該類繼承org.apache.hadoop.conf.Configured類，該類保存Configuration對象作爲屬性，實現org.apache.hadoop.util.Tool接口，該接口定義一個run方法，可以通過ToolRunner幫助類來執行實現了Tool接口的任務。

2. ToolRunner.printGenericCommandUsage(System.out)： ToolRunner類：Job任務的運行幫助類，該類可以在執行jar文件時解析hadoop命令行參數。這裏打印了命令行參數的使用說明：

publicstaticvoid printGenericCommandUsage(PrintStream out) {

out.println("Generic options supported are");

out.println("-conf <configuration file> specify an application configuration file");

out.println("-D <property=value> use value for given property");

out.println("-fs <local|namenode:port> specify a namenode");

out.println("-jt <local|resourcemanager:port> specify a ResourceManager");

out.println("-files <comma separated list of files> " +

"specify comma separated files to be copied to the map reduce cluster");

out.println("-libjars <comma separated list of jars> " +

"specify comma separated jar files to include in the classpath.");

out.println("-archives <comma separated list of archives> " +

"specify comma separated archives to be unarchived" +

" on the compute machines.\n");

out.println("The general command line syntax is");

out.println("bin/hadoop command [genericOptions] [commandOptions]\n");

}

3. JobClient:org.apache.hadoop.mapred.JobClient，該類是用戶和集羣進行交互的主要接口類，提供過了包括：提交任務，跟蹤處理進度，獲取任務的報告或日記，獲取集羣的MapReduce狀態信息等方法。這裏通過Configuration對象來創建JobClient實例。

4. Org.apache.hadoop.mapred.ClusterStatus類：該類表示當前集羣的信息，這裏通過cluster.getMaxReduceTasks() 獲取集羣支持的最大的reduce任務數量

5. 通過ToolRunner類運行後，將對命令行參數進行解析，並添加到configuration實例中，方便通過configuration獲取定義的屬性值。

6. 調用cluster.getTaskTrackers()獲取集羣任務跟蹤器的數量。

7. Org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat<K,V>序列文件的輸入格式

8. Org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat<K,V>序列文件的輸出格式

9. Org.apache.hadoop.io.BytesWritablehadoop的byte類型

10. Org.apache.hadoop.io.BytesWritablehadoop的byte類型

11. InputSampler.Sampler<K,V>sample:

org.apache.hadoop.mapreduce.lib.partition.InputSampler 幫助進行數據分區的採樣器。這裏的分區指的是Map的結果按照某種規則進行分區，分發給不同的reduce.當數據量比較大，無法判斷有效的分區規則是，可以通過採樣器對數據進行採樣分析後進行分區。

12. org.apache.hadoop.mapreduce.lib.partition.InputSampler-RandomSampler,隨機採樣。

13. job.setNumReduceTask() 可以設置reduce任務的個數。

14. job.setPartitionerClass(TotalOrderPartitioner.class)設置分區類。

Org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner，該類將mapper根據其key文件的定義劃分到不同的reducer裏面，這裏的key指的是TotalOrderpartitioner使用了partitionfile，該文件裏面key的數量=reduce數量-1，並且key之間是排序的，比如（2,4,6,8）這4個key。進行分區後形成5個分區（分區 2 分區4 分區 6 分區 8 分區）分發的5個reduce，而reduce的輸出是排序的，因此採用該分區類就實現了本例子的全輸入數據的排序目的。

15. 返回合法路徑，生成路徑，用於保存TotalOrderPartitioner的key文件

16. TotalOrderPartitioner.setPartitionFile(conf,partitionFile)設置分區文件

17. InputSampler.<K,V>writePartitionFile(job,sampler)通過取樣器取樣，寫入分區文件。

18. DistributedCache.adddCacheFile(partitionUri,conf)將文件添加的分佈緩存中，hadoop會將該文件分佈緩存到所有該任務工作的map節點中，目前推薦使用Job.addCacheFile(URI uri)方法替換。

總結

本例子主要引入了分區（加入取樣器）和排序在hadoop處理流程中的概念，通過使用TotalOrderPartioner類實現了利用mapreduce框架的map-reduce處理流程實現了輸入的全排序。

Map-Reduce流程圖：

圖1 MapReduce處理流程圖

引用:

1. MapReduce處理流程圖引用自博文：

http://blog.oddfoo.net/2011/04/17/mapreduce-partition%E5%88%86%E6%9E%90-2/

Hadoop例子之Sort

作用

主類

引用:

《Python進階》學習筆記

Leetcode 3161. 物塊放置查詢

leetcode 60 排列序列

一個docker容器暴露多個端口

微服務實踐之使用 Visual Studio 2022 調試Dapr 應用程序

wpf附加屬性理解 WPF附加屬性

ZooKeeper - O'Reilly Media ----Zookeeper Internals (2)

Windows 安裝bugzilla

ZooKeeper - O'Reilly Media ----Zookeeper Internals (3)

ZooKeeper - O'Reilly Media ----Zookeeper Internals (1)

Hadoop例子之Sort

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結