import java.io.*;
import org.apache.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
public class MaxTemperature{
public static void main(String[]args) throws IOException{
// args=new String[2];
// args[0]="/home/yukjin/Downsload/1901";
// args[1]="output";
if(args.length!=2){
System.err.println("Usage:MaxTemperature <input path> <output path");
System.exit(-1);
}
JobConf conf=new JobConf(MaxTemperature.class);//JobConf指定作業執行規範,可以使用它控制整個作業的運行
conf.setJobName("Max temperature");
FileInputFormat.addInputPath(conf,new Path(args[0]));//指定文件輸入路徑,路徑既可以是單個文件也可以是某個目錄,也可多次調用實現多路徑輸入
FileOutputFormat.setOutputPath(conf,new Path(args[1]));//指定文件輸出路徑,執行前該路徑不能存在,負責hadoop拒絕運行該任務
conf.setMapperClass(MaxTemperatureMapper.class);
conf.setReducerClass(MaxTemperatureReducer.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
JobClient.runJob(conf);
}
}
import java.io.*;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class MaxTemperatureMapper extends MapReduceBase implements Mapper<LongWritable,Text,Text,IntWritable>{
private static final int MISSING=9999;
public void map(LongWritable key,Text value,OutputCollector<Text,IntWritable>output,Reporter reporter)throws IOException{
String line=value.toString();//Text類型轉換爲String類型
String year=line.substring(15,19);//截取年份
int airtemperature;
if(line.charAt(87)=='+'){
airtemperature=Integer.parseInt(line.substring(88,92));
}
else{
airtemperature=Integer.parseInt(line.substring(87,92));
}
String quality=line.substring(92,93);
if(airtemperature!=MISSING&&quality.matches("[01459]")){
output.collect(new Text(year),new IntWritable(airtemperature));
}
}
}
Mapper接口是一個泛型類型,需要指定4個參數類型,分別指定Map函數的輸入鍵,輸入值,輸出鍵,輸出值,此例中輸入鍵位LongWritable(長整型偏移量),輸入值Text(一行文本),輸出鍵Text(年份),以及輸出值IntWritable(氣溫)。
Hadoop自身提供一套可優化網絡序列化傳輸的基本類型,而不直接使用java的的基本類型,這些類型在org.apache.hadoop.io包中可以找到。
import java.io.*;
import java.util.Iterator;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
public class MaxTemperatureReducer extends MapReduceBase implements Reducer<Text,IntWritable,Text,IntWritable>{
public void reduce(Text key,Iterator<IntWritable>values,OutputCollector<Text,IntWritable>output,Reporter reporter)throws IOException{
int maxValue=Integer.MIN_VALUE;
while(values.hasNext()){
maxValue=Math.max(maxValue,values.next().get());
}
output.collect(key,new IntWritable(maxValue));
}
}
Reducer接口同樣也是泛化類型,需要四個參數,分別指定Reduce函數的輸入鍵,輸入值,輸出鍵以及輸出值。Reduce函數的輸入鍵值必須與Map函數的輸出鍵值匹配。