十、MapReduce綜合實戰
綜合實戰:環境大數據
案列目的
1.學會分析環境數據文件;
2.學會編寫解析環境數據文件並進行統計的代碼;
3.學會進行遞歸MapReduce。
案例要求
要求實驗結束時,每位學生均已在master服務器上運行從北京2016年1月到6月這半年間的歷史天氣和空氣質量數據文件中分析出的環境統計結果,包含月平均氣溫、空氣質量分佈情況等。
實現原理
近年來,由於霧霾問題的持續發酵,越來越多的人開始關注城市相關的環境數據,包括空氣質量數據、天氣數據等等。
如果每小時記錄一次城市的天氣實況和空氣質量實況信息,則每個城市每天都會產生24條環境數據,全國所有2500多個城市如果均如此進行記錄,那每天產生的數據量將達到6萬多條,每年則會產生2190萬條記錄,已經可以稱得上環境大數據。
對於這些原始監測數據,我們可以根據時間的維度來進行統計,從而得出與該城市相關的日度及月度平均氣溫、空氣質量優良及污染天數等等,從而爲研究空氣污染物擴散條件提供有力的數據支持。
本實驗中選取了北京2016年1月到6月這半年間的每小時天氣和空氣質量數據(未取到數據的字段填充“N/A”),利用MapReduce來統計月度平均氣溫和半年內空氣質量爲優、良、輕度污染、中度污染、重度污染和嚴重污染的天數。
實驗數據如下
第一題:編寫月平均氣溫統計程序
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
public class TmpStat
{
public static class StatMapper extends Mapper<Object, Text, Text, IntWritable>
{
private IntWritable intValue = new IntWritable();
private Text dateKey = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException
{
String[] items = value.toString().split(",");
String date = items[0];
String tmp = items[5];
if(!"DATE".equals(date) && !"N/A".equals(tmp))
{//排除第一行說明以及未取到數據的行
dateKey.set(date.substring(0, 6));
intValue.set(Integer.parseInt(tmp));
context.write(dateKey, intValue);
}
}
}
public static class StatReducer extends Reducer<Text, IntWritable, Text, IntWritable>
{
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException
{
int tmp_sum = 0;
int count = 0;
for(IntWritable val : values)
{
tmp_sum += val.get();
count++;
}
int tmp_avg = tmp_sum/count;
result.set(tmp_avg);
context.write(key, result);
}
}
public static void main(String args[])
throws IOException, ClassNotFoundException, InterruptedException
{
Configuration conf = new Configuration();
Job job = new Job(conf, "MonthlyAvgTmpStat");
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.setInputPaths(job, args[0]);
job.setJarByClass(TmpStat.class);
job.setMapperClass(StatMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setPartitionerClass(HashPartitioner.class);
job.setReducerClass(StatReducer.class);
job.setNumReduceTasks(Integer.parseInt(args[2]));
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
運行結果:
第二題:編寫每日空氣質量統計程序
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
public class AqiStatDaily
{
public static class StatMapper extends Mapper<Object, Text, Text, IntWritable>
{
private IntWritable intValue = new IntWritable();
private Text dateKey = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException
{
String[] items = value.toString().split(",");
String date = items[0];
String aqi = items[6];
if(!"DATE".equals(date) && !"N/A".equals(aqi))
{
dateKey.set(date);
intValue.set(Integer.parseInt(aqi));
context.write(dateKey, intValue);
}
}
}
public static class StatReducer extends Reducer<Text, IntWritable, Text, IntWritable>
{
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException
{
int aqi_sum = 0;
int count = 0;
for(IntWritable val : values)
{
aqi_sum += val.get();
count++;
}
int aqi_avg = aqi_sum/count;
result.set(aqi_avg);
context.write(key, result);
}
}
public static void main(String args[])
throws IOException, ClassNotFoundException, InterruptedException
{
Configuration conf = new Configuration();
Job job = new Job(conf, "AqiStatDaily");
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.setInputPaths(job, args[0]);
job.setJarByClass(AqiStatDaily.class);
job.setMapperClass(StatMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setPartitionerClass(HashPartitioner.class);
job.setReducerClass(StatReducer.class);
job.setNumReduceTasks(Integer.parseInt(args[2]));
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
運行結果:
第三題: 編寫各空氣質量天數統計程序
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
public class AqiStat
{
public static final String GOOD = "優";
public static final String MODERATE = "良";
public static final String LIGHTLY_POLLUTED = "輕度污染";
public static final String MODERATELY_POLLUTED = "中度污染";
public static final String HEAVILY_POLLUTED = "重度污染";
public static final String SEVERELY_POLLUTED = "嚴重污染";
public static class StatMapper extends Mapper<Object, Text, Text, IntWritable>
{
private final static IntWritable one = new IntWritable(1);
private Text cond = new Text();
// map方法,根據AQI值,將對應空氣質量的天數加1
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException
{
String[] items = value.toString().split("\t");
int aqi = Integer.parseInt(items[1]);
if(aqi <= 50)
{
// 優
cond.set(GOOD);
}
else if(aqi <= 100)
{
// 良
cond.set(MODERATE);
}
else if(aqi <= 150)
{
// 輕度污染
cond.set(LIGHTLY_POLLUTED);
}
else if(aqi <= 200)
{
// 中度污染
cond.set(MODERATELY_POLLUTED);
}
else if(aqi <= 300)
{
// 重度污染
cond.set(HEAVILY_POLLUTED);
}
else
{
// 嚴重污染
cond.set(SEVERELY_POLLUTED);
}
context.write(cond, one);
}
}
// 定義reduce類,對相同的空氣質量狀況,把它們<K,VList>中VList值全部相加
public static class StatReducer extends Reducer<Text, IntWritable, Text, IntWritable>
{
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,Context context)
throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values)
{
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String args[])
throws IOException, ClassNotFoundException, InterruptedException
{
Configuration conf = new Configuration();
Job job = new Job(conf, "AqiStat");
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.setInputPaths(job, args[0]);
job.setJarByClass(AqiStat.class);
job.setMapperClass(StatMapper.class);
job.setCombinerClass(StatReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setPartitionerClass(HashPartitioner.class);
job.setReducerClass(StatReducer.class);
job.setNumReduceTasks(Integer.parseInt(args[2]));
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
運行結果:
有什麼不懂的,多多問問博主呦!!!