MapReduce案例之ReduceJoin
需求
如下兩個文件
-
order.txt是用戶的訂單信息
order001,u001 order002,u001 order003,u005 order004,u002 order005,u003 order006,u004
-
user.txt是基本用戶信息
u001,senge,18,angelababy u002,laozhao,48,ruhua u003,xiaoxu,16,chunge u004,laoyang,28,zengge u005,nana,14,huangbo
-
輸出用戶與訂單的關聯關係
order001,u001,senge,18,angelababy order002,u001,senge,18,angelababy order003,u005,nana,14,huangbo ……
分析
-
確定map輸入類型<LongWritable, Text>
根據輸入文件order.txt和user.txt分析:map輸入<key,value>爲<行索引,每行的內容>。
<0,"order001,u001">
-
確定reduce輸出類型<Text,Text>
根據輸出結果分析,reduce輸出的<key,value>爲
<"order001,u001,senge,18,angelababy","">
-
確定map輸出類型<Text,Text>
-
因爲需要根據userId字段進行關聯,所以userId字段爲關聯鍵。mr作業就是通過map輸出的key進行關聯和分組的,所以map輸出的key爲userId。
-
兩個輸入切片,需要區分才能進行關聯。可爲order.txt文件map輸出value添加1前綴;爲user.txt文件map輸出value添加2前綴。
<"u001","1:order001"> <"u001","2:senge,18,angelababy">
-
-
確定map邏輯
-
確定reduce邏輯
- 分別使用orderList和userList存放指定userId的訂單信息和用戶信息。
- 嵌套遍歷orderList和userList,輸出最終結果。
實現
導入依賴包
代碼
package mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class ReduceJoin2 {
//本地運行時,指定靜態代碼塊
static {
System.setProperty("hadoop.home.dir", "D:\\software\\hadoop-2.9.2");
}
public static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//如果第一行是orderid,userid列字段說明,需要過濾掉
//if (key.get() == 0) return;
String fileName = ((FileSplit) context.getInputSplit()).getPath().getName();
String line = value.toString();
String[] lineArr = line.split(",");
//order.txt 訂單表 格式:order,userid
if("order.txt".equals(fileName)) {
context.write(new Text(lineArr[1]), new Text("1:" + lineArr[0]));
} else {
//user.txt 用戶表 格式:userid,nickname,age,username
context.write(new Text(lineArr[0]), new Text("2:" + lineArr[1] + "," + lineArr[2] + "," + lineArr[3]));
}
}
}
public static class MyReducer extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
List<String> orderList = new ArrayList<>();
List<String> userList = new ArrayList<>();
//1.根據user_id,遍歷對應的訂單信息和用戶信息,拆分到相應的集合中
for (Text value : values) {
String valueStr = value.toString();
String[] valueArr = valueStr.split(":");
if("1".equals(valueArr[0])) {
orderList.add(valueArr[1]);
} else {
userList.add(valueStr.substring(2));
}
}
//2.將訂單列表和用戶列表進行嵌套遍歷
for (String order : orderList) {
for (String user : userList) {
context.write(new Text(order + "," + key.toString() + "," + user), new Text(""));
}
}
}
}
/**
* 驅動方法
* @param args
*/
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//0.初始化一個job
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "reduce_join2");
job.setJarByClass(ReduceJoin2.class);
//1.輸入文件
FileInputFormat.addInputPaths(job, args[0]);
//2.map並行計算
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//3.shuffle流程(內部實現)
//4.reduce計算
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//5.輸出文件
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//6.提交作業(總入口)
boolean result = job.waitForCompletion(true);
System.exit(result ? 1 : 0);
}
}
運行
本地運行
指定參數:比如,D://data/order.txt,D://data/user.txt D://output
集羣運行
- 打jar包
-
將jar包放到linux上
-
輸入文件order.txt和user.txt放到hdfs 的/test目錄中
-
運行jar
bin/yarn jar hadoop_demo-1.0-SNAPSHOT.jar mapreduce.ReduceJoin2 /test/order.txt,/test/user.txt /test/ouput