一、Reduce 端
1. 需求
商品表
id | pname | category_id | price |
---|---|---|---|
P0001 | 小米5 | 1000 | 2000 |
P0002 | 錘子T1 | 1000 | 3000 |
訂單數據表
id | date | pid | amount |
---|---|---|---|
1001 | 20150710 | P0001 | 2 |
1002 | 20150710 | P0002 | 3 |
2. 實現步驟
通過將關聯的條件作爲map輸出的key,將兩表滿足join條件的數據並攜帶數據所來源的文件信息,發往同一個reduce task,在reduce中進行數據的串聯
2.1 定義 Mapper
public class ReduceJoinMapper extends Mapper<LongWritable,Text,Text,Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 判斷數據來自哪個文件
FileSplit fileSplit = (FileSplit) context.getInputSplit();
String fileName = fileSplit.getPath().getName();
String[] split = value.toString().split(",");
if (fileName.equals("product.txt")){
// 數據來自商品表
context.write(new Text(split[0]),value);
}else {
// 數據來自訂單表
context.write(new Text(split[2]),value);
}
}
}
2.2 定義 Reducer
public class ReduceJoinReducer extends Reducer<Text,Text,Text,Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String first = "";
String second = "";
ArrayList<String> list = new ArrayList<String>();
for (Text value : values) {
if (value.toString().startsWith("p")){
first = value.toString();
}else {
list.add(value.toString());
}
}
for (String s : list) {
second = second + s + "\t";
}
context.write(key,new Text(first + "\t" + second));
}
}
2.3 定義主類
public class JobMain extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
Job job = Job.getInstance(super.getConf(), "reduce_join");
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("d:\\mapreduce\\reduce_join_in"));
job.setMapperClass(ReduceJoinMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(ReduceJoinReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("d:\\mapreduce\\reduce_join_out"));
boolean bl = job.waitForCompletion(true);
return bl ? 0 : 1;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
int run = ToolRunner.run(configuration,new JobMain(),args);
System.exit(run);
}
}
二、Map 端
1. 概述
適用於關聯表中有小表的情形.
使用分佈式緩存,可以將小表分發到所有的map節點,這樣,map節點就可以在本地對自己所讀到的大表數據進行join並輸出最終結果,可以大大提高join操作的併發度,加快處理速度
2. 實現步驟
先在mapper類中預先定義好小表,進行join
引入實際場景中的解決方案:一次加載數據庫或者用
2.1 定義 Mapper
public class MapJoinMapper extends Mapper<LongWritable,Text,Text,Text> {
private HashMap<String,String> map = new HashMap<String, String>();
//第一件事情:將分佈式緩存的小表數據讀取到本地Map集合(只需要做一次)
@Override
protected void setup(Context context) throws IOException, InterruptedException {
// 獲取分佈式緩存文件列表
URI[] cacheFiles = context.getCacheFiles();
// 獲取指定的分佈式緩存文件的文件系統
FileSystem fileSystem = FileSystem.get(cacheFiles[0], new Configuration());
// 獲取文件的輸入流
FSDataInputStream inputStream = fileSystem.open(new Path(cacheFiles[0]));
// 讀取文件內容,並將輸入存入Map集合
// 將字節輸入流轉爲字符緩衝流
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
// 讀取小表文件內容,以行爲單位,並將讀取的數據存入 map 集合
String line = null;
while((line = bufferedReader.readLine()) != null){
map.put(line.split(",")[0],line);
}
// 關閉流
bufferedReader.close();
fileSystem.close();
}
//第二件事情:對大表的處理業務邏輯,而且要實現大表和小表的join操作
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String productId = value.toString().split(",")[2]; // K2
String productLine = map.get(productId);
String valueLine = productLine + "\t" + value.toString(); // V2
context.write(new Text(productId),new Text(valueLine));
}
}
2.2 定義主類
public class JobMain extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
Job job = Job.getInstance(super.getConf(), "map_join");
// 將小表放在分佈式緩存中
// DistributedCache.addCacheFile(new URI("hdfs://node01:8020/cache_file/product.txt"), super.getConf());
job.addCacheFile(new URI("hdfs://node01:8020/cache_file/product.txt"));
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("d:\\mapreduce\\map_join_in"));
job.setMapperClass(MapJoinMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("d:\\mapreduce\\map_join_out"));
boolean bl = job.waitForCompletion(true);
return bl ? 0 : 1;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
int run = ToolRunner.run(configuration, new JobMain(), args);
System.exit(run);
}
}