輸出格式要求:員工編號,員工姓名,部門名稱,部門編號
1、原始數據
員工數據
empno ename job mgr hiredate sal comm deptno loc
7499 allen salesman 7698 1981-02-20 1600 300 30
7782 clark managers 7639 1981-06-09 2450 10
7654 martin salesman 7698 1981-03-20 1250 1400 30 boston
7900 james clerk 7698 1981-01-09 950 30
7788 scott analyst 7566 1981-09-01 3000 100 20
部門數據
deptno dname loc
30 sales chicago
20 research dallas
10 accounting newyork
2、處理join的思路:
將Join key 當作map的輸出key, 也就是reduce的輸入key , 這樣只要join的key相同,shuffle過後,就會進入到同一個reduce 的key - value list 中去。
需要爲join的2張表設計一個通用的一個bean. 並且bean中加一個flag的標誌屬性,這樣可以根據flag來區分是哪張表的數據。
reduce 階段根據flag來判斷是員工數據還是部門數據就很容易了 。而join的真正處理是在reduce階段。
3.實現中間bean
存儲數據的bean (由於數據要在網絡上傳輸必須序列化,hadoop處理的時候需要分組和排序,所以要實現WritableComparable接口):
package cn.edu.bjut.joinone;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class Emplyee implements WritableComparable<Emplyee> {
private String empNo = "";
private String empName = "";
private String deptNo = "";
private String deptName = "";
private int flag = 0;//部門數據爲1,員工數據爲0
public Emplyee() {}
public Emplyee(String empNo, String empName, String deptNo,
String deptName, int flag) {
super();
this.empNo = empNo;
this.empName = empName;
this.deptNo = deptNo;
this.deptName = deptName;
this.flag = flag;
}
public Emplyee(Emplyee e) {
this.empNo = e.getEmpNo();
this.empName = e.getEmpName();
this.deptNo = e.getDeptNo();
this.deptName = e.getDeptName();
this.flag = e.getFlag();
}
public void write(DataOutput out) throws IOException {
out.writeUTF(getEmpNo());
out.writeUTF(getEmpName());
out.writeUTF(getDeptNo());
out.writeUTF(getDeptName());
out.writeInt(getFlag());
}
public void readFields(DataInput in) throws IOException {
this.empNo = in.readUTF();
this.empName = in.readUTF();
this.deptNo = in.readUTF();
this.deptName = in.readUTF();
this.flag = in.readInt();
}
public int compareTo(Emplyee o) {
return 0;
}
@Override
public String toString() {
return "empNo=" + empNo + ", empName=" + empName + ", deptNo="
+ deptNo + ", deptName=" + deptName;
}
public String getEmpNo() {
return empNo;
}
public void setEmpNo(String empNo) {
this.empNo = empNo;
}
public String getEmpName() {
return empName;
}
public void setEmpName(String empName) {
this.empName = empName;
}
public String getDeptNo() {
return deptNo;
}
public void setDeptNo(String deptNo) {
this.deptNo = deptNo;
}
public String getDeptName() {
return deptName;
}
public void setDeptName(String deptName) {
this.deptName = deptName;
}
public int getFlag() {
return flag;
}
public void setFlag(int flag) {
this.flag = flag;
}
}
4.Mapper程序:
package cn.edu.bjut.joinone;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class JoinMapper extends Mapper<LongWritable, Text, LongWritable, Emplyee> {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] arr = line.split("\t");
if(arr.length <= 3) {
Emplyee e = new Emplyee();
e.setDeptNo(arr[0]);
e.setDeptName(arr[1]);
e.setFlag(1);
context.write(new LongWritable(Long.parseLong(e.getDeptNo())), e);
} else {
Emplyee e = new Emplyee();
e.setEmpNo(arr[0]);
e.setEmpName(arr[1]);
e.setDeptNo(arr[7]);
e.setFlag(0);
context.write(new LongWritable(Long.parseLong(e.getDeptNo())), e);
}
}
}
5.Reducer程序:
package cn.edu.bjut.joinone;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class JoinReducer extends Reducer<LongWritable, Emplyee, Text, NullWritable> {
@Override
protected void reduce(LongWritable key, Iterable<Emplyee> values, Context context)
throws IOException, InterruptedException {
Emplyee e = null;
List<Emplyee> list = new ArrayList<Emplyee>();
for(Emplyee emplyee : values) {
if(0 == emplyee.getFlag()) {
list.add(new Emplyee(emplyee));
} else {
e = new Emplyee(emplyee);
}
}
if(null != e) {
for(Emplyee emplyee : list) {
emplyee.setDeptName(e.getDeptName());
context.write(new Text(emplyee.toString()), NullWritable.get());
}
}
}
}
6.主程序:
package cn.edu.bjut.joinone;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MainJob {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "join");
job.setJarByClass(MainJob.class);
job.setMapperClass(JoinMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Emplyee.class);
job.setReducerClass(JoinReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
Path outPath = new Path(args[1]);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outPath)) {
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
job.waitForCompletion(true);
}
}