Hadoop 實例8 Join講解1: 獲取員工所在部門信息

輸出格式要求:員工編號,員工姓名,部門名稱,部門編號
1、原始數據
員工數據

empno   ename   job         mgr hiredate    sal comm    deptno  loc
7499    allen   salesman    7698    1981-02-20  1600    300 30  
7782    clark   managers    7639    1981-06-09  2450        10  
7654    martin  salesman    7698    1981-03-20  1250    1400    30  boston
7900    james   clerk   7698    1981-01-09  950     30  
7788    scott   analyst 7566    1981-09-01  3000    100 20  

部門數據

deptno  dname   loc
30  sales   chicago
20  research    dallas
10  accounting  newyork

2、處理join的思路:
        將Join key 當作map的輸出key, 也就是reduce的輸入key , 這樣只要join的key相同,shuffle過後,就會進入到同一個reduce 的key - value list 中去。
        需要爲join的2張表設計一個通用的一個bean. 並且bean中加一個flag的標誌屬性,這樣可以根據flag來區分是哪張表的數據。
        reduce 階段根據flag來判斷是員工數據還是部門數據就很容易了 。而join的真正處理是在reduce階段。

3.實現中間bean
        存儲數據的bean (由於數據要在網絡上傳輸必須序列化,hadoop處理的時候需要分組和排序,所以要實現WritableComparable接口):

package cn.edu.bjut.joinone;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class Emplyee implements WritableComparable<Emplyee> {

    private String empNo = "";
    private String empName = "";
    private String deptNo = "";
    private String deptName = "";
    private int flag = 0;//部門數據爲1,員工數據爲0

    public Emplyee() {}

    public Emplyee(String empNo, String empName, String deptNo,
            String deptName, int flag) {
        super();
        this.empNo = empNo;
        this.empName = empName;
        this.deptNo = deptNo;
        this.deptName = deptName;
        this.flag = flag;
    }

    public Emplyee(Emplyee e) {
        this.empNo = e.getEmpNo();
        this.empName = e.getEmpName();
        this.deptNo = e.getDeptNo();
        this.deptName = e.getDeptName();
        this.flag = e.getFlag();
    }

    public void write(DataOutput out) throws IOException {
        out.writeUTF(getEmpNo());
        out.writeUTF(getEmpName());
        out.writeUTF(getDeptNo());
        out.writeUTF(getDeptName());
        out.writeInt(getFlag());

    }

    public void readFields(DataInput in) throws IOException {
        this.empNo = in.readUTF();
        this.empName = in.readUTF();
        this.deptNo = in.readUTF();
        this.deptName = in.readUTF();
        this.flag = in.readInt();
    }

    public int compareTo(Emplyee o) {
        return 0;
    }



    @Override
    public String toString() {
        return "empNo=" + empNo + ", empName=" + empName + ", deptNo="
                + deptNo + ", deptName=" + deptName;
    }

    public String getEmpNo() {
        return empNo;
    }

    public void setEmpNo(String empNo) {
        this.empNo = empNo;
    }

    public String getEmpName() {
        return empName;
    }

    public void setEmpName(String empName) {
        this.empName = empName;
    }

    public String getDeptNo() {
        return deptNo;
    }

    public void setDeptNo(String deptNo) {
        this.deptNo = deptNo;
    }

    public String getDeptName() {
        return deptName;
    }

    public void setDeptName(String deptName) {
        this.deptName = deptName;
    }

    public int getFlag() {
        return flag;
    }

    public void setFlag(int flag) {
        this.flag = flag;
    }

}

4.Mapper程序:

package cn.edu.bjut.joinone;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class JoinMapper extends Mapper<LongWritable, Text, LongWritable, Emplyee> {

    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        String line = value.toString();
        String[] arr = line.split("\t");

        if(arr.length <= 3) {
            Emplyee e = new Emplyee();
            e.setDeptNo(arr[0]);
            e.setDeptName(arr[1]);
            e.setFlag(1);

            context.write(new LongWritable(Long.parseLong(e.getDeptNo())), e);
        } else {
            Emplyee e = new Emplyee();
            e.setEmpNo(arr[0]);
            e.setEmpName(arr[1]);
            e.setDeptNo(arr[7]);
            e.setFlag(0);

            context.write(new LongWritable(Long.parseLong(e.getDeptNo())), e);
        }
    }

}

5.Reducer程序:

package cn.edu.bjut.joinone;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class JoinReducer extends Reducer<LongWritable, Emplyee, Text, NullWritable> {

    @Override
    protected void reduce(LongWritable key, Iterable<Emplyee> values, Context context)
            throws IOException, InterruptedException {

        Emplyee e = null;
        List<Emplyee> list = new ArrayList<Emplyee>();

        for(Emplyee emplyee : values) {
            if(0 == emplyee.getFlag()) {
                list.add(new Emplyee(emplyee));
            } else {
                e = new Emplyee(emplyee);
            }
        }

        if(null != e) {
            for(Emplyee emplyee : list) {
                emplyee.setDeptName(e.getDeptName());
                context.write(new Text(emplyee.toString()), NullWritable.get());
            }
        }
    }

}

6.主程序:

package cn.edu.bjut.joinone;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MainJob {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = new Job(conf, "join");
        job.setJarByClass(MainJob.class);

        job.setMapperClass(JoinMapper.class);
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Emplyee.class);

        job.setReducerClass(JoinReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        Path outPath = new Path(args[1]);
        FileSystem fs = FileSystem.get(conf);
        if(fs.exists(outPath)) {
            fs.delete(outPath, true);
        }
        FileOutputFormat.setOutputPath(job, outPath);

        job.waitForCompletion(true);
    }

}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章