hadoop多文件格式輸入,一般可以使用MultipleInputs類指定不同的輸入文件路徑以及輸入文件格式。
1、需求:
比如現在有如下的需求:
現有兩份數據:
phone:
123,good number
124,common number
125,bad number
user:
zhangsan,123
lisi,124
wangwu,125
現在需要把user和phone按照phone number連接起來,得到下面的結果:
zhangsan,123,good number
lisi,123,common number
wangwu,125,bad number
2.自定義數據類型:
package cn.edu.bjut.multiinput;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class FlagDataType implements WritableComparable<FlagDataType> {
private String info;
private int flag;
public void write(DataOutput out) throws IOException {
out.writeUTF(info);
out.writeInt(flag);
}
public void readFields(DataInput in) throws IOException {
this.info = in.readUTF();
this.flag = in.readInt();
}
public int compareTo(FlagDataType o) {
return this.flag - o.getFlag();
}
@Override
public boolean equals(Object obj) {
if(null == obj) {
return false;
}
if(this == obj) {
return true;
}
if(obj instanceof FlagDataType) {
FlagDataType o = (FlagDataType) obj;
if(this.info.equals(o.getInfo()) && this.flag == o.getFlag()) {
return true;
} else {
return false;
}
}
return false;
}
public String getInfo() {
return info;
}
public void setInfo(String info) {
this.info = info;
}
public int getFlag() {
return flag;
}
public void setFlag(int flag) {
this.flag = flag;
}
}
3.Mapper程序1:
package cn.edu.bjut.multiinput;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MultiMapper1 extends Mapper<LongWritable, Text, Text, FlagDataType> {
private String delimiter;
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString().trim();
String[] arr = line.split(delimiter);
if(2 == arr.length) {
FlagDataType flagDataType = new FlagDataType();
flagDataType.setInfo(arr[1].trim());
flagDataType.setFlag(0);
context.write(new Text(arr[0]), flagDataType);
}
}
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
delimiter = context.getConfiguration().get("delimiter", ",");
}
}
Mapper程序2:
package cn.edu.bjut.multiinput;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MultiMapper2 extends Mapper<LongWritable, Text, Text, FlagDataType> {
private String delimiter;
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString().trim();
String[] arr = line.split(delimiter);
if(2 == arr.length) {
FlagDataType flagDataType = new FlagDataType();
flagDataType.setInfo(arr[0].trim());
flagDataType.setFlag(1);
context.write(new Text(arr[1]), flagDataType);
}
}
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
delimiter = context.getConfiguration().get("delimiter", ",");
}
}
4.Reducer程序:
package cn.edu.bjut.multiinput;
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MultiReducer extends Reducer<Text, FlagDataType, NullWritable, Text> {
private String delimiter;
@Override
protected void reduce(Text key, Iterable<FlagDataType> values, Context context)
throws IOException, InterruptedException {
String[] arr = new String[3];
arr[2] = key.toString();
for(FlagDataType flagDataType : values) {
arr[flagDataType.getFlag()] = flagDataType.getInfo();
}
context.write(NullWritable.get(), new Text(arr[0]+delimiter+arr[1]+delimiter+arr[2]));
}
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
delimiter = context.getConfiguration().get("delimiter", ",");
}
}
5.主程序:
package cn.edu.bjut.multiinput;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MainJob {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("delimiter", ",");
Job job = new Job(conf, "multi");
job.setJarByClass(MainJob.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlagDataType.class);
job.setReducerClass(MultiReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, MultiMapper1.class);
MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, MultiMapper2.class);
Path outPath = new Path(args[2]);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outPath)) {
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
job.waitForCompletion(true);
}
}