pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>MongMapReduce</groupId>
<artifactId>MongMapReduce</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.4</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>3.2.2</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>casbah-core_2.11</artifactId>
<version>3.1.1</version>
</dependency>
</dependencies>
</project>
自定義的InputFormat
import com.mongodb.MongoClient;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.*;
import org.bson.Document;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class MongoDBInputFormat extends InputFormat <LongWritable, Document>{
public MongoDBInputFormat(){
}
public static class MongoDBInputSplit extends InputSplit implements Writable{
private long start;
private long end;
public MongoDBInputSplit(){
}
public MongoDBInputSplit(long start, long end) {
this.start = start;
this.end = end;
}
public long getStart() {
return start;
}
public void setStart(int start) {
this.start = start;
}
public long getEnd() {
return end;
}
public void setEnd(int end) {
this.end = end;
}
public long getLength() throws IOException, InterruptedException {
return this.end-this.start;
}
public String[] getLocations() throws IOException, InterruptedException {
return new String[0];
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(start);
dataOutput.writeLong(end);
}
public void readFields(DataInput dataInput) throws IOException {
this.start=dataInput.readLong();
this.end=dataInput.readLong();
}
}
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
String uri = context.getConfiguration().get("input");
String[] datas = uri.split("://");
String dbsName = datas[1].split("\\.")[0];
String tableName = datas[1].split("\\.")[1];
List<InputSplit> list = new ArrayList<InputSplit>();
MongoClient client = new MongoClient(datas[0],27017);
MongoDatabase database = client.getDatabase(dbsName);
MongoCollection<Document> collection = database.getCollection(tableName);
long count = collection.count();
long chunk = 2;
long chunksize = (count/2);
//判斷是否數據足夠。
if(chunksize==0){
if(count!=0){
MongoDBInputSplit mi = new MongoDBInputSplit(0,count);
list.add(mi);
}else{
new Exception("沒有數據");
}
}else{
//將數據進行切片,也就是一個map裏面有一個切片,一個切片有上面定義的chunk = 2 條數據。
for(int i = 0;i<chunksize;i++){
MongoDBInputSplit mi = null;
if(i+1==chunksize){
mi = new MongoDBInputSplit(i*chunk,count);
list.add(mi);
}else{
mi = new MongoDBInputSplit(i*chunk,i*chunk+chunk);
list.add(mi);
}
}
}
//切片集合。
return list;
}
public static class MongoDBRecordReader extends RecordReader<LongWritable, Document>{
private MongoDBInputSplit split;
//從MongDb中查出來的結果集
private MongoCursor<Document> dbcursor;
//定義索引,每次都會被初始化成0,也就是隻能讀取自己切片中的 k,v
private int index;
private LongWritable k; //偏移量,再下面會自動封裝成切片數據的開始,就會知道讀多少行 ,對應map泛型的第一個值。
private Document v; //每次讀到的結果,會通過返回出去,對應 map泛型的第二個。
//數據庫的一些信息。
String ip;
String dbsName;
String tableName;
public MongoDBRecordReader(InputSplit split,TaskAttemptContext context) throws IOException, InterruptedException{
super();
initialize(split,context);
String uri = context.getConfiguration().get("input");
String[] datas = uri.split("://");
this.ip = datas[0];
this.dbsName = datas[1].split("\\.")[0];
this.tableName = datas[1].split("\\.")[1];
}
public MongoDBRecordReader(){
}
//初始化,將一些對象new出來,並把得到的切片(1個)強轉。
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
this.split = (MongoDBInputSplit)split;
this.k = new LongWritable();
v = new Document();
}
//讀取數據,並把數據封裝到當前MongoDBRecordReader的k v中。
public boolean nextKeyValue() throws IOException, InterruptedException {
//判斷dbcursor是否爲null
if(this.dbcursor == null){
//獲取dbcursor的值
// 獲取集合
MongoClient client = new MongoClient(ip,27017);
MongoDatabase database = client.getDatabase(dbsName);
MongoCollection<Document> collection = database.getCollection(tableName);
//獲取遊標
dbcursor = collection.find().skip((int) this.split.start).limit((int) this.split.getLength()).iterator();
}
//操作遊標
boolean hasNext = this.dbcursor.hasNext();
if(hasNext){
//獲取遊標的下一個值
Document next = this.dbcursor.next();
//下一個的key
this.k.set(this.split.start+index);
index ++;
//下一個value
this.v = next;
}
return hasNext;
}
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return this.k;
}
public Document getCurrentValue() throws IOException, InterruptedException {
return this.v;
}
public float getProgress() throws IOException, InterruptedException {
return 0;
}
public void close() throws IOException {
}
}
public RecordReader<LongWritable, Document> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
return new MongoDBRecordReader(split,context);
}
}
自定義的OutputFormat
import com.mongodb.MongoClient;
import com.mongodb.client.MongoCollection;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.bson.Document;
import java.io.IOException;
public class MongoDBOutputFormat <K,V> extends OutputFormat<K,V> {
public static class MongoDBRecordWriter<K,V> extends RecordWriter<K, V>{
public MongoCollection<Document> collection = null;
public MongoDBRecordWriter(){
}
public MongoDBRecordWriter(TaskAttemptContext context){
//獲取mongodb的連接
String uri = context.getConfiguration().get("output");
String[] datas = uri.split("://");
String ip = datas[0];
String dbsName = datas[1].split("\\.")[0];
String tableName = datas[1].split("\\.")[1];
MongoClient client = new MongoClient(ip,27017);
collection = client.getDatabase(dbsName).getCollection(tableName);
}
public void write(K key, V value) throws IOException, InterruptedException {
collection.insertOne(new Document(key.toString(),value.toString()));
}
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
}
}
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
return new MongoDBRecordWriter<K, V>(context);
}
public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
}
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
return new FileOutputCommitter(null, context);
}
}
自定義的Driver類:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.bson.Document;
import java.io.IOException;
public class WordCount {
public static class MyMapper extends Mapper<LongWritable, Document,Text,IntWritable>{
IntWritable iw = new IntWritable(1);
Text text = new Text();
@Override
protected void map(LongWritable key, Document value, Context context) throws IOException, InterruptedException {
String line = value.toString();
System.out.println(line);
String str = (String)value.get("str");
text.set(str);
context.write(text,iw);
}
}
public static class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
IntWritable iw = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int num = 0;
for(IntWritable value:values){
num+=value.get();
}
iw.set(num);
context.write(key,iw);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("input","localhost://db1.in1");
conf.set("output","localhost://db1.out10");
Job job = Job.getInstance(conf);
job.setJarByClass(WordCount.class);
//指定本業務job要使用的mapper業務類
job.setMapperClass(MyMapper.class);
//指定本業務job要使用的reducer業務類
job.setReducerClass(MyReducer.class);
//指定map輸出的類型是什麼
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//指定最終輸出數據的kv類型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setInputFormatClass(MongoDBInputFormat.class);
job.setOutputFormatClass(MongoDBOutputFormat.class);
//提交到yarn
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}