MapReduce與MongoDB

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>MongMapReduce</groupId>
    <artifactId>MongMapReduce</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.6.4</version>
        </dependency>
        <dependency>
            <groupId>org.mongodb</groupId>
            <artifactId>mongo-java-driver</artifactId>
            <version>3.2.2</version>
        </dependency>
        <dependency>
            <groupId>org.mongodb</groupId>
            <artifactId>casbah-core_2.11</artifactId>
            <version>3.1.1</version>
        </dependency>
    </dependencies>
</project>

自定義的InputFormat

import com.mongodb.MongoClient;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.*;
import org.bson.Document;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class MongoDBInputFormat  extends InputFormat <LongWritable, Document>{


    public MongoDBInputFormat(){

    }
    public static class MongoDBInputSplit extends InputSplit implements Writable{
        private long start;
        private long end;

        public MongoDBInputSplit(){

        }

        public MongoDBInputSplit(long start, long end) {
            this.start = start;
            this.end = end;
        }
        public long getStart() {
            return start;
        }

        public void setStart(int start) {
            this.start = start;
        }

        public long getEnd() {
            return end;
        }

        public void setEnd(int end) {
            this.end = end;
        }

        public long getLength() throws IOException, InterruptedException {
            return this.end-this.start;
        }

        public String[] getLocations() throws IOException, InterruptedException {
            return new String[0];
        }

        public void write(DataOutput dataOutput) throws IOException {
             dataOutput.writeLong(start);
             dataOutput.writeLong(end);
        }

        public void readFields(DataInput dataInput) throws IOException {
             this.start=dataInput.readLong();
             this.end=dataInput.readLong();
        }


    }
    public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
        String uri = context.getConfiguration().get("input");
        String[] datas = uri.split("://");
        String dbsName = datas[1].split("\\.")[0];
        String tableName = datas[1].split("\\.")[1];
        List<InputSplit> list = new ArrayList<InputSplit>();
        MongoClient client = new MongoClient(datas[0],27017);
        MongoDatabase database = client.getDatabase(dbsName);
        MongoCollection<Document> collection = database.getCollection(tableName);
        long count = collection.count();
        long chunk = 2;
        long chunksize = (count/2);
        //判斷是否數據足夠。
        if(chunksize==0){
            if(count!=0){
                MongoDBInputSplit  mi = new MongoDBInputSplit(0,count);
                list.add(mi);
            }else{
                new Exception("沒有數據");
            }

        }else{
            //將數據進行切片,也就是一個map裏面有一個切片,一個切片有上面定義的chunk = 2 條數據。
        for(int i = 0;i<chunksize;i++){
            MongoDBInputSplit mi = null;
            if(i+1==chunksize){
                mi = new MongoDBInputSplit(i*chunk,count);
                list.add(mi);
            }else{
                mi = new MongoDBInputSplit(i*chunk,i*chunk+chunk);
                list.add(mi);
            }
        }
        }
        //切片集合。
        return list;
    }

    public  static class MongoDBRecordReader extends RecordReader<LongWritable, Document>{
        private MongoDBInputSplit split;
        //從MongDb中查出來的結果集
        private MongoCursor<Document> dbcursor;
        //定義索引,每次都會被初始化成0,也就是隻能讀取自己切片中的 k,v
        private int index;
        private LongWritable k; //偏移量,再下面會自動封裝成切片數據的開始,就會知道讀多少行 ,對應map泛型的第一個值。
        private Document v;     //每次讀到的結果,會通過返回出去,對應  map泛型的第二個。


        //數據庫的一些信息。
        String ip;
        String dbsName;
        String tableName;

        public MongoDBRecordReader(InputSplit split,TaskAttemptContext context) throws IOException, InterruptedException{
            super();
            initialize(split,context);
            String uri = context.getConfiguration().get("input");
            String[] datas = uri.split("://");
            this.ip = datas[0];
            this.dbsName = datas[1].split("\\.")[0];
            this.tableName = datas[1].split("\\.")[1];
        }


        public MongoDBRecordReader(){
        }

        //初始化,將一些對象new出來,並把得到的切片(1個)強轉。
        public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
             this.split = (MongoDBInputSplit)split;
             this.k =  new LongWritable();
             v = new Document();
        }
        //讀取數據,並把數據封裝到當前MongoDBRecordReader的k v中。
        public boolean nextKeyValue() throws IOException, InterruptedException {
            //判斷dbcursor是否爲null
            if(this.dbcursor == null){
                //獲取dbcursor的值
                // 獲取集合
                MongoClient client = new MongoClient(ip,27017);
                MongoDatabase database = client.getDatabase(dbsName);
                MongoCollection<Document> collection = database.getCollection(tableName);

                //獲取遊標
                dbcursor = collection.find().skip((int) this.split.start).limit((int) this.split.getLength()).iterator();
            }
            //操作遊標
            boolean hasNext = this.dbcursor.hasNext();
            if(hasNext){
                //獲取遊標的下一個值
                Document next = this.dbcursor.next();
                //下一個的key
                this.k.set(this.split.start+index);
                index ++;
                //下一個value
                this.v = next;
            }
            return hasNext;
        }

        public LongWritable getCurrentKey() throws IOException, InterruptedException {
            return this.k;
        }

        public Document getCurrentValue() throws IOException, InterruptedException {
            return this.v;
        }

        public float getProgress() throws IOException, InterruptedException {
            return 0;
        }

        public void close() throws IOException {

        }
    }

    public RecordReader<LongWritable, Document> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        return new MongoDBRecordReader(split,context);
    }
}

自定義的OutputFormat

import com.mongodb.MongoClient;
import com.mongodb.client.MongoCollection;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.bson.Document;
import java.io.IOException;

public class MongoDBOutputFormat <K,V> extends OutputFormat<K,V> {



    public static class MongoDBRecordWriter<K,V> extends RecordWriter<K, V>{
        public MongoCollection<Document> collection  = null;

        public MongoDBRecordWriter(){

        }
        public MongoDBRecordWriter(TaskAttemptContext context){
                //獲取mongodb的連接

            String uri = context.getConfiguration().get("output");
            String[] datas = uri.split("://");
            String ip = datas[0];
            String dbsName = datas[1].split("\\.")[0];
            String tableName = datas[1].split("\\.")[1];


            MongoClient client = new MongoClient(ip,27017);
            collection = client.getDatabase(dbsName).getCollection(tableName);
        }


        public void write(K key, V value) throws IOException, InterruptedException {
             collection.insertOne(new Document(key.toString(),value.toString()));
        }

        public void close(TaskAttemptContext context) throws IOException, InterruptedException {

        }

    }
    public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
        return new MongoDBRecordWriter<K, V>(context);
    }

    public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {

    }

    public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
        return new FileOutputCommitter(null, context);
    }
}

自定義的Driver類:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.bson.Document;

import java.io.IOException;

public class WordCount {
    public static class MyMapper extends Mapper<LongWritable, Document,Text,IntWritable>{
        IntWritable iw = new IntWritable(1);
        Text text = new Text();
        @Override
        protected void map(LongWritable key, Document value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            System.out.println(line);
            String str = (String)value.get("str");
            text.set(str);
            context.write(text,iw);
        }
    }
    public static class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
        IntWritable iw = new IntWritable();
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int num = 0;
            for(IntWritable value:values){
                num+=value.get();
            }
            iw.set(num);
            context.write(key,iw);
        }
    }
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        conf.set("input","localhost://db1.in1");
        conf.set("output","localhost://db1.out10");
        Job job = Job.getInstance(conf);
        job.setJarByClass(WordCount.class);
        //指定本業務job要使用的mapper業務類
        job.setMapperClass(MyMapper.class);
        //指定本業務job要使用的reducer業務類
        job.setReducerClass(MyReducer.class);
        //指定map輸出的類型是什麼
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //指定最終輸出數據的kv類型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.setInputFormatClass(MongoDBInputFormat.class);
        job.setOutputFormatClass(MongoDBOutputFormat.class);
        //提交到yarn
        boolean res = job.waitForCompletion(true);
        System.exit(res?0:1);
    }
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章