MapReduce經典案例--WordCount 代碼(實測可以運行)

MapReduce經典案例--WordCount代碼

一.MapReduce工作的三階段

1.Map階段

map階段將要處理的任務切分成一個個的MapTask,每個MapTask各自計算自己負責計算的內容,也就是將計算分佈式。

map階段有兩步:

1.設置TextInputFormat類,將數據切分爲<k1,v1>,輸入到第二步。這裏k1和v1的切分由hadoop框架幫我們完成。

2.自定義Map邏輯,將第一步的結果轉化爲<k2,v2>,輸出結果。

2.Shuffle階段

1.對輸出的<k2,v2>進行分區

2.對不同分區的數據按照相同的key排序

3.對分組的數據初步規約,降低數據的網絡拷貝

4.對數據分組,相同的key的value放在一個集合中

注意:在編程中,如果不寫shuffle代碼,會採用默認的分區分組方法。因此在實際工程中更具需求此處是可選的。

3.Reduce階段

Reduce將上面的MapTask和Shuffle的計算結果進行整合得到最終輸出。

Reduce有兩步:

1.對多個 Map 任務的結果進行排序以及合併, 編寫 Reduce 函數實現自己的邏輯, 對輸 入的 Key-Value 進行處理, 轉爲新的 Key-Value(K3和V3)輸出。

2.設置 TextOutputFormat 處理並保存 Reduce 輸出的 Key-Value 數據

 

二.WordCount代碼編寫

1.準備

首先確保hadoop集羣能正常啓動(dfs集羣和yarn集羣),然後準備一份文本上傳到hdfs中。這裏不在贅述。

2.pom.xml中需要添加的依賴

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <parent>
        <artifactId>hdfs_api</artifactId>
        <groupId>com.sora</groupId>
        <version>1.0-SNAPSHOT</version>
        <relativePath>../hdfs_api/pom.xml</relativePath>
    </parent>
    <packaging>jar</packaging>
    <modelVersion>4.0.0</modelVersion>

    <artifactId>mapreduce</artifactId>

    <dependencies>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>3.0.0</version>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>3.0.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs-client</artifactId>
            <version>3.0.0</version>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.0.0</version>
        </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
            <scope>test</scope>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.maven.plugins/maven-shade-plugin -->
        <dependency>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-shade-plugin</artifactId>
            <version>2.4.3</version>
        </dependency>
        <dependency>
            <groupId>org.junit.jupiter</groupId>
            <artifactId>junit-jupiter-api</artifactId>
            <version>RELEASE</version>
            <scope>compile</scope>
        </dependency>
    </dependencies>


</project>

注意一定要添加packaging,因爲要打包到linux服務器中執行。

3.編寫WordCountMapper

package com.sora.mapreduce;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;

/**
 *Mapper泛型:
 *  KEYIN K1的類型  行偏移量 LongWritable
 *  VALUEIN v1的類型   行文本數據 Text
 *  KEYOUT K2的類型    一個單詞  Text
 *  VALUEOUT v2的類型   固定值 1  LongWritable
 * **/
/**
 * map function:將k1和v1轉化成k2和v2
 * k1     v1                     k2        v2
 * 0      hello,world           hello       1
 * 11     hello,hadoop          world       1
 *                              hello       1
 *                              hadoop      1
 * **/
public class WordCountMapper extends Mapper<LongWritable, Text,Text,LongWritable> {
    /**
     * key - k1   value - v1  context - mapreduce上下文對象:連接map和reduce
     * **/
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        Text text = new Text();
        LongWritable longWritable = new LongWritable();
        //對每一行的數據字符串拆分
        String line = value.toString();
        String[] split = line.split(",");
        //遍歷數組 獲取每一個單詞
        for(String word:split){
            text.set(word);
            longWritable.set(1);
            context.write(text,longWritable);
        }

    }
}

4.編寫WordCountReducer

package com.sora.mapreduce;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * KEYIN k2     Text 每個單詞
 * VALUEIN V2   LongWritable 集合中泛型的類型
 * KEYOUT K3    Text每個單詞
 * VALUEOUT V3  LongWritable 每個單詞出現的次數
 * **/
/**
 * reduce 將k2 v2 轉爲k3 v3
 * k2     v2            k3       v3
 * hello  <1,1>         hello    2
 * world  <1,1>         world    2
 * hadoop <1,1,1>       hadoop   3
 * **/
public class WordCountReducer extends Reducer<Text, LongWritable,Text,LongWritable>{
    /**
     * key K2
     * values 集合
     * context mapreduce的上下文對象
     * **/
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
        long count = 0;
        LongWritable longWritable = new LongWritable();
        //遍歷values集合
        for (LongWritable value:values){
            //將集合中的值相加
            count += value.get();
        }
        longWritable.set(count);
        //將k3 v3 寫入上下文中
        context.write(key, longWritable);
    }
}

5.編寫JobMain

package com.sora.mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class JobMain extends Configured implements Tool {

    //串聯mapreduce的整個過程 map  shuffle  reduce
    public int run(String[] strings) throws Exception {
        //創建一個任務對象
        Job job = Job.getInstance(super.getConf(), "mapreduce_wordcount");
        //打包在集羣中運行 需要做配置
        job.setJarByClass(JobMain.class);
        //1.設置讀取文件的類:  生成k1 v1
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job, new Path("hdfs://192.168.159.128:8020/worldcount"));
        //2.設置mapper類
        job.setMapperClass(WordCountMapper.class);
        //設置mapper階段輸出類型 k2 v2
        job.setMapOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        //3.4.5.6採用默認分區,排序,規約,分組
        //7.設置reduce類
        job.setReducerClass(WordCountReducer.class);
        //設置reduce階段的輸出類型 k3 v3
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        //8.設置輸出類型
        job.setOutputFormatClass(TextOutputFormat.class);
        //設置輸出路徑
        TextOutputFormat.setOutputPath(job, new Path("hdfs://192.168.159.128:8020/wordcount_out"));

        boolean b = job.waitForCompletion(true);

        return b?0:1;
    }

    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();  //配置對象 放配置信息
        //啓動任務
        int run = ToolRunner.run(configuration, new JobMain(), args);//調用上面的重寫的run方法
    }


}

6.打成jar包上傳到hadoop的主節點服務器,(namenode和resoucemanager的那臺節點),然後執行hadoop命令:

hadoop jar wordcount.jar  com.sora.mapreduce.JobMain

成功後在hdfs中就會有輸出文件。大家自己試一試吧 親測有效!

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章