MapReduce經典案例--WordCount代碼
一.MapReduce工作的三階段
1.Map階段
map階段將要處理的任務切分成一個個的MapTask,每個MapTask各自計算自己負責計算的內容,也就是將計算分佈式。
map階段有兩步:
1.設置TextInputFormat類,將數據切分爲<k1,v1>,輸入到第二步。這裏k1和v1的切分由hadoop框架幫我們完成。
2.自定義Map邏輯,將第一步的結果轉化爲<k2,v2>,輸出結果。
2.Shuffle階段
1.對輸出的<k2,v2>進行分區
2.對不同分區的數據按照相同的key排序
3.對分組的數據初步規約,降低數據的網絡拷貝
4.對數據分組,相同的key的value放在一個集合中
注意:在編程中,如果不寫shuffle代碼,會採用默認的分區分組方法。因此在實際工程中更具需求此處是可選的。
3.Reduce階段
Reduce將上面的MapTask和Shuffle的計算結果進行整合得到最終輸出。
Reduce有兩步:
1.對多個 Map 任務的結果進行排序以及合併, 編寫 Reduce 函數實現自己的邏輯, 對輸 入的 Key-Value 進行處理, 轉爲新的 Key-Value(K3和V3)輸出。
2.設置 TextOutputFormat 處理並保存 Reduce 輸出的 Key-Value 數據
二.WordCount代碼編寫
1.準備
首先確保hadoop集羣能正常啓動(dfs集羣和yarn集羣),然後準備一份文本上傳到hdfs中。這裏不在贅述。
2.pom.xml中需要添加的依賴
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hdfs_api</artifactId>
<groupId>com.sora</groupId>
<version>1.0-SNAPSHOT</version>
<relativePath>../hdfs_api/pom.xml</relativePath>
</parent>
<packaging>jar</packaging>
<modelVersion>4.0.0</modelVersion>
<artifactId>mapreduce</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.0.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs-client</artifactId>
<version>3.0.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.maven.plugins/maven-shade-plugin -->
<dependency>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>RELEASE</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project>
注意一定要添加packaging,因爲要打包到linux服務器中執行。
3.編寫WordCountMapper
package com.sora.mapreduce;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
*Mapper泛型:
* KEYIN K1的類型 行偏移量 LongWritable
* VALUEIN v1的類型 行文本數據 Text
* KEYOUT K2的類型 一個單詞 Text
* VALUEOUT v2的類型 固定值 1 LongWritable
* **/
/**
* map function:將k1和v1轉化成k2和v2
* k1 v1 k2 v2
* 0 hello,world hello 1
* 11 hello,hadoop world 1
* hello 1
* hadoop 1
* **/
public class WordCountMapper extends Mapper<LongWritable, Text,Text,LongWritable> {
/**
* key - k1 value - v1 context - mapreduce上下文對象:連接map和reduce
* **/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Text text = new Text();
LongWritable longWritable = new LongWritable();
//對每一行的數據字符串拆分
String line = value.toString();
String[] split = line.split(",");
//遍歷數組 獲取每一個單詞
for(String word:split){
text.set(word);
longWritable.set(1);
context.write(text,longWritable);
}
}
}
4.編寫WordCountReducer
package com.sora.mapreduce;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* KEYIN k2 Text 每個單詞
* VALUEIN V2 LongWritable 集合中泛型的類型
* KEYOUT K3 Text每個單詞
* VALUEOUT V3 LongWritable 每個單詞出現的次數
* **/
/**
* reduce 將k2 v2 轉爲k3 v3
* k2 v2 k3 v3
* hello <1,1> hello 2
* world <1,1> world 2
* hadoop <1,1,1> hadoop 3
* **/
public class WordCountReducer extends Reducer<Text, LongWritable,Text,LongWritable>{
/**
* key K2
* values 集合
* context mapreduce的上下文對象
* **/
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count = 0;
LongWritable longWritable = new LongWritable();
//遍歷values集合
for (LongWritable value:values){
//將集合中的值相加
count += value.get();
}
longWritable.set(count);
//將k3 v3 寫入上下文中
context.write(key, longWritable);
}
}
5.編寫JobMain
package com.sora.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class JobMain extends Configured implements Tool {
//串聯mapreduce的整個過程 map shuffle reduce
public int run(String[] strings) throws Exception {
//創建一個任務對象
Job job = Job.getInstance(super.getConf(), "mapreduce_wordcount");
//打包在集羣中運行 需要做配置
job.setJarByClass(JobMain.class);
//1.設置讀取文件的類: 生成k1 v1
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job, new Path("hdfs://192.168.159.128:8020/worldcount"));
//2.設置mapper類
job.setMapperClass(WordCountMapper.class);
//設置mapper階段輸出類型 k2 v2
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//3.4.5.6採用默認分區,排序,規約,分組
//7.設置reduce類
job.setReducerClass(WordCountReducer.class);
//設置reduce階段的輸出類型 k3 v3
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//8.設置輸出類型
job.setOutputFormatClass(TextOutputFormat.class);
//設置輸出路徑
TextOutputFormat.setOutputPath(job, new Path("hdfs://192.168.159.128:8020/wordcount_out"));
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration(); //配置對象 放配置信息
//啓動任務
int run = ToolRunner.run(configuration, new JobMain(), args);//調用上面的重寫的run方法
}
}
6.打成jar包上傳到hadoop的主節點服務器,(namenode和resoucemanager的那臺節點),然後執行hadoop命令:
hadoop jar wordcount.jar com.sora.mapreduce.JobMain
成功後在hdfs中就會有輸出文件。大家自己試一試吧 親測有效!