文章目錄
需求: 在一堆給定的文本文件中統計輸出每一個單詞出現的總次數
Step 1. 數據格式準備
-
創建一個新的文件
cd /export/servers vim wordcount.txt
-
向其中放入以下內容並保存
hello,world,hadoop hive,sqoop,flume,hello kitty,tom,jerry,world hadoop
-
上傳到 HDFS
hdfs dfs -mkdir /wordcount/ hdfs dfs -put wordcount.txt /wordcount/
Step 2. Mapper
首先還是要配置pom.xml文件
<!--最後要打包成jar包-->
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.9</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
<!-- <verbal>true</verbal>-->
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<minimizeJar>true</minimizeJar>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
創建一個包MapReduceDemo1
然後重新編寫mapper邏輯
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/*
四個泛型:
KEYIN:K1的類型
VALUEIN:V1的類型
KEYOUT:K2的類型
VALUEOUT:V2的類型
*/
// hadoop中數據需要經常進行序列化,因此在Java中的數據結構較爲臃腫
// 因此一般使用hadoop中的數據類型,如LongWritable, Text都是hadoop自帶的數據類型
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
/*
map是將K1,V1轉爲K2,V2
K1 V1
0 hello,world,hadoop
12 hdfs,hive,hello
---------------------------------
K2 V2
hello 1
world 1
hadoop 1
hdfs 1
hive 1
hello 1
*/
@Override
//protected void map(KEYIN key, VALUEIN value, Context context)
protected void map(LongWritable key, Text value, Context context) throws InterruptedException, IOException {
// 對每一行數據進行拆分
// Text不能直接進行拆分
String line = value.toString();
String[] split = line.split(",");
//遍歷數組,得到K2和V2
for (String word : split) {
context.write(new Text(word),new LongWritable(1));
}
}
}
Step 3. Reducer
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountReducer extends Reducer<Text, LongWritable,Text,LongWritable> {
/*
Reduce是將K2,V2轉爲K3,V3
K2 V2
hello <1,1>
world <1>
hadoop <1>
hdfs <1>
hive <1>
---------------------------------
K3 V3
hello 2
world 1
hadoop 1
hdfs 1
hive 1
*/
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
/*
key:K3
values:V3
context:上下文對象
*/
long count = 0;
// 遍歷集合
for (LongWritable value : values) {
count += value.get();
}
// 將K3和V3寫入context
context.write(key,new LongWritable(count));
}
}
Step 4. 定義主類, 描述 Job 並提交 Job
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.net.URI;
public class main extends Configured implements Tool {
//指定一個job任務
@Override
public int run(String[] args) throws Exception {
// 創建job任務對象, 獲取的是main中的Configuration
Job job = Job.getInstance(super.getConf(), "wordCount");
//打包到集羣上面運行時候,必須要添加以下配置,指定程序的main函數
job.setJarByClass(main.class);
//第一步:讀取輸入文件解析成key,value對
job.setInputFormatClass(TextInputFormat.class);
//讀取源文件
//文件在hdfs上
TextInputFormat.addInputPath(job, new Path("hdfs://hadoop1:8020/wordcount"));
//文件在本地
// TextInputFormat.addInputPath(job, new Path("file:///E:\\atest\\hello.txt"));
//第二步:設置我們的mapper類
job.setMapperClass(WordCountMapper.class);
//設置我們map階段完成之後的輸出類型,即K2和V2的類型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
//第三步,第四步,第五步,第六步採用默認方式,本案例不重新設置
//第七步:設置我們的reduce類
job.setReducerClass(WordCountReducer.class);
//設置我們reduce階段完成之後的輸出類型,K3和V3的類型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//第八步:設置輸出類以及輸出路徑
job.setOutputFormatClass(TextOutputFormat.class);
// 輸出存到HDFS上
Path path = new Path("hdfs://hadoop1:8020/wordcount_out");
// 判斷路徑是否存在文件
FileSystem fileSystem = FileSystem.get(new URI("hdfs://hadoop1:8020"), new Configuration());
boolean exists = fileSystem.exists(path);
// 如果目標路徑存在,刪除之前的
if(exists){
fileSystem.delete(path,true);
}
TextOutputFormat.setOutputPath(job,path);
// 輸出存到本地
// TextOutputFormat.setOutputPath(job,new Path("file:///E:\\atest\\output"));
// 等待任務結束
boolean b = job.waitForCompletion(true);
return b?0:1;
}
/**
* 程序main函數的入口類
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
// 啓動job任務,返回int,0表示失敗
int run = ToolRunner.run(configuration, new main(), args);
System.exit(run);
}
}
如果有log4j Warning,可以在resorces下新建一個log4j.properties
# Configure logging for testing: optionally with log file
#log4j.rootLogger=debug,appender
log4j.rootLogger=info,appender
#log4j.rootLogger=error,appender
#\u8F93\u51FA\u5230\u63A7\u5236\u53F0
log4j.appender.appender=org.apache.log4j.ConsoleAppender
#\u6837\u5F0F\u4E3ATTCCLayout
log4j.appender.appender.layout=org.apache.log4j.TTCCLayout
在集羣上運行
1. 將程序打包爲jar包
打包後的jar包
MapReduceDemo-1.0-SNAPSHOT.jar是把所有依賴包都打包在一起
original-MapReduceDemo-1.0-SNAPSHOT.jar未打包依賴包
2.在集羣上運行MapReduce程序
- 上傳jar包
cd /export/servers/
mkdir jar_test
rz -E
- 運行程序
hadoop jar jar包名 主方法位置
hadoop jar MapReduceDemo-1.0-SNAPSHOT.jar MapReduceDemo1.main