說明:
測試文件:
1 |
echo -e
"aa\tbb \tcc\nbb\tcc\tdd" > 3.txt |
1 |
hadoop fs -put 3.txt /tmp/3.txt |
全文的例子均以該文件做測試用例,統計單詞出現的次數(WordCount)。
1、原生態的方式:java 源碼編譯打包成jar包後,由 hadoop 腳本調度執行,舉例:
01 |
import java.io.IOException; |
02 |
import java.util.StringTokenizer; |
04 |
import org.apache.hadoop.conf.Configuration; |
05 |
import org.apache.hadoop.fs.Path; |
06 |
import org.apache.hadoop.io.IntWritable; |
07 |
import org.apache.hadoop.io.Text; |
08 |
import org.apache.hadoop.mapreduce.Job; |
09 |
import org.apache.hadoop.mapreduce.Mapper; |
10 |
import org.apache.hadoop.mapreduce.Reducer; |
11 |
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; |
12 |
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; |
13 |
import org.apache.hadoop.util.GenericOptionsParser; |
15 |
public class
WordCount { |
17 |
public
static class
TokenizerMapper extends |
18 |
Mapper<Object, Text, Text, IntWritable> { |
20 |
* LongWritable, IntWritable, Text 均是 Hadoop 中實現的用於封裝 Java 數據類型的類, |
21 |
* 這些類實現了WritableComparable接口,
|
22 |
* 都能夠被串行化從而便於在分佈式環境中進行數據交換, |
23 |
* 你可以將它們分別視爲long,int,String 的替代品。
|
26 |
private
final static
IntWritable one = new IntWritable( 1 ); |
27 |
private
Text word = new
Text(); |
29 |
public
void map(Object key, Text value, Context context) |
30 |
throws
IOException, InterruptedException { |
32 |
StringTokenizer itr =
new StringTokenizer(value.toString()); |
33 |
while
(itr.hasMoreTokens()) { |
34 |
word.set(itr.nextToken()); |
36 |
context.write(word, one); |
41 |
public
static class
IntSumReducer extends |
42 |
Reducer<Text, IntWritable, Text, IntWritable> { |
43 |
private
IntWritable result = new
IntWritable(); |
45 |
public
void reduce(Text key, Iterable<IntWritable> values, |
46 |
Context context)
throws IOException, InterruptedException { |
51 |
for
(IntWritable val : values) { |
56 |
context.write(key, result); |
60 |
public
static void
main(String[] args) throws
Exception { |
61 |
Configuration conf =
new Configuration(); |
63 |
conf.set( "mapred.job.queue.name" ,
"regular" ); |
66 |
String[] otherArgs =
new GenericOptionsParser(conf, args) |
68 |
if
(otherArgs.length != 2 ) { |
69 |
System.err.println( "Usage: wordcount <in> <out>" ); |
72 |
Job job =
new Job(conf, "word count" ); |
73 |
job.setJarByClass(WordCount. class ); |
75 |
job.setMapperClass(TokenizerMapper. class ); |
76 |
job.setCombinerClass(IntSumReducer. class ); |
77 |
job.setReducerClass(IntSumReducer. class ); |
78 |
job.setOutputKeyClass(Text. class ); |
79 |
job.setOutputValueClass(IntWritable. class ); |
81 |
FileInputFormat.addInputPath(job,
new Path(otherArgs[ 0 ])); |
82 |
FileOutputFormat.setOutputPath(job,
new Path(otherArgs[ 1 ])); |
84 |
System.exit(job.waitForCompletion( true ) ?
0 : 1 ); |
執行:
1 |
bin/hadoop jar /tmp/wordcount.jar WordCount /tmp/3.txt /tmp/5 |
結果:
1 |
hadoop fs - cat
/tmp/5/* |
參考資料:
Hadoop - Map/Reduce 通過WordCount例子的變化來了解新版hadoop接口的變化
http://blog.csdn.net/derekjiang/article/details/6836209
Hadoop示例程序WordCount運行及詳解
http://samuschen.iteye.com/blog/763940
官方的 wordcount v1.0 例子
http://hadoop.apache.org/docs/r1.1.1/mapred_tutorial.html#Example%3A+WordCount+v1.0
2、基於 MR 的數據流 Like SQL 腳本開發語言:pig
1 |
A1 = load
'/data/3.txt' ; |
2 |
A = stream A1 through `sed "s/\t/ /g" `; |
3 |
B = foreach A generate flatten(TOKENIZE((chararray)$0))
as word; |
4 |
C = filter B by
word matches '\\w+' ; |
6 |
E = foreach D generate COUNT (C),
group ; |
注意:不同分隔符對load及後面的$0的影響。
詳情請見:
https://gist.github.com/186460
http://www.slideshare.net/erikeldridge/a-brief-handson-introduction-to-hadoop-pig
3、構建數據倉庫的類 SQL 開發語言:hive
1 |
create table
textlines(text string); |
2 |
load data inpath
'/data/3.txt' overwrite
into table
textlines; |
3 |
SELECT wordColumn,
count (1) FROM
textlines LATERAL VIEW
explode(split(text, '\t+' )) wordTable
AS wordColumn GROUP
BY wordColumn; |
詳情請見:
http://my.oschina.net/leejun2005/blog/83045
http://blog.csdn.net/techdo/article/details/7433222
4、跨平臺的腳本語言:python
map:
4 |
for
i in
line.strip().split( "\t" ): |
reduce:
04 |
for words
in sys.stdin: |
10 |
for k, v
in arr.items(): |
11 |
print
str (k) +
": " +
str (v) |
最後在shell下執行:
1 |
hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-streaming-0.20.203.0.jar - file
map.py - file
reduce.py -mapper map.py -reducer reduce.py -input /data/3.txt -output /data/py |
注意:腳本開頭需要顯示指定何種解釋器以及賦予腳本執行權限
詳情請見:
http://blog.csdn.net/jiedushi/article/details/7390015
5、Linux 下的瑞士軍刀:shell 腳本
map:
01 |
june@deepin:~/hadoop/hadoop-0.20.203.0/tmp> |
02 |
hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-streaming-0.20.203.0.jar - file
map.py - file
reduce.py -mapper map.py -reducer reduce.py -input /data/3.txt -output /data/py |
03 |
packageJobJar: [map.py, reduce.py, /home/june/data_hadoop/tmp/hadoop-unjar2676221286002400849/] [] /tmp/streamjob8722854685251202950.jar tmpDir=null |
04 |
12/10/14 21:57:00 INFO mapred.FileInputFormat: Total input paths to process : 1 |
05 |
12/10/14 21:57:00 INFO streaming.StreamJob: getLocalDirs(): [/home/june/data_hadoop/tmp/mapred/ local ] |
06 |
12/10/14 21:57:00 INFO streaming.StreamJob: Running job: job_201210141552_0041 |
07 |
12/10/14 21:57:00 INFO streaming.StreamJob: To
kill this job, run: |
08 |
12/10/14 21:57:00 INFO streaming.StreamJob: /home/june/hadoop/hadoop-0.20.203.0/bin/../bin/hadoop job -Dmapred.job.tracker=localhost:9001 - kill
job_201210141552_0041 |
09 |
12/10/14 21:57:00 INFO streaming.StreamJob: Tracking URL: http://localhost:50030/jobdetails.jsp?jobid=job_201210141552_0041 |
10 |
12/10/14 21:57:01 INFO streaming.StreamJob: map 0% reduce 0% |
11 |
12/10/14 21:57:13 INFO streaming.StreamJob: map 67% reduce 0% |
12 |
12/10/14 21:57:19 INFO streaming.StreamJob: map 100% reduce 0% |
13 |
12/10/14 21:57:22 INFO streaming.StreamJob: map 100% reduce 22% |
14 |
12/10/14 21:57:31 INFO streaming.StreamJob: map 100% reduce 100% |
15 |
12/10/14 21:57:37 INFO streaming.StreamJob: Job complete: job_201210141552_0041 |
16 |
12/10/14 21:57:37 INFO streaming.StreamJob: Output: /data/py |
17 |
june@deepin:~/hadoop/hadoop-0.20.203.0/tmp> |
18 |
hadoop fs - cat
/data/py/part-00000 |
24 |
june@deepin:~/hadoop/hadoop-0.20.203.0/tmp> |
特別提示:上述有些方法對字段後的空格忽略或計算,請注意仔細甄別。
說明:列舉了上述幾種方法主要是給大家一個不同的思路,
在解決問題的過程中,開發效率、執行效率都是我們需要考慮的,不要太侷限某一種方法了。