Hadoop_MapReducer_簡單實用與實例

原創

2018-12-13 13:50

Mapper

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
/**
* 值得類型   個人理解
* LongWritable 整數類型
* IntWritable   整數類型
* NullWritable 如果輸出的鍵/值爲空類型用這個
* Text    類似String類型
*mapper裏面的四個值
*第一個值是行的偏移量整性
*第二個是行的內容
*第三個是發送到reducer的行內容的類型我理解的是每次發送一行可能有誤諒解
*第四個是發送到reducer的鍵的類型，相當於行內容的描述信息，比如次數等等
*/
public class FileMapper extends Mapper<LongWritable, Text, Text, Text> {

public void map(LongWritable ikey, Text ivalue, Context context) throws IOException, InterruptedException {
  //行內容轉化爲String類型進行操作，發送時在轉化爲對應的類型
  String line = ivalue.toString();
  String[] arr = line.split(" ");
  //這個例子取得是每個單詞的目錄名
  //下面兩行是得到目錄名的方法
  FileSplit fs = (FileSplit) context.getInputSplit();
  String name = fs.getPath().getName();
  //遍歷輸出
  for (int i = 0; i < arr.length; i++) {
   context.write(new Text(arr[i]), new Text(name));
  }
}
}

Reducer

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* 第一個和第二個類型對應Mapper發送的數據鍵/值類型
* 第三個和第四個類型對應輸出到文本的鍵/值類型
*/
public class FileReducer extends Reducer<Text, Text, Text, Text> {

public void reduce(Text _key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
  //針對這裏我用的Set集合去除重複的功能
  Set<String> f = new HashSet<>();
  //在這裏遍歷注意一點，地址覆蓋

  for (Text val : values) {
   f.add(val.toString());
  }
  context.write(_key, new Text(f.toString()));
}

}
//demo 取出key對應的最大的值
//IntWritable max= new IntWritable(0);
////地址複用 max和val公用一個地址
//for (IntWritable val : values) {
// if(max.get() < val.get()){
// max = val; //這裏用的地址覆蓋所以是取不到最大值的
// }
//}
//context.write(_key, max);

Driver

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class FileDriver {

public static void main(String[] args) throws Exception {
  Configuration conf = new Configuration();
  Job job = Job.getInstance(conf, "JobName");
  job.setJarByClass(com.hdfs.demo.FileDriver.class);
  //對應的Mapper和Reducer
  job.setMapperClass(FileMapper.class);
  //job.setMapOutputKeyClass(類型.class);
  //job.setMapOutputValueClass(類型.class);
  job.setReducerClass(FileReducer.class);
  //這裏是Mapper和Reducer的輸出類型如果類型一致修改對應的類型即可
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  //job=分配的一個task,就是tasktracker執行操作
  //一個讀數據進行操作 Mapper 讀數據解析成K-V結構
  //一個計算數據進行操作 Reduer 操作數據輸出到不同分區的文件中
  FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.153.129:9000/demo/txt/invert"));
  FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.153.129:9000/demoresult/invert"));

if (!job.waitForCompletion(true))
return;
}

}序列化/反序列化

有些數據普通類型已經表示不了它的整體信息，因此進行封裝，封裝完成Hadoop不認識，因此需要序列化和反序列化進行輸入和輸出

就是一個普通的javabean 實現了Writable接口，重寫裏面的方法

@Override
public void readFields(DataInput in) throws IOException {
  this.name = in.readUTF();
  this.age = in.readInt();
}
@Override
public void write(DataOutput out) throws IOException {
  out.writeUTF(name);
  out.writeInt(age);
}

String類型對於UTF

int類型對應int

目前只接觸到這兩種

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

Hadoop_MapReducer_簡單實用與實例

web開發過程中的路徑問題

for循環優化

JDK1.8 stream 解讀

ObjectMapper

Hadoop全分佈式安裝

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結