準備
LKh7zAJ4nwo TheReceptionist 653 Entertainment 424 13021 4.34 1305 744 DjdA-5oKYFQ NxTDlnOuybo c-8VuICzXtU DH56yrIO5nI W1Uo5DQTtzc E-3zXq_r4w0 1TCeoRPg5dE yAr26YhuYNY 2ZgXx72XmoE -7ClGo-YgZ0 vmdPOOd6cxI KRHfMQqSHpk pIMpORZthYw 1tUDzOp10pk heqocRij5P0 _XIuvoH6rUg LGVU5DsezE0 uO2kj6_D8B4 xiDqywcDQRM uX81lMev6_o
這是一行我們準備清洗的數據,它的每個數據的意思是(依次)
視頻唯一id 視頻上傳者 視頻年齡 視頻類別 視頻長度 觀看次數 視頻評分 流量 評論數 相關視頻id
要注意的是:
- 視頻類別:可能有多個分類,中間要以&分割,但是在有的數據中會以如下形式顯示
People & Blogs & 中間有空格,我們要處理掉它
- 相關電影id是以tab(" \t ")分割的,我們要將他換爲空格
- 有的電影沒有相關電影,我們要將這些數據過濾掉
瞭解需求後,我們開始做!
環境
IDEA + Maven +hadoop
相關依賴
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.2</version>
</dependency>
</dependencies>
代碼
Mapper代碼
public class ETLMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
Text k=new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//獲取一行
String s = value.toString();
//數據清理
String s1 = ETLUtil.FormatString(s);
//傳遞數據
if(s1==null) return;
k.set(s1);
context.write(k,NullWritable.get());
}
}
ETL工具類:
package util;
public class ETLUtil {
public static String FormatString(String s){
//切割數據
String[] split = s.split("\t");
//過濾髒數據
if(split.length<9){
return null;
}
//數據替換
split[3]=split[3].replace(" ","");//將空格去掉
StringBuilder sb=new StringBuilder();
//類型拼接
for(int i=0;i<split.length;i++){
if(i<9){
if(i==split.length-1){
sb.append(split[i]);
}else{
sb.append(split[i]+"\t");
}
}else {
if(i==split.length-1){
sb.append(split[i]);
}else{
sb.append(split[i]+"&");
}
}
}
return sb.toString();
}
}
驅動類 :
package etl;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class ETLDriver implements Tool {
private Configuration configuration;
public int run(String[] strings) throws Exception {
//創建Job
Job job = Job.getInstance(configuration);
//設置運行環境
job.setJarByClass(ETLDriver.class);
//設置對應的MapperReduce類
job.setMapperClass(ETLMapper.class);
//設置Mapper輸出的
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
//設置全局的輸出
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//設置輸出輸入路徑
FileInputFormat.setInputPaths(job,new Path(strings[0]));
FileOutputFormat.setOutputPath(job,new Path(strings[1]));
//不需要reduce
job.setNumReduceTasks(0);
//提交
job.submit();
return 1;
}
public void setConf(Configuration configuration) {
this.configuration=configuration;
}
public Configuration getConf() {
return configuration;
}
//主函數
public static void main(String[] args) throws Exception{
ToolRunner.run(new ETLDriver(),args);
}
}
測試運行
我們在windows上測試運行了代碼,按照要求完成了相應的任務
取一條數據看看
Gnbls__5gdo ggagnisevidal 699 People&Blogs 132 15 0 0 0 FDz8KaArjOA&O1F8tm0kY44&zq_NPp6-zUY&EvtlRc_G9DA&gL5aFyBlucE&1pGjSJD35AU&QGkOy0_uoOM&NbjQ-lTYgvo&_62f9_ylrjg&SX1FY9pxrhw&ITeraiadbJA&ZZZADbubu0Y&4JhAswOQV1Y&mLeOiDF99Yo&BrdO9GagGoM&gij1PytzQNg&wkvCDCOGzGc&5pdG8PZjVog&l8k-5CA2PKY&_iCmluYaOyI
很nice!
打包,放到集羣上使用
雙擊
jar就在這