HDFS的java-API操作
簡介
hdfs在生產應用中主要是客戶端的開發,其核心步驟是從hdfs提供的api中構造一個hdfs的訪問客戶端對象,然後通過該客戶端對象操作(增刪改查)hdfs上的文件。
導入依賴包(maven)
利用maven導入hadoop開發相關的依賴包,pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>hadoop</groupId>
<artifactId>hadoop</artifactId>
<packaging>pom</packaging>
<version>1.0-SNAPSHOT</version>
<modules>
<module>hdfs</module>
</modules>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<hadoop.version>3.0.3</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<!-- -->
</dependencies>
<build>
<defaultGoal>compile</defaultGoal>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
注:如需手動引入jar包,hdfs的jar包—-hadoop的安裝目錄的share下
獲取API中的客戶端對象
獲取到的fs對象就是DistributedFileSysytem的實例,通過fs可以直接操作fs
1. 獲取客戶端對象fs(默認配置)
在java中操作hdfs,首先要獲得一個客戶端實例
Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
2. 獲取客戶端對象fs(set)
Configuration conf = getConf();
conf.set("fs.defaultFS", "hdfs://hadoop1:9000"); // 指定配置
conf.set("dfs.replication", "3"); // 副本數量
FileSystem fs = FileSystem.get(conf);
3. 獲取客戶端對象fs(URI)
Configuration conf = getConf();
// 集羣地址,配置信息
fs = FileSystem.get(URI.create("hdfs://172.16.0.4:9000"), conf);
4. get方法是如何判斷具體實例化了哪種客戶端呢?
首先從conf中的參數fs.defaultFS的配置值判斷,如果我們的代碼中沒有指定fs.defaultFS,並且工程classpath下也沒有給定相應的配置,conf中的默認值就來自於hadoop的jar包中的core-default.xml,默認值爲: file:///,則獲取的將不是一個DistributedFileSystem的實例,而是一個本地文件系統的客戶端對象。因此,推薦使用默認值打包到安裝有hadoop客戶端的Linux上運行。
java-API基本操作
- 上傳文件
// hadoop jar
// extends Configuration獲得配置對象
// implements Tool控制命名參數
public class Put extends Configured implements Tool{
public static void main(String[] args) throws Exception {
ToolRunner.run(new Put(),args); //把參數拆分封裝到Configuration
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
FileSystem fs1 = FileSystem.get(conf);
Path fin = new Path(conf.get("inpath")); // 本地端
Path fout = new Path(conf.get("outpath")); // 服務端
fs1.copyFromLocalFile(fin,fout);
return 0;
}
}
- 下載文件
public class Get extends Configured implements Tool{
public static void main(String[] args) throws Exception {
ToolRunner.run(new Get(),args); //把參數拆分封裝到Configuration
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
FileSystem fs1 = FileSystem.get(conf);
Path fin = new Path(conf.get("inpath")); // 服務端
Path fout = new Path(conf.get("outpath")); // 本地端
fs1.copyToLocalFile(fin,fout);
return 0;
}
}
- 目錄操作(增、刪、改)
public class DirTest extends Configured implements Tool{
public static void main(String[] args) throws Exception {
ToolRunner.run(new DirTest(),args); //把參數拆分封裝到Configuration
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
FileSystem fs1 = FileSystem.get(conf);
Path fin = new Path(conf.get("inpath"));
Path fout = new Path(conf.get("outpath"));
fs1.mkdirs(fin); // 創建目錄
// fs1.delete(fout, true) // 刪除文件夾 ,如果是非空文件夾,參數2必須給值true
// fs1.rename(fin,fout); // 重命名文件或文件夾
return 0;
}
}
- 遞歸查看目錄信息(只顯示文件)
public class Ls extends Configured implements Tool {
public static void main(String[] args) throws Exception {
ToolRunner.run(new Ls(),args); //把參數拆分封裝到Configuration
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(new Path(conf.get("path")),true);
while(listFiles.hasNext()){
LocatedFileStatus fileStatus = listFiles.next();
System.out.println("blocksize:"+fileStatus.getBlockSize()); //塊大小
System.out.println("owner:"+fileStatus.getOwner()); //主用戶
System.out.println("replication:"+fileStatus.getReplication()); //副本數量
System.out.println("permission:"+fileStatus.getPermission()); //權限
System.out.println("name:"+fileStatus.getPath().getName()); //文件名稱
BlockLocation[] blockLocations = fileStatus.getBlockLocations();
for (BlockLocation b : blockLocations) {
System.out.println("塊的名字:"+b.getNames());
System.out.println("塊的偏移量:"+b.getOffset());
System.out.println("塊的長度"+b.getLength());
//塊所在的datanode節點
String[] datanodes = b.getHosts();
for (String dn : datanodes){
System.out.println("datanode:"+ dn);
}
}
System.out.println("=================");
}
return 0;
}
}
- 遞歸查看目錄信息(顯示文件和文件夾)
public class List extends Configured implements Tool {
FileSystem fs;
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
fs = FileSystem.get(conf);
FileStatus[] sts = fs.listStatus(new Path(conf.get("path")));
Stream.of(sts).forEach(this::showDetail);
return 0;
}
//對於FileStatus進行判斷
//如果是文件,打印相關元信息
//如果代表是目錄,遞歸調用showDetail
public void showDetail(FileStatus st) {
if(st.isFile() && st.getLen() > 0){
show(st);
}else if(st.isDirectory()){
try {
Stream.of(fs.listStatus(st.getPath())).forEach(this::showDetail);
} catch (IOException e) {
e.printStackTrace();
}
}
}
public void show(FileStatus st){
System.out.println("--------");
System.out.println(st.getPath());
System.out.println(st.getPermission());
System.out.println(st.getAccessTime());
System.out.println(st.getOwner());
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new List(),args);
}
}
java-API基本操作(流)
- 上傳文件
public class Put extends Configured implements Tool{
public static void main(String[] args) throws Exception {
ToolRunner.run(new Put(),args); //把參數拆分封裝到Configuration
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
FileSystem fs1 = FileSystem.get(conf);
Path fin = new Path(conf.get("inpath")); // 本地端
Path fout = new Path(conf.get("outpath")); // 服務端
FSDataInputStream in = fs1.open(fin);
FSDataOutputStream out = fs1.create(fout);
IOUtils.copyBytes(in,out,128,true);
return 0;
}
}
- 下載文件
public class Get extends Configured implements Tool{
public static void main(String[] args) throws Exception {
ToolRunner.run(new Get(),args); //把參數拆分封裝到Configuration
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
FileSystem fs1 = FileSystem.get(conf);
Path fin = new Path(conf.get("inpath")); // 服務端
Path fout = new Path(conf.get("outpath")); // 本地端
FSDataInputStream in = fs1.open(fin); //輸入
FSDataOutputStream out = fs2.create(fout); //輸出到本地
IOUtils.copyBytes(in,out,128,true);
return 0;
}
}
- 查看文件內容
public class Cat extends Configured implements Tool{
public static void main(String[] args) throws Exception {
ToolRunner.run(new Cat(),args); //把參數拆分封裝到Configuration
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
FileSystem fs1 = FileSystem.get(conf);
Path fin = new Path(conf.get("inpath"));
FSDataInputStream in = fs1.open(fin); // 輸入
IOUtils.copyBytes(in,System.out,128,true); // 輸出到控制檯
return 0;
}
}
常見java-API操作
- 壓縮和解壓縮(CompressionCodec)
上傳文件並壓縮
public class CompressionWriteTest extends Configured implements Tool {
public static void main(String[] args) throws Exception {
ToolRunner.run(new CompressionWriteTest(),args);
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
Path inpath = new Path(conf.get("inpath"));
Path outpath = new Path(conf.get("outpath"));
FileSystem infs = FileSystem.getLocal(conf);
FileSystem outfs = FileSystem.get(conf);
FSDataInputStream in = infs.open(inpath);
FSDataOutputStream out = outfs.create(outpath);
//壓縮要上傳的文件
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(outpath);
CompressionOutputStream cout = codec.createOutputStream(out);
IOUtils.copyBytes(in,cout,128,true);
return 0;
}
}
下載文件並解壓縮
public class CompressionReadTest extends Configured implements Tool {
public static void main(String[] args) throws Exception {
ToolRunner.run(new CompressionReadTest(),args);
}
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
Path inpath = new Path(conf.get("inpath"));
Path outpath = new Path(conf.get("outpath"));
FileSystem infs = FileSystem.get(conf);
FileSystem outfs = FileSystem.getLocal(conf);
FSDataInputStream in = infs.open(inpath);
FSDataOutputStream out = outfs.create(outpath);
//解壓要下載的文件
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(inpath);
CompressionInputStream cin = codec.createInputStream(in);
IOUtils.copyBytes(cin,out,128,true);
return 0;
}
}
- SequenceFile
SequenceFile文件是Hadoop用來存儲二進制形式的key-value對而設計的一種平面文件。SequenceFile文件可支持三種壓縮類型NONE:對records不進行壓縮;RECORD:僅壓縮每一個record中的value值;BLOCK:將一個block中的所有records壓縮在一起。
FileKey.java
public class FileKey implements WritableComparable<FileKey> {
private Text fileName = new Text();
private LongWritable length = new LongWritable();
@Override
public void readFields(DataInput input) throws IOException {
fileName.readFields(input);
length.readFields(input);
}
@Override
public void write(DataOutput output) throws IOException {
fileName.write(output);
length.write(output);
}
@Override
public int compareTo(FileKey other) {
if((fileName.compareTo(other.fileName) ==0)
&&(length.compareTo(other.length)==0)){
return 0;
}else if((fileName.compareTo(other.fileName) !=0)){
return fileName.compareTo(other.fileName);
}else{
return length.compareTo(other.length);
}
}
public String getFileName() {
return fileName.toString();
}
public void setFileName(String fileName) {
this.fileName.set(fileName);
}
public long getLength() {
return length.get();
}
public void setLength(long length) {
this.length.set(length);
}
public String toString(){
return fileName.toString() + ":" + length.get();
}
}
PutSeq.java
/**
*將目錄及子目錄和成一個序列文件文件上傳
*/
public class PutSeq extends Configured implements Tool {
private void construct(File dir, Writer writer) throws IOException {
File[] lists = dir.listFiles();
Stream.of(lists).forEach(f -> {
try {
if (f.isDirectory()) {
//一個目錄的開始,做上同步標記
writer.sync();
construct(f, writer);
} else {
byte[] content = getData(f);
FileKey key = new FileKey();
BytesWritable value = new BytesWritable();
key.setFileName(f.getPath());
key.setLength(f.length());
value.set(content, 0, content.length);
writer.append(key, value);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
});
}
/**
* 從file文件中讀取所有數據,放入字節數組並返回該字節數組
*/
private byte[] getData(File file) throws IOException {
FileInputStream fis = new FileInputStream(file);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] content = new byte[1024];
int length = 0;
while ((length = fis.read(content, 0, 1024)) > 0) {
baos.write(content, 0, length);
}
fis.close();
baos.flush();
byte[] r = baos.toByteArray();
baos.close();
return r;
}
@Override
public int run(String[] arg0) throws Exception {
Configuration conf = getConf();
SequenceFile.Writer.Option op1 = Writer.file(new Path(conf.get("output")));
SequenceFile.Writer.Option op2 = Writer.keyClass(FileKey.class);
SequenceFile.Writer.Option op3 = Writer.valueClass(BytesWritable.class);
SequenceFile.Writer writer = SequenceFile.createWriter(conf, op1, op2, op3);
File ds = new File(conf.get("input"));
construct(ds, writer);
writer.close();
return 0;
}
public static void main(String[] args) throws Exception {
System.exit(ToolRunner.run(new PutSeq(), args));
}
}
ListSeq.java
/**
* 展示序列文件的目錄結構
*/
public class ListSeq extends Configured implements Tool {
private SequenceFile.Reader reader;
@Override
public int run(String[] arg0) throws Exception {
Configuration conf = getConf();
Path input = new Path(conf.get("input"));
SequenceFile.Reader.Option op1 = SequenceFile.Reader.file(input);
reader = new SequenceFile.Reader(conf, op1);
Writable key = (Writable) reader.getKeyClass().newInstance();
Writable value = (Writable) reader.getValueClass().newInstance();
reader.sync(reader.getPosition());
while (reader.next(key, value)) {
FileKey file = (FileKey) key;
System.out.printf("%s\n", new File(file.getFileName()).getParent());
reader.sync(reader.getPosition());
}
return 0;
}
public static void main(String[] args) throws Exception {
System.exit(ToolRunner.run(new ListSeq(), args));
}
}
GetSeq.java
/**
* 下載序列文件並還原其目錄結構
*/
public class GetSeq extends Configured implements Tool {
private SequenceFile.Reader reader;
@Override
public int run(String[] arg0) throws Exception {
Configuration conf = getConf();
Path input = new Path(conf.get("input"));
File output = new File(conf.get("output"));
SequenceFile.Reader.Option op1 = SequenceFile.Reader.file(input);
reader = new SequenceFile.Reader(conf, op1);
Writable key = (Writable) reader.getKeyClass().newInstance();
Writable value = (Writable) reader.getValueClass().newInstance();
while (reader.next(key, value)) {
String file = ((FileKey) key).getFileName().toString();
save(new File(output, file), value);
}
return 0;
}
private void save(File file, Writable value) throws IOException {
File d = file.getParentFile();
if (!d.exists()) d.mkdirs();
BytesWritable bw = (BytesWritable) value;
byte[] bs = bw.copyBytes();
FileOutputStream fos = new FileOutputStream(file);
fos.write(bs, 0, bs.length);
fos.close();
}
public static void main(String[] args) throws Exception {
System.exit(ToolRunner.run(new GetSeq(), args));
}
}
- serialization(序列化)
hadoop序列化和反序列化
public class Student implements Writable {
//屬性類爲hadoop已經提供的類型
private IntWritable id;
private Text name;
private IntWritable age;
//反序列化時,需要反射調用空參構造函數,所以需要顯式定義一個
public Student(){}
//構造器 複製賦值 不要引用賦值
public Student(IntWritable id, Text name, IntWritable age) {
this.id = new IntWritable(id.get());
this.name = new Text(name.toString());
this.age = new IntWritable(age.get());
}
public IntWritable getId() {
return id;
}
public void setId(IntWritable id) {
this.id = new IntWritable(id.get());
}
public Text getName() {
return name;
}
public void setName(Text name) {
this.name = new Text(name.toString());
}
public IntWritable getAge() {
return age;
}
public void setAge(IntWritable age) {
this.age = new IntWritable(age.get());
}
@Override
public void write(DataOutput dataOutput) throws IOException {
id.write(dataOutput);
name.write(dataOutput);
age.write(dataOutput);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
id.readFields(dataInput);
name.readFields(dataInput);
age.readFields(dataInput);
}
}