使用mapreduce讀取hfile

這裏主要說是hbase提供的一個HFileScanner的使用。

以下代碼使用HFileScanner寫一個inputformat

package hadoop.hbase;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.io.hfile.HFileScanner;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.StringUtils;

/**
 * This is direct port (hopefully) of the Scala version of this class available
 * on https://gist.github.com/1120311
 * 
 * @author yuankang
 */
public class HFileInputFormat extends FileInputFormat<ImmutableBytesWritable, KeyValue> {

	
	
	


	private class HFileRecordReader extends
			RecordReader<ImmutableBytesWritable, KeyValue> {

		private HFile.Reader reader;
		private final HFileScanner scanner;
		private int entryNumber = 0;

		public HFileRecordReader(FileSplit split, Configuration conf)
				throws IOException {
		//	SchemaMetrics.configureGlobally(conf);
			final Path path = split.getPath();
			reader = HFile.createReader(FileSystem.get(conf), path, new CacheConfig(conf),conf);
			
			scanner = reader.getScanner(false, false);
			reader.loadFileInfo(); // This is required or else seekTo throws a
									// NPE
			scanner.seekTo(); // This is required or else scanner.next throws an
								// error
		}

		
		
		@Override
		public void close() throws IOException {
			if (reader != null) {
				reader.close();
			}
		}

		/*
		 * @Override public boolean next(ImmutableBytesWritable key, KeyValue
		 * value) throws IOException { entryNumber++; return scanner.next(); }
		 */

		@Override
		public ImmutableBytesWritable getCurrentKey() throws IOException,
				InterruptedException {
			// TODO Auto-generated method stub
			return new ImmutableBytesWritable(scanner.getKeyValue().getRow());
		}

		@Override
		public KeyValue getCurrentValue() throws IOException,
				InterruptedException {
			// TODO Auto-generated method stub
			return new KeyValue(scanner.getKeyValue());
		}

		@Override
		public boolean nextKeyValue() throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			entryNumber++;
			return scanner.next();
		}

		@Override
		public float getProgress() throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			if (reader != null) {
		        return (entryNumber / reader.getEntries());
		      }
		      return 1;
		}

		@Override
		public void initialize(InputSplit arg0, TaskAttemptContext arg1)
				throws IOException, InterruptedException {
			
		}

	}
	
	@Override
	protected boolean isSplitable(JobContext context, Path filename) {
	    return false;
	}

	@Override
	public RecordReader<ImmutableBytesWritable, KeyValue> createRecordReader(InputSplit split,
			TaskAttemptContext context) throws IOException,
			InterruptedException {
		return new HFileRecordReader((FileSplit) split,
				context.getConfiguration());
	}

}
全部代碼在這裏了https://github.com/willwill1101/hadoop
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章