import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.*; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.UDFType; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.io.Text; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; /** * Copyright (C), 2015, * * @author * @version 0.0.1 * @desc 導入hive數據到hbase, CREATE TEMPORARY FUNCTION hive2HBase as 'xxx.etl.hive2hbase.UDFHbaseMerge'; * @date 2/23/16 */ @Description(name = "hive2HBase", value = "FUNC(zookeeperQuorum, hbaseTable, CF, rowKey, c1, c2, c3, …) - read data from hive and delete same date at HBase, " + "returns success of the import.", extended = "The first argument is zookeeperQuorum, " + "the second argument is the hbase table, " + "the third argument is the CF, " + "the fourth argument is the rowKey, " + "the other args should be a map, seprated by ',' ." + "example: select FUNC('zookeeperQuorum', 'tableName', 'columFamily', key, 'columnName1,columnName2', columnName1value, columnName2value) from dual;") @UDFType(deterministic = false) public class UDFHbaseMerge extends GenericUDF { private static final Logger logger = LoggerFactory.getLogger(UDFHbaseMerge.class); // 接受輸入參數 protected transient ObjectInspector[] argumentOI; protected transient String hbaseTable; protected BufferedMutator mutator; protected Connection connection; protected static String cf = "F"; protected static String[] cols; protected final static String NULL_FLAG = ""; protected final Text result = new Text(); protected String zookeeperQuorum; @Override public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException { argumentOI = objectInspectors; // 校驗udf輸入的前3個參數是否爲String, 如果不是將會拋出異常 for (int i = 0; i < 3; i++) { if (objectInspectors[i].getCategory() == ObjectInspector.Category.PRIMITIVE) { PrimitiveObjectInspector poi = ((PrimitiveObjectInspector) objectInspectors[i]); if (!(poi.getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING)) { throw new UDFArgumentTypeException(i, "The argument of function should be \"" + serdeConstants.STRING_TYPE_NAME + "\", but \"" + objectInspectors[i].getTypeName() + "\" is found"); } } } // 校驗第四個及以後的參數是否爲java 原生的類型, 如果不是將會拋出異常 for (int i = 3; i < objectInspectors.length; i++) { if (objectInspectors[i].getCategory() != ObjectInspector.Category.PRIMITIVE) { throw new UDFArgumentTypeException(i, "The argument of function should be primative" + ", but \"" + objectInspectors[i].getTypeName() + "\" is found"); } } // 設置預期返回值爲String return PrimitiveObjectInspectorFactory.writableStringObjectInspector; } @Override public Object evaluate(DeferredObject[] deferredObjects) throws HiveException { try { if (mutator == null) { zookeeperQuorum = getDeferredObject(deferredObjects, 0); hbaseTable = getDeferredObject(deferredObjects, 1); cf = getDeferredObject(deferredObjects, 2); cols = getDeferredObject(deferredObjects, 4).split(","); Configuration conf = HBaseConfiguration.create(); conf.set("hbase.zookeeper.quorum", zookeeperQuorum); conf.set("hbase.zookeeper.property.clientPort", "2181"); conf.set("mapred.task.timeout", "3600000"); conf.set("dfs.socket.timeout", "3600000"); conf.set("dfs.datanode.socket.write.timeout", "3600000"); connection = ConnectionFactory.createConnection(conf); mutator = connection.getBufferedMutator(TableName.valueOf(hbaseTable)); } Put put = getPut(deferredObjects); try { mutator.mutate(put); } catch (IOException e) { logger.error(Bytes.toString(mutator.getName().getName()) + " put error " + e.getMessage()); } result.set("success"); } catch (Exception ex) { logger.error("evaluate發生錯誤"); result.set(ex.toString()); this.close(); } return result; } @Override public String getDisplayString(String[] children) { StringBuilder sb = new StringBuilder(); sb.append("hive2HBase("); if (children.length > 0) { sb.append(children[0]); for (int i = 1; i < children.length; i++) { sb.append(","); sb.append(children[i]); } } sb.append(")"); return sb.toString(); } protected String getDeferredObject(DeferredObject[] arguments, int index) throws HiveException { if (arguments[index].get() == null) { return NULL_FLAG; } return ((PrimitiveObjectInspector) argumentOI[index]).getPrimitiveJavaObject(arguments[index].get()).toString(); } protected Put getPut(DeferredObject[] arguments) throws Exception { String rowKey = getDeferredObject(arguments, 3); Put put = new Put(Bytes.toBytes(rowKey)); for (int i = 0; i < cols.length; i++) { put.addColumn(Bytes.toBytes(cf), Bytes.toBytes(cols[i]), Bytes.toBytes(getDeferredObject(arguments, i + 5))); } return put; } @Override public void close() { try { super.close(); if (mutator != null) { mutator.flush(); mutator.close(); connection.close(); } } catch (Exception e) { logger.error(Bytes.toString(mutator.getName().getName()) + " close error " + e.getMessage()); } } @Override public String[] getRequiredFiles() { return super.getRequiredFiles(); } } 該udf繼承GenericUDF,實現了initialize、evaluate等方法,通過在evaluate中構造put進行導入。 具體使用方法如下: set mapred.reduce.tasks=20; add jar /home/xxx/xxx/hive2hbase/xxx-etl-0.0.1.jar; CREATE TEMPORARY FUNCTION hive2HBase as 'xxx.etl.hive2hbase.UDFHbaseMerge'; 在hbase創建表
drop table if exists hive2hbase_tmp;
create table hive2hbase_tmp as select hive2HBase('zookeeperQuorum', 'mobile_nature', 'n', mobile, 'n1,n2', 11_, 58_) from mobile_nature;
drop table if exists hive2hbase_tmp;
舊api | 新api | |
讀請求 | 1w-2w/s |
3k-4k/s
|
寫請求 | 4k/s | 4w/s |
耗時 | 40min | 4min |
對集羣影響 | 集羣會出現一定的隱患 | 對集羣無影響 |
通過結果表明應該使用新的批量api,我們的讀取請求不會達到3k/s
參考:http://blog.csdn.net/zzuiezhangqihui/article/details/47259465