1.編寫flume-hbase.conf
a1.sources=r1
a1.channels= hbaseC
a1.sinks= hbaseS2
a1.sources.r1.type = avro
a1.sources.r1.bind = spark1
#spark1端的flume source的端口必須與8989保持一致
a1.sources.r1.port = 8989
a1.sources.r1.threads = 5
#*********************flume+hbase**************************
a1.sources.r1.channels = hbaseC
a1.sinks.hbaseS2.channel = hbaseC
a1.channels.hbaseC.type = memory
a1.channels.hbaseC.capacity = 10000
a1.channels.hbaseC.transactionCapacity = 10000
a1.channels.hbaseC.keep-alive = 20
a1.sinks.hbaseS2.type = asynchbase
#weblogs爲hbase的表名
a1.sinks.hbaseS2.table = weblogs
#info爲hbase表的列族名,UserDfAsyncHbaseEventSerializer爲自定義類名
a1.sinks.hbaseS2.columnFamily = info
a1.sinks.hbaseS2.serializer = org.apache.flume.sink.hbase.UserDfAsyncHbaseEventSerializer
#payloadColumn爲flume監聽數據的列名,將會映射到hbase的列名
a1.sinks.hbaseS2.serializer.payloadColumn=datetime,userid,searchname,retorder,cliorder,cliurl
2.在hbase中創建表:create ‘weblogs’,‘info’
3.自定義UserDfAsyncHbaseEventSerializer類,下載flume的源碼,加載源碼的flume-ng-hbase-sink包,在改包下自定義一個UserDfAsyncHbaseEventSerializer類繼承AsyncHbaseEventSerializer接口,具體方法如下:
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.flume.sink.hbase; import com.google.common.base.Charsets; import org.apache.flume.Context; import org.apache.flume.Event; import org.apache.flume.FlumeException; import org.apache.flume.conf.ComponentConfiguration; import org.hbase.async.AtomicIncrementRequest; import org.hbase.async.PutRequest; import java.util.ArrayList; import java.util.List; public class UserDfAsyncHbaseEventSerializer implements AsyncHbaseEventSerializer { //表名 private byte[] table; //列族名 private byte[] colFam; //當前事件 private Event currentEvent; //列名 private byte[][] columnNames; //用於向Hbase批量存數據 private final List<PutRequest> puts = new ArrayList<PutRequest>(); private final List<AtomicIncrementRequest> incs = new ArrayList<AtomicIncrementRequest>(); //當前行建 private byte[] currentRowKey; private final byte[] eventCountCol = "eventCount".getBytes(); @Override public void configure(Context context) { //從配置文件中獲取列名 String cols = context.getString("payloadColumn"); String[] names = cols.split(","); columnNames = new byte[names.length][]; int i = 0; for (String name : names) { columnNames[i++] = name.getBytes(); } } @Override public void cleanUp() { table = null; colFam = null; currentEvent = null; columnNames = null; currentRowKey = null; } @Override public List<PutRequest> getActions() { //分割事件,獲取各列的值 String eventStr = new String(currentEvent.getBody()); String[] cols = eventStr.split(","); puts.clear(); String datetime = cols[0]; String userid = cols[1]; if (cols.length == columnNames.length) { //生成行鍵 try { currentRowKey = SimpleRowKeyGenerator.getHbaseRowKey(userid, datetime); } catch (Exception e) { throw new FlumeException("Could not get row key!", e); } //添加每列數據 for (int i = 0; i < cols.length; i++) { PutRequest putReq = new PutRequest(table, currentRowKey, colFam, columnNames[i], cols[i].getBytes(Charsets.UTF_8)); puts.add(putReq); } } return puts; } @Override public List<AtomicIncrementRequest> getIncrements() { //增加接受的事件數量 incs.clear(); incs.add(new AtomicIncrementRequest(table, "totalEvents".getBytes(), colFam, eventCountCol)); return incs; } //初始化表名和列族名 @Override public void initialize(byte[] table, byte[] colFam) { this.table = table; this.colFam = colFam; } @Override public void setEvent(Event event) { this.currentEvent = event; } @Override public void configure(ComponentConfiguration conf) { // TODO Auto-generated method stub } }
4.打包上傳到flume的lib下。
5.啓動hbase、flume服務。
6.配置hive集成hbase方法:
a.在hive-site.xml文件中配置zookeeper,hive通過這些參數去連接hbase
<property>
<name>hbase.zookeeper.quorum</name>
<value>spark1</value>
<property>
b.將hbase的下面的依賴包拷貝到hive的lib下面,如果cdh同版本的,就不需要拷貝
hbase-server-1.2.9.jar
hbase-client-1.2.9.jar
hbase-protocol-1.2.9.jar
hbase-it-1.2.9.jar
htrace-core-3.1.0-incubating.jar
hbase-hadoop2-compat-1.2.9.jar
hbase-hadoop-compat-1.2.9.jar
c.在hive中創建表
create external table weblogs(
id string,
datetime string,
userid string,
searchname string,
retorder string,
cliorder string,
cliurl string
) stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties ("hbase.columns.mapping" = ":key, info:datetime,info:userid,info:searchname,info:retorder,info:cliorder,info:cliurl") tblproperties("hbase.table.name"="weblogs");
最後可以從hbase中加載數據到hive表中進行離線分析。