sqoop 導入 mysql 數據到 hbase 效率不高, 並且不能實時導入數據
這裏使用 kafka-thrift
方案
原理
mysql
需要開啓binlog
,配置 maxwell
監聽binlog
,這樣 mysql
任何數據變動,都會反應在 binlog
文件中,由 maxwell
解析成 json
格式,傳入消息隊列 kafka
,然後通過自己寫 python 程序作爲 kafka
消費者,將數據通過happybase
包存儲在 hbase
中,happybase
包依賴thrift
這種方式可以實時將 mysql
數據導入 hbase
中
Phoenix安裝 (便於查詢 hbase, 安裝可選)
由於HBase數據庫只提供了Java和shell兩種接口,並不支持SQL,所以誕生了Phoenix 工具
# 在 ubuntu1 上
cd /opt
wget http://mirror.bit.edu.cn/apache/phoenix/apache-phoenix-4.15.0-HBase-1.4/bin/apache-phoenix-4.15.0-HBase-1.4-bin.tar.gz
tar zvxf apache-phoenix-4.15.0-HBase-1.4-bin.tar.gz
mv apache-phoenix-4.15.0-HBase-1.4-bin phoenix-4.15.0
# 將 phoenix-5.0.0-HBase-2.0-server.jar 拷貝到所有節點的 hbase 的 lib 目錄下
cp /opt/phoenix-4.15.0/phoenix-4.15.0-HBase-1.4-server.jar /opt/hbase-1.4.12/lib/
scp /opt/phoenix-4.15.0/phoenix-4.15.0-HBase-1.4-server.jar hadoop-2:/opt/hbase-1.4.12/lib/
scp /opt/phoenix-4.15.0/phoenix-4.15.0-HBase-1.4-server.jar hadoop-3:/opt/hbase-1.4.12/lib/
# 配置環境變量
vim /etc/profile.d/hbase-1.4.12.sh
export PHOENIX_HOME=/opt/phoenix-4.15.0
export PATH=$PATH:$PHOENIX_HOME/bin
source /etc/profile
# 重啓 hbase
stop-hbase.sh
start-hbase.sh
# 啓動 phoenix
sqlline.py
輸入 !table
驗證
kafka 安裝
# 在所有節點上
cd /opt
wget https://mirrors.tuna.tsinghua.edu.cn/apache/kafka/2.4.0/kafka_2.13-2.4.0.tgz
tar zvxf kafka_2.13-2.4.0.tgz
mv kafka_2.13-2.4.0 kafka-2.4.0
# 分別修改 kafka 配置
vim kafka-2.4.0/config/server.properties
broker.id=1 # 所有節點 id 均不一樣,隨便設置一個整數即可
log.dirs=/opt/kafka-2.4.0/kafka-logs
listeners=PLAINTEXT://10.0.0.10:9092 # 改爲各節點內網ip
zookeeper.connect=hadoop-1:2181,hadoop-2:2181,hadoop-3:2181
# 啓動
cd /opt/kafka-2.4.0
./bin/kafka-server-start.sh -daemon config/server.properties
# 創建 topic, 注意根據實際消費者數目修改分區數
./bin/kafka-topics.sh --create --bootstrap-server 10.0.0.10:9092 --replication-factor 2 --partitions 4 --topic mysql2hbase
# 修改分區數
./kafka-topics.sh --bootstrap-server 10.0.0.10:9092 --alter --partitions 8 --topic mysql2hbase
# 分區數要大於等於消費者的數量,這樣多個消費者才能並行消費,如果分區數爲1,那麼只會有一個消費者在消費
# 驗證
./bin/kafka-topics.sh --list --bootstrap-server 10.0.0.10:9092
# 查看 kafka 消費情況
./kafka-consumer-groups.sh --bootstrap-server hadoop-1:9092 --describe --group testgroup
mysql 開啓 binlog
vim /etc/mysql/my.cnf
[mysqld]
server-id=1
log-bin=/data/mysql/mysql-bin
binlog_format=row
systemctl restart mysql
# 創建 maxwell 使用的賬戶並設置權限
mysql> CREATE USER 'maxwell'@'%' IDENTIFIED BY 'maxwell';
mysql> GRANT ALL ON maxwell.* TO 'maxwell'@'%';
mysql> GRANT SELECT, REPLICATION CLIENT,REPLICATION SLAVE on *.* to 'maxwell'@'%';
mysql> flush privileges;
maxwell 部署
# hadoop-1 上
cd /opt
wget https://github.com/zendesk/maxwell/releases/download/v1.24.1/maxwell-1.24.1.tar.gz
tar zvxf maxwell-1.24.1.tar.gz
cd maxwell-1.24.1
cp config.properties.example config.properties
vim config.properties
kafka.bootstrap.servers=hadoop-1:9092,hadoop-2:9092,hadoop-3:9092
# mysql login info
host=10.0.0.5
user=maxwell
password=maxwell
kafka_topic=mysql2hbase
kafka.acks=all
#######修改partition_by,解決kafka數據傾斜######
# 詳情參考: https://www.jianshu.com/p/a102577bfc8f
# zabbix 中 history_uint 表沒有主鍵,可以使用 transaction_id
kafka_partition_hash=murmur3
producer_partition_by=primary_key
# 啓動, maxwell 默認使用當前工作目錄的配置文件
./bin/maxwell --daemon
thrift 安裝
apt install -y automake bison flex g++ git libboost1.55 libevent-dev libssl-dev libtool make pkg-config
cd /opt
wget http://mirrors.hust.edu.cn/apache/thrift/0.13.0/thrift-0.13.0.tar.gz
tar zvxf thrift-0.13.0.tar.gz
cd thrift-0.13.0
./configure --with-cpp --with-boost --with-python --without-csharp --with-java --without-erlang --without-perl --with-php --without-php_extension --without-ruby --without-haskell --without-go
make && make install
# 修改 thrift 超時時間
vim /opt/hbase-1.4.12/conf/hbase-site.xml
<property>
<name>hbase.thrift.server.socket.read.timeout</name>
<value>6000000</value>
<description>eg:milisecond</description>
</property>
# 啓動
hbase-daemon.sh start thrift
測試表結構
總共 342169
條數據, itemid
與 clock
爲聯合主鍵
該表數據不會新增,我們要將該表的全部數據都導入到 hbase
中。
我們將該表備份,然後刪除表 (drop table
不會寫 binlog
,還原表會),再還原表,這樣 binlog
就能產生記錄,maxwell
能解析並生產到 kafka
中
如果該表數據在不斷新增,那麼這種方法只能導入新增的數據, 並且下面的腳本中, 要將consumer_timeout_ms
刪掉
mysql to hbase
# 安裝依賴
apt-get install libsnappy-dev
# yum install libsnappy-devel
# 創建 python 虛擬環境略
pip install python-snappy pykafka happybase
python 消費者
# 使用腳本前, 先配置好 mysql->maxwell->kafka 數據流, hbase 安裝並運行 thrift
# 然後將目標數據庫導入mysql, 這樣全部數據在 kafka 中, 覈對 kafka 中 lag 是否與mysql表總數一樣
# 執行該腳本, 腳本會持續消費 kafka 消息, 如果30s沒有收到新的消息, 表示已全部消費完, 則表示全部數據已導入 hbase
# 該腳本可以執行多次, 已達到多進程多消費者同時消費的目的, 也可以使用下面的多進程版本
import json
import time
from pykafka import KafkaClient
import happybase
class mysqlToHbase():
def __init__(self):
self.client = KafkaClient(
hosts="hadoop-1:9092,hadoop-2:9092,hadoop-3:9092")
self.topic = self.client.topics['mysql2hbase']
self.consumer = self.topic.get_balanced_consumer(
consumer_group='testgroup',
consumer_timeout_ms=30000, # 30s kafka 沒有 lag, 則停止程序, 如果需要實時導入數據,則去掉該參數
auto_commit_enable=True,
zookeeper_connect='hadoop-1:2181,hadoop-2:2181,hadoop-3:2181')
self.conn = happybase.Connection(host="127.0.0.1",
port=9090,
autoconnect=True)
def batchTokafka(self, mysql_table_name, mysql_table_id, include_column,
table_name, families):
start_time = time.time()
table = self.conn.table(table_name)
i = 1
with table.batch(batch_size=1024 * 1024) as bat:
# batch_size 要根據實際數據大小設置, 如果設置成 1024 * 1024 , zabbix 的 trends 表 26200 條數據才提交一次
for m in self.consumer:
database = json.loads(m.value.decode('utf-8'))["database"]
name = json.loads(m.value.decode('utf-8'))["table"]
row_data = json.loads(m.value.decode('utf-8'))["data"]
if database == 'zabbix' and name == mysql_table_name:
table_id = ''.join(
[str(row_data[i]) for i in mysql_table_id])
for id in mysql_table_id:
del row_data[id]
row_data = add_prefix(row_data, families, include_column)
bat.put(
table_id,
row_data) # put 內部檢測是否達到 batch_size, 達到則調用 send() 發送
if i % 1000 == 0:
print("===========插入了" + str(i) + "數據!============")
print("===========累計耗時:" +
str(time.time() - start_time) + "s=============")
i += 1
print('********** 總共插入: %d *************' % (i - 1))
print('********** mysql表總共: %d *************' % mysql_table_counts)
def createTable(self, table_name, families):
if bytes(table_name, 'utf-8') in self.conn.tables():
return
self.conn.create_table(table_name, families)
print("==========create table %s successful==========" % table_name)
def deletTable(self, table_name, flag):
self.conn.delete_table(table_name, flag)
print("==========delete table %s successful==========" % table_name)
def insertData(self, table_name, row, data):
table = self.conn.table(table_name)
table.put(row=row, data=data)
def getRow(self, table_name):
table = self.conn.table(table_name)
print(table.scan())
i = 0
for key, value in table.scan():
print('key: ', key, 'value: ', value)
i += 1
print("==========counts: %d ==========" % i)
def close(self):
self.conn.close()
del self.consumer
def add_prefix(row_data, prefix, include_column):
new_raw_data = {}
if '*' in include_column:
columns = row_data
else:
columns = include_column
for column in columns:
new_raw_data[prefix + ':' + column] = str(row_data.get(column))
return new_raw_data
if __name__ == '__main__':
htb = mysqlToHbase()
mysql_table_name = 'trends'
mysql_table_id = ['clock', 'itemid']
# 表中含有多個主鍵, 最終 rowkey 爲 'clock + itemid'
mysql_table_counts = 342169
include_column = ['*']
# 如果想包含全部列, 配置 include_column = ['*']; 多列: include_column = ['column1', 'column2']
table_name = 'mysql_hbase_trends'
families = 'info'
htb.createTable(table_name, {families: {}})
# htb.getRow(table_name)
htb.batchTokafka(mysql_table_name, mysql_table_id, include_column,
table_name, families)
# htb.deletTable(table_name, True)
htb.close()
測試
單進程測試,30w 條記錄大概耗時 1分鐘
342169
條數據全部導入 hbase 中,hbase 中驗證:
多進程版本
# 使用腳本前, 先配置好 mysql->maxwell->kafka 數據流, hbase 安裝並運行 thrift
# thrift 要設置超時時間 https://blog.51cto.com/13103353/2107257
# 然後將目標數據庫導入mysql, 這樣全部數據在 kafka 中, 覈對 kafka 中 lag 是否與mysql表總數一樣
# 執行該腳本, 腳本會持續消費 kafka 消息, 如果 5min 沒有收到新的消息, 表示該消費者已無法從 kafka 獲取消息, 終止相關進程
# 該腳本爲多進程版本,每個版本一個消費者,kafka 相應 topic 的分區必須大於等於消費者總數纔行
# 並且一定要確保生產者是在同時往 kafka 多個分區中同時寫數據
# 進程數不能超過 kafka 中lag 不爲0的分區數, 否則程序報錯 RuntimeError: cannot join current thread, 該報錯可忽略
import concurrent.futures
import json
from multiprocessing import cpu_count
import os
import time
from pykafka import KafkaClient
from pykafka.exceptions import ConsumerStoppedException
import happybase
class mysqlToHbase():
def __init__(self):
self.client = KafkaClient(
hosts="hadoop-1:9092,hadoop-2:9092,hadoop-3:9092")
self.topic = self.client.topics['mysql2hbase']
self.consumer = self.topic.get_balanced_consumer(
consumer_group='testgroup',
# 5min kafka 沒有 lag, 則停止程序, 數據量小於百萬, 可以設置爲 30s, 數據量越大,建議設置長一點, 但不能超過 thrift 超時時間
consumer_timeout_ms=5 * 60 * 1000,
auto_commit_enable=True,
zookeeper_connect='hadoop-1:2181,hadoop-2:2181,hadoop-3:2181')
self.conn = happybase.Connection(host="127.0.0.1",
port=9090,
autoconnect=True)
def batchTokafka(self, mysql_table_name, mysql_table_id, include_column,
table_name, families, batch_size):
start_time = time.time()
table = self.conn.table(table_name)
i = 1
with table.batch() as bat:
try:
for m in self.consumer:
database = json.loads(m.value.decode('utf-8'))["database"]
name = json.loads(m.value.decode('utf-8'))["table"]
row_data = json.loads(m.value.decode('utf-8'))["data"]
if database == 'zabbix' and name == mysql_table_name:
table_id = ''.join(
[str(row_data[i]) for i in mysql_table_id])
for id in mysql_table_id:
del row_data[id]
row_data = add_prefix(row_data, families,
include_column)
bat.put(table_id, row_data)
if i % batch_size == 0:
bat.send()
print(
"[%d]: " % os.getpid(),
"===========插入了" + str(i) + "數據!============")
print(
"[%d]: " % os.getpid(), "===========累計耗時:" +
str(time.time() - start_time) +
"s=============")
i += 1
except ConsumerStoppedException:
pass
print("[%d]: " % os.getpid(),
'********** 總共插入: %d *************' % (i - 1))
def createTable(self, table_name, families):
if bytes(table_name, 'utf-8') in self.conn.tables():
return
self.conn.create_table(table_name, families)
print("==========create table %s successful==========" % table_name)
def deletTable(self, table_name, flag):
self.conn.delete_table(table_name, flag)
print("==========delete table %s successful==========" % table_name)
def insertData(self, table_name, row, data):
table = self.conn.table(table_name)
table.put(row=row, data=data)
def getRow(self, table_name):
table = self.conn.table(table_name)
print(table.scan())
i = 0
for key, value in table.scan():
print('key: ', key, 'value: ', value)
i += 1
print("==========counts: %d ==========" % i)
def close(self):
self.conn.close()
del self.consumer
def add_prefix(row_data, prefix, include_column):
new_raw_data = {}
if '*' in include_column:
columns = row_data
else:
columns = include_column
for column in columns:
new_raw_data[prefix + ':' + column] = str(row_data.get(column))
return new_raw_data
def execute(mysql_table_name, mysql_table_id, include_column, table_name,
families, batch_size):
htb = mysqlToHbase()
htb.batchTokafka(mysql_table_name, mysql_table_id, include_column,
table_name, families, batch_size)
htb.close()
if __name__ == '__main__':
cpus = cpu_count()
mysql_table_name = 'trends'
# 表中含有多個主鍵, 最終 rowkey 爲 'clock + itemid'
mysql_table_id = ['clock', 'itemid']
mysql_table_counts = 342169
# 多少條記錄批量提交一次
batch_size = 10000
# 如果想包含全部列, 配置 include_column = ['*']; 多列: include_column = ['column1', 'column2']
include_column = ['*']
table_name = 'mysql_hbase_trends'
families = 'info'
# 創建 hbase 表
htb = mysqlToHbase()
htb.createTable(table_name, {families: {}})
htb.close()
with concurrent.futures.ProcessPoolExecutor(max_workers=cpus) as executor:
submits = [
executor.submit(execute, mysql_table_name, mysql_table_id,
include_column, table_name, families, batch_size)
for i in range(cpus)
]
for future in concurrent.futures.as_completed(submits):
future.result()
print('********** mysql表總共: %d *************' % mysql_table_counts)
8核服務器運行,耗時大概10s