1.腳本
get_hdfs_ha_state.sh
#!/bin/bash
NN1_HOSTNAME=""
NN2_HOSTNAME=""
NN1_SERVICEID=""
NN2_SERVICEID=""
NN1_SERVICESTATE=""
NN2_SERVICESTATE=""
#failover預警郵箱
EMAIL=[email protected]
#CDH_BIN_HOME=/opt/cloudera/parcels/CDH/bin
#hadoop二進制文件目錄
CDH_BIN_HOME=/home/hadoop/app/hadoop/bin
# 讀取nameservers
ha_name=$(${CDH_BIN_HOME}/hdfs getconf -confKey dfs.nameservices)
# 讀取namenodeids
namenode_serviceids=$(${CDH_BIN_HOME}/hdfs getconf -confKey dfs.ha.namenodes.${ha_name})
# 按照 , 分隔遍歷namenodeids
for node in $(echo ${namenode_serviceids//,/ }); do
state=$(${CDH_BIN_HOME}/hdfs haadmin -getServiceState $node)
if [ "$state" == "active" ]; then
NN1_SERVICEID="${node}"
NN1_SERVICESTATE="${state}"
NN1_HOSTNAME=`echo $(${CDH_BIN_HOME}/hdfs getconf -confKey dfs.namenode.rpc-address.${ha_name}.${node}) | awk -F ':' '{print $1}'`
#echo "${NN1_HOSTNAME} : ${NN1_SERVICEID} : ${NN1_SERVICESTATE}"
elif [ "$state" == "standby" ]; then
NN2_SERVICEID="${node}"
NN2_SERVICESTATE="${state}"
NN2_HOSTNAME=`echo $(${CDH_BIN_HOME}/hdfs getconf -confKey dfs.namenode.rpc-address.${ha_name}.${node}) | awk -F ':' '{print $1}'`
#echo "${NN2_HOSTNAME} : ${NN2_SERVICEID} : ${NN2_SERVICESTATE}"
else
echo "hdfs haadmin -getServiceState $node: unkown"
fi
done
echo " "
echo "Hostname Namenode_Serviceid Namenode_State"
echo "${NN1_HOSTNAME} ${NN1_SERVICEID} ${NN1_SERVICESTATE}"
echo "${NN2_HOSTNAME} ${NN2_SERVICEID} ${NN2_SERVICESTATE}"
#save current NN1/2_HOSTNAME state
echo "${NN1_HOSTNAME} ${NN1_SERVICEID} ${NN1_SERVICESTATE}" > HDFS_HA.log
echo "${NN2_HOSTNAME} ${NN2_SERVICEID} ${NN2_SERVICESTATE}" >> HDFS_HA.log
# 發送郵件
# 判斷文件是否存在
if [ -f HDFS_HA_LAST.log ];then
# 取出第一列 一個元素
HISTORYHOSTNAME=`cat HDFS_HA_LAST.log| awk '{print $1}' | head -n 1`
# 判斷namenode是否有變化
if [ "$HISTORYHOSTNAME" != "${NN1_HOSTNAME}" ];then
# 有變化 發送郵件
echo "send a mail"
echo -e "`date "+%Y-%m-%d %H:%M:%S"` : Please to check namenode log." | mail \
-r "From: alertAdmin <[email protected]>" \
-s "Warn: CDH HDFS HA Failover!." ${EMAIL}
fi
fi
cat HDFS_HA.log > HDFS_HA_LAST.log
2.思路
讀取當前active的namenode節點,並將節點id保存到HDFS_HA.log,同時讀取上一次的log,HDFS_HA_LAST.log,對比active節點id是否發生變化,若變化了(如從nn1變成了nn2),說明發生了failover,立即發送郵件預警。
3.Centos發送郵件配置
可參考我之前的博文: CentOS 通過465端口發送QQ郵件
4.執行腳本
執行腳本
[hadoop@ruozedata001 hadoop]$ ./get_hdfs_ha_state.sh
# 此時hdfs ha運行正常沒有產生failover,所以僅僅是打印了集羣hdfs ha狀態
Hostname Namenode_Serviceid Namenode_State
ruozedata002 nn2 active
ruozedata001 nn1 standby
模擬failover
[hadoop@ruozedata001 ~]$ hdfs haadmin -failover nn2 nn1
Failover to NameNode at ruozedata001/172.24.102.253:8020 successful
再執行腳本
[hadoop@ruozedata001 hadoop]$ ./get_hdfs_ha_state.sh
# 檢測到failover,成功發送郵件
Hostname Namenode_Serviceid Namenode_State
ruozedata001 nn1 active
ruozedata002 nn2 standby
send a mail
效果