上篇文章發了salt-minion的監控代碼 http://6252961.blog.51cto.com/6242961/1710977 ,在監控跑完列出所有的有問題的客戶端列表之後,如果手動一個個去修復,很費事,服務器你太多,所以寫了這個自動修復的代碼,解放雙手
代碼邏輯:
1、首先從數據庫讀取minion端有問題的服務器,如果數量超過100,則停止自動修復,沒有則繼續(這個沒有在代碼中實現,不過也很簡單,只需要判斷一下列表長度即可)
2、檢測服務器的ping,如果ping通,繼續,否則保存錯誤信息,停止自動修復
3、檢測服務器的ssh登陸狀態,如果可以登錄並命令‘date’執行成功,繼續,否則保存錯誤信息,停止自動修復
4、檢查服務器的nfs掛載狀態,如果掛載異常,先卸載nfs,再繼續執行(因爲服務器幾千臺,會經常出現服務器掛載的nfs的ip不通的問題,造成yum在執行的過程中卡死,無法完成任務也無法退出任務,具體原因沒有細究),如果nfs掛載正常,則繼續下一步,如果卸載失敗,則停止修復
5、對服務器yum進行修復,就是初始化yum的過程,初始化完之後執行yum list| grep salt如果執行成功,則繼續,否則保存錯誤信息,停止自動修復
6、卸載服務器原有salt-minion客戶端,卸載之後檢查有沒有卸載成功,如果成功,則繼續,否則保存錯誤信息,停止自動修復
7、重新安裝最新salt-minion客戶端,檢查有沒有安裝成功,如果成功,則繼續,否則保存錯誤信息,停止自動修復
8、啓動salt-minion客戶端,檢查啓動狀態,如果成功,則繼續,否則保存錯誤信息,停止自動修復
9、登陸master端執行簡單命令,確認master與修復後的minion通信是否成功,如果成功,則修改最新數據庫的對應信息,如果報錯,則把最新信息的對應報錯信息更新
注:
很多地方都是用的公司通道機獲取的json格式的返回數據,函數run_cmd,如:
{"RETURN":"{\"sub_task_id\":\"******\",\"ip\":\"10.75.4.43\",\"user\":\"****\",\"result\":\"10.75.19.1**\\n\"}"}
代碼:
#!/usr/bin/python # -*- coding:utf-8 -*- _author__ = 'mujibin' #import python lib import random import urllib import datetime import time import MySQLdb import os import time import re import urllib2 import json import string import sys import time import paramiko #add path sys.path.append("/data1/salt/mysqlapi/salt/") #import salt repaire function from multiprocessing import * import logging from salt_minion_list import * from init_server import * from check_salt import * #from check_salt_bak import * from salt_repair_ssh import * reload(sys) sys.setdefaultencoding('utf8') H3303='*****.cn' H3304m='******.cn' P3303=3303 P3304=3304 dp_admin='dp_admin' HOST_PORT='3303' HOST_USER = 'mysqlha' HOST_PASSED = '********' db='test' port='*******' c_date = time.strftime("%Y%m%d",time.localtime()) c_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) ''' log_path = "/data1/dbatemp/salt/logs" is_path=os.path.exists(log_path) if not is_path: os.makedirs(log_path) log_name = "salt_reparie.log" logger = logging.getLogger() handler = logging.FileHandler(os.path.join(log_path,log_name)) formater = logging.Formatter("%(asctime)s %(levelname)s [%(funcName)s :%(lineno)d] %(message)s") handler.setFormatter(formater) logger.addHandler(handler) logger.setLevel(logging.NOTSET) #logger.setLevel(logging.INFO) #logger.setLevel(logging.DEBUG) #logger.setLevel(logging.ERROR) ''' ########################################################## salt_yes = datetime.date.today() ########################################################## #ssh api argument method = "sync" output = "json" ignore_error = "true" timeout = "28" ########################################################## slat_minion_check_CONSTANT="salt-minion" ########################################################## SALT = "salt" VERSION = "5.3" ########################################################### #master dns transfor to ip ########################################################### def getIp(domain): import socket myaddr = socket.getaddrinfo(domain,'http')[0][4][0] return myaddr MASTERDNS= "******.cn" MASTERIP = getIp(MASTERDNS) ########################################################## def ssh_connect_bak(host): client = paramiko.SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) privatekeyfile = os.path.expanduser('/root/.ssh/id_rsa') mykey = paramiko.RSAKey.from_private_key_file(privatekeyfile) host=host.strip() client.connect(host,26387,username='root',timeout=2,pkey=mykey) return client def ssh_connect(host): client = paramiko.SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) host=host.strip() client.connect(host,26387,username='root',timeout=10) return client def ssh_cmd(host,cmd): try: client = ssh_connect(host) i,o,e = client.exec_command(cmd) res = o.read().strip() return res except Exception,e: msg = "The host:%s and cmd:%s execute exception." % (host,cmd) #print msg pass def ssh_cmd_check(host,cmd1): #用來判斷是否可以ssh登陸成功 flag=0 #host_info=[host,flag] cmds=[cmd1] try: for cmd in cmds: #paramiko.util.log_to_file('paramiko.log') s = paramiko.SSHClient() s.load_system_host_keys() s.set_missing_host_key_policy(paramiko.AutoAddPolicy()) host=host.strip() s.connect(host,26387,username='root',timeout=20) s0,s1,s2 = s.exec_command(cmd1) info = s1.read().strip() #print s2.read().strip() #host_info.append(info) s.close() flag=0 except Exception,e: #根據第二個標誌位確定ssh是否通 flag=1 #host_info[1]=-1 return flag def run_cmd(ips,cmd,method,output,ignore_error,timeout):##這是公司的通道機,可以獲取json格式的返回數據 _ips_ = ips _cmd_ = cmd #logger.debug(_cmd_) _method_ = method _output_ = output _ignore_error_ = ignore_error _timeout_ = timeout _user_='***' _key_='*****' url='*****p.php' argument={ 'user':_user_,'method':_method_,'output':_output_,'ignore_error':_ignore_error_,'key':_key_,'timeout':_timeout_,'ip':_ips_,'cmd':_cmd_} try: data = urllib.urlencode(argument) response = urllib2.urlopen(url, data) except Exception,e: msg = "Call the api function error!" pass return response.read() def select_in_3303(sql,host,user,port,passwd,db): ##查詢sql try: db = MySQLdb.connect(host=host,user=user,port=port,passwd=passwd,db=db,connect_timeout=5,charset = "utf8") cursor = db.cursor() cursor.execute(sql) rows = cursor.fetchall() array = [] for row in rows: array.append(str(row[0])) db.close() return array except Exception,e: ##print str(e) return [] ##插入sql,因爲還沒有寫入庫的函數,所以沒有用 def sql_insert(sql, port=3304, domain='*****', db='*****'): try: db = MySQLdb.connect(host=domain,user=HOST_USER,port=port,passwd=HOST_PASSED,db='swordfish',connect_timeout=3,charset="utf8") cursor = db.cursor() cursor.execute(sql) db.commit() db.close() except Exception,e: #print str(e) db.rollback() db.close() ##獲取所有客戶端有問題的服務器ip列表 def fix_list_salt(): sshList=[] try: saltsql="select ip_in from salt_mon where salt_mon_value != 'ok' and salt_mon_info not like '%None%' and ctime = (select ctime from salt_mon order by ctime desc limit 1);" sshList=select_in_3303(sql=saltsql,host=H3304m,user=HOST_USER,port=P3304,passwd=HOST_PASSED,db='swordfish') return sshList except Exception,e: print e #判斷salt是否安裝,通過命令去判斷,如果系統無salt,那麼改命名則什麼都不會 #返回;如果存在,則會返回該系統salt的版本。返回:0 表示系統上存在salt,但進程不一定起來;返回1 #表示,希望不存在salt。 def salt_exist_check(host): try: versionCmd = "rpm -qa | grep salt | wc -l" #versionRes = run_cmd(host, versionCmd, method="sync",output="text",ignore_error="true",timeout=5) versionRes = ssh_cmd(host,versionCmd) #logger.info(host+":"+versionRes) if int(versionRes) == 0: status = 0 else: status = 1 res = status return res except Exception,e: msg = "The function salt_exist_check execute failed with host:%s" % host #logger.error(msg) #logger.error(msg) #該函數嘗試restart salt minion 客戶端,在重啓minion客戶端之前,首先通過接口去master上 #刪除該系統id的key,然後再將本地的key刪除,最後重啓。重啓後通過判斷salt進程是否存在,以此 #表明salt是否重啓成功。返回0表示重啓salt成功,返回1表示重啓失敗。 def salt_minion_restart(host): """ when salt minion installed, which will be restart. This function remove the key of minion. """ try: #logger.info("%s Try to restart the salt minion,this action can't guarante for success!" % host) #salt_remove_key(host) Cmd1 = """sudo rm -f /etc/salt/pki/minion/minion_master.pub""" Cmd2 = """sudo /etc/init.d/salt-minion restart""" #logger.info(host+" : "+rmKeyCmd) #logger.info(host+" : "+startCmd) rmRes1 = run_cmd(host, Cmd1, method="sync",output="text",ignore_error="true",timeout=10) time.sleep(5) rmRes2=run_cmd(host, Cmd2, method="sync",output="text",ignore_error="true",timeout=10) #logger.info(host+" : "+rmRes) #logger.info(host+" : "+startRes) time.sleep(5) saltExistStatus = salt_check(host) if saltExistStatus == 0: msg = 0 else: msg = 1 res = msg return res except Exception,e: msg = "The host:%s restart minion failed!" %(host) #logger.error(msg) #logger.error(e) #該函數會自動刪除系統安裝的salt程序,包括salt與salt minion。如果返回0,表示刪除成功;如果返回 #1,表示刪除失敗。 def remove_salt_minion(host): try: #logger.info("%s Try to remove salt minion!" % host) versionCmd = "sudo rpm -qa | grep salt| grep -v grep" versionRes = run_cmd(host, versionCmd, method="sync",output="json",ignore_error="true",timeout=10) #versionRes = ssh_cmd(host,versionCmd) verResJsion = json.loads(versionRes) saltList = json.loads(verResJsion["RETURN"])['result'].split('\n') ssh_cmd(host,'/etc/init.d/salt-minion stop > /dev/null 2>&1 ') if len(saltList) > 1: for one in range(len(saltList)-1): rmCmd ="sudo yum remove -y %s > /dev/null 2>&1 " % (saltList[one]) #logger.info(host+" : "+rmCmd) rmRes = ssh_cmd(host,rmCmd) time.sleep(4) print rmRes #logger.info(host+" : "+rmRes) else: #logger.info("salt minion don't install!") pass versionStatus = salt_exist_check(host) if versionStatus == 0: status = 0 else: status =1 res = status print 'res:%s' %res return res except Exception,e: msg = "The function remove_salt_minion_qa execute failed with host:%s" % host #logger.info(msg) #logger.info(e) #該函數去判斷系統的yum列表是否存在所需安裝的salt版本。如果存在,則返回0;反之,則返回1。 def yum_check(host): try: #logger.info("%s Try to check yum." % host) checkCmd = "sudo yum list | grep salt | grep 2015 | wc -l" checkRes = ssh_cmd(host,checkCmd) if checkRes != 0: status = 0 else: status = 1 msg = status return msg except Exception,e: msg = "The host:%s check the yum error!" %(host) #logger.error(msg) #logger.error(e) #該函數修復系統的yum源。修復成功,返回0;修復失敗,返回1,就是一個初始化yum源的過程。 def yum_repaire(host): try: yumCmd1=""" ([ `ps -ef | grep yum | grep -v grep | wc -l` -ne 0 ] && sudo ps -ef | grep '/usr/bin/yum' | grep -v grep | awk '{print $2}' | xargs kill -9 || echo '') && (cd /var/lib/rpm/ && sudo rm -f __db.00*) && (sudo rpm --rebuilddb) && (sudo yum clean all) && (sudo chattr -i /etc/yum.conf) && (sudo echo 'include=http://****/conf/yumconf.php' > /etc/yum.conf) && (sudo rm -rf /etc/yum.repos.d/*) && (sudo yum -y remove ****dbp > /dev/null 2>&1) && (sudo yum -y install ****dbp > /dev/null 2>&1) """ ret1 = ssh_cmd(host,yumCmd1) time.sleep(60) if yum_check(host) == 0: msg = 0 else: msg = 1 status = msg return msg except Exception,e: msg = "The host:%s try to repaire yum failed!" %(host) #logger.error(msg) #logger.error(msg) #該函數去判斷系統是否存在salt進程,如果存在則,返回0;反之,則返回1. def salt_check(host): try: #logger.info("%s Check the process of salt." % host) checkCmd = "ps -ef | grep salt-minion | grep -v grep | wc -l" checkRes = ssh_cmd(host,checkCmd) #pattern = re.compile(r".*salt") #match = pattern.match(checkRes) if checkRes != 0: status = 0 else: status = 1 msg = status return msg except Exception,e: msg = "The host:%s salt check error!" %(host) #logger.error(msg) #logger.error(msg) #該函數安裝salt minion客戶端,如果安裝成功,返回0;反之,則返回1. def install_salt_minion(host): try: #logger.info("Install salt minion.") inSaltCmd = """([ `ps -ef | grep yum | grep -v grep | wc -l` -ne 0 ] && sudo ps -ef | grep '/usr/bin/yum' | grep -v grep | awk '{print $2}' | xargs kill -9 || echo '') && (sudo yum clean all) && (sudo yum -y install salt.noarch salt-minion.noarch)""" #in1Res = run_cmd(host, inSaltCmd, method, output, ignore_error, timeout) in1Res = ssh_cmd(host,inSaltCmd) #logger.info(host+" : "+in1Res) #print in1Res time.sleep(20) saltInStatus = salt_exist_check(host) if int(saltInStatus) == 1: status = 0 else: status = 1 res = status return res except Exception,e: msg = "The host:%s install minion failed!" %(host) #logger.debug(msg) #logger.error(e) ##該函數檢測服務器ip是否能ping通 def ping_mon_by_host(host): try: ping_cmd = "ping -c 1 -w 2 %s > /dev/null" % host ret = os.system(ping_cmd) if ret == 0: status = 0 msg = "The host %s ping ok" % host else: status = 1 msg = "The host %s ping failed" % host result = status return result except Exception,e: msg = """The host %d: ping_mon_by_host failed!""" % host #logger.error(msg) #logger.error(e) #檢查master與minion端通信是否成功 def check_salt_minion(host): try: cmd = "salt '%s' -t 7 cmd.run 'uptime'" %host ret = ssh_cmd(MASTERIP,cmd) msg = "" if ret and 'load' in ret: status = 0 msg = 'ok' else : status = 1 try: msg = ret.split(':')[1].strip() except Exception,e: msg = ret result = {'status':status,'message':msg} return result except Exception,e: pass #該函數檢測nfs掛載狀態,這裏使用公司通道機獲取json格式的返回數據 def nfs_check(host): mount_number_cmd = "mount | grep 'knfs'| wc -l" mount_number = ssh_cmd(host,mount_number_cmd) if int(mount_number) != 0: mount_data_cmd = "mount | grep 'knfs' | awk -F ' ' '{print $3}'" mount_ip_cmd = "mount | grep 'knfs' | awk -F ':' '{print $1}'" try: mount_ip = run_cmd(host, mount_ip_cmd, method="sync",output="json",ignore_error="true",timeout=10) print mount_ip ipJson = json.loads(mount_ip) ipList = json.loads(ipJson['RETURN'])['result'].split('\n') for one in range(len(ipList)-1): ping_Cmd = "ping -c 1 -w 1 %s | grep '0 received' | wc -l" % (ipList[one]) pingRes = ssh_cmd(host,ping_Cmd) if int(pingRes) != 0: umount = run_cmd(host, mount_data_cmd, method="sync",output="json",ignore_error="true",timeout=10) umJson = json.loads(umount) dataList = json.loads(umJson["RETURN"])['result'].split('\n') for one in range(len(dataList)-1): rmCmd ="umount -l %s > /dev/null 2>&1 " % (dataList[one]) rmRes = ssh_cmd(host,rmCmd) time.sleep(2) mount_number_cmd2 = "mount | grep 'knfs'| wc -l" mount_number2 = ssh_cmd(host,mount_number_cmd2) if int(mount_number2) != 0: msg = 1 else: msg = 0 else: msg = 0 except Exception,e: msg = 1 else: msg = 0 return msg #自動修復salt主程序 def salt_repaire(host): try: msg = "" pingStatus = ping_mon_by_host(host) if pingStatus == 0: #判斷是否可以登錄 sshStatus=ssh_cmd_check(host,'date') if sshStatus == 0: #監測nfs掛載是否正常 nfsStatus = nfs_check(host) if nfsStatus == 0: #修復yum源 print 'yum_repair' yumStatus = yum_repaire(host) #print yumStatus if yumStatus == 0: #卸載salt minion客戶端 print 'remove salt' removeStatus = remove_salt_minion(host) if removeStatus == 0: print 'install salt' #安裝salt minion客戶端 installStatus = install_salt_minion(host) if installStatus == 0: #啓動salt minion 客戶端 print 'start salt' restartStatus = salt_minion_restart(host) if restartStatus == 0: print 'master-minion check' minionStatus = check_salt_minion(host) if minionStatus["status"] == 0: print '%s:ok' % host else: print '%s:%s' %(host,minionStatus["message"]) else: msg = "%s:salt minion restart error!" % host else: msg = "%s:install salt minion failed!" % host else: msg = "%s:remove salt minion failed!" % host else: msg = "%s: yum occur error!" % host else: msg = "%s:nfs err" %host else: msg = "%s: bad ssh,go failed!" % host else: msg = "%s: The host can not ping!" % host print msg #info = msg #re_info = msg #return info #相關信息入庫 #ping_status = p_status #salt_status = s_status #salt_minion_mon(host,ping_status,salt_status,re_info) #salt_info(host,info) return info except Exception,e: msg = "Salt repaire failed with host:%s " % host #logger.info(msg) #logger.info(e) def scheduler_repaire(): minionList = fix_list_salt() pool = Pool(8) pool.map(salt_repaire,minionList) pool.close() pool.join() if __name__ == "__main__": scheduler_repaire()