结合Ansible技术监控Storm集群
#!/bin/shPATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/local/sbin
. /etc/profile
## 监控页面地址参数
MON_SRV_IPADDR="192.168.1.103"
MON_SRV_PORT="8080"
## 是否已正确扫描
SCAN_FLAG=0
## 工作基路径
BASE_PATH="/data/scripts"
## 异常 storm Supervisor 主机地址列表
FAIL_SUPERVISOR_LIST="${BASE_PATH}/fail_supervisor.txt"
#---------------------------------------------------------------------------------------------------
## 重启storm的nimbus服务
function restart_storm_nimbus_server()
{
[[ -n `ps aux | grep java | grep storm` ]] && kill -9 `ps aux | grep java | grep storm | awk '{print $2}'`
nohup /usr/local/storm/bin/storm nimbus >/dev/null 2>&1 &
nohup /usr/local/storm/bin/storm ui >/dev/null 2>&1 &
sleep 30
}
#---------------------------------------------------------------------------------------------------
## 1、检查监控页面是否正常【8080端口不通的情况】
for ((i=0; i<3; i++)); do
RETVAL=`/usr/bin/nmap -n -sS -p ${MON_SRV_PORT} ${MON_SRV_IPADDR} | grep open`
[[ -n "${RETVAL}" ]] && SCAN_FLAG=1;break || sleep 10
done
[[ ${SCAN_FLAG} -ne 1 ]] && restart_storm_nimbus_server
#---------------------------------------------------------------------------------------------------
## 2、将监控页面抓取内容与本地hosts内容进行差异比较,以确定是否存在异常的 storm supervisor 服务
curl -s http://${MON_SRV_IPADDR}:${MON_SRV_PORT}/ | sed 's/<td>/<td>\n/g' | awk -F '<' '/^storm_/{print $1}' | awk '!/nimbus/{print}' | sort > ${BASE_PATH}/supervisor_list_from_page.txt
## 如果获取的storm nimbus监控页面数据为空,代表storm nimbus服务存在异常
[[ -z `sed '/^$/d' ${BASE_PATH}/supervisor_list_from_page.txt` ]] && restart_storm_nimbus_server
sort -nr ${BASE_PATH}/supervisor_list_from_page.txt ${BASE_PATH}/supervisor_list.txt | uniq -u > ${BASE_PATH}/supervisor_list_for_failed.txt
[[ -z `sed '/^$/d' ${BASE_PATH}/supervisor_list_for_failed.txt` ]] && rm -f ${BASE_PATH}/supervisor_list_for_failed.txt && exit 0
#---------------------------------------------------------------------------------------------------
## 3、获得异常的 storm supervisor 服务的IP地址列表
echo "" >> ${FAIL_SUPERVISOR_LIST}
for SUPERVISOR_NAMEADDR in `cat ${BASE_PATH}/supervisor_list_for_failed.txt`
do
TEMP_IPADDR=`grep -w ${SUPERVISOR_NAMEADDR} /etc/hosts | grep -v '#' | awk '{print $1}' | tail -1`
echo "${TEMP_IPADDR}" >> ${FAIL_SUPERVISOR_LIST}
IPLIST="${IPLIST} ${TEMP_IPADDR}"
done
#---------------------------------------------------------------------------------------------------
## 4、远程重启 storm supervisor 服务
/usr/local/bin/ansible -i ${FAIL_SUPERVISOR_LIST} fail_supervisor -m shell -a "/data/scripts/restart_storm_service.sh"
rm -f ${FAIL_SUPERVISOR_LIST}
页:
[1]