saundy 发表于 2018-7-30 12:29:55

结合Ansible技术监控Storm集群

#!/bin/sh  
PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/local/sbin
  

  
. /etc/profile
  

  
## 监控页面地址参数
  
MON_SRV_IPADDR="192.168.1.103"
  
MON_SRV_PORT="8080"
  

  
## 是否已正确扫描
  
SCAN_FLAG=0
  

  
## 工作基路径
  
BASE_PATH="/data/scripts"
  

  
## 异常 storm Supervisor 主机地址列表
  
FAIL_SUPERVISOR_LIST="${BASE_PATH}/fail_supervisor.txt"
  

  
#---------------------------------------------------------------------------------------------------
  
## 重启storm的nimbus服务
  
function restart_storm_nimbus_server()
  
{
  
    [[ -n `ps aux | grep java | grep storm` ]] && kill -9 `ps aux | grep java | grep storm | awk '{print $2}'`
  
    nohup /usr/local/storm/bin/storm nimbus >/dev/null 2>&1 &
  
    nohup /usr/local/storm/bin/storm ui >/dev/null 2>&1 &
  

  
    sleep 30
  
}
  

  
#---------------------------------------------------------------------------------------------------
  
## 1、检查监控页面是否正常【8080端口不通的情况】
  
for ((i=0; i<3; i++)); do
  
    RETVAL=`/usr/bin/nmap -n -sS -p ${MON_SRV_PORT} ${MON_SRV_IPADDR} | grep open`
  
    [[ -n "${RETVAL}" ]] && SCAN_FLAG=1;break || sleep 10
  
done
  

  
[[ ${SCAN_FLAG} -ne 1 ]] && restart_storm_nimbus_server
  

  
#---------------------------------------------------------------------------------------------------
  
## 2、将监控页面抓取内容与本地hosts内容进行差异比较,以确定是否存在异常的 storm supervisor 服务
  
curl -s http://${MON_SRV_IPADDR}:${MON_SRV_PORT}/ | sed 's/<td>/<td>\n/g' | awk -F '<' '/^storm_/{print $1}' | awk '!/nimbus/{print}' | sort > ${BASE_PATH}/supervisor_list_from_page.txt
  

  
## 如果获取的storm nimbus监控页面数据为空,代表storm nimbus服务存在异常
  
[[ -z `sed '/^$/d' ${BASE_PATH}/supervisor_list_from_page.txt` ]] && restart_storm_nimbus_server
  

  
sort -nr ${BASE_PATH}/supervisor_list_from_page.txt ${BASE_PATH}/supervisor_list.txt | uniq -u > ${BASE_PATH}/supervisor_list_for_failed.txt
  
[[ -z `sed '/^$/d' ${BASE_PATH}/supervisor_list_for_failed.txt` ]] && rm -f ${BASE_PATH}/supervisor_list_for_failed.txt && exit 0
  

  
#---------------------------------------------------------------------------------------------------
  
## 3、获得异常的 storm supervisor 服务的IP地址列表
  
echo "" >> ${FAIL_SUPERVISOR_LIST}
  

  
for SUPERVISOR_NAMEADDR in `cat ${BASE_PATH}/supervisor_list_for_failed.txt`
  
do
  
    TEMP_IPADDR=`grep -w ${SUPERVISOR_NAMEADDR} /etc/hosts | grep -v '#' | awk '{print $1}' | tail -1`
  
    echo "${TEMP_IPADDR}" >> ${FAIL_SUPERVISOR_LIST}
  
    IPLIST="${IPLIST} ${TEMP_IPADDR}"
  
done
  

  
#---------------------------------------------------------------------------------------------------
  
## 4、远程重启 storm supervisor 服务
  
/usr/local/bin/ansible -i ${FAIL_SUPERVISOR_LIST} fail_supervisor -m shell -a "/data/scripts/restart_storm_service.sh"
  
rm -f ${FAIL_SUPERVISOR_LIST}
页: [1]
查看完整版本: 结合Ansible技术监控Storm集群