系统失败时会警告 NOC,并在 Nagios Web 界面看到闪烁的红点。如果节点异常或者温度太高,NOC 还将收到电子邮件通知。
系统工程师可以绘制数据,报告集群利用率,制定未来采购硬件的决策。
回页首
安装 Nagios
在机器上安装 Nagios 的方法可以从 Internet 查询。因为我经常需要在不同的环境中安装,所以为此编写了一个脚本。
首先需要 下载两个包:
Nagios(测试了 3.0.6 版本)
Nagios-plugins(测试了 1.4.13 版本)
插件包括:
Nagios Event Log,可以监视 Windows 事件日志
NRPE,提供了许多 Ganglia 功能
获取源代码并放在目录中。为了演示,我在 /tmp 中放置了三个文件:
nagios-3.0.6.tar.gz
nagios-plugins-1.4.13.tar.gz
naginstall.sh
清单 1 展示了 naginstall.sh 安装脚本:
清单 1. naginstall.sh 脚本
#!/bin/ksh
NAGIOSSRC=nagios-3.0.6
NAGIOSPLUGINSRC=nagios-plugins-1.4.13
NAGIOSCONTACTSCFG=/usr/local/nagios/etc/objects/contacts.cfg
NAGIOSPASSWD=/usr/local/nagios/etc/htpasswd.users
PASSWD=cluster
OS=foo
function buildNagiosPlug {
if [ -e $NAGIOSPLUGINSRC.tar.gz ]
then
echo "found $NAGIOSPLUGINSRC.tar.gz building and installing Nagios"
else
echo "could not find $NAGIOSPLUGINSRC.tar.gz in current directory."
echo "Please run $0 in the same directory as the source files."
exit 1
fi
echo "Extracting Nagios Plugins..."
tar zxf $NAGIOSPLUGINSRC.tar.gz
cd $NAGIOSPLUGINSRC
echo "Configuring Nagios Plugins..."
if ./configure --with-nagios-user=nagios --with-nagios-group=nagios
-prefix=/usr/local/nagios > config.LOG.$$ 2>&1
then
echo "Making Nagios Plugins..."
if make -j8 > make.LOG.$$ 2>&1
then
make install > make.LOG.$$ 2>&1
else
echo "Make failed of Nagios plugins. See $NAGIOSPLUGINSRC/make.LOG.$$"
exit 1
fi
else
echo "configure of Nagios plugins failed. See config.LOG.$$"
exit 1
fi
echo "Successfully built and installed Nagios Plugins!"
cd ..
}
function buildNagios {
if [ -e $NAGIOSSRC.tar.gz ]
then
echo "found $NAGIOSSRC.tar.gz building and installing Nagios"
else
echo "could not find $NAGIOSSRC.tar.gz in current directory."
echo "Please run $0 in the same directory as the source files."
exit 1
fi
echo "Extracting Nagios..."
tar zxf $NAGIOSSRC.tar.gz
cd $NAGIOSSRC
echo "Configuring Nagios..."
if ./configure --with-command-group=nagcmd > config.LOG.$$ 2>&1
then
echo "Making Nagios..."
if make all -j8 > make.LOG.$$ 2>&1
then
make install > make.LOG.$$ 2>&1
make install-init > make.LOG.$$ 2>&1
make install-config > make.LOG.$$ 2>&1
make install-commandmode > make.LOG.$$ 2>&1
make install-webconf > make.LOG.$$ 2>&1
else
echo "make all failed. See log:"
echo "$NAGIOSSRC/make.LOG.$$"
exit 1
fi
else
echo "configure of Nagios failed. Please read $NAGIOSSRC/config.LOG.$$ for details."
exit 1
fi
echo "Done Making Nagios!"
cd ..
}
function configNagios {
echo "We'll now configure Nagios."
LOOP=1
while [[ $LOOP -eq 1 ]]
do
echo "You'll need to put in a user name. This should be the person"
echo "who will be receiving alerts. This person should have an account"
echo "on this server. "
print "Type in the userid of the person who will receive alerts (e.g. bob)> \c"
read NAME
print "What is ${NAME}'s email?> \c"
read EMAIL
echo
echo
echo "Nagios alerts will be sent to $NAME at $EMAIL"
print "Is this correct? [y/N] \c"
read YN
if [[ "$YN" = "y" ]]
then
LOOP=0
fi
done
if [ -r $NAGIOSCONTACTSCFG ]
then
perl -pi -e "s/nagiosadmin/$NAME/g" $NAGIOSCONTACTSCFG
EMAIL=$(echo $EMAIL | sed s/\@/\\\\@/g)
perl -pi -e "s/nagios\@localhost/$EMAIL/g" $NAGIOSCONTACTSCFG
else
echo "$NAGIOSCONTACTSCFG does not exist"
exit 1
fi
echo "setting ${NAME}'s password to be 'cluster' in Nagios"
echo " you can change this later by running: "
echo " htpasswd -c $NAGIOSPASSWD $Name)'"
htpasswd -bc $NAGIOSPASSWD $NAME cluster
if [ "$OS" = "rh" ]
then
service httpd restart
fi
}
function preNagios {
if [ "$OS" = "rh" ]
then
echo "making sure prereqs are installed"
yum -y install httpd gcc glibc glibc-common gd gd-devel perl-TimeDate
/usr/sbin/useradd -m nagios
echo $PASSWD | passwd --stdin nagios
/usr/sbin/groupadd nagcmd
/usr/sbin/usermod -a -G nagcmd nagios
/usr/sbin/usermod -a -G nagcmd apache
fi
}
function postNagios {
if [ "$OS" = "rh" ]
then
chkconfig --add nagios
chkconfig nagios on
# touch this file so that if it doesn't exist we won't get errors
touch /var/www/html/index.html
service nagios start
fi
echo "You may now be able to access Nagios at the URL below:"
echo "http://localhost/nagios"
}
if [ -e /etc/redhat-release ]
then
echo "installing monitoring on Red Hat system"
OS=rh
fi
# make sure you're root:
ID=$(id -u)
if [ "$ID" != "0" ]
then
echo "Must run this as root!"
exit
fi
preNagios
buildNagios
buildNagiosPlug
configNagios
postNagios
define servicegroup {
servicegroup_name ganglia-metrics
alias Ganglia Metrics
}
define command {
command_name check_ganglia
command_line $USER1$/check_ganglia.py -h $HOSTNAME$ -m $ARG1$ -w $ARG2$ -c $ARG3$
}
define service {
use generic-service
name ganglia-service
hostgroup_name dallas-cloud-servers
service_groups ganglia-metrics
notifications_enabled 0
}
define service {
use ganglia-service
service_description load_one
check_command check_ganglia!load_one!4!5
}
define service {
use ganglia-service
service_description ambient_temp
check_command check_ganglia!AmbientTemp!20!30
}
define service {
use ganglia-service
service_description disk_free
check_command check_ganglia!disk_free!10!5
}
[iyunv@redhouse libexec]# ./check_pbs.pl -Q dque -tw 20 -tm 50
check_pbs.pl Critical: dque on localhost checked, Total number of jobs
higher than 50. Total jobs:518, Jobs Queued:518, Jobs Waiting:0, Jobs
Halted:0 |exectime=9340us