Nagios监控自动化

xiaodouya33 · 发表于 2019-1-14 08:00:03

　　一、安装
　　nagios下载http://download.chinaunix.net/download.php?id=25308&ResourceID=7183
　　tar fvxz nagios-3.2.0.tar.gz

　　cd nagios-3.2.0
　　
./configure --prefix=/usr/local/nagios --with-nagios-user=apache --with-nagios-group=apache --with-command-user=apache --with-command-group=apahce
　　注：默认使用nagios用户，如果使用别的用户，必须在这里指定
　　
useradd nagios
　　make all
make install
makeinstall-init
　　make install-comandmode
makeinstall-config
makeinstall-webconf

　　vim /etc/httpd/conf/httpd.conf
　　User nagios
　　Group nagios
　　

　　service httpd restart
　　

　　由于nagios的web控制台必须使用用户验证登陆，所以

　　htpasswd -c /usr/local/nagios/etc/htpasswd.users nagios
　　以后再添加用户就不用加-c了
　　vim /usr/local/nagios/etc/cgi.cfg
　　在所有的nagiosadmin后面添加nagios
　　service nagios start

　　访问：http://localhost/nagios
　　但此时的nagios监控到的localhost居然是down状态
　　原因是nagios是通过/usr/local/nagios/libexec下的各种插件来获取主机信息的，而此时此路径下没有安装任何的插件
tar fvxz  /tmp/nagios-plugins-1.4.13.tar
cd nagios-plugins-1.4.13
　　./configure--prefix=/usr/local/nagios/
　　make
　　make install
　　再访问：http://localhost/nagios就会发现localhost是up状态了
　　

　　二、配置
　　/usr/local/nagios/etc/nagios.cfg是nagios的主配置文件，通过配置此文件指定各个单独配置文件的路径使得其生效
　　
　　cfg_file=/usr/local/nagios//etc/objects/commands.cfg(用什么监控？)
　　cfg_file=/usr/local/nagios//etc/objects/contacts.cfg（有问题联系谁？）
　　cfg_file=/usr/local/nagios//etc/objects/timeperiods.cfg（什么时间监控？默认即可）
　　cfg_file=/usr/local/nagios//etc/objects/localhost.cfg（监控谁？包括主机和服务两种）
　　为了方便管理，我们需要在此文件中为每一个被监控的主机单独指定一个配置文件
　　cfg_file=/usr/local/nagios//etc/objects/10.8.8.140.cfg
　　cfg_file=/usr/local/nagios//etc/objects/10.8.8.142.cfg
　　注：关于10.8.8.140.cfg的书写，请参考附件中的脚本
　　注：commands.cfg文件中
USER1代表/usr/local/nagios/libexec
$HOSTADDRESS$是系统自带的变量，自动调用localhost.cfg文件中定义的主机define host区域中的 address
$ARG1$是用户自定义的变量，需要在localhost.cfg文件中的check_command后添加!变量值
　　localhost.cfg中的check_command调用的就是commands.cfg中的command_name。command_name调用/usr/local/nagios/libexec中的check_*
　　

　　但是，此时的nagios还只能监控本机，不能够监控远程主机。
　　如果要监控远程主机，需要配合使用nrpe。
　　原理：nagios服务端通过check_nrpe -H 被监控的IP -c "command"把"command"命令传送给被监控端也就是nrpe服务端，nrpe服务端接收到命令后查找主配置文件nrpe.cfg中command["command"]匹配"command"，然后去执行对应的本地插件，把执行结果返回给 nagios服务端
　　注：nrpr的服务端是安装在被监控主机的，可以理解为nrpe是一个***程序
　　被监控端安装nrpe

　　tar fvxz nrpe-2.12.tar.gz
　　cd nrpe-2.12
　　./configure --prefix=/usr/local/nagios2 --with-nrpe-user=apache --with-nrpe-group=apache --with-nagios-user=apache --with-nagios-group=apache
　　make all
　　make install-daemon
　　make install-daemon-config
　　make install-xinetd
　　make all
　　echo "nrpe 5666/tcp" >> /etc/services
　　tar fvxz /home/yuchunyun/libexec.tgz -C /usr/local/nagios（把nagios服务端的插件拷贝过来）
　　

　　注：nrpe服务有两种启动方式。1:依赖于xinetd服务。2:单独配置文件方式启动。
　　方式1：请确保xinetd服务已安装
　　sed -i s/127.0.0.1/10.8.8.141/g /etc/xinetd.d/nrpe
　　/etc/init.d/xinetd restart
　　netstat -nutlp | grep xinetd 会发现nrpe开启的的默认5666端口是以xinetd核心守护进程方式启动
　　方式2：
　　/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d
　　ps -ef | grep nrpe
　　会发现nrpe是一个单独的进程
　　

　　
日志
tail -f  /usr/local/nagios/var/nagios.log
　　检查错误
　　/usr/local/nagios/bin/nagios -v/usr/local/nagios/etc/nagios.cfg
　　Nagios的check_*插件是用过返回值来判断的
　　0 成功  1 警告  2 严重错误  3 未知
　　

　　三、自动化添加被监控主机
　　先执行host.sh
　　
#!/bin/bash
#作者：yuchunyun
#时间：2014/03/14
#用途：此脚本用于监控主机nagios添加被监控端的主机定义
#用法：使用此脚本之前，请确保被监控端已安装nrpe服务，且本机具备连接其NRPE的权限
cfg_path=/usr/local/nagios//etc/objects
nagios_cfg=/usr/local/nagios/etc/nagios.cfg
read -p "please input the hostname(like 10.8.8.141):" hostname
read -p "please input the alise(like 141):" alise
read -p "please input the ipaddress(like 10.8.8.141):" ip
host=`sed -n /$hostname/p  $nagios_cfg`
if [ "$host" = "" ];then
echo "cfg_file=$cfg_path/$hostname.cfg" >> $nagios_cfg
echo "define host {
host_name    $hostname
alias          $alise
address       $ip
check_command check-host-alive
notification_options d,u,r
check_interval  1
max_check_attempts    2
contact_groups  admins
notification_interval 10
notification_period    24x7
}" > $cfg_path/$hostname.cfg
echo "#############################"
echo "the host:$hostname had add ok"
echo "#############################"
else
echo "the host:$hostname had already existed!!!"
fi　　

　　再执行service.sh
　　
　　
　　
　　
#!/bin/bash
#作者：yuchunyun
#时间：2014/03/14
#用途：此脚本用于监控主机nagios配置被监控端上的服务监控
#用法：使用此脚本之前请确保已用host.sh脚本定义了该被监控主机。由于监控插件有限，所以只能提供下面列出的监控模板！
cfg_path=/usr/local/nagios/etc/objects
nagios_cfg=/usr/local/nagios/etc/nagios.cfg
nagios_bin=/usr/local/nagios/bin/nagios
read -p "please input the hostname you has been added:" hostname
host=`sed -n /$hostname/p  $nagios_cfg`
if [ "$host" = "" ];then
echo "the host:$hostname is undefind!!! "
else
for ((i=0;i> $cfg_path/$hostname.cfg
echo "Total User had monitored OK!"
elif [ $id -eq 2 ];then
echo "define service {
host_name    $hostname
service_description    System Load
check_period 24x7
normal_check_interval 2
retry_check_interval 1
max_check_attempts    5
notification_period    24x7
notification_options w,u,c,r
check_command check_nrpe!check_load
}" >> $cfg_path/$hostname.cfg
echo "System Load had monitored OK!"
elif [ $id -eq 3 ];then
echo "define service {
host_name    $hostname
service_description    Total Proces
check_period 24x7
normal_check_interval 2
retry_check_interval 1
max_check_attempts    5
notification_period    24x7
notification_options w,u,c,r
check_command check_nrpe!check_total_procs
}" >> $cfg_path/$hostname.cfg
echo "Total Proces had monitored OK!"
elif [ $id -eq 4 ];then
echo "define service {
host_name    $hostname
service_description    Disk
check_period 24x7
normal_check_interval 2
retry_check_interval 1
max_check_attempts    5
notification_period    24x7
notification_options w,u,c,r
check_command check_nrpe!check_disk
}" >> $cfg_path/$hostname.cfg
echo "Disk had monitored OK!"
elif [ $id -eq 5 ];then
echo "define service {
host_name    $hostname
service_description    Memory
check_period 24x7
normal_check_interval 2
retry_check_interval 1
max_check_attempts    5
notification_period    24x7
notification_options w,u,c,r
check_command check_nrpe!check_free_memory
}" >> $cfg_path/$hostname.cfg
echo "Memory had monitored OK!"
elif [ $id -eq 6 ];then
echo "define service {
host_name    $hostname
service_description    Contect
check_period 24x7
normal_check_interval 2
retry_check_interval 1
max_check_attempts    5
notification_period    24x7
notification_options w,u,c,r
check_command check_nrpe!check_contect
}" >> $cfg_path/$hostname.cfg
echo "Contect had monitored OK!"
elif [ $id -eq 7 ];then
echo "define service {
host_name    $hostname
service_description    Httpd process
check_period 24x7
normal_check_interval 2
retry_check_interval 1
max_check_attempts    5
notification_period    24x7
notification_options w,u,c,r
check_command check_nrpe!check_httpd_proces
}" >> $cfg_path/$hostname.cfg
echo "Httpd process had monitored OK!"
elif [ $id -eq 8 ];then
echo "define service {
host_name    $hostname
service_description    Nginx process
check_period 24x7
normal_check_interval 2
retry_check_interval 1
max_check_attempts    5
notification_period    24x7
notification_options w,u,c,r
check_command check_nrpe!check_nginx_proces
}" >> $cfg_path/$hostname.cfg
echo "Nginx process had monitored OK!"
elif [ $id -eq 9 ];then
echo "define service {
host_name    $hostname
service_description    Php process
check_period 24x7
normal_check_interval 2
retry_check_interval 1
max_check_attempts    5
notification_period    24x7
notification_options w,u,c,r
check_command check_nrpe!check_php_proces
}" >> $cfg_path/$hostname.cfg
echo "Php process had monitored OK!"
elif [ $id -eq 10 ];then
echo "define service {
host_name    $hostname
service_description    Mysql process
check_period 24x7
normal_check_interval 2
retry_check_interval 1
max_check_attempts    5
notification_period    24x7
notification_options w,u,c,r
check_command check_nrpe!check_mysql_proces
}" >> $cfg_path/$hostname.cfg
echo "Mysql process had monitored OK!"
elif [ $id -eq 11 ];then
echo "define service {
host_name    $hostname
service_description    Java process
check_period 24x7
normal_check_interval 2
retry_check_interval 1
max_check_attempts    5
notification_period    24x7
notification_options w,u,c,r
check_command check_nrpe!check_java_proces
}" >> $cfg_path/$hostname.cfg
echo "Java process had monitored OK!"
elif [ $id -eq 12 ];then
break
else
echo "your input values is invalid"
fi
done
fi
$nagios_bin -v $nagios_cfg  #检测配置文件有没有语法错误　　

　　

　　四、常见错误
　　
问题：被被监控端执行check_login有正确的返回信息，但是在监控端上执行check_nrpe -H IP -c check_login却没有输出，提示can not read output！
解决：由于被监控端nrpe插件的执行者是apache/nagios，本地执行脚本可能是用的root用户来执行，但远程的脚本中执行某些命令时没有权限，因此需要给予apache/nagios执行sudo的权限，然后才脚本内命令前加sudo
vi /etc/sudoers  添加apache 10.8.8.249=(root) NOPASSWD: /bin/cat,/bin/sed
或者把apache用户加入到root中
排查测试：A:给予apache一个shell：su - apache -s /bin/bash
               B:用此shell调试执行脚本 sh -x /usr/local/libexec/check_login（查看shell脚本语法错误用sh -n）
               C:根据返回的错误信息解决权限问题

问题：CHECK_NRPE: Socket timeout after 10 seconds
　　解决：原因是命令执行的超时时间有限制
　　1.vi commands.cfg --->  command_line /usr/local/nagios/libexec/check_nrpe -H $HOSTADDRESS$ -c $ARG1$ -t 60 (默认命令后面加：-t 60)
2.vi nrpe.cfg ---> command_timeout=60
3. 使用命令测试。---> ./check_nrpe -H 192.168.1.203 -c check_raid -t 30 (使用命令时后面加：-t 30)
注意：-t 后面的时间自由添加。
　　

附件：http://down.运维网.com/data/2364213

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

Red Hat RHCE 8 (EX294) Cert Guide

c++ size_t 和 int 的区别

HERE 使用 AWS EF 和 JFrog Artifactory 打

C++ 指针大全：从基础到进阶，一篇快速上手

wirelessnetview好用的无线分析工具

亿图图示专家(EDraw Max) V7.9 中文破解版

Nagios监控自动化

浏览过的版块

扫码加入运维网微信交流群