cd /home/huang/tools/
wget http://www.nagios-plugins.org/download/nagios-plugins-2.1.1.tar.gz
tar xfz nagios-plugins-2.1.1.tar.gz
cd nagios-plugins-2.1.1
./configure --with-nagios-user=nagios --with-nagios-group=nagios --prefix=/usr/local/nagios
make
make install
chown -R nagios.nagios /usr/local/nagios/
#安装NRPE
cd /home/huang/tools/
wget https://github.com/NagiosEnterprises/nrpe/archive/3.0.tar.gz
tar xf 3.0.tar.gz
cd nrpe-3.0/
./configure
make all
make install
make install-plugin
make install-daemon
make install-config
#修改配置文件,添加nagios主机
sed -i 's#allowed_hosts=127.0.0.1#allowed_hosts=127.0.0.1,192.168.1.155#g' /usr/local/nagios/etc/nrpe.cfg
##编辑hosts.cfg配置文件,添加阿里云服务器的信息
vim ../objects/hosts.cfg
1、define host{
use linux-server
host_name client01
alias client01
address 120.26.68.152
}
2、加入组
define hostgroup{
hostgroup_name linux-servers
alias Linux Servers
members nagios_server,client01(client01新加入的)
}
3、定义主动模式(不需要nrpe服务) 此时是主动拉取数据,所以并没有用到nrpe,也就是不使用check_nrpe,也不需要在客户端开启nrpe服务
cd /usr/local/nagios/etc/
mkdir services
chown -R nagios.nagios services
cd services
vim port-client.cfg(nagios服务的services目录下)
define service{
use generic-service
host_name client01
service_description ntp_port
check_period 24x7
check_interval 5
retry_interval 1
max_check_attempts 3
check_command check_http!-p 123
notification_period 24x7
notification_interval 30
notification_options w,u,c,r
contact_groups admins
}
3、手动测试数据是否有收集
[root@RS1 services]# /usr/local/nagios/libexec/check_tcp -H 120.26.68.152 -p 123
CRITICAL - Socket timeout after 10 seconds
报错如上
解决办法:防火墙的123端口没有向外开放(由于不想开放这个端口,所以不理会这个报错,重新监控其他端口)
这里选择监控sshd服务的端口22389
当监控sshd的端口22389
[root@RS1 services]# /usr/local/nagios/libexec/check_tcp -H 120.26.68.152 -p 22389
TCP OK - 0.092 second response time on 120.26.68.152 port 22389|time=0.092055s;;;0.000000;10.000000
修改配置文件的123端口为22389
重启检测语法以及重启服务,在web页面观察到如下错误:
check_tcp: Port must be a positive integer
手动能够收集数据,但是nagios的web界面报错:check_tcp: Port must be a positive integer
[root@node2 ~]# /usr/local/nagios/libexec/check_tcp -H 192.168.1.11 -p 80
TCP OK - 0.001 second response time on 192.168.1.11 port 80|time=0.000810s;;;0.000000;10.000000
手动抓取数据正常,于是配置服务器端:
define service{
use generic-service
host_name client02 ----》已经定义了hosts文件(192.168.1.11为client02)
service_description web_port
check_command check_nrpe!check_tcp
}
然后重启服务,出现如下错误:
NRPE: Command 'check_tcp' not defined
然后在服务端进行手动测试抓取数据:
[root@RS1 etc]# /usr/local/nagios/libexec/check_nrpe -H 192.168.1.11 -c check_tcp
TCP OK - 0.001 second response time on 192.168.1.11 port 80|time=0.000670s;;;0.000000;10.000000
成功解决
然后被动模式成功
#########对每个服务进行分组(好几台服务器监控的服务都是load负载)
新建:vim /usr/local/nagios/etc/services/servicegroup.cfg
define servicegroup{
servicegroup_name load ----》这里对应service配置文件的service_description
alias Linux Servers
members nagios_server,load,client01,load,client02,load
}
将三个主机(nagios_server、client01、client02)的监控的load服务定义到一个服务组中,
修改server.cfg中的description如下:
vim ../objects/services.cfg
define service{
use generic-service
host_name nagios_server 由于这里只定义了一个主机,所以还是显示了一个
service_description load -----》这里改成和服务组定义的名称一致
check_command check_nrpe!check_load
}