Nagios
Nagios is an open-source monitoring system that tracks hosts and services, providing alerts when issues are detected.
Installation
Linux (Ubuntu/Debian)
# Install Nagios Core
sudo apt update
sudo apt install nagios4 nagios4-common nagios-plugins nagios-nrpe-plugin
# Enable and start
sudo systemctl enable nagios4
sudo systemctl start nagios4
# Default credentials: nagiosadmin / nagiosadmin
# Access: http://localhost/nagios4
CentOS/RHEL
# Install from EPEL
sudo yum install epel-release
sudo yum install nagios nagios-plugins-all nagios-plugins-nrpe nrpe
# Start service
sudo systemctl start nagios
sudo systemctl enable nagios
Docker
docker run -d -p 80:80 \
-e NAGIOS_HOSTNAME=nagios.example.com \
--name nagios \
jasonrivers/nagios:latest
# Login: nagiosadmin / nagios
Configuration Files
Main Config Structure
# Primary configuration
/etc/nagios4/nagios.cfg
# Host/service definitions
/etc/nagios4/objects/
# Resource file (macros, passwords)
/etc/nagios4/resource.cfg
# Validate config
sudo nagios4 -v /etc/nagios4/nagios.cfg
nagios.cfg Key Settings
# Main configuration file
log_file=/var/log/nagios4/nagios.log
object_cache_file=/var/cache/nagios4/objects.cache
status_file=/var/lib/nagios4/status.dat
# Retention data
retention_update_interval=60
retention_data_file=/var/lib/nagios4/retention.dat
# Commands
command_file=/var/lib/nagios4/rw/nagios.cmd
max_check_result_age=3600
check_result_path=/var/lib/nagios4/spool/checkresults
# Event handlers
event_handler_enabled=1
notification_logging_enabled=1
Host and Service Definitions
Define Hosts
# /etc/nagios4/objects/hosts.cfg
define host{
use linux-server
host_name web-server-01
alias Production Web Server 1
address 192.168.1.10
check_command check-host-alive
max_check_attempts 3
check_interval 5
retry_interval 1
notification_interval 60
notification_period 24x7
contacts sysadmin
_location "Data Center 1"
}
Define Services
# /etc/nagios4/objects/services.cfg
define service{
use local-service
host_name web-server-01
service_description CPU Load
check_command check_local_load!5.0,4.0!10.0,6.0
check_interval 5
retry_interval 1
max_check_attempts 3
}
define service{
use local-service
host_name web-server-01
service_description HTTP
check_command check_http
check_interval 5
}
define service{
use local-service
host_name web-server-01
service_description Disk Usage
check_command check_local_disk!20!10!/
}
Host Groups
define hostgroup{
hostgroup_name web-servers
alias Web Servers
members web-server-01,web-server-02,web-server-03
}
define servicegroup{
servicegroup_name web-services
alias Web Services
members web-server-01,HTTP,web-server-01,HTTPS
}
Check Commands
Local Checks (executed on Nagios server)
define command{
command_name check_local_disk
command_line $USER1$/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$
}
define command{
command_name check_local_cpu
command_line $USER1$/check_cpu -w $ARG1$ -c $ARG2$
}
define command{
command_name check_local_memory
command_line $USER1$/check_memory -w $ARG1$ -c $ARG2$
}
Remote Checks (NRPE - Nagios Remote Plugin Executor)
define command{
command_name check_nrpe
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
}
# Usage in service
define service{
service_description Remote Disk
check_command check_nrpe!check_disk
}
Notifications
Define Contacts
define contact{
contact_name sysadmin
alias System Administrator
email sysadmin@example.com
host_notification_period 24x7
service_notification_period 24x7
host_notification_options d,r,u
service_notification_options w,u,c,r
can_submit_commands 1
}
define contact{
contact_name oncall
alias On-Call Engineer
email oncall@example.com
pager +1-555-0123
host_notification_commands notify-by-email
service_notification_commands notify-by-email
}
Notification Commands
define command{
command_name notify-by-email
command_line /usr/bin/printf "%b" "***** Nagios Alert *****\n\nService: $SERVICEDESC$\nHost: $HOSTNAME$\nState: $SERVICESTATE$\nTime: $DATETIME$\n\nDetails:\n$SERVICEOUTPUT$" | /bin/mail -s "** $NOTIFICATIONTYPE$ alert - $HOSTNAME$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$
}
define command{
command_name notify-by-slack
command_line /usr/local/bin/send-slack.sh "$NOTIFICATIONTYPE$" "$HOSTNAME$" "$SERVICEDESC$" "$SERVICESTATE$" "$SERVICEOUTPUT$"
}
Monitoring Common Services
Web Services
define command{
command_name check_http_basic_auth
command_line $USER1$/check_http -H $HOSTADDRESS$ -u $ARG1$:$ARG2$ -p $ARG3$
}
define service{
service_description HTTPS Certificate
check_command check_http!-S
check_interval 1440 # Daily
}
Database Monitoring
define command{
command_name check_mysql
command_line $USER1$/check_mysql -H $HOSTADDRESS$ -u $ARG1$ -p $ARG2$
}
define service{
service_description MySQL
check_command check_mysql!nagios!password
check_interval 5
}
Log File Monitoring
define command{
command_name check_logfiles
command_line $USER1$/check_logfiles -f $HOSTADDRESS$:$ARG1$ -s $ARG2$
}
define service{
service_description Application Errors
check_command check_logfiles!/var/log/app.log!/var/lib/nagios/searches/errors.log
}
NRPE Configuration (Remote Monitoring)
On Remote Host
# Install NRPE
sudo apt install nagios-nrpe-server nagios-plugins
# Configure /etc/nagios/nrpe.cfg
allowed_hosts=192.168.1.5 # Nagios server IP
command[check_disk]=/usr/lib/nagios/plugins/check_disk -w 20 -c 10 -p /
command[check_load]=/usr/lib/nagios/plugins/check_load -w 5.0,4.0 -c 10.0,6.0
command[check_memory]=/usr/lib/nagios/plugins/check_memory -w 80 -c 90
# Start service
sudo systemctl start nagios-nrpe-server
sudo systemctl enable nagios-nrpe-server
# Test from Nagios server
/usr/lib/nagios/plugins/check_nrpe -H remote-host -c check_disk
Monitoring Web UI
Access Nagios Web Interface
# URL: http://localhost/nagios4/
# Default credentials
Username: nagiosadmin
Password: nagiosadmin
# Change admin password
sudo htpasswd /etc/nagios4/passwd nagiosadmin
# Key Sections:
# - Tactical Overview: Status summary
# - Current Status: All hosts/services
# - Service Problems: Alert listing
# - Hosts: Individual host details
Command-Line Tools
Check Status
# Check Nagios is running
systemctl status nagios4
# Service logs
tail -f /var/log/nagios4/nagios.log
# Check for errors
grep error /var/log/nagios4/nagios.log
# Verify configuration
nagios4 -v /etc/nagios4/nagios.cfg
Submit Commands
# Commands go to command pipe
# Schedule downtime for host
echo "[$(date +%s)] SCHEDULE_HOST_DOWNTIME;web-server-01;$(date +%s);$(($(date +%s) + 3600));1;0;0;nagios;Maintenance Window" >> /var/lib/nagios4/rw/nagios.cmd
# Acknowledge problem
echo "[$(date +%s)] ACKNOWLEDGE_SVC_PROBLEM;web-server-01;CPU Load;1;0;1;nagios;Acknowledged" >> /var/lib/nagios4/rw/nagios.cmd
# Force service check
echo "[$(date +%s)] SCHEDULE_FORCED_SVC_CHECK;web-server-01;HTTP;$(date +%s)" >> /var/lib/nagios4/rw/nagios.cmd
Advanced Features
Custom Event Handlers
define command{
command_name restart_http
command_line ssh -l nagios $HOSTADDRESS$ '/usr/bin/sudo /bin/systemctl restart httpd'
}
define service{
service_description HTTP
event_handler restart_http
event_handler_enabled 1
}
Performance Graphing (PNP4Nagios)
# Install PNP4Nagios
apt install pnp4nagios
# Enable service with performance data
enable performance data in nagios.cfg
process_performance_data=1
service_perfdata_command=process-service-perfdata
# Access graphs: http://localhost/pnp4nagios
Best Practices
- Monitor critical systems: databases, web servers, file servers
- Set appropriate check intervals (5-60 min based on importance)
- Use NRPE for remote monitoring instead of SSH
- Group related services into servicegroups
- Configure escalation for unacknowledged problems
- Schedule maintenance windows to prevent false alerts
- Regularly review check thresholds (don’t over-alert)
- Archive logs regularly
- Monitor Nagios itself (meta-monitoring)
- Use templates to reduce configuration duplication