Nagios

Nagios is an open-source monitoring system that tracks hosts and services, providing alerts when issues are detected.

Installation

Linux (Ubuntu/Debian)

# Install Nagios Core
sudo apt update
sudo apt install nagios4 nagios4-common nagios-plugins nagios-nrpe-plugin

# Enable and start
sudo systemctl enable nagios4
sudo systemctl start nagios4

# Default credentials: nagiosadmin / nagiosadmin
# Access: http://localhost/nagios4

CentOS/RHEL

# Install from EPEL
sudo yum install epel-release
sudo yum install nagios nagios-plugins-all nagios-plugins-nrpe nrpe

# Start service
sudo systemctl start nagios
sudo systemctl enable nagios

Docker

docker run -d -p 80:80 \
  -e NAGIOS_HOSTNAME=nagios.example.com \
  --name nagios \
  jasonrivers/nagios:latest

# Login: nagiosadmin / nagios

Configuration Files

Main Config Structure

# Primary configuration
/etc/nagios4/nagios.cfg

# Host/service definitions
/etc/nagios4/objects/

# Resource file (macros, passwords)
/etc/nagios4/resource.cfg

# Validate config
sudo nagios4 -v /etc/nagios4/nagios.cfg

nagios.cfg Key Settings

# Main configuration file
log_file=/var/log/nagios4/nagios.log
object_cache_file=/var/cache/nagios4/objects.cache
status_file=/var/lib/nagios4/status.dat

# Retention data
retention_update_interval=60
retention_data_file=/var/lib/nagios4/retention.dat

# Commands
command_file=/var/lib/nagios4/rw/nagios.cmd
max_check_result_age=3600
check_result_path=/var/lib/nagios4/spool/checkresults

# Event handlers
event_handler_enabled=1
notification_logging_enabled=1

Host and Service Definitions

Define Hosts

# /etc/nagios4/objects/hosts.cfg
define host{
  use                     linux-server
  host_name               web-server-01
  alias                   Production Web Server 1
  address                 192.168.1.10
  check_command           check-host-alive
  max_check_attempts      3
  check_interval          5
  retry_interval          1
  notification_interval   60
  notification_period     24x7
  contacts                sysadmin
  _location               "Data Center 1"
}

Define Services

# /etc/nagios4/objects/services.cfg
define service{
  use                     local-service
  host_name               web-server-01
  service_description     CPU Load
  check_command           check_local_load!5.0,4.0!10.0,6.0
  check_interval          5
  retry_interval          1
  max_check_attempts      3
}

define service{
  use                     local-service
  host_name               web-server-01
  service_description     HTTP
  check_command           check_http
  check_interval          5
}

define service{
  use                     local-service
  host_name               web-server-01
  service_description     Disk Usage
  check_command           check_local_disk!20!10!/
}

Host Groups

define hostgroup{
  hostgroup_name          web-servers
  alias                   Web Servers
  members                 web-server-01,web-server-02,web-server-03
}

define servicegroup{
  servicegroup_name       web-services
  alias                   Web Services
  members                 web-server-01,HTTP,web-server-01,HTTPS
}

Check Commands

Local Checks (executed on Nagios server)

define command{
  command_name    check_local_disk
  command_line    $USER1$/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$
}

define command{
  command_name    check_local_cpu
  command_line    $USER1$/check_cpu -w $ARG1$ -c $ARG2$
}

define command{
  command_name    check_local_memory
  command_line    $USER1$/check_memory -w $ARG1$ -c $ARG2$
}

Remote Checks (NRPE - Nagios Remote Plugin Executor)

define command{
  command_name    check_nrpe
  command_line    $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
}

# Usage in service
define service{
  service_description     Remote Disk
  check_command           check_nrpe!check_disk
}

Notifications

Define Contacts

define contact{
  contact_name            sysadmin
  alias                   System Administrator
  email                   sysadmin@example.com
  host_notification_period 24x7
  service_notification_period 24x7
  host_notification_options d,r,u
  service_notification_options w,u,c,r
  can_submit_commands     1
}

define contact{
  contact_name            oncall
  alias                   On-Call Engineer
  email                   oncall@example.com
  pager                   +1-555-0123
  host_notification_commands notify-by-email
  service_notification_commands notify-by-email
}

Notification Commands

define command{
  command_name    notify-by-email
  command_line    /usr/bin/printf "%b" "***** Nagios Alert *****\n\nService: $SERVICEDESC$\nHost: $HOSTNAME$\nState: $SERVICESTATE$\nTime: $DATETIME$\n\nDetails:\n$SERVICEOUTPUT$" | /bin/mail -s "** $NOTIFICATIONTYPE$ alert - $HOSTNAME$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$
}

define command{
  command_name    notify-by-slack
  command_line    /usr/local/bin/send-slack.sh "$NOTIFICATIONTYPE$" "$HOSTNAME$" "$SERVICEDESC$" "$SERVICESTATE$" "$SERVICEOUTPUT$"
}

Monitoring Common Services

Web Services

define command{
  command_name    check_http_basic_auth
  command_line    $USER1$/check_http -H $HOSTADDRESS$ -u $ARG1$:$ARG2$ -p $ARG3$
}

define service{
  service_description     HTTPS Certificate
  check_command           check_http!-S
  check_interval          1440  # Daily
}

Database Monitoring

define command{
  command_name    check_mysql
  command_line    $USER1$/check_mysql -H $HOSTADDRESS$ -u $ARG1$ -p $ARG2$
}

define service{
  service_description     MySQL
  check_command           check_mysql!nagios!password
  check_interval          5
}

Log File Monitoring

define command{
  command_name    check_logfiles
  command_line    $USER1$/check_logfiles -f $HOSTADDRESS$:$ARG1$ -s $ARG2$
}

define service{
  service_description     Application Errors
  check_command           check_logfiles!/var/log/app.log!/var/lib/nagios/searches/errors.log
}

NRPE Configuration (Remote Monitoring)

On Remote Host

# Install NRPE
sudo apt install nagios-nrpe-server nagios-plugins

# Configure /etc/nagios/nrpe.cfg
allowed_hosts=192.168.1.5  # Nagios server IP
command[check_disk]=/usr/lib/nagios/plugins/check_disk -w 20 -c 10 -p /
command[check_load]=/usr/lib/nagios/plugins/check_load -w 5.0,4.0 -c 10.0,6.0
command[check_memory]=/usr/lib/nagios/plugins/check_memory -w 80 -c 90

# Start service
sudo systemctl start nagios-nrpe-server
sudo systemctl enable nagios-nrpe-server

# Test from Nagios server
/usr/lib/nagios/plugins/check_nrpe -H remote-host -c check_disk

Monitoring Web UI

Access Nagios Web Interface

# URL: http://localhost/nagios4/

# Default credentials
Username: nagiosadmin
Password: nagiosadmin

# Change admin password
sudo htpasswd /etc/nagios4/passwd nagiosadmin

# Key Sections:
# - Tactical Overview: Status summary
# - Current Status: All hosts/services
# - Service Problems: Alert listing
# - Hosts: Individual host details

Command-Line Tools

Check Status

# Check Nagios is running
systemctl status nagios4

# Service logs
tail -f /var/log/nagios4/nagios.log

# Check for errors
grep error /var/log/nagios4/nagios.log

# Verify configuration
nagios4 -v /etc/nagios4/nagios.cfg

Submit Commands

# Commands go to command pipe
# Schedule downtime for host
echo "[$(date +%s)] SCHEDULE_HOST_DOWNTIME;web-server-01;$(date +%s);$(($(date +%s) + 3600));1;0;0;nagios;Maintenance Window" >> /var/lib/nagios4/rw/nagios.cmd

# Acknowledge problem
echo "[$(date +%s)] ACKNOWLEDGE_SVC_PROBLEM;web-server-01;CPU Load;1;0;1;nagios;Acknowledged" >> /var/lib/nagios4/rw/nagios.cmd

# Force service check
echo "[$(date +%s)] SCHEDULE_FORCED_SVC_CHECK;web-server-01;HTTP;$(date +%s)" >> /var/lib/nagios4/rw/nagios.cmd

Advanced Features

Custom Event Handlers

define command{
  command_name    restart_http
  command_line    ssh -l nagios $HOSTADDRESS$ '/usr/bin/sudo /bin/systemctl restart httpd'
}

define service{
  service_description     HTTP
  event_handler           restart_http
  event_handler_enabled   1
}

Performance Graphing (PNP4Nagios)

# Install PNP4Nagios
apt install pnp4nagios

# Enable service with performance data
enable performance data in nagios.cfg
process_performance_data=1
service_perfdata_command=process-service-perfdata

# Access graphs: http://localhost/pnp4nagios

Best Practices

Monitor critical systems: databases, web servers, file servers
Set appropriate check intervals (5-60 min based on importance)
Use NRPE for remote monitoring instead of SSH
Group related services into servicegroups
Configure escalation for unacknowledged problems
Schedule maintenance windows to prevent false alerts
Regularly review check thresholds (don’t over-alert)
Archive logs regularly
Monitor Nagios itself (meta-monitoring)
Use templates to reduce configuration duplication