Ir al contenido

Nagios

Nagios is an open-source monitoring system that tracks hosts and services, providing alerts when issues are detected.

Installation

Linux (Ubuntu/Debian)

# Install Nagios Core
sudo apt update
sudo apt install nagios4 nagios4-common nagios-plugins nagios-nrpe-plugin

# Enable and start
sudo systemctl enable nagios4
sudo systemctl start nagios4

# Default credentials: nagiosadmin / nagiosadmin
# Access: http://localhost/nagios4

CentOS/RHEL

# Install from EPEL
sudo yum install epel-release
sudo yum install nagios nagios-plugins-all nagios-plugins-nrpe nrpe

# Start service
sudo systemctl start nagios
sudo systemctl enable nagios

Docker

docker run -d -p 80:80 \
  -e NAGIOS_HOSTNAME=nagios.example.com \
  --name nagios \
  jasonrivers/nagios:latest

# Login: nagiosadmin / nagios

Configuration Files

Main Config Structure

# Primary configuration
/etc/nagios4/nagios.cfg

# Host/service definitions
/etc/nagios4/objects/

# Resource file (macros, passwords)
/etc/nagios4/resource.cfg

# Validate config
sudo nagios4 -v /etc/nagios4/nagios.cfg

nagios.cfg Key Settings

# Main configuration file
log_file=/var/log/nagios4/nagios.log
object_cache_file=/var/cache/nagios4/objects.cache
status_file=/var/lib/nagios4/status.dat

# Retention data
retention_update_interval=60
retention_data_file=/var/lib/nagios4/retention.dat

# Commands
command_file=/var/lib/nagios4/rw/nagios.cmd
max_check_result_age=3600
check_result_path=/var/lib/nagios4/spool/checkresults

# Event handlers
event_handler_enabled=1
notification_logging_enabled=1

Host and Service Definitions

Define Hosts

# /etc/nagios4/objects/hosts.cfg
define host{
  use                     linux-server
  host_name               web-server-01
  alias                   Production Web Server 1
  address                 192.168.1.10
  check_command           check-host-alive
  max_check_attempts      3
  check_interval          5
  retry_interval          1
  notification_interval   60
  notification_period     24x7
  contacts                sysadmin
  _location               "Data Center 1"
}

Define Services

# /etc/nagios4/objects/services.cfg
define service{
  use                     local-service
  host_name               web-server-01
  service_description     CPU Load
  check_command           check_local_load!5.0,4.0!10.0,6.0
  check_interval          5
  retry_interval          1
  max_check_attempts      3
}

define service{
  use                     local-service
  host_name               web-server-01
  service_description     HTTP
  check_command           check_http
  check_interval          5
}

define service{
  use                     local-service
  host_name               web-server-01
  service_description     Disk Usage
  check_command           check_local_disk!20!10!/
}

Host Groups

define hostgroup{
  hostgroup_name          web-servers
  alias                   Web Servers
  members                 web-server-01,web-server-02,web-server-03
}

define servicegroup{
  servicegroup_name       web-services
  alias                   Web Services
  members                 web-server-01,HTTP,web-server-01,HTTPS
}

Check Commands

Local Checks (executed on Nagios server)

define command{
  command_name    check_local_disk
  command_line    $USER1$/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$
}

define command{
  command_name    check_local_cpu
  command_line    $USER1$/check_cpu -w $ARG1$ -c $ARG2$
}

define command{
  command_name    check_local_memory
  command_line    $USER1$/check_memory -w $ARG1$ -c $ARG2$
}

Remote Checks (NRPE - Nagios Remote Plugin Executor)

define command{
  command_name    check_nrpe
  command_line    $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
}

# Usage in service
define service{
  service_description     Remote Disk
  check_command           check_nrpe!check_disk
}

Notifications

Define Contacts

define contact{
  contact_name            sysadmin
  alias                   System Administrator
  email                   sysadmin@example.com
  host_notification_period 24x7
  service_notification_period 24x7
  host_notification_options d,r,u
  service_notification_options w,u,c,r
  can_submit_commands     1
}

define contact{
  contact_name            oncall
  alias                   On-Call Engineer
  email                   oncall@example.com
  pager                   +1-555-0123
  host_notification_commands notify-by-email
  service_notification_commands notify-by-email
}

Notification Commands

define command{
  command_name    notify-by-email
  command_line    /usr/bin/printf "%b" "***** Nagios Alert *****\n\nService: $SERVICEDESC$\nHost: $HOSTNAME$\nState: $SERVICESTATE$\nTime: $DATETIME$\n\nDetails:\n$SERVICEOUTPUT$" | /bin/mail -s "** $NOTIFICATIONTYPE$ alert - $HOSTNAME$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$
}

define command{
  command_name    notify-by-slack
  command_line    /usr/local/bin/send-slack.sh "$NOTIFICATIONTYPE$" "$HOSTNAME$" "$SERVICEDESC$" "$SERVICESTATE$" "$SERVICEOUTPUT$"
}

Monitoring Common Services

Web Services

define command{
  command_name    check_http_basic_auth
  command_line    $USER1$/check_http -H $HOSTADDRESS$ -u $ARG1$:$ARG2$ -p $ARG3$
}

define service{
  service_description     HTTPS Certificate
  check_command           check_http!-S
  check_interval          1440  # Daily
}

Database Monitoring

define command{
  command_name    check_mysql
  command_line    $USER1$/check_mysql -H $HOSTADDRESS$ -u $ARG1$ -p $ARG2$
}

define service{
  service_description     MySQL
  check_command           check_mysql!nagios!password
  check_interval          5
}

Log File Monitoring

define command{
  command_name    check_logfiles
  command_line    $USER1$/check_logfiles -f $HOSTADDRESS$:$ARG1$ -s $ARG2$
}

define service{
  service_description     Application Errors
  check_command           check_logfiles!/var/log/app.log!/var/lib/nagios/searches/errors.log
}

NRPE Configuration (Remote Monitoring)

On Remote Host

# Install NRPE
sudo apt install nagios-nrpe-server nagios-plugins

# Configure /etc/nagios/nrpe.cfg
allowed_hosts=192.168.1.5  # Nagios server IP
command[check_disk]=/usr/lib/nagios/plugins/check_disk -w 20 -c 10 -p /
command[check_load]=/usr/lib/nagios/plugins/check_load -w 5.0,4.0 -c 10.0,6.0
command[check_memory]=/usr/lib/nagios/plugins/check_memory -w 80 -c 90

# Start service
sudo systemctl start nagios-nrpe-server
sudo systemctl enable nagios-nrpe-server

# Test from Nagios server
/usr/lib/nagios/plugins/check_nrpe -H remote-host -c check_disk

Monitoring Web UI

Access Nagios Web Interface

# URL: http://localhost/nagios4/

# Default credentials
Username: nagiosadmin
Password: nagiosadmin

# Change admin password
sudo htpasswd /etc/nagios4/passwd nagiosadmin

# Key Sections:
# - Tactical Overview: Status summary
# - Current Status: All hosts/services
# - Service Problems: Alert listing
# - Hosts: Individual host details

Command-Line Tools

Check Status

# Check Nagios is running
systemctl status nagios4

# Service logs
tail -f /var/log/nagios4/nagios.log

# Check for errors
grep error /var/log/nagios4/nagios.log

# Verify configuration
nagios4 -v /etc/nagios4/nagios.cfg

Submit Commands

# Commands go to command pipe
# Schedule downtime for host
echo "[$(date +%s)] SCHEDULE_HOST_DOWNTIME;web-server-01;$(date +%s);$(($(date +%s) + 3600));1;0;0;nagios;Maintenance Window" >> /var/lib/nagios4/rw/nagios.cmd

# Acknowledge problem
echo "[$(date +%s)] ACKNOWLEDGE_SVC_PROBLEM;web-server-01;CPU Load;1;0;1;nagios;Acknowledged" >> /var/lib/nagios4/rw/nagios.cmd

# Force service check
echo "[$(date +%s)] SCHEDULE_FORCED_SVC_CHECK;web-server-01;HTTP;$(date +%s)" >> /var/lib/nagios4/rw/nagios.cmd

Advanced Features

Custom Event Handlers

define command{
  command_name    restart_http
  command_line    ssh -l nagios $HOSTADDRESS$ '/usr/bin/sudo /bin/systemctl restart httpd'
}

define service{
  service_description     HTTP
  event_handler           restart_http
  event_handler_enabled   1
}

Performance Graphing (PNP4Nagios)

# Install PNP4Nagios
apt install pnp4nagios

# Enable service with performance data
enable performance data in nagios.cfg
process_performance_data=1
service_perfdata_command=process-service-perfdata

# Access graphs: http://localhost/pnp4nagios

Best Practices

  • Monitor critical systems: databases, web servers, file servers
  • Set appropriate check intervals (5-60 min based on importance)
  • Use NRPE for remote monitoring instead of SSH
  • Group related services into servicegroups
  • Configure escalation for unacknowledged problems
  • Schedule maintenance windows to prevent false alerts
  • Regularly review check thresholds (don’t over-alert)
  • Archive logs regularly
  • Monitor Nagios itself (meta-monitoring)
  • Use templates to reduce configuration duplication