Skip to content

Prometheus Commands

Prometheus is an open-source monitoring and alerting toolkit for recording time-series metrics.

Installation

Docker

# Run Prometheus
docker run -d -p 9090:9090 \
  -v /path/to/prometheus.yml:/etc/prometheus/prometheus.yml \
  prom/prometheus

# With mounted volume
docker run -d -p 9090:9090 \
  -v prometheus_storage:/prometheus \
  -v /path/to/prometheus.yml:/etc/prometheus/prometheus.yml \
  prom/prometheus

Linux

# Download
wget https://github.com/prometheus/prometheus/releases/download/v2.40.0/prometheus-2.40.0.linux-amd64.tar.gz
tar xzf prometheus-2.40.0.linux-amd64.tar.gz
cd prometheus-2.40.0.linux-amd64

# Run
./prometheus --config.file=prometheus.yml

Configuration

Basic prometheus.yml

global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    cluster: 'my-cluster'

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  - job_name: 'node'
    static_configs:
      - targets: ['localhost:9100', 'localhost:9101']

  - job_name: 'docker'
    static_configs:
      - targets: ['localhost:9323']

  - job_name: 'mongodb'
    static_configs:
      - targets: ['localhost:27017']

  - job_name: 'kubernetes'
    kubernetes_sd_configs:
      - role: node

Metric Types

TypePurpose
CounterMonotonically increasing value
GaugeCan increase or decrease
HistogramDistribution of values
SummarySimilar to histogram with quantiles

PromQL Queries

Basic Queries

# Get instant value
node_cpu_seconds_total

# Get specific label
node_memory_MemFree_bytes{instance="localhost:9100"}

# Get all with label
node_memory_MemFree_bytes{}

# Wildcard matching
http_requests_total{handler="/api/.*"}

Range Queries

# Last 5 minutes
rate(http_requests_total[5m])

# Last hour
increase(errors_total[1h])

# Last day
sum(rate(requests_total[5m])) by (job)

# Range query with offset
rate(http_requests_total[5m] offset 1h)

Aggregation Operators

# Sum
sum(http_requests_total)

# Count
count(up)

# Average
avg(node_memory_MemFree_bytes)

# Min/Max
min(temperature_celsius)
max(disk_usage_percent)

# Group by label
sum by (job) (http_requests_total)

# Multiple grouping
sum by (job, instance) (http_requests_total)

# without clause (exclude labels)
sum without (instance) (http_requests_total)

# Top N
topk(5, http_requests_total)

# Bottom N
bottomk(3, error_rate)

Rate Functions

# Rate (requests per second)
rate(http_requests_total[5m])

# Increase over range
increase(http_requests_total[1h])

# Change over time
changes(counter_total[5m])

# Deriv (derivative)
deriv(temperature[1h])

# Per-second rate
irate(http_requests_total[5m])

Comparison Operators

# Equality
up == 1

# Inequality
error_rate != 0

# Greater than
node_memory_MemFree_bytes > 1000000000

# Greater or equal
cpu_usage >= 80

# Range
memory_usage > 500 and memory_usage < 1000

Math Operators

# Addition
(node_memory_MemTotal_bytes - node_memory_MemFree_bytes) / 1000000

# Percentage
(errors_total / requests_total) * 100

# Division
disk_used / disk_total

# Multiplication
cpu_cores * cpu_percent

Binary Operators

# One-to-one matching
http_requests_total / http_requests_created

# Group by left side
sum by (job) (http_requests_total) / sum by (job) (http_requests_created)

# Ignoring labels
rate(http_requests_total[5m]) / ignoring (instance) http_requests_total

# On matching
rate(http_requests_total[5m]) * on(job, instance) group_left() node_up

Functions

# Absolute value
abs(-5)

# Round
round(3.14159, 2)

# Ceil/Floor
ceil(3.2)
floor(3.9)

# Sqrt
sqrt(16)

# Log functions
log2(8)
ln(2.718)

# Time functions
time()
minute(now)
hour(now)
day_of_week(now)

Web Console

# Query HTTP endpoint
curl 'http://localhost:9090/api/v1/query?query=up'

# Range query
curl 'http://localhost:9090/api/v1/query_range?query=up&start=1609459200&end=1609545600&step=60'

# Series matching
curl 'http://localhost:9090/api/v1/series?match[]=up'

# Label values
curl 'http://localhost:9090/api/v1/label/job/values'

# Targets
curl 'http://localhost:9090/api/v1/targets'

Recording Rules

groups:
  - name: compute
    interval: 30s
    rules:
      - record: instance:node_cpu:rate5m
        expr: rate(node_cpu_seconds_total[5m])

      - record: instance:node_memory_usage:ratio
        expr: (node_memory_MemTotal_bytes - node_memory_MemFree_bytes) / node_memory_MemTotal_bytes

      - record: job:http_requests:rate1m
        expr: sum by (job) (rate(http_requests_total[1m]))

Alert Rules

groups:
  - name: alerts
    interval: 1m
    rules:
      - alert: HighMemoryUsage
        expr: (node_memory_MemTotal_bytes - node_memory_MemFree_bytes) / node_memory_MemTotal_bytes > 0.90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"
          description: "Memory usage is {{ $value | humanizePercentage }}"

      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} is down"

      - alert: HighErrorRate
        expr: rate(errors_total[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          description: "Error rate: {{ $value | humanizePercentage }}"

Alertmanager Integration

alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - localhost:9093

Example Alertmanager Config

global:
  resolve_timeout: 5m

route:
  receiver: 'default'
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 4h

receivers:
  - name: 'default'
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/WEBHOOK/URL'
        channel: '#alerts'
        title: 'Alert: {{ .GroupLabels.alertname }}'
        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'

Common Queries

CPU Metrics

# CPU usage per instance
100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

# CPU per core
rate(node_cpu_seconds_total[5m]) * on(cpu, instance) group_left() node_cpu_info

Memory Metrics

# Memory usage percent
(1 - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes) * 100

# Memory available
node_memory_MemAvailable_bytes / 1024 / 1024

Disk Metrics

# Disk usage percent
(1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100

# Disk read/write rate
rate(node_disk_read_bytes_total[5m])
rate(node_disk_written_bytes_total[5m])

Network Metrics

# Network in/out
rate(node_network_receive_bytes_total[5m])
rate(node_network_transmit_bytes_total[5m])

# Packet loss
rate(node_network_receive_drop_total[5m])

Service Discovery

File-based

scrape_configs:
  - job_name: 'file_discovery'
    file_sd_configs:
      - files:
          - 'targets/*.json'
        refresh_interval: 30s

Consul

scrape_configs:
  - job_name: 'consul'
    consul_sd_configs:
      - server: 'localhost:8500'

Kubernetes

scrape_configs:
  - job_name: 'kubernetes-pods'
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true

Retention and Storage

global:
  # Retention settings
  scrape_interval: 15s
  external_labels:
    # Data retention (default 15d)
    # Use command line flag: --storage.tsdb.retention.time=30d

server_flags:
  - --storage.tsdb.retention.time=30d
  - --storage.tsdb.retention.size=50GB
  - --storage.tsdb.max-block-duration=10h

Best Practices

  • Use meaningful metric names following conventions
  • Add descriptive labels for dimensions
  • Implement recording rules for expensive queries
  • Set reasonable scrape intervals (15-60s typical)
  • Use retention policies based on storage
  • Create alerts for critical thresholds
  • Monitor your Prometheus instance itself
  • Use relabel configs to filter metrics
  • Test alert rules before deployment
  • Implement proper backup strategy

Resources


Last updated: 2026-03-30|Prometheus 2.40+