Upright is an open-source synthetic monitoring platform that continuously tests your application’s health and availability from multiple geographic locations. It combines HTTP health checks with Playwright-based browser automation to validate both API endpoints and complex user workflows. Perfect for SREs and DevOps teams monitoring critical services.
# Pull the Upright image
docker pull upright:latest
# Run with basic configuration
docker run -d \
--name upright \
-p 3000:3000 \
-v ./config:/app/config \
-e UPRIGHT_ENVIRONMENT=production \
upright:latest
# Verify installation
curl http://localhost:3000/health
# Create docker-compose.yml
version: '3.8'
services:
upright:
image: upright:latest
ports:
- "3000:3000"
volumes:
- ./config:/app/config
- ./data:/app/data
environment:
UPRIGHT_ENVIRONMENT: production
UPRIGHT_LOG_LEVEL: info
restart: unless-stopped
postgres:
image: postgres:15
environment:
POSTGRES_PASSWORD: upright_password
volumes:
- postgres_data:/var/lib/postgresql/data
volumes:
postgres_data:
# Start services
docker-compose up -d
apiVersion: apps/v1
kind: Deployment
metadata:
name: upright
spec:
replicas: 2
selector:
matchLabels:
app: upright
template:
metadata:
labels:
app: upright
spec:
containers:
- name: upright
image: upright:latest
ports:
- containerPort: 3000
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
volumeMounts:
- name: config
mountPath: /app/config
volumes:
- name: config
configMap:
name: upright-config
---
apiVersion: v1
kind: Service
metadata:
name: upright-service
spec:
selector:
app: upright
ports:
- port: 3000
targetPort: 3000
type: LoadBalancer
# config.yaml
application:
name: "MyApp Monitoring"
environment: production
database:
type: postgres
host: localhost
port: 5432
database: upright_db
probes:
regions:
- us-east
- us-west
- eu-west
- ap-southeast
frequency: 60s
timeout: 30s
retries: 2
checks:
- name: api-health
type: http
url: https://api.example.com/health
method: GET
interval: 60s
timeout: 10s
regions:
- us-east
- eu-west
expectedStatus: 200
headers:
Authorization: Bearer YOUR_TOKEN
checks:
- name: user-service
type: http
url: https://api.example.com/users
method: GET
interval: 120s
timeout: 15s
# Validate response body
assertions:
- type: json_path
path: $.status
value: "healthy"
- type: json_path
path: $.responseTime
operator: lessThan
value: 500
# Regex validation
- type: regex
pattern: '"version":"[\d.]+"'
# SSL certificate validation
checkSSL: true
minTTL: 30 # days
checks:
- name: api-endpoint-test
type: http
url: https://api.example.com/validate
method: POST
interval: 300s
headers:
Content-Type: application/json
body: |
{
"email": "test@example.com",
"action": "validate"
}
expectedStatus: 200
assertions:
- type: json_path
path: $.valid
value: true
regions:
us-east:
location: "N. Virginia"
provider: aws
weight: 0.4
us-west:
location: "Oregon"
provider: aws
weight: 0.3
eu-west:
location: "Ireland"
provider: aws
weight: 0.2
ap-southeast:
location: "Singapore"
provider: aws
weight: 0.1
checks:
- name: critical-api
url: https://api.example.com
regions:
- us-east
- us-west
- eu-west
- ap-southeast
weight: 1.0
- name: optional-api
url: https://optional.example.com
regions:
- us-east
- eu-west
weight: 0.5
# Check active probes
curl http://localhost:3000/api/probes/status
# Get region statistics
curl http://localhost:3000/api/probes/regions
# View probe latency
curl http://localhost:3000/api/metrics/probe-latency
checks:
- name: homepage-load
type: playwright
interval: 300s
regions:
- us-east
- eu-west
script: |
const { chromium } = require('playwright');
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto('https://example.com', {
waitUntil: 'networkidle'
});
const title = await page.title();
console.log('Page loaded:', title);
await browser.close();
timeout: 60s
assertions:
- type: log
pattern: "Page loaded"
checks:
- name: checkout-flow
type: playwright
interval: 600s
script: |
const { chromium } = require('playwright');
const browser = await chromium.launch();
const page = await browser.newPage();
// Navigate to product page
await page.goto('https://shop.example.com/products/item-123');
await page.waitForSelector('.add-to-cart');
// Add to cart
await page.click('.add-to-cart');
await page.waitForSelector('.cart-count');
// Verify cart updated
const cartCount = await page.locator('.cart-count').textContent();
if (cartCount !== '1') throw new Error('Cart count incorrect');
// Proceed to checkout
await page.click('[data-testid="checkout-btn"]');
await page.waitForNavigation();
// Fill payment form
await page.fill('input[name="cardNumber"]', '4111111111111111');
await page.fill('input[name="expiry"]', '12/25');
await page.fill('input[name="cvc"]', '123');
// Submit order
await page.click('[data-testid="submit-order"]');
await page.waitForSelector('.order-confirmation');
const confirmText = await page.locator('.order-confirmation').textContent();
console.log('Order confirmed:', confirmText);
await browser.close();
timeout: 90s
checks:
- name: homepage-visual
type: playwright
interval: 3600s
script: |
const { chromium } = require('playwright');
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto('https://example.com');
await page.screenshot({ path: 'homepage.png' });
await browser.close();
artifacts:
- type: screenshot
path: homepage.png
compare: baseline # Visual regression detection
notifications:
slack:
enabled: true
webhook_url: https://hooks.slack.com/services/YOUR/WEBHOOK/URL
channels:
critical: "#critical-alerts"
warning: "#upright-alerts"
message_template: |
Check: {{ check.name }}
Status: {{ status }}
Region: {{ region }}
Message: {{ error }}
Dashboard: https://upright.example.com/checks/{{ check.id }}
notifications:
pagerduty:
enabled: true
api_token: YOUR_PAGERDUTY_TOKEN
escalation_policy: "P123ABC"
severity_mapping:
critical: "critical"
warning: "warning"
info: "info"
notifications:
email:
enabled: true
smtp_host: smtp.example.com
smtp_port: 587
smtp_user: alerts@example.com
smtp_password: YOUR_PASSWORD
recipients:
critical: ["oncall@example.com", "devops@example.com"]
warning: ["devops@example.com"]
from_address: upright-alerts@example.com
notifications:
webhook:
enabled: true
endpoints:
- name: custom-handler
url: https://webhook.example.com/upright
method: POST
retry_count: 3
retry_interval: 30s
headers:
Authorization: Bearer YOUR_TOKEN
X-Custom-Header: upright-check
events:
- check_failed
- check_recovered
- slo_breached
# Default URL
http://localhost:3000
# Login with admin credentials
# User: admin
# Password: (set via UPRIGHT_ADMIN_PASSWORD env var)
| View | Purpose | Key Metrics |
|---|
| Overview | System health summary | Up/down status, incident count |
| Checks | Individual check details | Last run, latency, assertions |
| Regions | Geographic performance | Success rate per region, latency |
| Incidents | Alert history | Timestamp, duration, resolution |
| Analytics | Trends over time | Uptime %, response times, failures |
dashboards:
- name: sre-overview
title: "SRE Monitoring Dashboard"
widgets:
- type: status_grid
checks: ["api-health", "user-service", "checkout-flow"]
- type: uptime_chart
timerange: 30d
checks: all
- type: latency_graph
timerange: 7d
regions:
- us-east
- eu-west
- type: incident_timeline
timerange: 90d
slos:
- name: api-availability
description: "API endpoint availability"
checks:
- api-health
- user-service
target: 99.9 # percentage
period: monthly
error_budget:
enabled: true
warning_threshold: 10 # % of remaining budget
# Get SLO status
curl http://localhost:3000/api/slos/status
# Get current error budget
curl http://localhost:3000/api/slos/api-availability/budget
# Monthly SLO report
curl http://localhost:3000/api/reports/slo-monthly?month=2026-04
reporting:
slo_reports:
enabled: true
recipients:
- stakeholders@example.com
frequency: monthly
send_date: "1st of month"
include:
- slo_summary
- error_budget_status
- major_incidents
- recommendations
checks:
- name: rest-api-endpoints
type: http
interval: 120s
endpoints:
- method: GET
url: https://api.example.com/v1/users
expectedStatus: 200
- method: GET
url: https://api.example.com/v1/products
expectedStatus: 200
- method: GET
url: https://api.example.com/v1/health
expectedStatus: 200
- method: POST
url: https://api.example.com/v1/auth/validate
body: '{"token":"test"}'
expectedStatus: 200
checks:
- name: graphql-api
type: http
url: https://api.example.com/graphql
method: POST
headers:
Content-Type: application/json
body: |
{
"query": "{ user(id: \"123\") { id name email } }"
}
assertions:
- type: json_path
path: $.data.user.id
value: "123"
checks:
- name: ssl-monitoring
type: ssl
domains:
- example.com
- api.example.com
- www.example.com
interval: 86400s # Daily
alert_threshold: 30 # days before expiry
notifications:
slack: true
email: true
# Check SSL certificate status
curl http://localhost:3000/api/ssl/status
# Get certificate details
curl http://localhost:3000/api/ssl/certificate/example.com
# Export upcoming expiries
curl http://localhost:3000/api/ssl/expiring?days=30
checks:
- name: custom-business-logic
type: custom
language: javascript
interval: 600s
script: |
const fetch = require('node-fetch');
async function check() {
const response = await fetch('https://api.example.com/data');
const data = await response.json();
// Custom business logic
const avgLatency = data.metrics.reduce((a, b) => a + b) / data.metrics.length;
if (avgLatency > 1000) {
throw new Error(`High latency: ${avgLatency}ms`);
}
return { success: true, latency: avgLatency };
}
check();
checks:
- name: database-check
type: custom
language: python
interval: 300s
script: |
import psycopg2
import json
conn = psycopg2.connect(
host="db.example.com",
database="production",
user="upright_user",
password="UPRIGHT_DB_PASSWORD"
)
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM users WHERE active = true")
count = cursor.fetchone()[0]
if count < 100:
raise Exception(f"Low active users: {count}")
print(json.dumps({"success": True, "active_users": count}))
conn.close()
# Create PagerDuty service
curl -X POST https://api.pagerduty.com/services \
-H "Authorization: Token token=YOUR_TOKEN" \
-H "Content-Type: application/json" \
-d '{
"service": {
"type": "service",
"name": "Upright Monitoring",
"escalation_policy": {
"type": "escalation_policy_reference",
"id": "P123ABC"
}
}
}'
notifications:
opsgenie:
enabled: true
api_key: YOUR_OPSGENIE_API_KEY
region: us
mapping:
critical: P1
warning: P2
info: P3
notifications:
servicenow:
enabled: true
instance: dev12345
api_user: upright_api
api_password: YOUR_PASSWORD
incident_mapping:
urgency: high
impact: high
assignment_group: "Platform Team"
| Metric | Description | Unit |
|---|
| Availability | Percentage of successful checks | % |
| Latency | Response time from probe to endpoint | ms |
| Error Rate | Failed checks over total checks | % |
| MTBF | Mean time between failures | hours |
| MTTR | Mean time to recovery | minutes |
| Throughput | Requests per minute | req/min |
# Get metrics for specific check
curl http://localhost:3000/api/metrics/checks/api-health?period=7d
# Get regional latency metrics
curl http://localhost:3000/api/metrics/regions/us-east?metric=latency
# Get uptime statistics
curl http://localhost:3000/api/metrics/uptime?checks=api-health,user-service
# Export metrics (Prometheus format)
curl http://localhost:3000/metrics
reporting:
automated_reports:
- name: weekly_summary
frequency: weekly
day: monday
time: "09:00"
format: pdf
recipients:
- team@example.com
sections:
- uptime_summary
- incident_list
- top_failures
- performance_trends
- name: monthly_slo
frequency: monthly
day: 1
time: "08:00"
format: html
recipients:
- stakeholders@example.com
| Feature | Upright | Uptime Robot |
|---|
| Open Source | Yes | No |
| Self-Hosted | Yes | Cloud only |
| Browser Automation | Yes (Playwright) | Limited |
| Custom Scripts | Yes | No |
| Multi-Region | Yes | Yes |
| Incident Management | Built-in | Via integrations |
| Cost | Free (self-hosted) | $4.99/month |
| Feature | Upright | Checkly |
|---|
| Open Source | Yes | No |
| Playwright Support | Yes | Yes |
| Visual Regression | Yes | Yes |
| Scheduled Checks | Yes | Yes |
| Team Collaboration | Community | Premium |
| Pricing Model | Free (self-hosted) | SaaS subscription |
| Feature | Upright | Datadog Synthetics |
|---|
| Learning Curve | Moderate | Steep |
| Open Source | Yes | No |
| Custom Metrics | Yes | Yes |
| Multi-Region | Yes | Yes |
| Integration Ecosystem | Limited | Extensive |
| Cost | Free (self-hosted) | $0.10+ per test run |
- Set intervals based on criticality: vital services every 60s, less critical every 300s
- Use multiple regions for customer-facing services
- Include timeouts appropriate for expected response times
- Add context-specific headers (API keys, auth tokens) to requests
- Test checks locally before deploying to production
- Avoid alert fatigue with proper thresholds and deduplication
- Use escalation policies for critical services
- Implement on-call rotation integration
- Include runbooks in alert messages
- Review and tune alert rules monthly
- Start with 99% availability, adjust based on business requirements
- Calculate error budget: (1 - target) × total seconds per month
- Reserve error budget for planned maintenance
- Track and report SLO trends monthly
- Use SLO breaches to drive reliability improvements
- Cache DNS lookups when checking multiple related endpoints
- Use connection pooling for database checks
- Monitor probe resource usage to avoid overload
- Implement result caching for expensive checks
- Adjust timeout values based on historical data
# Test regional connectivity
curl -v http://localhost:3000/api/probes/us-east/test
# Check region-specific logs
curl http://localhost:3000/api/logs/regions/us-east?limit=100
# Adjust retry configuration
checks:
- name: flaky-endpoint
retries: 3
retry_delay: 5s
consecutive_failures: 2 # Trigger alert after 2 consecutive failures
# Monitor Upright process
docker stats upright
# Reduce check frequency or number of regions
# Optimize custom scripts to reduce heap usage