Salta ai contenuti

Upright

Upright is an open-source synthetic monitoring platform that continuously tests your application’s health and availability from multiple geographic locations. It combines HTTP health checks with Playwright-based browser automation to validate both API endpoints and complex user workflows. Perfect for SREs and DevOps teams monitoring critical services.

# Pull the Upright image
docker pull upright:latest

# Run with basic configuration
docker run -d \
  --name upright \
  -p 3000:3000 \
  -v ./config:/app/config \
  -e UPRIGHT_ENVIRONMENT=production \
  upright:latest

# Verify installation
curl http://localhost:3000/health
# Create docker-compose.yml
version: '3.8'
services:
  upright:
    image: upright:latest
    ports:
      - "3000:3000"
    volumes:
      - ./config:/app/config
      - ./data:/app/data
    environment:
      UPRIGHT_ENVIRONMENT: production
      UPRIGHT_LOG_LEVEL: info
    restart: unless-stopped
  
  postgres:
    image: postgres:15
    environment:
      POSTGRES_PASSWORD: upright_password
    volumes:
      - postgres_data:/var/lib/postgresql/data

volumes:
  postgres_data:

# Start services
docker-compose up -d
apiVersion: apps/v1
kind: Deployment
metadata:
  name: upright
spec:
  replicas: 2
  selector:
    matchLabels:
      app: upright
  template:
    metadata:
      labels:
        app: upright
    spec:
      containers:
      - name: upright
        image: upright:latest
        ports:
        - containerPort: 3000
        resources:
          requests:
            memory: "256Mi"
            cpu: "250m"
          limits:
            memory: "512Mi"
            cpu: "500m"
        volumeMounts:
        - name: config
          mountPath: /app/config
      volumes:
      - name: config
        configMap:
          name: upright-config
---
apiVersion: v1
kind: Service
metadata:
  name: upright-service
spec:
  selector:
    app: upright
  ports:
  - port: 3000
    targetPort: 3000
  type: LoadBalancer
# config.yaml
application:
  name: "MyApp Monitoring"
  environment: production

database:
  type: postgres
  host: localhost
  port: 5432
  database: upright_db

probes:
  regions:
    - us-east
    - us-west
    - eu-west
    - ap-southeast
  
  frequency: 60s
  timeout: 30s
  retries: 2
checks:
  - name: api-health
    type: http
    url: https://api.example.com/health
    method: GET
    interval: 60s
    timeout: 10s
    regions:
      - us-east
      - eu-west
    
    expectedStatus: 200
    headers:
      Authorization: Bearer YOUR_TOKEN
checks:
  - name: user-service
    type: http
    url: https://api.example.com/users
    method: GET
    interval: 120s
    timeout: 15s
    
    # Validate response body
    assertions:
      - type: json_path
        path: $.status
        value: "healthy"
      - type: json_path
        path: $.responseTime
        operator: lessThan
        value: 500
      
      # Regex validation
      - type: regex
        pattern: '"version":"[\d.]+"'
    
    # SSL certificate validation
    checkSSL: true
    minTTL: 30  # days
checks:
  - name: api-endpoint-test
    type: http
    url: https://api.example.com/validate
    method: POST
    interval: 300s
    
    headers:
      Content-Type: application/json
    
    body: |
      {
        "email": "test@example.com",
        "action": "validate"
      }
    
    expectedStatus: 200
    assertions:
      - type: json_path
        path: $.valid
        value: true
regions:
  us-east:
    location: "N. Virginia"
    provider: aws
    weight: 0.4
  
  us-west:
    location: "Oregon"
    provider: aws
    weight: 0.3
  
  eu-west:
    location: "Ireland"
    provider: aws
    weight: 0.2
  
  ap-southeast:
    location: "Singapore"
    provider: aws
    weight: 0.1
checks:
  - name: critical-api
    url: https://api.example.com
    regions:
      - us-east
      - us-west
      - eu-west
      - ap-southeast
    weight: 1.0
    
  - name: optional-api
    url: https://optional.example.com
    regions:
      - us-east
      - eu-west
    weight: 0.5
# Check active probes
curl http://localhost:3000/api/probes/status

# Get region statistics
curl http://localhost:3000/api/probes/regions

# View probe latency
curl http://localhost:3000/api/metrics/probe-latency
checks:
  - name: homepage-load
    type: playwright
    interval: 300s
    regions:
      - us-east
      - eu-west
    
    script: |
      const { chromium } = require('playwright');
      const browser = await chromium.launch();
      const page = await browser.newPage();
      
      await page.goto('https://example.com', { 
        waitUntil: 'networkidle' 
      });
      
      const title = await page.title();
      console.log('Page loaded:', title);
      
      await browser.close();
    
    timeout: 60s
    assertions:
      - type: log
        pattern: "Page loaded"
checks:
  - name: checkout-flow
    type: playwright
    interval: 600s
    
    script: |
      const { chromium } = require('playwright');
      const browser = await chromium.launch();
      const page = await browser.newPage();
      
      // Navigate to product page
      await page.goto('https://shop.example.com/products/item-123');
      await page.waitForSelector('.add-to-cart');
      
      // Add to cart
      await page.click('.add-to-cart');
      await page.waitForSelector('.cart-count');
      
      // Verify cart updated
      const cartCount = await page.locator('.cart-count').textContent();
      if (cartCount !== '1') throw new Error('Cart count incorrect');
      
      // Proceed to checkout
      await page.click('[data-testid="checkout-btn"]');
      await page.waitForNavigation();
      
      // Fill payment form
      await page.fill('input[name="cardNumber"]', '4111111111111111');
      await page.fill('input[name="expiry"]', '12/25');
      await page.fill('input[name="cvc"]', '123');
      
      // Submit order
      await page.click('[data-testid="submit-order"]');
      await page.waitForSelector('.order-confirmation');
      
      const confirmText = await page.locator('.order-confirmation').textContent();
      console.log('Order confirmed:', confirmText);
      
      await browser.close();
    
    timeout: 90s
checks:
  - name: homepage-visual
    type: playwright
    interval: 3600s
    
    script: |
      const { chromium } = require('playwright');
      const browser = await chromium.launch();
      const page = await browser.newPage();
      
      await page.goto('https://example.com');
      await page.screenshot({ path: 'homepage.png' });
      
      await browser.close();
    
    artifacts:
      - type: screenshot
        path: homepage.png
        compare: baseline  # Visual regression detection
notifications:
  slack:
    enabled: true
    webhook_url: https://hooks.slack.com/services/YOUR/WEBHOOK/URL
    
    channels:
      critical: "#critical-alerts"
      warning: "#upright-alerts"
    
    message_template: |
      Check: {{ check.name }}
      Status: {{ status }}
      Region: {{ region }}
      Message: {{ error }}
      Dashboard: https://upright.example.com/checks/{{ check.id }}
notifications:
  pagerduty:
    enabled: true
    api_token: YOUR_PAGERDUTY_TOKEN
    
    escalation_policy: "P123ABC"
    
    severity_mapping:
      critical: "critical"
      warning: "warning"
      info: "info"
notifications:
  email:
    enabled: true
    smtp_host: smtp.example.com
    smtp_port: 587
    smtp_user: alerts@example.com
    smtp_password: YOUR_PASSWORD
    
    recipients:
      critical: ["oncall@example.com", "devops@example.com"]
      warning: ["devops@example.com"]
    
    from_address: upright-alerts@example.com
notifications:
  webhook:
    enabled: true
    
    endpoints:
      - name: custom-handler
        url: https://webhook.example.com/upright
        method: POST
        retry_count: 3
        retry_interval: 30s
        
        headers:
          Authorization: Bearer YOUR_TOKEN
          X-Custom-Header: upright-check
        
        events:
          - check_failed
          - check_recovered
          - slo_breached
# Default URL
http://localhost:3000

# Login with admin credentials
# User: admin
# Password: (set via UPRIGHT_ADMIN_PASSWORD env var)
ViewPurposeKey Metrics
OverviewSystem health summaryUp/down status, incident count
ChecksIndividual check detailsLast run, latency, assertions
RegionsGeographic performanceSuccess rate per region, latency
IncidentsAlert historyTimestamp, duration, resolution
AnalyticsTrends over timeUptime %, response times, failures
dashboards:
  - name: sre-overview
    title: "SRE Monitoring Dashboard"
    
    widgets:
      - type: status_grid
        checks: ["api-health", "user-service", "checkout-flow"]
      
      - type: uptime_chart
        timerange: 30d
        checks: all
      
      - type: latency_graph
        timerange: 7d
        regions:
          - us-east
          - eu-west
      
      - type: incident_timeline
        timerange: 90d
slos:
  - name: api-availability
    description: "API endpoint availability"
    checks:
      - api-health
      - user-service
    
    target: 99.9  # percentage
    period: monthly
    
    error_budget:
      enabled: true
      warning_threshold: 10  # % of remaining budget
# Get SLO status
curl http://localhost:3000/api/slos/status

# Get current error budget
curl http://localhost:3000/api/slos/api-availability/budget

# Monthly SLO report
curl http://localhost:3000/api/reports/slo-monthly?month=2026-04
reporting:
  slo_reports:
    enabled: true
    
    recipients:
      - stakeholders@example.com
    
    frequency: monthly
    send_date: "1st of month"
    
    include:
      - slo_summary
      - error_budget_status
      - major_incidents
      - recommendations
checks:
  - name: rest-api-endpoints
    type: http
    interval: 120s
    
    endpoints:
      - method: GET
        url: https://api.example.com/v1/users
        expectedStatus: 200
      
      - method: GET
        url: https://api.example.com/v1/products
        expectedStatus: 200
      
      - method: GET
        url: https://api.example.com/v1/health
        expectedStatus: 200
      
      - method: POST
        url: https://api.example.com/v1/auth/validate
        body: '{"token":"test"}'
        expectedStatus: 200
checks:
  - name: graphql-api
    type: http
    url: https://api.example.com/graphql
    method: POST
    
    headers:
      Content-Type: application/json
    
    body: |
      {
        "query": "{ user(id: \"123\") { id name email } }"
      }
    
    assertions:
      - type: json_path
        path: $.data.user.id
        value: "123"
checks:
  - name: ssl-monitoring
    type: ssl
    domains:
      - example.com
      - api.example.com
      - www.example.com
    
    interval: 86400s  # Daily
    alert_threshold: 30  # days before expiry
    
    notifications:
      slack: true
      email: true
# Check SSL certificate status
curl http://localhost:3000/api/ssl/status

# Get certificate details
curl http://localhost:3000/api/ssl/certificate/example.com

# Export upcoming expiries
curl http://localhost:3000/api/ssl/expiring?days=30
checks:
  - name: custom-business-logic
    type: custom
    language: javascript
    interval: 600s
    
    script: |
      const fetch = require('node-fetch');
      
      async function check() {
        const response = await fetch('https://api.example.com/data');
        const data = await response.json();
        
        // Custom business logic
        const avgLatency = data.metrics.reduce((a, b) => a + b) / data.metrics.length;
        
        if (avgLatency > 1000) {
          throw new Error(`High latency: ${avgLatency}ms`);
        }
        
        return { success: true, latency: avgLatency };
      }
      
      check();
checks:
  - name: database-check
    type: custom
    language: python
    interval: 300s
    
    script: |
      import psycopg2
      import json
      
      conn = psycopg2.connect(
        host="db.example.com",
        database="production",
        user="upright_user",
        password="UPRIGHT_DB_PASSWORD"
      )
      
      cursor = conn.cursor()
      cursor.execute("SELECT COUNT(*) FROM users WHERE active = true")
      count = cursor.fetchone()[0]
      
      if count < 100:
        raise Exception(f"Low active users: {count}")
      
      print(json.dumps({"success": True, "active_users": count}))
      conn.close()
# Create PagerDuty service
curl -X POST https://api.pagerduty.com/services \
  -H "Authorization: Token token=YOUR_TOKEN" \
  -H "Content-Type: application/json" \
  -d '{
    "service": {
      "type": "service",
      "name": "Upright Monitoring",
      "escalation_policy": {
        "type": "escalation_policy_reference",
        "id": "P123ABC"
      }
    }
  }'
notifications:
  opsgenie:
    enabled: true
    api_key: YOUR_OPSGENIE_API_KEY
    region: us
    
    mapping:
      critical: P1
      warning: P2
      info: P3
notifications:
  servicenow:
    enabled: true
    instance: dev12345
    api_user: upright_api
    api_password: YOUR_PASSWORD
    
    incident_mapping:
      urgency: high
      impact: high
      assignment_group: "Platform Team"
MetricDescriptionUnit
AvailabilityPercentage of successful checks%
LatencyResponse time from probe to endpointms
Error RateFailed checks over total checks%
MTBFMean time between failureshours
MTTRMean time to recoveryminutes
ThroughputRequests per minutereq/min
# Get metrics for specific check
curl http://localhost:3000/api/metrics/checks/api-health?period=7d

# Get regional latency metrics
curl http://localhost:3000/api/metrics/regions/us-east?metric=latency

# Get uptime statistics
curl http://localhost:3000/api/metrics/uptime?checks=api-health,user-service

# Export metrics (Prometheus format)
curl http://localhost:3000/metrics
reporting:
  automated_reports:
    - name: weekly_summary
      frequency: weekly
      day: monday
      time: "09:00"
      format: pdf
      recipients:
        - team@example.com
      
      sections:
        - uptime_summary
        - incident_list
        - top_failures
        - performance_trends
    
    - name: monthly_slo
      frequency: monthly
      day: 1
      time: "08:00"
      format: html
      recipients:
        - stakeholders@example.com
FeatureUprightUptime Robot
Open SourceYesNo
Self-HostedYesCloud only
Browser AutomationYes (Playwright)Limited
Custom ScriptsYesNo
Multi-RegionYesYes
Incident ManagementBuilt-inVia integrations
CostFree (self-hosted)$4.99/month
FeatureUprightCheckly
Open SourceYesNo
Playwright SupportYesYes
Visual RegressionYesYes
Scheduled ChecksYesYes
Team CollaborationCommunityPremium
Pricing ModelFree (self-hosted)SaaS subscription
FeatureUprightDatadog Synthetics
Learning CurveModerateSteep
Open SourceYesNo
Custom MetricsYesYes
Multi-RegionYesYes
Integration EcosystemLimitedExtensive
CostFree (self-hosted)$0.10+ per test run
  • Set intervals based on criticality: vital services every 60s, less critical every 300s
  • Use multiple regions for customer-facing services
  • Include timeouts appropriate for expected response times
  • Add context-specific headers (API keys, auth tokens) to requests
  • Test checks locally before deploying to production
  • Avoid alert fatigue with proper thresholds and deduplication
  • Use escalation policies for critical services
  • Implement on-call rotation integration
  • Include runbooks in alert messages
  • Review and tune alert rules monthly
  • Start with 99% availability, adjust based on business requirements
  • Calculate error budget: (1 - target) × total seconds per month
  • Reserve error budget for planned maintenance
  • Track and report SLO trends monthly
  • Use SLO breaches to drive reliability improvements
  • Cache DNS lookups when checking multiple related endpoints
  • Use connection pooling for database checks
  • Monitor probe resource usage to avoid overload
  • Implement result caching for expensive checks
  • Adjust timeout values based on historical data
# Test regional connectivity
curl -v http://localhost:3000/api/probes/us-east/test

# Check region-specific logs
curl http://localhost:3000/api/logs/regions/us-east?limit=100
# Adjust retry configuration
checks:
  - name: flaky-endpoint
    retries: 3
    retry_delay: 5s
    consecutive_failures: 2  # Trigger alert after 2 consecutive failures
# Monitor Upright process
docker stats upright

# Reduce check frequency or number of regions
# Optimize custom scripts to reduce heap usage