تخطَّ إلى المحتوى

LitmusChaos Cheat Sheet

Overview

LitmusChaos is a CNCF incubating project that provides a complete chaos engineering framework for Kubernetes environments. It enables SRE and DevOps teams to inject controlled faults into applications, infrastructure, and platforms to identify weaknesses before they cause production outages. LitmusChaos uses Kubernetes custom resources (CRDs) to define chaos experiments, making it natively integrate with GitOps workflows and CI/CD pipelines.

The framework includes a comprehensive chaos hub (ChaosHub) with pre-built experiments for pod-level faults (kill, CPU stress, memory hog), node-level disruptions (drain, taint, restart), and network chaos (latency, loss, partition). LitmusChaos 3.x introduced a centralized control plane with a web dashboard, RBAC, team management, and detailed resilience scoring that helps teams quantify their system’s fault tolerance over time.

Installation

Install LitmusChaos 3.x via Helm

# Add Litmus Helm repository
helm repo add litmuschaos https://litmuschaos.github.io/litmus-helm/
helm repo update

# Create namespace
kubectl create namespace litmus

# Install Litmus ChaosCenter (control plane)
helm install chaos litmuschaos/litmus \
  --namespace litmus \
  --set portal.server.service.type=LoadBalancer \
  --set portal.frontend.service.type=LoadBalancer

# Verify installation
kubectl get pods -n litmus
kubectl get svc -n litmus

# Get ChaosCenter URL
kubectl get svc -n litmus chaos-litmus-frontend-service \
  -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'

# Default login: admin / litmus

Install via kubectl (Manifest)

# Install Litmus 3.x from manifests
kubectl apply -f https://litmuschaos.github.io/litmus/3.0.0/litmus-3.0.0.yaml

# Verify CRDs are installed
kubectl get crds | grep litmus
# Expected: chaosengines, chaosexperiments, chaosresults, etc.

Install litmusctl CLI

# Install litmusctl
curl -sL https://github.com/litmuschaos/litmusctl/releases/latest/download/litmusctl-linux-amd64 \
  -o /usr/local/bin/litmusctl
chmod +x /usr/local/bin/litmusctl

# Configure litmusctl
litmusctl config set-account \
  --endpoint "https://chaos.example.com" \
  --username admin \
  --password litmus

# List available projects
litmusctl get projects

# Connect a chaos delegate (agent)
litmusctl connect chaos-delegate \
  --name "production-cluster" \
  --project-id "project-id" \
  --non-interactive

Core Commands — Chaos Experiments

Pod-Level Chaos

# pod-delete.yaml — Kill application pods
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosEngine
metadata:
  name: pod-delete-chaos
  namespace: default
spec:
  engineState: active
  appinfo:
    appns: default
    applabel: app=nginx
    appkind: deployment
  chaosServiceAccount: litmus-admin
  experiments:
    - name: pod-delete
      spec:
        components:
          env:
            - name: TOTAL_CHAOS_DURATION
              value: "60"
            - name: CHAOS_INTERVAL
              value: "10"
            - name: FORCE
              value: "false"
            - name: PODS_AFFECTED_PERC
              value: "50"
# Apply pod delete experiment
kubectl apply -f pod-delete.yaml

# Monitor experiment
kubectl get chaosengine pod-delete-chaos -o jsonpath='{.status.engineStatus}'
kubectl get chaosresult pod-delete-chaos-pod-delete -o jsonpath='{.status.experimentStatus.verdict}'

CPU and Memory Stress

# pod-cpu-hog.yaml — Stress pod CPU
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosEngine
metadata:
  name: cpu-hog-chaos
  namespace: default
spec:
  engineState: active
  appinfo:
    appns: default
    applabel: app=api-server
    appkind: deployment
  chaosServiceAccount: litmus-admin
  experiments:
    - name: pod-cpu-hog
      spec:
        components:
          env:
            - name: CPU_CORES
              value: "2"
            - name: TOTAL_CHAOS_DURATION
              value: "120"
            - name: CPU_LOAD
              value: "80"
            - name: PODS_AFFECTED_PERC
              value: "100"

---
# pod-memory-hog.yaml — Stress pod memory
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosEngine
metadata:
  name: memory-hog-chaos
  namespace: default
spec:
  engineState: active
  appinfo:
    appns: default
    applabel: app=api-server
    appkind: deployment
  chaosServiceAccount: litmus-admin
  experiments:
    - name: pod-memory-hog
      spec:
        components:
          env:
            - name: MEMORY_CONSUMPTION
              value: "500"
            - name: TOTAL_CHAOS_DURATION
              value: "120"
            - name: NUMBER_OF_WORKERS
              value: "1"

Network Chaos

# pod-network-latency.yaml — Inject network latency
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosEngine
metadata:
  name: network-latency-chaos
  namespace: default
spec:
  engineState: active
  appinfo:
    appns: default
    applabel: app=frontend
    appkind: deployment
  chaosServiceAccount: litmus-admin
  experiments:
    - name: pod-network-latency
      spec:
        components:
          env:
            - name: NETWORK_INTERFACE
              value: "eth0"
            - name: NETWORK_LATENCY
              value: "200"
            - name: TOTAL_CHAOS_DURATION
              value: "60"
            - name: DESTINATION_IPS
              value: ""
            - name: DESTINATION_HOSTS
              value: "database.internal"
            - name: JITTER
              value: "50"

---
# pod-network-loss.yaml — Inject packet loss
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosEngine
metadata:
  name: network-loss-chaos
  namespace: default
spec:
  engineState: active
  appinfo:
    appns: default
    applabel: app=frontend
    appkind: deployment
  chaosServiceAccount: litmus-admin
  experiments:
    - name: pod-network-loss
      spec:
        components:
          env:
            - name: NETWORK_INTERFACE
              value: "eth0"
            - name: NETWORK_PACKET_LOSS_PERCENTAGE
              value: "30"
            - name: TOTAL_CHAOS_DURATION
              value: "60"

Configuration

Service Account and RBAC

# litmus-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: litmus-admin
  namespace: default

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: litmus-admin
rules:
  - apiGroups: [""]
    resources: ["pods", "pods/log", "events", "configmaps", "secrets", "services"]
    verbs: ["create", "delete", "get", "list", "patch", "update", "watch"]
  - apiGroups: ["apps"]
    resources: ["deployments", "replicasets", "statefulsets", "daemonsets"]
    verbs: ["get", "list", "patch", "update", "delete"]
  - apiGroups: ["litmuschaos.io"]
    resources: ["chaosengines", "chaosexperiments", "chaosresults"]
    verbs: ["create", "delete", "get", "list", "patch", "update", "watch"]
  - apiGroups: ["batch"]
    resources: ["jobs"]
    verbs: ["create", "delete", "get", "list", "watch"]

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: litmus-admin
subjects:
  - kind: ServiceAccount
    name: litmus-admin
    namespace: default
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: litmus-admin

Chaos Workflow (ChaosSchedule)

# chaos-schedule.yaml — Recurring chaos experiments
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosSchedule
metadata:
  name: weekly-resilience-test
  namespace: default
spec:
  schedule:
    repeat:
      timeRange:
        startTime: "2026-05-18T10:00:00Z"
        endTime: "2026-12-31T10:00:00Z"
      properties:
        minChaosInterval: "24h"
      workDays:
        includedDays: "Mon,Tue,Wed,Thu,Fri"
  engineTemplateSpec:
    engineState: active
    appinfo:
      appns: default
      applabel: app=api-server
      appkind: deployment
    chaosServiceAccount: litmus-admin
    experiments:
      - name: pod-delete
        spec:
          components:
            env:
              - name: TOTAL_CHAOS_DURATION
                value: "30"
              - name: CHAOS_INTERVAL
                value: "10"
              - name: FORCE
                value: "false"

Advanced Usage

Probes (Steady-State Hypothesis)

# Experiment with probes for validation
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosEngine
metadata:
  name: resilience-test
  namespace: default
spec:
  engineState: active
  appinfo:
    appns: default
    applabel: app=api-server
    appkind: deployment
  chaosServiceAccount: litmus-admin
  experiments:
    - name: pod-delete
      spec:
        probe:
          - name: "check-api-health"
            type: "httpProbe"
            mode: "Continuous"
            runProperties:
              probeTimeout: 10
              retry: 3
              interval: 5
              probePollingInterval: 2
            httpProbe/inputs:
              url: "http://api-server.default.svc:8080/health"
              method:
                get:
                  criteria: "=="
                  responseCode: "200"
          - name: "check-error-rate"
            type: "promProbe"
            mode: "Edge"
            runProperties:
              probeTimeout: 10
              retry: 2
              interval: 5
            promProbe/inputs:
              endpoint: "http://prometheus.monitoring:9090"
              query: "sum(rate(http_requests_total{status=~'5..', app='api-server'}[5m])) / sum(rate(http_requests_total{app='api-server'}[5m]))"
              comparator:
                type: "float"
                criteria: "<="
                value: "0.05"
          - name: "check-data-integrity"
            type: "cmdProbe"
            mode: "EOT"
            runProperties:
              probeTimeout: 30
              retry: 1
            cmdProbe/inputs:
              command: "kubectl exec deploy/api-server -- curl -s localhost:8080/data-check"
              comparator:
                type: "string"
                criteria: "contains"
                value: "integrity_ok"

CI/CD Integration

#!/bin/bash
# chaos-gate.sh — Run chaos as CI/CD quality gate
set -euo pipefail

EXPERIMENT="pod-delete-chaos"
TIMEOUT=300

echo "Applying chaos experiment: $EXPERIMENT"
kubectl apply -f experiments/${EXPERIMENT}.yaml

echo "Waiting for experiment to complete..."
END=$((SECONDS + TIMEOUT))
while [ $SECONDS -lt $END ]; do
  STATUS=$(kubectl get chaosresult "${EXPERIMENT}-pod-delete" \
    -o jsonpath='{.status.experimentStatus.verdict}' 2>/dev/null || echo "Waiting")
  
  if [ "$STATUS" = "Pass" ]; then
    echo "Chaos experiment PASSED — system is resilient"
    exit 0
  elif [ "$STATUS" = "Fail" ]; then
    echo "Chaos experiment FAILED — system is not resilient"
    kubectl get chaosresult "${EXPERIMENT}-pod-delete" -o yaml
    exit 1
  fi
  
  sleep 10
done

echo "TIMEOUT: Experiment did not complete in ${TIMEOUT}s"
exit 1

Node-Level Chaos

# node-drain.yaml
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosEngine
metadata:
  name: node-drain-chaos
  namespace: default
spec:
  engineState: active
  auxiliaryAppInfo: ""
  chaosServiceAccount: litmus-admin
  experiments:
    - name: node-drain
      spec:
        components:
          env:
            - name: TOTAL_CHAOS_DURATION
              value: "120"
            - name: TARGET_NODE
              value: "worker-node-01"
          nodeSelector:
            kubernetes.io/hostname: control-plane-01

Troubleshooting

IssueCauseSolution
Experiment stuck in waitingService account lacks permissionsApply RBAC manifests and verify ClusterRoleBinding
ChaosEngine not foundCRDs not installedRun kubectl get crds | grep litmus and reinstall
Probe failing incorrectlyProbe timeout too shortIncrease probeTimeout and retry values
Chaos runner pod OOMKilledInsufficient resource limitsIncrease runner pod memory limits in ChaosEngine
Network chaos not workingMissing NET_ADMIN capabilityEnsure container runtime allows required capabilities
Node chaos permission deniedlitmusctl not adminUse a service account with node-level cluster-admin
ChaosHub sync failingNetwork/firewall blocking GitHubUse a private ChaosHub or mirror experiments locally
Results showing AwaitedExperiment still runningWait or check runner pod logs: kubectl logs -l chaosUID
# Debug chaos experiment
kubectl describe chaosengine <engine-name>
kubectl get chaosresult <engine-name>-<experiment-name> -o yaml

# Check chaos runner logs
kubectl logs -l app.kubernetes.io/component=experiment -n default --tail=50

# List all chaos experiments available
kubectl get chaosexperiments

# Clean up failed experiments
kubectl delete chaosengine --all -n default