ClickHouse Cheat Sheet

Overview

ClickHouse is an open-source column-oriented database management system that allows generating analytical data reports in real-time using SQL queries. Developed by Yandex for their web analytics service (processing over 20 billion events per day), ClickHouse is designed to process analytical queries over billions of rows and terabytes of data in seconds. Its columnar storage format and vectorized query execution engine make it one of the fastest analytical databases available.

ClickHouse supports a rich SQL dialect with extensions for analytics, real-time data ingestion at millions of rows per second, data compression ratios of 3-10x, and distributed query processing across clusters. It is used by companies like Uber, eBay, Cloudflare, and Bloomberg for log analytics, time-series data, business intelligence, and real-time dashboards.

Installation

Docker (Recommended)

# Quick start
docker run -d --name clickhouse-server \
  -p 8123:8123 -p 9000:9000 \
  -v clickhouse-data:/var/lib/clickhouse \
  clickhouse/clickhouse-server

# Connect with client
docker exec -it clickhouse-server clickhouse-client

Linux (APT)

# Add repository
sudo apt-get install -y apt-transport-https ca-certificates dirmngr
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754
echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list

# Install
sudo apt-get update
sudo apt-get install -y clickhouse-server clickhouse-client

# Start service
sudo service clickhouse-server start

# Connect
clickhouse-client

macOS

brew install clickhouse
clickhouse server &
clickhouse client

Core SQL

Create Database and Tables

CREATE DATABASE IF NOT EXISTS analytics;

-- MergeTree engine (primary engine for analytics)
CREATE TABLE analytics.events (
    event_id UUID DEFAULT generateUUIDv4(),
    event_type String,
    user_id UInt64,
    timestamp DateTime64(3) DEFAULT now64(3),
    properties String,  -- JSON as string
    country LowCardinality(String),
    device LowCardinality(String),
    revenue Decimal64(2) DEFAULT 0,
    date Date DEFAULT toDate(timestamp)
) ENGINE = MergeTree()
PARTITION BY toYYYYMM(date)
ORDER BY (event_type, user_id, timestamp)
TTL date + INTERVAL 90 DAY
SETTINGS index_granularity = 8192;

-- ReplacingMergeTree (deduplication)
CREATE TABLE analytics.users (
    user_id UInt64,
    name String,
    email String,
    created_at DateTime DEFAULT now(),
    updated_at DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(updated_at)
ORDER BY user_id;

-- AggregatingMergeTree (pre-aggregated)
CREATE TABLE analytics.daily_stats (
    date Date,
    event_type LowCardinality(String),
    count AggregateFunction(count),
    revenue AggregateFunction(sum, Decimal64(2)),
    users AggregateFunction(uniq, UInt64)
) ENGINE = AggregatingMergeTree()
PARTITION BY toYYYYMM(date)
ORDER BY (date, event_type);

Data Types

Type	Description	Example
`UInt8/16/32/64`	Unsigned integers	`UInt64`
`Int8/16/32/64`	Signed integers	`Int32`
`Float32/64`	Floating point	`Float64`
`Decimal64(S)`	Fixed-point decimal	`Decimal64(2)`
`String`	Variable-length string	`String`
`FixedString(N)`	Fixed-length string	`FixedString(16)`
`DateTime`	Unix timestamp (seconds)	`DateTime`
`DateTime64(P)`	High-precision timestamp	`DateTime64(3)`
`Date`	Date without time	`Date`
`UUID`	128-bit identifier	`UUID`
`Array(T)`	Array of type T	`Array(String)`
`Tuple(T1, T2)`	Tuple	`Tuple(String, UInt64)`
`Map(K, V)`	Key-value map	`Map(String, String)`
`LowCardinality(T)`	Dictionary-encoded	`LowCardinality(String)`
`Nullable(T)`	Nullable wrapper	`Nullable(UInt64)`
`Enum8/16`	Enumeration	`Enum8('a'=1, 'b'=2)`

Insert Data

-- Single insert
INSERT INTO analytics.events (event_type, user_id, country, device)
VALUES ('page_view', 12345, 'US', 'mobile');

-- Bulk insert
INSERT INTO analytics.events (event_type, user_id, country, device) VALUES
    ('click', 1001, 'US', 'desktop'),
    ('purchase', 1002, 'UK', 'mobile'),
    ('page_view', 1003, 'DE', 'tablet');

-- Insert from select
INSERT INTO analytics.daily_stats
SELECT
    toDate(timestamp) AS date,
    event_type,
    countState() AS count,
    sumState(revenue) AS revenue,
    uniqState(user_id) AS users
FROM analytics.events
GROUP BY date, event_type;

-- Insert from file
clickhouse-client --query="INSERT INTO events FORMAT CSV" < data.csv
clickhouse-client --query="INSERT INTO events FORMAT JSONEachRow" < data.json

Query Patterns

-- Basic analytics
SELECT
    event_type,
    count() AS event_count,
    uniq(user_id) AS unique_users,
    sum(revenue) AS total_revenue,
    avg(revenue) AS avg_revenue
FROM analytics.events
WHERE date >= today() - 30
GROUP BY event_type
ORDER BY event_count DESC;

-- Time series with fill
SELECT
    toStartOfHour(timestamp) AS hour,
    count() AS events
FROM analytics.events
WHERE timestamp >= now() - INTERVAL 24 HOUR
GROUP BY hour
ORDER BY hour
WITH FILL STEP toIntervalHour(1);

-- Window functions
SELECT
    user_id,
    event_type,
    timestamp,
    row_number() OVER (PARTITION BY user_id ORDER BY timestamp) AS event_num,
    lagInFrame(event_type) OVER (PARTITION BY user_id ORDER BY timestamp) AS prev_event
FROM analytics.events
WHERE date = today();

-- Funnel analysis
SELECT
    level,
    count() AS users
FROM (
    SELECT
        user_id,
        windowFunnel(86400)(timestamp, event_type = 'page_view', event_type = 'add_to_cart', event_type = 'purchase') AS level
    FROM analytics.events
    WHERE date >= today() - 7
    GROUP BY user_id
)
GROUP BY level
ORDER BY level;

-- Retention analysis
SELECT
    toDate(first_event) AS cohort_date,
    dateDiff('day', first_event, timestamp) AS day_number,
    uniq(user_id) AS users
FROM analytics.events
INNER JOIN (
    SELECT user_id, min(timestamp) AS first_event
    FROM analytics.events
    GROUP BY user_id
) AS cohorts USING user_id
WHERE date >= today() - 30
GROUP BY cohort_date, day_number
ORDER BY cohort_date, day_number;

Table Engines

Engine	Use Case	Key Feature
`MergeTree`	General analytics	Primary engine, sorted storage
`ReplacingMergeTree`	Deduplication	Keeps latest version by key
`SummingMergeTree`	Pre-aggregation	Auto-sums numeric columns
`AggregatingMergeTree`	Complex aggregations	Stores aggregate function states
`CollapsingMergeTree`	State tracking	Supports row deletion/update
`ReplicatedMergeTree`	High availability	Zookeeper-based replication
`Distributed`	Cluster queries	Sharding across nodes
`MaterializedView`	Auto-aggregation	Triggered on insert
`Buffer`	Write buffering	Batches inserts
`Kafka`	Stream ingestion	Read from Kafka topics

Configuration

Server Config

<!-- /etc/clickhouse-server/config.xml -->
<clickhouse>
    <listen_host>0.0.0.0</listen_host>
    <http_port>8123</http_port>
    <tcp_port>9000</tcp_port>
    <max_connections>1024</max_connections>
    <max_concurrent_queries>100</max_concurrent_queries>
    <max_memory_usage>10000000000</max_memory_usage>
    <mark_cache_size>5368709120</mark_cache_size>
</clickhouse>

Users Config

<!-- /etc/clickhouse-server/users.xml -->
<clickhouse>
    <users>
        <analyst>
            <password_sha256_hex>...</password_sha256_hex>
            <networks><ip>::/0</ip></networks>
            <profile>readonly</profile>
            <quota>default</quota>
        </analyst>
    </users>
</clickhouse>

Advanced Usage

Materialized Views

-- Auto-aggregate on insert
CREATE MATERIALIZED VIEW analytics.hourly_events_mv
TO analytics.hourly_events
AS SELECT
    toStartOfHour(timestamp) AS hour,
    event_type,
    count() AS event_count,
    uniq(user_id) AS unique_users,
    sum(revenue) AS total_revenue
FROM analytics.events
GROUP BY hour, event_type;

Distributed Queries

-- Create distributed table
CREATE TABLE analytics.events_distributed AS analytics.events
ENGINE = Distributed(my_cluster, analytics, events, rand());

-- Query spans all shards
SELECT count() FROM analytics.events_distributed;

Integration

# HTTP API
curl 'http://localhost:8123/' --data-binary "SELECT count() FROM events"

# With authentication
curl 'http://localhost:8123/?user=default&password=pass' \
  --data-binary "SELECT * FROM events LIMIT 10 FORMAT JSON"

# Insert via HTTP
curl 'http://localhost:8123/?query=INSERT INTO events FORMAT JSONEachRow' \
  --data-binary '{"event_type":"click","user_id":1,"country":"US"}'

Troubleshooting

Problem	Solution
Slow queries	Check `ORDER BY` matches query patterns; add skip indexes
High memory usage	Set `max_memory_usage`; use `max_bytes_before_external_sort`
Too many parts	Increase `parts_to_merge` or reduce insert frequency
Duplicate data	Use `ReplacingMergeTree` + `FINAL` or `OPTIMIZE FINAL`
Mutation stuck	Check `system.mutations`; kill with `KILL MUTATION`
Replication lag	Check Zookeeper health; verify network between replicas
Insert timeouts	Use `Buffer` engine or batch inserts; increase timeout
Disk space issues	Check TTL rules; run `OPTIMIZE TABLE ... FINAL`