Ir al contenido

ClickHouse Cheat Sheet

Overview

ClickHouse is an open-source column-oriented database management system that allows generating analytical data reports in real-time using SQL queries. Developed by Yandex for their web analytics service (processing over 20 billion events per day), ClickHouse is designed to process analytical queries over billions of rows and terabytes of data in seconds. Its columnar storage format and vectorized query execution engine make it one of the fastest analytical databases available.

ClickHouse supports a rich SQL dialect with extensions for analytics, real-time data ingestion at millions of rows per second, data compression ratios of 3-10x, and distributed query processing across clusters. It is used by companies like Uber, eBay, Cloudflare, and Bloomberg for log analytics, time-series data, business intelligence, and real-time dashboards.

Installation

# Quick start
docker run -d --name clickhouse-server \
  -p 8123:8123 -p 9000:9000 \
  -v clickhouse-data:/var/lib/clickhouse \
  clickhouse/clickhouse-server

# Connect with client
docker exec -it clickhouse-server clickhouse-client

Linux (APT)

# Add repository
sudo apt-get install -y apt-transport-https ca-certificates dirmngr
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754
echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list

# Install
sudo apt-get update
sudo apt-get install -y clickhouse-server clickhouse-client

# Start service
sudo service clickhouse-server start

# Connect
clickhouse-client

macOS

brew install clickhouse
clickhouse server &
clickhouse client

Core SQL

Create Database and Tables

CREATE DATABASE IF NOT EXISTS analytics;

-- MergeTree engine (primary engine for analytics)
CREATE TABLE analytics.events (
    event_id UUID DEFAULT generateUUIDv4(),
    event_type String,
    user_id UInt64,
    timestamp DateTime64(3) DEFAULT now64(3),
    properties String,  -- JSON as string
    country LowCardinality(String),
    device LowCardinality(String),
    revenue Decimal64(2) DEFAULT 0,
    date Date DEFAULT toDate(timestamp)
) ENGINE = MergeTree()
PARTITION BY toYYYYMM(date)
ORDER BY (event_type, user_id, timestamp)
TTL date + INTERVAL 90 DAY
SETTINGS index_granularity = 8192;

-- ReplacingMergeTree (deduplication)
CREATE TABLE analytics.users (
    user_id UInt64,
    name String,
    email String,
    created_at DateTime DEFAULT now(),
    updated_at DateTime DEFAULT now()
) ENGINE = ReplacingMergeTree(updated_at)
ORDER BY user_id;

-- AggregatingMergeTree (pre-aggregated)
CREATE TABLE analytics.daily_stats (
    date Date,
    event_type LowCardinality(String),
    count AggregateFunction(count),
    revenue AggregateFunction(sum, Decimal64(2)),
    users AggregateFunction(uniq, UInt64)
) ENGINE = AggregatingMergeTree()
PARTITION BY toYYYYMM(date)
ORDER BY (date, event_type);

Data Types

TypeDescriptionExample
UInt8/16/32/64Unsigned integersUInt64
Int8/16/32/64Signed integersInt32
Float32/64Floating pointFloat64
Decimal64(S)Fixed-point decimalDecimal64(2)
StringVariable-length stringString
FixedString(N)Fixed-length stringFixedString(16)
DateTimeUnix timestamp (seconds)DateTime
DateTime64(P)High-precision timestampDateTime64(3)
DateDate without timeDate
UUID128-bit identifierUUID
Array(T)Array of type TArray(String)
Tuple(T1, T2)TupleTuple(String, UInt64)
Map(K, V)Key-value mapMap(String, String)
LowCardinality(T)Dictionary-encodedLowCardinality(String)
Nullable(T)Nullable wrapperNullable(UInt64)
Enum8/16EnumerationEnum8('a'=1, 'b'=2)

Insert Data

-- Single insert
INSERT INTO analytics.events (event_type, user_id, country, device)
VALUES ('page_view', 12345, 'US', 'mobile');

-- Bulk insert
INSERT INTO analytics.events (event_type, user_id, country, device) VALUES
    ('click', 1001, 'US', 'desktop'),
    ('purchase', 1002, 'UK', 'mobile'),
    ('page_view', 1003, 'DE', 'tablet');

-- Insert from select
INSERT INTO analytics.daily_stats
SELECT
    toDate(timestamp) AS date,
    event_type,
    countState() AS count,
    sumState(revenue) AS revenue,
    uniqState(user_id) AS users
FROM analytics.events
GROUP BY date, event_type;

-- Insert from file
clickhouse-client --query="INSERT INTO events FORMAT CSV" < data.csv
clickhouse-client --query="INSERT INTO events FORMAT JSONEachRow" < data.json

Query Patterns

-- Basic analytics
SELECT
    event_type,
    count() AS event_count,
    uniq(user_id) AS unique_users,
    sum(revenue) AS total_revenue,
    avg(revenue) AS avg_revenue
FROM analytics.events
WHERE date >= today() - 30
GROUP BY event_type
ORDER BY event_count DESC;

-- Time series with fill
SELECT
    toStartOfHour(timestamp) AS hour,
    count() AS events
FROM analytics.events
WHERE timestamp >= now() - INTERVAL 24 HOUR
GROUP BY hour
ORDER BY hour
WITH FILL STEP toIntervalHour(1);

-- Window functions
SELECT
    user_id,
    event_type,
    timestamp,
    row_number() OVER (PARTITION BY user_id ORDER BY timestamp) AS event_num,
    lagInFrame(event_type) OVER (PARTITION BY user_id ORDER BY timestamp) AS prev_event
FROM analytics.events
WHERE date = today();

-- Funnel analysis
SELECT
    level,
    count() AS users
FROM (
    SELECT
        user_id,
        windowFunnel(86400)(timestamp, event_type = 'page_view', event_type = 'add_to_cart', event_type = 'purchase') AS level
    FROM analytics.events
    WHERE date >= today() - 7
    GROUP BY user_id
)
GROUP BY level
ORDER BY level;

-- Retention analysis
SELECT
    toDate(first_event) AS cohort_date,
    dateDiff('day', first_event, timestamp) AS day_number,
    uniq(user_id) AS users
FROM analytics.events
INNER JOIN (
    SELECT user_id, min(timestamp) AS first_event
    FROM analytics.events
    GROUP BY user_id
) AS cohorts USING user_id
WHERE date >= today() - 30
GROUP BY cohort_date, day_number
ORDER BY cohort_date, day_number;

Table Engines

EngineUse CaseKey Feature
MergeTreeGeneral analyticsPrimary engine, sorted storage
ReplacingMergeTreeDeduplicationKeeps latest version by key
SummingMergeTreePre-aggregationAuto-sums numeric columns
AggregatingMergeTreeComplex aggregationsStores aggregate function states
CollapsingMergeTreeState trackingSupports row deletion/update
ReplicatedMergeTreeHigh availabilityZookeeper-based replication
DistributedCluster queriesSharding across nodes
MaterializedViewAuto-aggregationTriggered on insert
BufferWrite bufferingBatches inserts
KafkaStream ingestionRead from Kafka topics

Configuration

Server Config

<!-- /etc/clickhouse-server/config.xml -->
<clickhouse>
    <listen_host>0.0.0.0</listen_host>
    <http_port>8123</http_port>
    <tcp_port>9000</tcp_port>
    <max_connections>1024</max_connections>
    <max_concurrent_queries>100</max_concurrent_queries>
    <max_memory_usage>10000000000</max_memory_usage>
    <mark_cache_size>5368709120</mark_cache_size>
</clickhouse>

Users Config

<!-- /etc/clickhouse-server/users.xml -->
<clickhouse>
    <users>
        <analyst>
            <password_sha256_hex>...</password_sha256_hex>
            <networks><ip>::/0</ip></networks>
            <profile>readonly</profile>
            <quota>default</quota>
        </analyst>
    </users>
</clickhouse>

Advanced Usage

Materialized Views

-- Auto-aggregate on insert
CREATE MATERIALIZED VIEW analytics.hourly_events_mv
TO analytics.hourly_events
AS SELECT
    toStartOfHour(timestamp) AS hour,
    event_type,
    count() AS event_count,
    uniq(user_id) AS unique_users,
    sum(revenue) AS total_revenue
FROM analytics.events
GROUP BY hour, event_type;

Distributed Queries

-- Create distributed table
CREATE TABLE analytics.events_distributed AS analytics.events
ENGINE = Distributed(my_cluster, analytics, events, rand());

-- Query spans all shards
SELECT count() FROM analytics.events_distributed;

Integration

# HTTP API
curl 'http://localhost:8123/' --data-binary "SELECT count() FROM events"

# With authentication
curl 'http://localhost:8123/?user=default&password=pass' \
  --data-binary "SELECT * FROM events LIMIT 10 FORMAT JSON"

# Insert via HTTP
curl 'http://localhost:8123/?query=INSERT INTO events FORMAT JSONEachRow' \
  --data-binary '{"event_type":"click","user_id":1,"country":"US"}'

Troubleshooting

ProblemSolution
Slow queriesCheck ORDER BY matches query patterns; add skip indexes
High memory usageSet max_memory_usage; use max_bytes_before_external_sort
Too many partsIncrease parts_to_merge or reduce insert frequency
Duplicate dataUse ReplacingMergeTree + FINAL or OPTIMIZE FINAL
Mutation stuckCheck system.mutations; kill with KILL MUTATION
Replication lagCheck Zookeeper health; verify network between replicas
Insert timeoutsUse Buffer engine or batch inserts; increase timeout
Disk space issuesCheck TTL rules; run OPTIMIZE TABLE ... FINAL