DuckDB Cheat Sheet

Overview

DuckDB is an in-process SQL OLAP database management system designed for analytical workloads. Often described as the SQLite for analytics, DuckDB runs inside your application process with no separate server to install or manage. It provides a feature-rich SQL dialect, vectorized query execution, and can directly query Parquet, CSV, JSON, and Excel files without importing them first.

DuckDB is ideal for local data analysis, ETL scripting, data science workflows, and testing. It supports full ACID transactions, window functions, CTEs, lateral joins, and many PostgreSQL-compatible functions. With native bindings for Python, R, Java, Node.js, and other languages, DuckDB integrates seamlessly into existing data workflows. It can also query remote files on S3, GCS, and HTTP endpoints, making it a powerful tool for data exploration without infrastructure overhead.

Installation

# Python
pip install duckdb

# CLI (macOS)
brew install duckdb

# CLI (Linux)
wget https://github.com/duckdb/duckdb/releases/download/v1.0.0/duckdb_cli-linux-amd64.zip
unzip duckdb_cli-linux-amd64.zip

# Node.js
npm install duckdb

# R
install.packages("duckdb")

# Start CLI
duckdb                     # In-memory database
duckdb my_database.db      # Persistent database

Core SQL Operations

File Querying (No Import Required)

-- Query CSV directly
SELECT * FROM 'data/sales.csv' LIMIT 10;

-- Query Parquet
SELECT region, SUM(revenue) AS total
FROM 'data/sales/*.parquet'
GROUP BY region;

-- Query JSON
SELECT * FROM read_json_auto('data/events.json');

-- Query remote files
SELECT * FROM 'https://example.com/data.csv';

-- Query S3 (after installing httpfs extension)
INSTALL httpfs;
LOAD httpfs;
SET s3_region='us-east-1';
SET s3_access_key_id='AKIA...';
SET s3_secret_access_key='...';
SELECT * FROM 's3://bucket/data/*.parquet';

-- Query multiple files with glob patterns
SELECT * FROM 'data/**/*.csv';

-- Read Excel files
INSTALL spatial;
LOAD spatial;
SELECT * FROM st_read('report.xlsx');

Table Operations

-- Create table from query
CREATE TABLE sales AS
SELECT * FROM 'sales_2024.parquet';

-- Create table with schema
CREATE TABLE orders (
    order_id INTEGER PRIMARY KEY,
    customer_id INTEGER NOT NULL,
    amount DECIMAL(10,2),
    order_date DATE,
    status VARCHAR
);

-- Insert data
INSERT INTO orders VALUES
    (1, 100, 99.99, '2024-06-15', 'completed'),
    (2, 101, 49.50, '2024-06-16', 'pending');

-- CTAS with Parquet output
COPY (SELECT * FROM orders WHERE status = 'completed')
TO 'completed_orders.parquet' (FORMAT PARQUET, COMPRESSION ZSTD);

-- Export to CSV
COPY orders TO 'orders.csv' (HEADER, DELIMITER ',');

-- Import from CSV
COPY orders FROM 'new_orders.csv' (AUTO_DETECT TRUE);

-- Create or replace view
CREATE OR REPLACE VIEW active_orders AS
SELECT * FROM orders WHERE status != 'cancelled';

Analytical Queries

-- Window functions
SELECT
    order_date,
    amount,
    SUM(amount) OVER (ORDER BY order_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total,
    AVG(amount) OVER (ORDER BY order_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS moving_avg_7d,
    RANK() OVER (ORDER BY amount DESC) AS amount_rank
FROM orders;

-- Pivot / Unpivot
PIVOT orders ON status USING SUM(amount) GROUP BY order_date;

UNPIVOT monthly_sales ON jan, feb, mar, apr INTO NAME month VALUE sales;

-- List and struct operations
SELECT
    [1, 2, 3] AS my_list,
    {'name': 'Alice', 'age': 30} AS my_struct,
    list_transform([1,2,3], x -> x * 2) AS doubled,
    list_filter([1,2,3,4,5], x -> x > 3) AS filtered;

-- String aggregation
SELECT
    customer_id,
    string_agg(DISTINCT status, ', ' ORDER BY status) AS all_statuses,
    list(amount ORDER BY order_date) AS amount_history
FROM orders
GROUP BY customer_id;

-- Recursive CTEs
WITH RECURSIVE hierarchy AS (
    SELECT id, name, parent_id, 0 AS depth
    FROM categories WHERE parent_id IS NULL
    UNION ALL
    SELECT c.id, c.name, c.parent_id, h.depth + 1
    FROM categories c JOIN hierarchy h ON c.parent_id = h.id
)
SELECT * FROM hierarchy;

-- QUALIFY (filter on window functions)
SELECT customer_id, order_date, amount
FROM orders
QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY order_date DESC) = 1;

Python Integration

import duckdb

# In-memory connection
con = duckdb.connect()

# Query files directly
df = con.sql("SELECT * FROM 'data.parquet' WHERE amount > 100").df()

# Query pandas DataFrames
import pandas as pd
pandas_df = pd.DataFrame({"id": [1,2,3], "name": ["a","b","c"]})
result = con.sql("SELECT * FROM pandas_df WHERE id > 1").df()

# Query Polars DataFrames
import polars as pl
polars_df = pl.DataFrame({"x": [1,2,3], "y": [4,5,6]})
result = con.sql("SELECT x, y, x+y AS total FROM polars_df").pl()

# Relation API (lazy evaluation)
rel = con.sql("SELECT * FROM 'large_file.parquet'")
result = rel.filter("amount > 100").aggregate("region, SUM(amount)").df()

# Persistent database
con = duckdb.connect("analytics.db")
con.sql("CREATE TABLE IF NOT EXISTS events AS SELECT * FROM 'events.parquet'")
con.close()

# Use as replacement for pandas operations
duckdb.sql("""
    SELECT
        date_trunc('month', order_date) AS month,
        COUNT(*) AS orders,
        SUM(amount) AS revenue
    FROM 'orders.csv'
    GROUP BY 1
    ORDER BY 1
""").show()

Configuration and Extensions

-- List installed extensions
SELECT * FROM duckdb_extensions();

-- Install and load extensions
INSTALL httpfs;    LOAD httpfs;      -- S3/HTTP file access
INSTALL postgres;  LOAD postgres;    -- PostgreSQL scanner
INSTALL mysql;     LOAD mysql;       -- MySQL scanner
INSTALL sqlite;    LOAD sqlite;      -- SQLite scanner
INSTALL json;      LOAD json;        -- JSON functions
INSTALL parquet;   LOAD parquet;     -- Parquet support (built-in)
INSTALL icu;       LOAD icu;         -- Unicode collation
INSTALL fts;       LOAD fts;         -- Full-text search
INSTALL spatial;   LOAD spatial;     -- Geospatial functions

-- Memory and performance settings
SET memory_limit = '8GB';
SET threads = 8;
SET temp_directory = '/tmp/duckdb';

-- PostgreSQL integration
ATTACH 'dbname=mydb user=postgres host=localhost' AS pg (TYPE POSTGRES);
SELECT * FROM pg.public.customers LIMIT 10;

-- SQLite integration
ATTACH 'legacy.db' AS sqlite_db (TYPE SQLITE);
SELECT * FROM sqlite_db.main.users;

Advanced Usage

Macros and User-Defined Functions

-- Create SQL macro
CREATE MACRO dollars_to_euros(amount) AS amount * 0.92;
SELECT dollars_to_euros(100);

-- Table macro
CREATE MACRO recent_orders(days) AS TABLE
    SELECT * FROM orders WHERE order_date >= current_date - days;
SELECT * FROM recent_orders(30);

Full-Text Search

INSTALL fts; LOAD fts;

-- Create FTS index
PRAGMA create_fts_index('articles', 'id', 'title', 'content', stemmer='english');

-- Search
SELECT id, title, fts_main_articles.match_bm25(id, 'machine learning') AS score
FROM articles
WHERE score IS NOT NULL
ORDER BY score DESC
LIMIT 10;

Data Export Formats

-- Export to Parquet with partitioning
COPY (SELECT * FROM events)
TO 'output' (FORMAT PARQUET, PARTITION_BY (year, month), COMPRESSION ZSTD);

-- Export to JSON
COPY orders TO 'orders.json' (FORMAT JSON);

-- Export to Excel (requires spatial extension)
COPY orders TO 'report.xlsx' (FORMAT GDAL, DRIVER 'xlsx');

Troubleshooting

Issue	Solution
Out of memory	Set `SET memory_limit='4GB'`. Use `temp_directory` for spilling to disk
Slow CSV parsing	Use `read_csv_auto` with `sample_size=-1` for better type detection. Pre-convert to Parquet
S3 access denied	Check credentials with `SET s3_access_key_id`. Ensure `httpfs` extension is loaded
Type mismatch errors	Use explicit CAST(). Check `read_csv_auto` type detection with `DESCRIBE`
Concurrent write conflicts	DuckDB supports single-writer. Use separate connections for reads
Extension not found	Run `INSTALL extension_name` first, then `LOAD`. Check internet connectivity
Large file performance	Convert CSV to Parquet first. Use column projection (`SELECT col1, col2` not `*`)
PostgreSQL attach fails	Install `postgres` extension. Check connection string format and access
Date parsing issues	Specify format explicitly: `strptime(date_col, '%Y-%m-%d')`
Database file locked	Close other connections to the same `.db` file. Only one write connection allowed