Zum Inhalt springen

LanceDB Cheat Sheet

Overview

LanceDB is an open-source, serverless vector database built on the Lance columnar data format. It requires no server management and stores data on local disk or cloud object storage (S3, GCS, Azure Blob). LanceDB supports multi-modal data (vectors, text, images, video), automatic versioning, zero-copy access from analytical tools like Pandas and DuckDB, and efficient disk-based indexing that keeps memory usage low.

The database is designed for AI applications where simplicity and cost-efficiency matter. It embeds directly into your application as a library (no separate server process), supports full-text search alongside vector search, and provides hybrid search with fusion ranking. LanceDB scales from prototyping on a laptop to production workloads with billions of vectors using its cloud-hosted option.

Installation

pip install lancedb

# With embedding model support
pip install "lancedb[embeddings]"

# With full-text search
pip install "lancedb[full-text-search]"

# JavaScript/TypeScript
npm install @lancedb/lancedb

Core Operations

Create Database and Table

import lancedb
import numpy as np

# Open/create database (local)
db = lancedb.connect("./my_lancedb")

# Open cloud database
# db = lancedb.connect("s3://bucket/path/to/db")

# Create table from list of dicts
data = [
    {"vector": np.random.rand(1536).tolist(), "text": "Document about RAG", "source": "wiki", "id": 1},
    {"vector": np.random.rand(1536).tolist(), "text": "Vector databases guide", "source": "docs", "id": 2},
    {"vector": np.random.rand(1536).tolist(), "text": "LLM fine-tuning tutorial", "source": "blog", "id": 3},
]
table = db.create_table("documents", data)

# Create table from Pandas DataFrame
import pandas as pd
df = pd.DataFrame(data)
table = db.create_table("documents_df", df)

# Create table from PyArrow
import pyarrow as pa
schema = pa.schema([
    pa.field("vector", pa.list_(pa.float32(), 1536)),
    pa.field("text", pa.string()),
    pa.field("id", pa.int64()),
])
table = db.create_table("documents_arrow", schema=schema)

Insert Data

# Add more data
table.add([
    {"vector": np.random.rand(1536).tolist(), "text": "New document", "source": "api", "id": 4},
])

# Bulk insert
batch = [
    {"vector": np.random.rand(1536).tolist(), "text": f"Doc {i}", "source": "batch", "id": i}
    for i in range(5, 10005)
]
table.add(batch)
# Basic search
query = np.random.rand(1536).tolist()
results = table.search(query).limit(10).to_pandas()
print(results[["text", "source", "_distance"]])

# Search with filter
results = (
    table.search(query)
    .where("source = 'wiki'")
    .limit(5)
    .to_pandas()
)

# Search with selected columns
results = (
    table.search(query)
    .select(["text", "source"])
    .limit(10)
    .to_pandas()
)

# Search with distance threshold
results = (
    table.search(query)
    .where("_distance < 0.5")
    .limit(20)
    .to_pandas()
)
# Create full-text index
table.create_fts_index("text")

# Full-text search
results = table.search("vector database tutorial", query_type="fts").limit(10).to_pandas()

# Hybrid search (vector + full-text)
results = (
    table.search(query, query_type="hybrid")
    .where("source = 'docs'")
    .limit(10)
    .to_pandas()
)

Indexing

Index Types

IndexDescriptionBest For
IVF_PQInverted file + product quantizationLarge datasets, low memory
IVF_HNSW_SQIVF + HNSW + scalar quantizationBalanced speed/accuracy
IVF_HNSW_PQIVF + HNSW + product quantizationLarge-scale, fast search
# Create IVF_PQ index
table.create_index(
    metric="cosine",
    num_partitions=256,
    num_sub_vectors=96,
    index_type="IVF_PQ"
)

# Create IVF_HNSW_SQ index
table.create_index(
    metric="L2",
    index_type="IVF_HNSW_SQ",
    num_partitions=128
)

# Search with index parameters
results = (
    table.search(query)
    .nprobes(20)        # Number of partitions to search
    .refine_factor(10)  # Refine top results with exact distance
    .limit(10)
    .to_pandas()
)

Embedding Functions

from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

# Use built-in OpenAI embeddings
openai_embed = get_registry().get("openai").create(name="text-embedding-3-small")

class Document(LanceModel):
    text: str = openai_embed.SourceField()
    vector: Vector(1536) = openai_embed.VectorField()
    source: str

# Create table with auto-embedding
table = db.create_table("auto_embed", schema=Document)

# Insert - embeddings generated automatically
table.add([
    {"text": "What is retrieval augmented generation?", "source": "wiki"},
    {"text": "How vector databases work", "source": "docs"},
])

# Search with text - auto-embedded
results = table.search("RAG techniques").limit(5).to_pandas()

# Sentence Transformers
st_embed = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5")

class STDocument(LanceModel):
    text: str = st_embed.SourceField()
    vector: Vector(384) = st_embed.VectorField()

Configuration

Table Management

# List tables
print(db.table_names())

# Open existing table
table = db.open_table("documents")

# Get table stats
print(f"Rows: {table.count_rows()}")
print(f"Schema: {table.schema}")

# Drop table
db.drop_table("documents")

# Update data
table.update(where="source = 'old'", values={"source": "archived"})

# Delete data
table.delete("source = 'archived'")

Versioning

# LanceDB automatically versions data
# List versions
versions = table.list_versions()
for v in versions:
    print(f"Version {v['version']}: {v['timestamp']}")

# Checkout specific version
table.checkout(version=2)
results = table.search(query).limit(5).to_pandas()

# Restore to latest
table.checkout_latest()

# Compact old versions
table.compact_files()
table.cleanup_old_versions(older_than="7d")

Advanced Usage

DuckDB Integration

import duckdb

# Query LanceDB table with DuckDB SQL
arrow_table = table.to_arrow()
result = duckdb.query("""
    SELECT source, COUNT(*) as count, AVG(length(text)) as avg_len
    FROM arrow_table
    GROUP BY source
    ORDER BY count DESC
""").to_df()
print(result)

Multi-Modal Data

from lancedb.pydantic import LanceModel, Vector
import PIL.Image

class ImageDoc(LanceModel):
    image_uri: str
    vector: Vector(512)
    caption: str
    tags: list[str]

table = db.create_table("images", schema=ImageDoc)
table.add([{
    "image_uri": "s3://bucket/img001.jpg",
    "vector": clip_embedding.tolist(),
    "caption": "A sunset over mountains",
    "tags": ["nature", "sunset"]
}])

# Filter by tags
results = (
    table.search(query_vector)
    .where("array_contains(tags, 'nature')")
    .limit(10)
    .to_pandas()
)

JavaScript/TypeScript

import * as lancedb from "@lancedb/lancedb";

const db = await lancedb.connect("./my_lancedb");

const data = [
  { vector: Array.from({ length: 1536 }, () => Math.random()), text: "Hello world", id: 1 },
  { vector: Array.from({ length: 1536 }, () => Math.random()), text: "LanceDB rocks", id: 2 },
];

const table = await db.createTable("docs", data);

const results = await table.search(queryVector).limit(5).toArray();
console.log(results);

Troubleshooting

IssueSolution
Slow first searchBuild an index for tables >50K rows
Out of memoryUse IVF_PQ index, increase nprobes instead of loading all data
S3 permission deniedCheck AWS credentials and bucket policy
Schema mismatch on insertEnsure new data matches table schema exactly
Full-text search not workingCreate FTS index first: table.create_fts_index("text")
Version cleanup failsEnsure no active readers on old versions
Embedding dimension errorVerify model output dim matches Vector field dim
Concurrent write conflictsUse write locks or sequential writes
# Check database size
du -sh ./my_lancedb/

# List table files
ls -la ./my_lancedb/documents.lance/

# Verify installation
python -c "import lancedb; print(lancedb.__version__)"