LanceDB Cheat Sheet
Overview
LanceDB is an open-source, serverless vector database built on the Lance columnar data format. It requires no server management and stores data on local disk or cloud object storage (S3, GCS, Azure Blob). LanceDB supports multi-modal data (vectors, text, images, video), automatic versioning, zero-copy access from analytical tools like Pandas and DuckDB, and efficient disk-based indexing that keeps memory usage low.
The database is designed for AI applications where simplicity and cost-efficiency matter. It embeds directly into your application as a library (no separate server process), supports full-text search alongside vector search, and provides hybrid search with fusion ranking. LanceDB scales from prototyping on a laptop to production workloads with billions of vectors using its cloud-hosted option.
Installation
pip install lancedb
# With embedding model support
pip install "lancedb[embeddings]"
# With full-text search
pip install "lancedb[full-text-search]"
# JavaScript/TypeScript
npm install @lancedb/lancedb
Core Operations
Create Database and Table
import lancedb
import numpy as np
# Open/create database (local)
db = lancedb.connect("./my_lancedb")
# Open cloud database
# db = lancedb.connect("s3://bucket/path/to/db")
# Create table from list of dicts
data = [
{"vector": np.random.rand(1536).tolist(), "text": "Document about RAG", "source": "wiki", "id": 1},
{"vector": np.random.rand(1536).tolist(), "text": "Vector databases guide", "source": "docs", "id": 2},
{"vector": np.random.rand(1536).tolist(), "text": "LLM fine-tuning tutorial", "source": "blog", "id": 3},
]
table = db.create_table("documents", data)
# Create table from Pandas DataFrame
import pandas as pd
df = pd.DataFrame(data)
table = db.create_table("documents_df", df)
# Create table from PyArrow
import pyarrow as pa
schema = pa.schema([
pa.field("vector", pa.list_(pa.float32(), 1536)),
pa.field("text", pa.string()),
pa.field("id", pa.int64()),
])
table = db.create_table("documents_arrow", schema=schema)
Insert Data
# Add more data
table.add([
{"vector": np.random.rand(1536).tolist(), "text": "New document", "source": "api", "id": 4},
])
# Bulk insert
batch = [
{"vector": np.random.rand(1536).tolist(), "text": f"Doc {i}", "source": "batch", "id": i}
for i in range(5, 10005)
]
table.add(batch)
Vector Search
# Basic search
query = np.random.rand(1536).tolist()
results = table.search(query).limit(10).to_pandas()
print(results[["text", "source", "_distance"]])
# Search with filter
results = (
table.search(query)
.where("source = 'wiki'")
.limit(5)
.to_pandas()
)
# Search with selected columns
results = (
table.search(query)
.select(["text", "source"])
.limit(10)
.to_pandas()
)
# Search with distance threshold
results = (
table.search(query)
.where("_distance < 0.5")
.limit(20)
.to_pandas()
)
Full-Text Search
# Create full-text index
table.create_fts_index("text")
# Full-text search
results = table.search("vector database tutorial", query_type="fts").limit(10).to_pandas()
# Hybrid search (vector + full-text)
results = (
table.search(query, query_type="hybrid")
.where("source = 'docs'")
.limit(10)
.to_pandas()
)
Indexing
Index Types
| Index | Description | Best For |
|---|---|---|
IVF_PQ | Inverted file + product quantization | Large datasets, low memory |
IVF_HNSW_SQ | IVF + HNSW + scalar quantization | Balanced speed/accuracy |
IVF_HNSW_PQ | IVF + HNSW + product quantization | Large-scale, fast search |
# Create IVF_PQ index
table.create_index(
metric="cosine",
num_partitions=256,
num_sub_vectors=96,
index_type="IVF_PQ"
)
# Create IVF_HNSW_SQ index
table.create_index(
metric="L2",
index_type="IVF_HNSW_SQ",
num_partitions=128
)
# Search with index parameters
results = (
table.search(query)
.nprobes(20) # Number of partitions to search
.refine_factor(10) # Refine top results with exact distance
.limit(10)
.to_pandas()
)
Embedding Functions
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
# Use built-in OpenAI embeddings
openai_embed = get_registry().get("openai").create(name="text-embedding-3-small")
class Document(LanceModel):
text: str = openai_embed.SourceField()
vector: Vector(1536) = openai_embed.VectorField()
source: str
# Create table with auto-embedding
table = db.create_table("auto_embed", schema=Document)
# Insert - embeddings generated automatically
table.add([
{"text": "What is retrieval augmented generation?", "source": "wiki"},
{"text": "How vector databases work", "source": "docs"},
])
# Search with text - auto-embedded
results = table.search("RAG techniques").limit(5).to_pandas()
# Sentence Transformers
st_embed = get_registry().get("sentence-transformers").create(name="BAAI/bge-small-en-v1.5")
class STDocument(LanceModel):
text: str = st_embed.SourceField()
vector: Vector(384) = st_embed.VectorField()
Configuration
Table Management
# List tables
print(db.table_names())
# Open existing table
table = db.open_table("documents")
# Get table stats
print(f"Rows: {table.count_rows()}")
print(f"Schema: {table.schema}")
# Drop table
db.drop_table("documents")
# Update data
table.update(where="source = 'old'", values={"source": "archived"})
# Delete data
table.delete("source = 'archived'")
Versioning
# LanceDB automatically versions data
# List versions
versions = table.list_versions()
for v in versions:
print(f"Version {v['version']}: {v['timestamp']}")
# Checkout specific version
table.checkout(version=2)
results = table.search(query).limit(5).to_pandas()
# Restore to latest
table.checkout_latest()
# Compact old versions
table.compact_files()
table.cleanup_old_versions(older_than="7d")
Advanced Usage
DuckDB Integration
import duckdb
# Query LanceDB table with DuckDB SQL
arrow_table = table.to_arrow()
result = duckdb.query("""
SELECT source, COUNT(*) as count, AVG(length(text)) as avg_len
FROM arrow_table
GROUP BY source
ORDER BY count DESC
""").to_df()
print(result)
Multi-Modal Data
from lancedb.pydantic import LanceModel, Vector
import PIL.Image
class ImageDoc(LanceModel):
image_uri: str
vector: Vector(512)
caption: str
tags: list[str]
table = db.create_table("images", schema=ImageDoc)
table.add([{
"image_uri": "s3://bucket/img001.jpg",
"vector": clip_embedding.tolist(),
"caption": "A sunset over mountains",
"tags": ["nature", "sunset"]
}])
# Filter by tags
results = (
table.search(query_vector)
.where("array_contains(tags, 'nature')")
.limit(10)
.to_pandas()
)
JavaScript/TypeScript
import * as lancedb from "@lancedb/lancedb";
const db = await lancedb.connect("./my_lancedb");
const data = [
{ vector: Array.from({ length: 1536 }, () => Math.random()), text: "Hello world", id: 1 },
{ vector: Array.from({ length: 1536 }, () => Math.random()), text: "LanceDB rocks", id: 2 },
];
const table = await db.createTable("docs", data);
const results = await table.search(queryVector).limit(5).toArray();
console.log(results);
Troubleshooting
| Issue | Solution |
|---|---|
| Slow first search | Build an index for tables >50K rows |
| Out of memory | Use IVF_PQ index, increase nprobes instead of loading all data |
| S3 permission denied | Check AWS credentials and bucket policy |
| Schema mismatch on insert | Ensure new data matches table schema exactly |
| Full-text search not working | Create FTS index first: table.create_fts_index("text") |
| Version cleanup fails | Ensure no active readers on old versions |
| Embedding dimension error | Verify model output dim matches Vector field dim |
| Concurrent write conflicts | Use write locks or sequential writes |
# Check database size
du -sh ./my_lancedb/
# List table files
ls -la ./my_lancedb/documents.lance/
# Verify installation
python -c "import lancedb; print(lancedb.__version__)"