Embedchain Cheat Sheet

Overview

Embedchain (now part of mem0) is a framework for building AI applications that can learn from any data source. It abstracts away the complexity of chunking, embedding, storing, and retrieving data, allowing developers to create RAG-powered chatbots and assistants with just a few lines of code. Embedchain supports 30+ data sources including web pages, PDFs, YouTube videos, databases, and APIs.

The framework automatically handles document loading, text splitting, embedding generation, vector storage, and retrieval when answering questions. It supports multiple LLM providers (OpenAI, Anthropic, Ollama, HuggingFace), vector databases (Chroma, Pinecone, Qdrant, Weaviate), and embedding models, making it one of the simplest ways to build a RAG application.

Installation

pip install embedchain

# With specific vector store
pip install "embedchain[pinecone]"
pip install "embedchain[qdrant]"
pip install "embedchain[weaviate]"

# With all dependencies
pip install "embedchain[all]"

Core Usage

Quick Start

from embedchain import App

# Create app (uses OpenAI by default)
app = App()

# Add data from various sources
app.add("https://en.wikipedia.org/wiki/Machine_learning")
app.add("https://arxiv.org/abs/2005.11401", data_type="pdf_file")
app.add("Embedchain is a RAG framework", data_type="text")
app.add("https://www.youtube.com/watch?v=dQw4w9WgXcQ", data_type="youtube_video")

# Query
answer = app.query("What is machine learning?")
print(answer)

# Chat (maintains conversation history)
response = app.chat("Tell me about RAG")
print(response)
response = app.chat("How does it compare to fine-tuning?")
print(response)

Supported Data Sources

Data Type	Identifier	Input
Web page	`web_page`	URL
PDF	`pdf_file`	URL or file path
YouTube	`youtube_video`	YouTube URL
Text	`text`	Raw text string
CSV	`csv`	File path
JSON	`json`	File path
DOCX	`docx`	File path
Markdown	`mdx`	File path
Sitemap	`sitemap`	Sitemap URL
XML	`xml`	File path
Notion	`notion`	Notion page URL
Substack	`substack`	Substack URL
Discourse	`discourse`	Discourse URL
QnA pairs	`qna_pair`	Dict with Q&A
Code docs	`docs_site`	Docs URL
Image	`image`	Image URL/path
Custom	`custom`	Custom loader

Adding Data

from embedchain import App

app = App()

# Web page
app.add("https://docs.python.org/3/tutorial/", data_type="web_page")

# PDF from URL
app.add("https://arxiv.org/pdf/2005.11401.pdf", data_type="pdf_file")

# Local file
app.add("/path/to/document.pdf", data_type="pdf_file")

# Plain text
app.add("The Earth orbits the Sun at about 67,000 mph.", data_type="text")

# YouTube video (extracts transcript)
app.add("https://www.youtube.com/watch?v=VIDEO_ID", data_type="youtube_video")

# CSV data
app.add("data.csv", data_type="csv")

# QnA pair
app.add(("What is Python?", "Python is a programming language."), data_type="qna_pair")

# Sitemap (bulk pages)
app.add("https://example.com/sitemap.xml", data_type="sitemap")

# With metadata
app.add(
    "https://example.com/article",
    data_type="web_page",
    metadata={"source": "blog", "topic": "AI", "date": "2025-01-15"}
)

Configuration

YAML Config

# config.yaml
app:
  config:
    name: my-assistant
    collect_metrics: false

llm:
  provider: openai
  config:
    model: gpt-4o
    temperature: 0.5
    max_tokens: 2000
    stream: true
    system_prompt: |
      You are a helpful AI assistant. Answer questions based on the provided context.
      If you don't know the answer, say so.

embedder:
  provider: openai
  config:
    model: text-embedding-3-small

vectordb:
  provider: chroma
  config:
    collection_name: my-collection
    dir: ./db
    allow_reset: true

chunker:
  chunk_size: 1000
  chunk_overlap: 200
  min_chunk_size: 100

from embedchain import App

app = App.from_config(config_path="config.yaml")

Python Config

from embedchain import App
from embedchain.config import AppConfig

app = App(config=AppConfig(
    collect_metrics=False,
    log_level="INFO"
))

# Or with detailed config
from embedchain.config import (
    AppConfig, LlmConfig, EmbedderConfig,
    VectordbConfig, ChunkerConfig
)

app = App(config=AppConfig(
    llm=LlmConfig(
        provider="openai",
        model="gpt-4o",
        temperature=0.5,
        max_tokens=2000,
    ),
    embedder=EmbedderConfig(
        provider="openai",
        model="text-embedding-3-small"
    ),
    vectordb=VectordbConfig(
        provider="chroma",
        config={"collection_name": "my-docs", "dir": "./db"}
    ),
    chunker=ChunkerConfig(
        chunk_size=1000,
        chunk_overlap=200
    )
))

Provider Options

# OpenAI
llm:
  provider: openai
  config:
    model: gpt-4o

# Anthropic
llm:
  provider: anthropic
  config:
    model: claude-3-5-sonnet-20241022

# Ollama (local)
llm:
  provider: ollama
  config:
    model: llama3.1
    base_url: http://localhost:11434

# HuggingFace
llm:
  provider: huggingface
  config:
    model: mistralai/Mistral-7B-Instruct-v0.2

Advanced Usage

Search Without Generation

# Retrieve relevant chunks without LLM generation
contexts = app.search("What is machine learning?", num_documents=5)
for context in contexts:
    print(f"Source: {context['metadata']['url']}")
    print(f"Text: {context['context'][:200]}")
    print("---")

Streaming Responses

# Stream responses
for chunk in app.chat("Explain RAG in detail", stream=True):
    print(chunk, end="", flush=True)

Deploy as API

from embedchain import App
from fastapi import FastAPI

fast_app = FastAPI()
ec_app = App()

# Pre-load data
ec_app.add("https://docs.example.com", data_type="web_page")

@fast_app.post("/query")
async def query(question: str):
    answer = ec_app.query(question)
    return {"answer": answer}

@fast_app.post("/chat")
async def chat(message: str, session_id: str = "default"):
    response = ec_app.chat(message)
    return {"response": response}

@fast_app.post("/add")
async def add_source(url: str, data_type: str = "web_page"):
    ec_app.add(url, data_type=data_type)
    return {"status": "added"}

Custom Data Loader

from embedchain.loaders.base import BaseLoader

class CustomAPILoader(BaseLoader):
    def load_data(self, url):
        import requests
        response = requests.get(url)
        data = response.json()

        documents = []
        for item in data["results"]:
            documents.append({
                "content": item["text"],
                "meta_data": {
                    "url": url,
                    "id": item["id"],
                    "source": "custom_api"
                }
            })
        return {"doc_id": url, "data": documents}

# Register custom loader
app.add("https://api.example.com/data", data_type="custom", loader=CustomAPILoader())

Multiple Apps (Multi-Brain)

# Create separate knowledge bases
legal_bot = App(config=AppConfig(
    vectordb=VectordbConfig(
        provider="chroma",
        config={"collection_name": "legal"}
    )
))

tech_bot = App(config=AppConfig(
    vectordb=VectordbConfig(
        provider="chroma",
        config={"collection_name": "tech"}
    )
))

legal_bot.add("legal_docs.pdf", data_type="pdf_file")
tech_bot.add("tech_docs.pdf", data_type="pdf_file")

legal_answer = legal_bot.query("What are the compliance requirements?")
tech_answer = tech_bot.query("How do I deploy the application?")

Filtering and Metadata

# Query with metadata filters
answer = app.query(
    "What happened recently?",
    where={"source": "news", "date": {"$gte": "2025-01-01"}},
    citations=True
)

print(f"Answer: {answer['answer']}")
for citation in answer['citations']:
    print(f"Source: {citation['metadata']['url']}")

Troubleshooting

Issue	Solution
OpenAI key error	Set `OPENAI_API_KEY` environment variable
Empty answers	Verify data was added, check with `app.search()`
Slow ingestion	Reduce chunk_size, use batch adding
YouTube transcript fails	Video may not have captions available
PDF parsing error	Install `pypdf`: `pip install pypdf`
Vector store full	Increase storage limits or use cloud provider
Duplicate data	Embedchain deduplicates by default on source URL
Streaming not working	Set `stream: true` in LLM config

# Verify setup
python -c "from embedchain import App; app = App(); print('Embedchain ready')"

# Check vector store
python -c "
from embedchain import App
app = App()
print(f'Documents: {app.count()}')
"

# Reset database
python -c "
from embedchain import App
app = App()
app.reset()
print('Database reset')
"