Embedchain Cheat Sheet
Overview
Embedchain (now part of mem0) is a framework for building AI applications that can learn from any data source. It abstracts away the complexity of chunking, embedding, storing, and retrieving data, allowing developers to create RAG-powered chatbots and assistants with just a few lines of code. Embedchain supports 30+ data sources including web pages, PDFs, YouTube videos, databases, and APIs.
The framework automatically handles document loading, text splitting, embedding generation, vector storage, and retrieval when answering questions. It supports multiple LLM providers (OpenAI, Anthropic, Ollama, HuggingFace), vector databases (Chroma, Pinecone, Qdrant, Weaviate), and embedding models, making it one of the simplest ways to build a RAG application.
Installation
pip install embedchain
# With specific vector store
pip install "embedchain[pinecone]"
pip install "embedchain[qdrant]"
pip install "embedchain[weaviate]"
# With all dependencies
pip install "embedchain[all]"
Core Usage
Quick Start
from embedchain import App
# Create app (uses OpenAI by default)
app = App()
# Add data from various sources
app.add("https://en.wikipedia.org/wiki/Machine_learning")
app.add("https://arxiv.org/abs/2005.11401", data_type="pdf_file")
app.add("Embedchain is a RAG framework", data_type="text")
app.add("https://www.youtube.com/watch?v=dQw4w9WgXcQ", data_type="youtube_video")
# Query
answer = app.query("What is machine learning?")
print(answer)
# Chat (maintains conversation history)
response = app.chat("Tell me about RAG")
print(response)
response = app.chat("How does it compare to fine-tuning?")
print(response)
Supported Data Sources
| Data Type | Identifier | Input |
|---|---|---|
| Web page | web_page | URL |
pdf_file | URL or file path | |
| YouTube | youtube_video | YouTube URL |
| Text | text | Raw text string |
| CSV | csv | File path |
| JSON | json | File path |
| DOCX | docx | File path |
| Markdown | mdx | File path |
| Sitemap | sitemap | Sitemap URL |
| XML | xml | File path |
| Notion | notion | Notion page URL |
| Substack | substack | Substack URL |
| Discourse | discourse | Discourse URL |
| QnA pairs | qna_pair | Dict with Q&A |
| Code docs | docs_site | Docs URL |
| Image | image | Image URL/path |
| Custom | custom | Custom loader |
Adding Data
from embedchain import App
app = App()
# Web page
app.add("https://docs.python.org/3/tutorial/", data_type="web_page")
# PDF from URL
app.add("https://arxiv.org/pdf/2005.11401.pdf", data_type="pdf_file")
# Local file
app.add("/path/to/document.pdf", data_type="pdf_file")
# Plain text
app.add("The Earth orbits the Sun at about 67,000 mph.", data_type="text")
# YouTube video (extracts transcript)
app.add("https://www.youtube.com/watch?v=VIDEO_ID", data_type="youtube_video")
# CSV data
app.add("data.csv", data_type="csv")
# QnA pair
app.add(("What is Python?", "Python is a programming language."), data_type="qna_pair")
# Sitemap (bulk pages)
app.add("https://example.com/sitemap.xml", data_type="sitemap")
# With metadata
app.add(
"https://example.com/article",
data_type="web_page",
metadata={"source": "blog", "topic": "AI", "date": "2025-01-15"}
)
Configuration
YAML Config
# config.yaml
app:
config:
name: my-assistant
collect_metrics: false
llm:
provider: openai
config:
model: gpt-4o
temperature: 0.5
max_tokens: 2000
stream: true
system_prompt: |
You are a helpful AI assistant. Answer questions based on the provided context.
If you don't know the answer, say so.
embedder:
provider: openai
config:
model: text-embedding-3-small
vectordb:
provider: chroma
config:
collection_name: my-collection
dir: ./db
allow_reset: true
chunker:
chunk_size: 1000
chunk_overlap: 200
min_chunk_size: 100
from embedchain import App
app = App.from_config(config_path="config.yaml")
Python Config
from embedchain import App
from embedchain.config import AppConfig
app = App(config=AppConfig(
collect_metrics=False,
log_level="INFO"
))
# Or with detailed config
from embedchain.config import (
AppConfig, LlmConfig, EmbedderConfig,
VectordbConfig, ChunkerConfig
)
app = App(config=AppConfig(
llm=LlmConfig(
provider="openai",
model="gpt-4o",
temperature=0.5,
max_tokens=2000,
),
embedder=EmbedderConfig(
provider="openai",
model="text-embedding-3-small"
),
vectordb=VectordbConfig(
provider="chroma",
config={"collection_name": "my-docs", "dir": "./db"}
),
chunker=ChunkerConfig(
chunk_size=1000,
chunk_overlap=200
)
))
Provider Options
# OpenAI
llm:
provider: openai
config:
model: gpt-4o
# Anthropic
llm:
provider: anthropic
config:
model: claude-3-5-sonnet-20241022
# Ollama (local)
llm:
provider: ollama
config:
model: llama3.1
base_url: http://localhost:11434
# HuggingFace
llm:
provider: huggingface
config:
model: mistralai/Mistral-7B-Instruct-v0.2
Advanced Usage
Search Without Generation
# Retrieve relevant chunks without LLM generation
contexts = app.search("What is machine learning?", num_documents=5)
for context in contexts:
print(f"Source: {context['metadata']['url']}")
print(f"Text: {context['context'][:200]}")
print("---")
Streaming Responses
# Stream responses
for chunk in app.chat("Explain RAG in detail", stream=True):
print(chunk, end="", flush=True)
Deploy as API
from embedchain import App
from fastapi import FastAPI
fast_app = FastAPI()
ec_app = App()
# Pre-load data
ec_app.add("https://docs.example.com", data_type="web_page")
@fast_app.post("/query")
async def query(question: str):
answer = ec_app.query(question)
return {"answer": answer}
@fast_app.post("/chat")
async def chat(message: str, session_id: str = "default"):
response = ec_app.chat(message)
return {"response": response}
@fast_app.post("/add")
async def add_source(url: str, data_type: str = "web_page"):
ec_app.add(url, data_type=data_type)
return {"status": "added"}
Custom Data Loader
from embedchain.loaders.base import BaseLoader
class CustomAPILoader(BaseLoader):
def load_data(self, url):
import requests
response = requests.get(url)
data = response.json()
documents = []
for item in data["results"]:
documents.append({
"content": item["text"],
"meta_data": {
"url": url,
"id": item["id"],
"source": "custom_api"
}
})
return {"doc_id": url, "data": documents}
# Register custom loader
app.add("https://api.example.com/data", data_type="custom", loader=CustomAPILoader())
Multiple Apps (Multi-Brain)
# Create separate knowledge bases
legal_bot = App(config=AppConfig(
vectordb=VectordbConfig(
provider="chroma",
config={"collection_name": "legal"}
)
))
tech_bot = App(config=AppConfig(
vectordb=VectordbConfig(
provider="chroma",
config={"collection_name": "tech"}
)
))
legal_bot.add("legal_docs.pdf", data_type="pdf_file")
tech_bot.add("tech_docs.pdf", data_type="pdf_file")
legal_answer = legal_bot.query("What are the compliance requirements?")
tech_answer = tech_bot.query("How do I deploy the application?")
Filtering and Metadata
# Query with metadata filters
answer = app.query(
"What happened recently?",
where={"source": "news", "date": {"$gte": "2025-01-01"}},
citations=True
)
print(f"Answer: {answer['answer']}")
for citation in answer['citations']:
print(f"Source: {citation['metadata']['url']}")
Troubleshooting
| Issue | Solution |
|---|---|
| OpenAI key error | Set OPENAI_API_KEY environment variable |
| Empty answers | Verify data was added, check with app.search() |
| Slow ingestion | Reduce chunk_size, use batch adding |
| YouTube transcript fails | Video may not have captions available |
| PDF parsing error | Install pypdf: pip install pypdf |
| Vector store full | Increase storage limits or use cloud provider |
| Duplicate data | Embedchain deduplicates by default on source URL |
| Streaming not working | Set stream: true in LLM config |
# Verify setup
python -c "from embedchain import App; app = App(); print('Embedchain ready')"
# Check vector store
python -c "
from embedchain import App
app = App()
print(f'Documents: {app.count()}')
"
# Reset database
python -c "
from embedchain import App
app = App()
app.reset()
print('Database reset')
"