Ir al contenido

Embedchain Cheat Sheet

Overview

Embedchain (now part of mem0) is a framework for building AI applications that can learn from any data source. It abstracts away the complexity of chunking, embedding, storing, and retrieving data, allowing developers to create RAG-powered chatbots and assistants with just a few lines of code. Embedchain supports 30+ data sources including web pages, PDFs, YouTube videos, databases, and APIs.

The framework automatically handles document loading, text splitting, embedding generation, vector storage, and retrieval when answering questions. It supports multiple LLM providers (OpenAI, Anthropic, Ollama, HuggingFace), vector databases (Chroma, Pinecone, Qdrant, Weaviate), and embedding models, making it one of the simplest ways to build a RAG application.

Installation

pip install embedchain

# With specific vector store
pip install "embedchain[pinecone]"
pip install "embedchain[qdrant]"
pip install "embedchain[weaviate]"

# With all dependencies
pip install "embedchain[all]"

Core Usage

Quick Start

from embedchain import App

# Create app (uses OpenAI by default)
app = App()

# Add data from various sources
app.add("https://en.wikipedia.org/wiki/Machine_learning")
app.add("https://arxiv.org/abs/2005.11401", data_type="pdf_file")
app.add("Embedchain is a RAG framework", data_type="text")
app.add("https://www.youtube.com/watch?v=dQw4w9WgXcQ", data_type="youtube_video")

# Query
answer = app.query("What is machine learning?")
print(answer)

# Chat (maintains conversation history)
response = app.chat("Tell me about RAG")
print(response)
response = app.chat("How does it compare to fine-tuning?")
print(response)

Supported Data Sources

Data TypeIdentifierInput
Web pageweb_pageURL
PDFpdf_fileURL or file path
YouTubeyoutube_videoYouTube URL
TexttextRaw text string
CSVcsvFile path
JSONjsonFile path
DOCXdocxFile path
MarkdownmdxFile path
SitemapsitemapSitemap URL
XMLxmlFile path
NotionnotionNotion page URL
SubstacksubstackSubstack URL
DiscoursediscourseDiscourse URL
QnA pairsqna_pairDict with Q&A
Code docsdocs_siteDocs URL
ImageimageImage URL/path
CustomcustomCustom loader

Adding Data

from embedchain import App

app = App()

# Web page
app.add("https://docs.python.org/3/tutorial/", data_type="web_page")

# PDF from URL
app.add("https://arxiv.org/pdf/2005.11401.pdf", data_type="pdf_file")

# Local file
app.add("/path/to/document.pdf", data_type="pdf_file")

# Plain text
app.add("The Earth orbits the Sun at about 67,000 mph.", data_type="text")

# YouTube video (extracts transcript)
app.add("https://www.youtube.com/watch?v=VIDEO_ID", data_type="youtube_video")

# CSV data
app.add("data.csv", data_type="csv")

# QnA pair
app.add(("What is Python?", "Python is a programming language."), data_type="qna_pair")

# Sitemap (bulk pages)
app.add("https://example.com/sitemap.xml", data_type="sitemap")

# With metadata
app.add(
    "https://example.com/article",
    data_type="web_page",
    metadata={"source": "blog", "topic": "AI", "date": "2025-01-15"}
)

Configuration

YAML Config

# config.yaml
app:
  config:
    name: my-assistant
    collect_metrics: false

llm:
  provider: openai
  config:
    model: gpt-4o
    temperature: 0.5
    max_tokens: 2000
    stream: true
    system_prompt: |
      You are a helpful AI assistant. Answer questions based on the provided context.
      If you don't know the answer, say so.

embedder:
  provider: openai
  config:
    model: text-embedding-3-small

vectordb:
  provider: chroma
  config:
    collection_name: my-collection
    dir: ./db
    allow_reset: true

chunker:
  chunk_size: 1000
  chunk_overlap: 200
  min_chunk_size: 100
from embedchain import App

app = App.from_config(config_path="config.yaml")

Python Config

from embedchain import App
from embedchain.config import AppConfig

app = App(config=AppConfig(
    collect_metrics=False,
    log_level="INFO"
))

# Or with detailed config
from embedchain.config import (
    AppConfig, LlmConfig, EmbedderConfig,
    VectordbConfig, ChunkerConfig
)

app = App(config=AppConfig(
    llm=LlmConfig(
        provider="openai",
        model="gpt-4o",
        temperature=0.5,
        max_tokens=2000,
    ),
    embedder=EmbedderConfig(
        provider="openai",
        model="text-embedding-3-small"
    ),
    vectordb=VectordbConfig(
        provider="chroma",
        config={"collection_name": "my-docs", "dir": "./db"}
    ),
    chunker=ChunkerConfig(
        chunk_size=1000,
        chunk_overlap=200
    )
))

Provider Options

# OpenAI
llm:
  provider: openai
  config:
    model: gpt-4o

# Anthropic
llm:
  provider: anthropic
  config:
    model: claude-3-5-sonnet-20241022

# Ollama (local)
llm:
  provider: ollama
  config:
    model: llama3.1
    base_url: http://localhost:11434

# HuggingFace
llm:
  provider: huggingface
  config:
    model: mistralai/Mistral-7B-Instruct-v0.2

Advanced Usage

Search Without Generation

# Retrieve relevant chunks without LLM generation
contexts = app.search("What is machine learning?", num_documents=5)
for context in contexts:
    print(f"Source: {context['metadata']['url']}")
    print(f"Text: {context['context'][:200]}")
    print("---")

Streaming Responses

# Stream responses
for chunk in app.chat("Explain RAG in detail", stream=True):
    print(chunk, end="", flush=True)

Deploy as API

from embedchain import App
from fastapi import FastAPI

fast_app = FastAPI()
ec_app = App()

# Pre-load data
ec_app.add("https://docs.example.com", data_type="web_page")

@fast_app.post("/query")
async def query(question: str):
    answer = ec_app.query(question)
    return {"answer": answer}

@fast_app.post("/chat")
async def chat(message: str, session_id: str = "default"):
    response = ec_app.chat(message)
    return {"response": response}

@fast_app.post("/add")
async def add_source(url: str, data_type: str = "web_page"):
    ec_app.add(url, data_type=data_type)
    return {"status": "added"}

Custom Data Loader

from embedchain.loaders.base import BaseLoader

class CustomAPILoader(BaseLoader):
    def load_data(self, url):
        import requests
        response = requests.get(url)
        data = response.json()

        documents = []
        for item in data["results"]:
            documents.append({
                "content": item["text"],
                "meta_data": {
                    "url": url,
                    "id": item["id"],
                    "source": "custom_api"
                }
            })
        return {"doc_id": url, "data": documents}

# Register custom loader
app.add("https://api.example.com/data", data_type="custom", loader=CustomAPILoader())

Multiple Apps (Multi-Brain)

# Create separate knowledge bases
legal_bot = App(config=AppConfig(
    vectordb=VectordbConfig(
        provider="chroma",
        config={"collection_name": "legal"}
    )
))

tech_bot = App(config=AppConfig(
    vectordb=VectordbConfig(
        provider="chroma",
        config={"collection_name": "tech"}
    )
))

legal_bot.add("legal_docs.pdf", data_type="pdf_file")
tech_bot.add("tech_docs.pdf", data_type="pdf_file")

legal_answer = legal_bot.query("What are the compliance requirements?")
tech_answer = tech_bot.query("How do I deploy the application?")

Filtering and Metadata

# Query with metadata filters
answer = app.query(
    "What happened recently?",
    where={"source": "news", "date": {"$gte": "2025-01-01"}},
    citations=True
)

print(f"Answer: {answer['answer']}")
for citation in answer['citations']:
    print(f"Source: {citation['metadata']['url']}")

Troubleshooting

IssueSolution
OpenAI key errorSet OPENAI_API_KEY environment variable
Empty answersVerify data was added, check with app.search()
Slow ingestionReduce chunk_size, use batch adding
YouTube transcript failsVideo may not have captions available
PDF parsing errorInstall pypdf: pip install pypdf
Vector store fullIncrease storage limits or use cloud provider
Duplicate dataEmbedchain deduplicates by default on source URL
Streaming not workingSet stream: true in LLM config
# Verify setup
python -c "from embedchain import App; app = App(); print('Embedchain ready')"

# Check vector store
python -c "
from embedchain import App
app = App()
print(f'Documents: {app.count()}')
"

# Reset database
python -c "
from embedchain import App
app = App()
app.reset()
print('Database reset')
"