Ir al contenido

TruLens Cheat Sheet

Overview

TruLens is an open-source framework for evaluating, tracking, and monitoring LLM applications. It provides feedback functions that score LLM outputs on dimensions like groundedness, relevance, and harmfulness, with a dashboard for visualizing evaluation results over time. TruLens wraps existing LLM apps (LangChain, LlamaIndex, or custom) to capture inputs, outputs, and intermediate steps for comprehensive evaluation.

The framework implements the RAG Triad evaluation methodology: context relevance (are retrieved documents relevant?), groundedness (is the answer supported by context?), and answer relevance (does the answer address the question?). TruLens supports both LLM-based evaluators and traditional NLP metrics, with a Streamlit dashboard for interactive exploration.

Installation

pip install trulens

# With specific providers
pip install "trulens[openai]"
pip install "trulens[langchain]"
pip install "trulens[llama_index]"

# With all providers
pip install "trulens[all]"

Core Concepts

Basic Setup

from trulens.core import TruSession
from trulens.providers.openai import OpenAI as TruOpenAI

# Initialize session (stores eval data)
session = TruSession()
session.reset_database()  # Start fresh (optional)

# Create feedback provider
provider = TruOpenAI(model_engine="gpt-4o-mini")

Feedback Functions

from trulens.core import Feedback, Select

# Answer relevance: Is the answer relevant to the question?
f_answer_relevance = (
    Feedback(provider.relevance_with_cot_reasons, name="Answer Relevance")
    .on_input()
    .on_output()
)

# Context relevance: Are retrieved documents relevant?
f_context_relevance = (
    Feedback(provider.context_relevance_with_cot_reasons, name="Context Relevance")
    .on_input()
    .on(Select.RecordCalls.retrieve.rets[:])
    .aggregate(lambda scores: sum(scores) / len(scores) if scores else 0)
)

# Groundedness: Is the answer grounded in retrieved context?
from trulens.feedback import Groundedness
grounded = Groundedness(groundedness_provider=provider)

f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name="Groundedness")
    .on(Select.RecordCalls.retrieve.rets[:].collect())
    .on_output()
)

# Harmfulness check
f_harmfulness = (
    Feedback(provider.harmfulness_with_cot_reasons, name="Harmfulness")
    .on_output()
)

Wrapping a Custom App

from trulens.apps.basic import TruBasicApp

def my_rag_app(question):
    # Your RAG logic here
    context = retrieve(question)
    answer = generate(question, context)
    return answer

tru_app = TruBasicApp(
    app=my_rag_app,
    app_name="my-rag",
    app_version="v1",
    feedbacks=[f_answer_relevance, f_groundedness, f_context_relevance]
)

# Run with recording
with tru_app as recording:
    result = tru_app.app("What is retrieval-augmented generation?")
    print(result)

# View results
session.get_leaderboard()

Wrapping LangChain

from trulens.apps.langchain import TruChain
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

# Build LangChain RAG
llm = ChatOpenAI(model="gpt-4o")
embeddings = OpenAIEmbeddings()
vectorstore = Chroma(embedding_function=embeddings, persist_directory="./chroma_db")
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

# Wrap with TruLens
tru_chain = TruChain(
    chain,
    app_name="langchain-rag",
    app_version="v1",
    feedbacks=[f_answer_relevance, f_groundedness, f_context_relevance]
)

# Run and evaluate
with tru_chain as recording:
    response = tru_chain.invoke({"query": "How does vector search work?"})

Wrapping LlamaIndex

from trulens.apps.llamaindex import TruLlama
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# Build LlamaIndex app
documents = SimpleDirectoryReader("./data").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

# Wrap with TruLens
tru_llama = TruLlama(
    query_engine,
    app_name="llamaindex-rag",
    app_version="v1",
    feedbacks=[f_answer_relevance, f_groundedness, f_context_relevance]
)

# Run and evaluate
with tru_llama as recording:
    response = tru_llama.query("Explain the architecture of the system")

Dashboard

# Launch Streamlit dashboard
from trulens.dashboard import run_dashboard

run_dashboard(session, port=8501)
# Opens at http://localhost:8501
# CLI launch
trulens-eval --port 8501

Dashboard Features

ViewDescription
LeaderboardCompare app versions by aggregate scores
RecordsBrowse individual query/response pairs
FeedbackDetailed feedback scores with reasons
AppTrace internal component calls
CompareSide-by-side version comparison

Available Feedback Functions

Quality Metrics

# Coherence
f_coherence = Feedback(provider.coherence_with_cot_reasons).on_output()

# Conciseness
f_conciseness = Feedback(provider.conciseness_with_cot_reasons).on_output()

# Correctness (requires ground truth)
f_correctness = (
    Feedback(provider.correctness_with_cot_reasons)
    .on_input()
    .on_output()
)

# Comprehensiveness
f_comprehensive = (
    Feedback(provider.comprehensiveness_with_cot_reasons)
    .on_input()
    .on_output()
)

Safety Metrics

# Toxicity
f_toxicity = Feedback(provider.toxicity_with_cot_reasons).on_output()

# Controversiality
f_controversial = Feedback(provider.controversiality_with_cot_reasons).on_output()

# Criminality
f_criminal = Feedback(provider.criminality_with_cot_reasons).on_output()

# Insensitivity
f_insensitivity = Feedback(provider.insensitivity_with_cot_reasons).on_output()

Configuration

Using Different Providers

# OpenAI
from trulens.providers.openai import OpenAI
provider = OpenAI(model_engine="gpt-4o")

# Azure OpenAI
from trulens.providers.openai import AzureOpenAI
provider = AzureOpenAI(
    deployment_name="gpt-4o",
    azure_endpoint="https://your-resource.openai.azure.com/"
)

# HuggingFace (local)
from trulens.providers.huggingface import HuggingfaceLocal
provider = HuggingfaceLocal()

# Bedrock
from trulens.providers.bedrock import Bedrock
provider = Bedrock(model_id="anthropic.claude-3-sonnet")

Database Backends

# SQLite (default)
session = TruSession(database_url="sqlite:///trulens.db")

# PostgreSQL
session = TruSession(database_url="postgresql://user:pass@localhost:5432/trulens")

# MySQL
session = TruSession(database_url="mysql://user:pass@localhost:3306/trulens")

Advanced Usage

Custom Feedback Functions

from trulens.core import Feedback

def custom_technical_score(question: str, answer: str) -> float:
    """Score technical accuracy from 0 to 1."""
    prompt = f"""Rate the technical accuracy of this answer on a scale of 0-1.
    Question: {question}
    Answer: {answer}
    Score (0-1):"""

    # Use your preferred LLM
    score = call_llm(prompt)
    return float(score)

f_technical = (
    Feedback(custom_technical_score, name="Technical Accuracy")
    .on_input()
    .on_output()
)

Batch Evaluation

questions = [
    "What is RAG?",
    "How do vector databases work?",
    "Explain transformer architecture",
]

with tru_app as recording:
    for q in questions:
        result = tru_app.app(q)
        print(f"Q: {q}\nA: {result}\n")

# Get aggregated results
leaderboard = session.get_leaderboard()
print(leaderboard)

# Export records
records = session.get_records_and_feedback()
records.to_csv("evaluation_results.csv")

A/B Testing

# Version A
tru_app_v1 = TruBasicApp(app=rag_v1, app_name="rag", app_version="v1", feedbacks=feedbacks)
# Version B
tru_app_v2 = TruBasicApp(app=rag_v2, app_name="rag", app_version="v2", feedbacks=feedbacks)

for q in test_questions:
    with tru_app_v1:
        tru_app_v1.app(q)
    with tru_app_v2:
        tru_app_v2.app(q)

# Compare in dashboard
leaderboard = session.get_leaderboard()
print(leaderboard[["app_version", "Answer Relevance", "Groundedness"]])

Troubleshooting

IssueSolution
Dashboard not loadingCheck port 8501 is free, run trulens-eval CLI
Feedback scores all NoneVerify API keys, check provider connection
Recording not capturing contextEnsure Select paths match your app structure
Database locked (SQLite)Switch to PostgreSQL for concurrent access
Slow evaluationUse gpt-4o-mini provider, reduce feedback count
Import errorsUpdate: pip install -U trulens
LangChain wrapper failsCheck LangChain version compatibility
Memory issues with large datasetsUse PostgreSQL backend, paginate queries
# Reset database
python -c "from trulens.core import TruSession; TruSession().reset_database()"

# Check installed version
pip show trulens

# Export all data
python -c "
from trulens.core import TruSession
s = TruSession()
records = s.get_records_and_feedback()
records.to_parquet('trulens_export.parquet')
"