TruLens Cheat Sheet
Overview
TruLens is an open-source framework for evaluating, tracking, and monitoring LLM applications. It provides feedback functions that score LLM outputs on dimensions like groundedness, relevance, and harmfulness, with a dashboard for visualizing evaluation results over time. TruLens wraps existing LLM apps (LangChain, LlamaIndex, or custom) to capture inputs, outputs, and intermediate steps for comprehensive evaluation.
The framework implements the RAG Triad evaluation methodology: context relevance (are retrieved documents relevant?), groundedness (is the answer supported by context?), and answer relevance (does the answer address the question?). TruLens supports both LLM-based evaluators and traditional NLP metrics, with a Streamlit dashboard for interactive exploration.
Installation
pip install trulens
# With specific providers
pip install "trulens[openai]"
pip install "trulens[langchain]"
pip install "trulens[llama_index]"
# With all providers
pip install "trulens[all]"
Core Concepts
Basic Setup
from trulens.core import TruSession
from trulens.providers.openai import OpenAI as TruOpenAI
# Initialize session (stores eval data)
session = TruSession()
session.reset_database() # Start fresh (optional)
# Create feedback provider
provider = TruOpenAI(model_engine="gpt-4o-mini")
Feedback Functions
from trulens.core import Feedback, Select
# Answer relevance: Is the answer relevant to the question?
f_answer_relevance = (
Feedback(provider.relevance_with_cot_reasons, name="Answer Relevance")
.on_input()
.on_output()
)
# Context relevance: Are retrieved documents relevant?
f_context_relevance = (
Feedback(provider.context_relevance_with_cot_reasons, name="Context Relevance")
.on_input()
.on(Select.RecordCalls.retrieve.rets[:])
.aggregate(lambda scores: sum(scores) / len(scores) if scores else 0)
)
# Groundedness: Is the answer grounded in retrieved context?
from trulens.feedback import Groundedness
grounded = Groundedness(groundedness_provider=provider)
f_groundedness = (
Feedback(grounded.groundedness_measure_with_cot_reasons, name="Groundedness")
.on(Select.RecordCalls.retrieve.rets[:].collect())
.on_output()
)
# Harmfulness check
f_harmfulness = (
Feedback(provider.harmfulness_with_cot_reasons, name="Harmfulness")
.on_output()
)
Wrapping a Custom App
from trulens.apps.basic import TruBasicApp
def my_rag_app(question):
# Your RAG logic here
context = retrieve(question)
answer = generate(question, context)
return answer
tru_app = TruBasicApp(
app=my_rag_app,
app_name="my-rag",
app_version="v1",
feedbacks=[f_answer_relevance, f_groundedness, f_context_relevance]
)
# Run with recording
with tru_app as recording:
result = tru_app.app("What is retrieval-augmented generation?")
print(result)
# View results
session.get_leaderboard()
Wrapping LangChain
from trulens.apps.langchain import TruChain
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
# Build LangChain RAG
llm = ChatOpenAI(model="gpt-4o")
embeddings = OpenAIEmbeddings()
vectorstore = Chroma(embedding_function=embeddings, persist_directory="./chroma_db")
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
# Wrap with TruLens
tru_chain = TruChain(
chain,
app_name="langchain-rag",
app_version="v1",
feedbacks=[f_answer_relevance, f_groundedness, f_context_relevance]
)
# Run and evaluate
with tru_chain as recording:
response = tru_chain.invoke({"query": "How does vector search work?"})
Wrapping LlamaIndex
from trulens.apps.llamaindex import TruLlama
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
# Build LlamaIndex app
documents = SimpleDirectoryReader("./data").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
# Wrap with TruLens
tru_llama = TruLlama(
query_engine,
app_name="llamaindex-rag",
app_version="v1",
feedbacks=[f_answer_relevance, f_groundedness, f_context_relevance]
)
# Run and evaluate
with tru_llama as recording:
response = tru_llama.query("Explain the architecture of the system")
Dashboard
# Launch Streamlit dashboard
from trulens.dashboard import run_dashboard
run_dashboard(session, port=8501)
# Opens at http://localhost:8501
# CLI launch
trulens-eval --port 8501
Dashboard Features
| View | Description |
|---|---|
| Leaderboard | Compare app versions by aggregate scores |
| Records | Browse individual query/response pairs |
| Feedback | Detailed feedback scores with reasons |
| App | Trace internal component calls |
| Compare | Side-by-side version comparison |
Available Feedback Functions
Quality Metrics
# Coherence
f_coherence = Feedback(provider.coherence_with_cot_reasons).on_output()
# Conciseness
f_conciseness = Feedback(provider.conciseness_with_cot_reasons).on_output()
# Correctness (requires ground truth)
f_correctness = (
Feedback(provider.correctness_with_cot_reasons)
.on_input()
.on_output()
)
# Comprehensiveness
f_comprehensive = (
Feedback(provider.comprehensiveness_with_cot_reasons)
.on_input()
.on_output()
)
Safety Metrics
# Toxicity
f_toxicity = Feedback(provider.toxicity_with_cot_reasons).on_output()
# Controversiality
f_controversial = Feedback(provider.controversiality_with_cot_reasons).on_output()
# Criminality
f_criminal = Feedback(provider.criminality_with_cot_reasons).on_output()
# Insensitivity
f_insensitivity = Feedback(provider.insensitivity_with_cot_reasons).on_output()
Configuration
Using Different Providers
# OpenAI
from trulens.providers.openai import OpenAI
provider = OpenAI(model_engine="gpt-4o")
# Azure OpenAI
from trulens.providers.openai import AzureOpenAI
provider = AzureOpenAI(
deployment_name="gpt-4o",
azure_endpoint="https://your-resource.openai.azure.com/"
)
# HuggingFace (local)
from trulens.providers.huggingface import HuggingfaceLocal
provider = HuggingfaceLocal()
# Bedrock
from trulens.providers.bedrock import Bedrock
provider = Bedrock(model_id="anthropic.claude-3-sonnet")
Database Backends
# SQLite (default)
session = TruSession(database_url="sqlite:///trulens.db")
# PostgreSQL
session = TruSession(database_url="postgresql://user:pass@localhost:5432/trulens")
# MySQL
session = TruSession(database_url="mysql://user:pass@localhost:3306/trulens")
Advanced Usage
Custom Feedback Functions
from trulens.core import Feedback
def custom_technical_score(question: str, answer: str) -> float:
"""Score technical accuracy from 0 to 1."""
prompt = f"""Rate the technical accuracy of this answer on a scale of 0-1.
Question: {question}
Answer: {answer}
Score (0-1):"""
# Use your preferred LLM
score = call_llm(prompt)
return float(score)
f_technical = (
Feedback(custom_technical_score, name="Technical Accuracy")
.on_input()
.on_output()
)
Batch Evaluation
questions = [
"What is RAG?",
"How do vector databases work?",
"Explain transformer architecture",
]
with tru_app as recording:
for q in questions:
result = tru_app.app(q)
print(f"Q: {q}\nA: {result}\n")
# Get aggregated results
leaderboard = session.get_leaderboard()
print(leaderboard)
# Export records
records = session.get_records_and_feedback()
records.to_csv("evaluation_results.csv")
A/B Testing
# Version A
tru_app_v1 = TruBasicApp(app=rag_v1, app_name="rag", app_version="v1", feedbacks=feedbacks)
# Version B
tru_app_v2 = TruBasicApp(app=rag_v2, app_name="rag", app_version="v2", feedbacks=feedbacks)
for q in test_questions:
with tru_app_v1:
tru_app_v1.app(q)
with tru_app_v2:
tru_app_v2.app(q)
# Compare in dashboard
leaderboard = session.get_leaderboard()
print(leaderboard[["app_version", "Answer Relevance", "Groundedness"]])
Troubleshooting
| Issue | Solution |
|---|---|
| Dashboard not loading | Check port 8501 is free, run trulens-eval CLI |
| Feedback scores all None | Verify API keys, check provider connection |
| Recording not capturing context | Ensure Select paths match your app structure |
| Database locked (SQLite) | Switch to PostgreSQL for concurrent access |
| Slow evaluation | Use gpt-4o-mini provider, reduce feedback count |
| Import errors | Update: pip install -U trulens |
| LangChain wrapper fails | Check LangChain version compatibility |
| Memory issues with large datasets | Use PostgreSQL backend, paginate queries |
# Reset database
python -c "from trulens.core import TruSession; TruSession().reset_database()"
# Check installed version
pip show trulens
# Export all data
python -c "
from trulens.core import TruSession
s = TruSession()
records = s.get_records_and_feedback()
records.to_parquet('trulens_export.parquet')
"