Braintrust
Braintrust는 AI 애플리케이션의 평가, 테스트 및 모니터링을 위한 엔드 투 엔드 플랫폼입니다. 구조화된 실험, 내장 및 사용자 정의 스코어러, 데이터셋 관리, 트레이싱, 모델 액세스를 위한 프록시 및 CI 통합을 제공합니다.
설치
# Install Braintrust SDK
npm install braintrust autoevals
# or
pip install braintrust autoevals
# Set API key
export BRAINTRUST_API_KEY="your-api-key"
Running Evaluations
Basic Eval (TypeScript)
import { Eval } from "braintrust";
import { Factuality } from "autoevals";
Eval("my-project", {
data: () => [
{ input: "What is 2+2?", expected: "4" },
{ input: "Capital of France?", expected: "Paris" },
],
task: async (input) => {
// Call your LLM or pipeline here
const response = await callMyLLM(input);
return response;
},
scores: [Factuality],
});
Basic Eval (Python)
from braintrust import Eval
from autoevals import Factuality
Eval(
"my-project",
data=lambda: [
{"input": "What is 2+2?", "expected": "4"},
{"input": "Capital of France?", "expected": "Paris"},
],
task=lambda input: call_my_llm(input),
scores=[Factuality],
)
Scorers
Built-in Scorers (autoevals)
from autoevals import (
Factuality, # LLM-graded factual correctness
ClosedQA, # Closed-book question answering
Battle, # Compare two outputs head-to-head
Summary, # Summarization quality
Translation, # Translation quality
Humor, # Humor evaluation
Sql, # SQL query correctness
)
# Use in an evaluation
Eval(
"my-project",
data=lambda: test_cases,
task=lambda input: my_pipeline(input),
scores=[Factuality, ClosedQA],
)
Custom Scorers
from braintrust import Eval
def exact_match(input, output, expected):
"""Custom scorer returning a score between 0 and 1."""
return {
"name": "ExactMatch",
"score": 1.0 if output.strip() == expected.strip() else 0.0,
}
def contains_keyword(input, output, expected):
"""Check if output contains expected keywords."""
keywords = expected.split(",")
matches = sum(1 for k in keywords if k.strip().lower() in output.lower())
return {
"name": "KeywordCoverage",
"score": matches / len(keywords) if keywords else 0,
}
Eval(
"my-project",
data=lambda: test_cases,
task=lambda input: my_pipeline(input),
scores=[exact_match, contains_keyword],
)
Datasets
Creating and Managing Datasets
from braintrust import init_dataset
# Create or open a dataset
dataset = init_dataset("my-project", "qa-pairs")
# Insert records
dataset.insert(input="What is Braintrust?", expected="An AI evaluation platform")
dataset.insert(input="How do scorers work?", expected="They return 0-1 scores")
# Flush to persist
dataset.flush()
Using Datasets in Evals
from braintrust import Eval, init_dataset
def load_data():
dataset = init_dataset("my-project", "qa-pairs")
return [{"input": r["input"], "expected": r["expected"]} for r in dataset]
Eval(
"my-project",
data=load_data,
task=lambda input: my_pipeline(input),
scores=[Factuality],
)
Logging and Tracing
Manual Logging
from braintrust import init_logger
logger = init_logger("my-project")
# Log a single event
logger.log(
input={"query": "What is AI?"},
output="Artificial Intelligence is...",
expected="A field of computer science...",
scores={"relevance": 0.85},
metadata={"model": "gpt-4o", "temperature": 0.7},
)
Tracing with Spans
from braintrust import traced, current_span
@traced
def rag_pipeline(query: str) -> str:
# Create child spans for each step
with current_span().start_span(name="retrieve") as span:
docs = retrieve(query)
span.log(output={"doc_count": len(docs)})
with current_span().start_span(name="generate") as span:
result = generate(query, docs)
span.log(
input={"query": query, "context_len": len(docs)},
output=result,
metadata={"model": "gpt-4o"},
)
return result
Braintrust Proxy
# Use the Braintrust proxy as a unified API gateway
# Supports OpenAI, Anthropic, Google, and more
# Set the proxy base URL
export OPENAI_BASE_URL="https://api.braintrust.dev/v1/proxy"
export OPENAI_API_KEY="$BRAINTRUST_API_KEY"
# Calls through the proxy are automatically logged
curl https://api.braintrust.dev/v1/proxy/chat/completions \
-H "Authorization: Bearer $BRAINTRUST_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "gpt-4o",
"messages": [{"role": "user", "content": "Hello"}]
}'
Online Scoring
# Score production traffic in real-time
from braintrust import init_logger
from autoevals import Factuality
logger = init_logger("production-app")
# Log and score in production
def handle_request(query, response, reference=None):
scores = {}
if reference:
result = Factuality(input=query, output=response, expected=reference)
scores["factuality"] = result.score
logger.log(
input={"query": query},
output=response,
scores=scores,
metadata={"env": "production"},
)
CI Integration
# Run evals in CI and post results
npx braintrust eval my_eval.ts
# Use with GitHub Actions
# .github/workflows/eval.yml
# GitHub Actions workflow for Braintrust evals
name: AI Evals
on: [pull_request]
jobs:
eval:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: 20
- run: npm install
- run: npx braintrust eval evals/*.ts
env:
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
Experiments
from braintrust import init_experiment
# Create a named experiment for tracking
experiment = init_experiment("my-project", experiment="v2-new-prompt")
for case in test_cases:
output = my_pipeline(case["input"])
experiment.log(
input=case["input"],
output=output,
expected=case["expected"],
scores={"quality": score_output(output, case["expected"])},
)
# Print summary with comparison to baseline
summary = experiment.summarize()
print(summary)