تخطَّ إلى المحتوى

Braintrust

Braintrust هي منصة شاملة لتقييم واختبار ومراقبة تطبيقات الذكاء الاصطناعي. توفر تجارب منظمة ومُقيّمين مدمجين ومخصصين وإدارة مجموعات البيانات والتتبع ووكيل للوصول إلى النماذج والتكامل مع CI.

التثبيت

# Install Braintrust SDK
npm install braintrust autoevals
# or
pip install braintrust autoevals

# Set API key
export BRAINTRUST_API_KEY="your-api-key"

Running Evaluations

Basic Eval (TypeScript)

import { Eval } from "braintrust";
import { Factuality } from "autoevals";

Eval("my-project", {
  data: () => [
    { input: "What is 2+2?", expected: "4" },
    { input: "Capital of France?", expected: "Paris" },
  ],
  task: async (input) => {
    // Call your LLM or pipeline here
    const response = await callMyLLM(input);
    return response;
  },
  scores: [Factuality],
});

Basic Eval (Python)

from braintrust import Eval
from autoevals import Factuality

Eval(
    "my-project",
    data=lambda: [
        {"input": "What is 2+2?", "expected": "4"},
        {"input": "Capital of France?", "expected": "Paris"},
    ],
    task=lambda input: call_my_llm(input),
    scores=[Factuality],
)

Scorers

Built-in Scorers (autoevals)

from autoevals import (
    Factuality,      # LLM-graded factual correctness
    ClosedQA,        # Closed-book question answering
    Battle,          # Compare two outputs head-to-head
    Summary,         # Summarization quality
    Translation,     # Translation quality
    Humor,           # Humor evaluation
    Sql,             # SQL query correctness
)

# Use in an evaluation
Eval(
    "my-project",
    data=lambda: test_cases,
    task=lambda input: my_pipeline(input),
    scores=[Factuality, ClosedQA],
)

Custom Scorers

from braintrust import Eval

def exact_match(input, output, expected):
    """Custom scorer returning a score between 0 and 1."""
    return {
        "name": "ExactMatch",
        "score": 1.0 if output.strip() == expected.strip() else 0.0,
    }

def contains_keyword(input, output, expected):
    """Check if output contains expected keywords."""
    keywords = expected.split(",")
    matches = sum(1 for k in keywords if k.strip().lower() in output.lower())
    return {
        "name": "KeywordCoverage",
        "score": matches / len(keywords) if keywords else 0,
    }

Eval(
    "my-project",
    data=lambda: test_cases,
    task=lambda input: my_pipeline(input),
    scores=[exact_match, contains_keyword],
)

Datasets

Creating and Managing Datasets

from braintrust import init_dataset

# Create or open a dataset
dataset = init_dataset("my-project", "qa-pairs")

# Insert records
dataset.insert(input="What is Braintrust?", expected="An AI evaluation platform")
dataset.insert(input="How do scorers work?", expected="They return 0-1 scores")

# Flush to persist
dataset.flush()

Using Datasets in Evals

from braintrust import Eval, init_dataset

def load_data():
    dataset = init_dataset("my-project", "qa-pairs")
    return [{"input": r["input"], "expected": r["expected"]} for r in dataset]

Eval(
    "my-project",
    data=load_data,
    task=lambda input: my_pipeline(input),
    scores=[Factuality],
)

Logging and Tracing

Manual Logging

from braintrust import init_logger

logger = init_logger("my-project")

# Log a single event
logger.log(
    input={"query": "What is AI?"},
    output="Artificial Intelligence is...",
    expected="A field of computer science...",
    scores={"relevance": 0.85},
    metadata={"model": "gpt-4o", "temperature": 0.7},
)

Tracing with Spans

from braintrust import traced, current_span

@traced
def rag_pipeline(query: str) -> str:
    # Create child spans for each step
    with current_span().start_span(name="retrieve") as span:
        docs = retrieve(query)
        span.log(output={"doc_count": len(docs)})

    with current_span().start_span(name="generate") as span:
        result = generate(query, docs)
        span.log(
            input={"query": query, "context_len": len(docs)},
            output=result,
            metadata={"model": "gpt-4o"},
        )

    return result

Braintrust Proxy

# Use the Braintrust proxy as a unified API gateway
# Supports OpenAI, Anthropic, Google, and more

# Set the proxy base URL
export OPENAI_BASE_URL="https://api.braintrust.dev/v1/proxy"
export OPENAI_API_KEY="$BRAINTRUST_API_KEY"

# Calls through the proxy are automatically logged
curl https://api.braintrust.dev/v1/proxy/chat/completions \
  -H "Authorization: Bearer $BRAINTRUST_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "gpt-4o",
    "messages": [{"role": "user", "content": "Hello"}]
  }'

Online Scoring

# Score production traffic in real-time
from braintrust import init_logger
from autoevals import Factuality

logger = init_logger("production-app")

# Log and score in production
def handle_request(query, response, reference=None):
    scores = {}
    if reference:
        result = Factuality(input=query, output=response, expected=reference)
        scores["factuality"] = result.score

    logger.log(
        input={"query": query},
        output=response,
        scores=scores,
        metadata={"env": "production"},
    )

CI التكامل

# Run evals in CI and post results
npx braintrust eval my_eval.ts

# Use with GitHub Actions
# .github/workflows/eval.yml
# GitHub Actions workflow for Braintrust evals
name: AI Evals
on: [pull_request]
jobs:
  eval:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-node@v4
        with:
          node-version: 20
      - run: npm install
      - run: npx braintrust eval evals/*.ts
        env:
          BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

Experiments

from braintrust import init_experiment

# Create a named experiment for tracking
experiment = init_experiment("my-project", experiment="v2-new-prompt")

for case in test_cases:
    output = my_pipeline(case["input"])
    experiment.log(
        input=case["input"],
        output=output,
        expected=case["expected"],
        scores={"quality": score_output(output, case["expected"])},
    )

# Print summary with comparison to baseline
summary = experiment.summarize()
print(summary)