Arize AI

Arize AIは本番モデルの監視とLLMアプリケーションのデバッグのためのMLオブザーバビリティを提供します。Phoenixはオープンソースのトレーシングおよび評価ツールであり、Arizeクラウドプラットフォームは本番監視、ドリフト検出、エンベディング分析を処理します。

インストール

# Install Phoenix (open-source local observability)
pip install arize-phoenix

# Install Arize SDK for cloud platform
pip install arize

# Install OpenTelemetry instrumentors
pip install openinference-instrumentation-openai
pip install openinference-instrumentation-langchain
pip install openinference-instrumentation-llama-index

Phoenix: Local LLM Tracing

Launch Phoenix UI

# Start Phoenix server locally
python -m phoenix.server.main serve

# Or launch from Python
import phoenix as px
session = px.launch_app()
print(session.url)  # http://localhost:6006

Tracing OpenAI Calls

import phoenix as px
from openinference.instrumentation.openai import OpenAIInstrumentor
from phoenix.otel import register

# Connect to Phoenix
tracer_provider = register(project_name="my-llm-app")

# Instrument OpenAI
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)

# All OpenAI calls are now traced
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "Explain observability"}]
)

Tracing LangChain

from openinference.instrumentation.langchain import LangChainInstrumentor
from phoenix.otel import register

tracer_provider = register(project_name="langchain-app")
LangChainInstrumentor().instrument(tracer_provider=tracer_provider)

# LangChain calls are now traced in Phoenix
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o")
result = llm.invoke("What is Phoenix?")

Tracing LlamaIndex

from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from phoenix.otel import register

tracer_provider = register(project_name="llamaindex-app")
LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)

# LlamaIndex queries are traced
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query("What is in this document?")

Phoenix Evaluations

LLM-as-Judge Evaluations

from phoenix.evals import (
    OpenAIModel,
    llm_classify,
    QA_PROMPT_RAIL_MAP,
    HALLUCINATION_PROMPT_RAIL_MAP,
)
import pandas as pd

# Set up the evaluation model
eval_model = OpenAIModel(model="gpt-4o")

# Evaluate QA relevance
qa_results = llm_classify(
    dataframe=pd.DataFrame({
        "input": ["What is Python?"],
        "output": ["Python is a programming language."],
        "reference": ["Python is a high-level programming language."],
    }),
    model=eval_model,
    template=QA_PROMPT_RAIL_MAP,
    rails=["correct", "incorrect"],
)

# Evaluate hallucination
hallucination_results = llm_classify(
    dataframe=traces_df,
    model=eval_model,
    template=HALLUCINATION_PROMPT_RAIL_MAP,
    rails=["hallucinated", "factual"],
)

Custom Evaluators

from phoenix.evals import llm_classify

custom_template = """
Given the following query and response, rate the helpfulness.
Query: {input}
Response: {output}
Is this response helpful? Respond with "helpful" or "unhelpful".
"""

results = llm_classify(
    dataframe=my_data,
    model=eval_model,
    template=custom_template,
    rails=["helpful", "unhelpful"],
)

Arize Cloud: Production Monitoring

Logging Model Data

from arize.api import Client
from arize.utils.types import ModelTypes, Environments

client = Client(space_key="YOUR_SPACE_KEY", api_key="YOUR_API_KEY")

# Log prediction data
response = client.log(
    model_id="my-model-v1",
    model_version="1.0",
    model_type=ModelTypes.SCORE,
    environment=Environments.PRODUCTION,
    prediction_id="pred-001",
    prediction_label=0.85,
    actual_label=1.0,
    features={"feature_a": 1.2, "feature_b": "category_1"},
)

Logging Embeddings

from arize.utils.types import Embedding
import numpy as np

# Log embedding vectors for drift analysis
response = client.log(
    model_id="embedding-model",
    model_type=ModelTypes.SCORE,
    environment=Environments.PRODUCTION,
    prediction_id="pred-002",
    prediction_label=0.9,
    features={
        "text_embedding": Embedding(
            vector=np.random.randn(384).tolist(),
            raw_data="The original text that was embedded",
        )
    },
)

Drift Detection

Monitoring Data Drift

# Arize automatically detects drift when you log training and production data

# Log training (baseline) data
client.log(
    model_id="my-model",
    model_type=ModelTypes.SCORE,
    environment=Environments.TRAINING,
    prediction_id="train-001",
    prediction_label=0.75,
    features={"income": 50000, "age": 35},
)

# Log production data - Arize compares distributions
client.log(
    model_id="my-model",
    model_type=ModelTypes.SCORE,
    environment=Environments.PRODUCTION,
    prediction_id="prod-001",
    prediction_label=0.82,
    features={"income": 120000, "age": 28},
)
# Drift alerts trigger when PSI/KL-divergence exceeds thresholds

Prompt Evaluation and Tracking

from arize.utils.types import LLMConfigColumnNames

# Log LLM prompts and responses for evaluation
client.log(
    model_id="my-llm-app",
    model_type=ModelTypes.GENERATIVE_LLM,
    environment=Environments.PRODUCTION,
    prediction_id="llm-001",
    prompt="Summarize this article about AI safety",
    response="AI safety research focuses on...",
    prompt_template="Summarize this article about {topic}",
    prompt_template_version="v2",
    llm_config={
        LLMConfigColumnNames.MODEL_NAME: "gpt-4o",
        LLMConfigColumnNames.TEMPERATURE: 0.7,
    },
)

Phoenix Datasets

import phoenix as px

# Create a dataset from traces for evaluation
client = px.Client()

# Upload a dataset
dataset = client.upload_dataset(
    dataset_name="eval-set",
    dataframe=pd.DataFrame({
        "input": ["q1", "q2", "q3"],
        "expected_output": ["a1", "a2", "a3"],
    }),
    input_keys=["input"],
    output_keys=["expected_output"],
)

# Run evaluations on the dataset
task_results = client.run_experiment(
    dataset=dataset,
    task=my_task_function,
    evaluators=[my_evaluator],
    experiment_name="baseline-eval",
)