Arize AI
Arize AI는 프로덕션 모델 모니터링과 LLM 애플리케이션 디버깅을 위한 ML 관측성을 제공합니다. Phoenix는 오픈소스 트레이싱 및 평가 도구이며, Arize 클라우드 플랫폼은 프로덕션 모니터링, 드리프트 감지 및 임베딩 분석을 처리합니다.
설치
# Install Phoenix (open-source local observability)
pip install arize-phoenix
# Install Arize SDK for cloud platform
pip install arize
# Install OpenTelemetry instrumentors
pip install openinference-instrumentation-openai
pip install openinference-instrumentation-langchain
pip install openinference-instrumentation-llama-index
Phoenix: Local LLM Tracing
Launch Phoenix UI
# Start Phoenix server locally
python -m phoenix.server.main serve
# Or launch from Python
import phoenix as px
session = px.launch_app()
print(session.url) # http://localhost:6006
Tracing OpenAI Calls
import phoenix as px
from openinference.instrumentation.openai import OpenAIInstrumentor
from phoenix.otel import register
# Connect to Phoenix
tracer_provider = register(project_name="my-llm-app")
# Instrument OpenAI
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)
# All OpenAI calls are now traced
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "Explain observability"}]
)
Tracing LangChain
from openinference.instrumentation.langchain import LangChainInstrumentor
from phoenix.otel import register
tracer_provider = register(project_name="langchain-app")
LangChainInstrumentor().instrument(tracer_provider=tracer_provider)
# LangChain calls are now traced in Phoenix
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o")
result = llm.invoke("What is Phoenix?")
Tracing LlamaIndex
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from phoenix.otel import register
tracer_provider = register(project_name="llamaindex-app")
LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)
# LlamaIndex queries are traced
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query("What is in this document?")
Phoenix Evaluations
LLM-as-Judge Evaluations
from phoenix.evals import (
OpenAIModel,
llm_classify,
QA_PROMPT_RAIL_MAP,
HALLUCINATION_PROMPT_RAIL_MAP,
)
import pandas as pd
# Set up the evaluation model
eval_model = OpenAIModel(model="gpt-4o")
# Evaluate QA relevance
qa_results = llm_classify(
dataframe=pd.DataFrame({
"input": ["What is Python?"],
"output": ["Python is a programming language."],
"reference": ["Python is a high-level programming language."],
}),
model=eval_model,
template=QA_PROMPT_RAIL_MAP,
rails=["correct", "incorrect"],
)
# Evaluate hallucination
hallucination_results = llm_classify(
dataframe=traces_df,
model=eval_model,
template=HALLUCINATION_PROMPT_RAIL_MAP,
rails=["hallucinated", "factual"],
)
Custom Evaluators
from phoenix.evals import llm_classify
custom_template = """
Given the following query and response, rate the helpfulness.
Query: {input}
Response: {output}
Is this response helpful? Respond with "helpful" or "unhelpful".
"""
results = llm_classify(
dataframe=my_data,
model=eval_model,
template=custom_template,
rails=["helpful", "unhelpful"],
)
Arize Cloud: Production Monitoring
Logging Model Data
from arize.api import Client
from arize.utils.types import ModelTypes, Environments
client = Client(space_key="YOUR_SPACE_KEY", api_key="YOUR_API_KEY")
# Log prediction data
response = client.log(
model_id="my-model-v1",
model_version="1.0",
model_type=ModelTypes.SCORE,
environment=Environments.PRODUCTION,
prediction_id="pred-001",
prediction_label=0.85,
actual_label=1.0,
features={"feature_a": 1.2, "feature_b": "category_1"},
)
Logging Embeddings
from arize.utils.types import Embedding
import numpy as np
# Log embedding vectors for drift analysis
response = client.log(
model_id="embedding-model",
model_type=ModelTypes.SCORE,
environment=Environments.PRODUCTION,
prediction_id="pred-002",
prediction_label=0.9,
features={
"text_embedding": Embedding(
vector=np.random.randn(384).tolist(),
raw_data="The original text that was embedded",
)
},
)
Drift Detection
Monitoring Data Drift
# Arize automatically detects drift when you log training and production data
# Log training (baseline) data
client.log(
model_id="my-model",
model_type=ModelTypes.SCORE,
environment=Environments.TRAINING,
prediction_id="train-001",
prediction_label=0.75,
features={"income": 50000, "age": 35},
)
# Log production data - Arize compares distributions
client.log(
model_id="my-model",
model_type=ModelTypes.SCORE,
environment=Environments.PRODUCTION,
prediction_id="prod-001",
prediction_label=0.82,
features={"income": 120000, "age": 28},
)
# Drift alerts trigger when PSI/KL-divergence exceeds thresholds
Prompt Evaluation and Tracking
from arize.utils.types import LLMConfigColumnNames
# Log LLM prompts and responses for evaluation
client.log(
model_id="my-llm-app",
model_type=ModelTypes.GENERATIVE_LLM,
environment=Environments.PRODUCTION,
prediction_id="llm-001",
prompt="Summarize this article about AI safety",
response="AI safety research focuses on...",
prompt_template="Summarize this article about {topic}",
prompt_template_version="v2",
llm_config={
LLMConfigColumnNames.MODEL_NAME: "gpt-4o",
LLMConfigColumnNames.TEMPERATURE: 0.7,
},
)
Phoenix Datasets
import phoenix as px
# Create a dataset from traces for evaluation
client = px.Client()
# Upload a dataset
dataset = client.upload_dataset(
dataset_name="eval-set",
dataframe=pd.DataFrame({
"input": ["q1", "q2", "q3"],
"expected_output": ["a1", "a2", "a3"],
}),
input_keys=["input"],
output_keys=["expected_output"],
)
# Run evaluations on the dataset
task_results = client.run_experiment(
dataset=dataset,
task=my_task_function,
evaluators=[my_evaluator],
experiment_name="baseline-eval",
)