Commandes Arize
Arize AI fournit l’observabilité ML pour surveiller les modèles en production et déboguer les problèmes avec les embeddings, les LLMs et les pipelines ML traditionnels. Elle offre la détection de dérive, l’analyse de performance et le débogage de traces.
Installation
# Install Phoenix (open-source local observability)
pip install arize-phoenix
# Install Arize SDK for cloud platform
pip install arize
# Install OpenTelemetry instrumentors
pip install openinference-instrumentation-openai
pip install openinference-instrumentation-langchain
pip install openinference-instrumentation-llama-index
Phoenix : traçage local de LLM
Lancer l’UI de Phoenix
# Start Phoenix server locally
python -m phoenix.server.main serve
# Or launch from Python
import phoenix as px
session = px.launch_app()
print(session.url) # http://localhost:6006
Traçage d’appels OpenAI
import phoenix as px
from openinference.instrumentation.openai import OpenAIInstrumentor
from phoenix.otel import register
# Connect to Phoenix
tracer_provider = register(project_name="my-llm-app")
# Instrument OpenAI
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)
# All OpenAI calls are now traced
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "Explain observability"}]
)
Traçage LangChain
from openinference.instrumentation.langchain import LangChainInstrumentor
from phoenix.otel import register
tracer_provider = register(project_name="langchain-app")
LangChainInstrumentor().instrument(tracer_provider=tracer_provider)
# LangChain calls are now traced in Phoenix
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o")
result = llm.invoke("What is Phoenix?")
Traçage de LlamaIndex
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from phoenix.otel import register
tracer_provider = register(project_name="llamaindex-app")
LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)
# LlamaIndex queries are traced
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query("What is in this document?")
Évaluations Phoenix
Évaluations LLM en tant que juge
from phoenix.evals import (
OpenAIModel,
llm_classify,
QA_PROMPT_RAIL_MAP,
HALLUCINATION_PROMPT_RAIL_MAP,
)
import pandas as pd
# Set up the evaluation model
eval_model = OpenAIModel(model="gpt-4o")
# Evaluate QA relevance
qa_results = llm_classify(
dataframe=pd.DataFrame({
"input": ["What is Python?"],
"output": ["Python is a programming language."],
"reference": ["Python is a high-level programming language."],
}),
model=eval_model,
template=QA_PROMPT_RAIL_MAP,
rails=["correct", "incorrect"],
)
# Evaluate hallucination
hallucination_results = llm_classify(
dataframe=traces_df,
model=eval_model,
template=HALLUCINATION_PROMPT_RAIL_MAP,
rails=["hallucinated", "factual"],
)
Évaluateurs personnalisés
from phoenix.evals import llm_classify
custom_template = """
Given the following query and response, rate the helpfulness.
Query: {input}
Response: {output}
Is this response helpful? Respond with "helpful" or "unhelpful".
"""
results = llm_classify(
dataframe=my_data,
model=eval_model,
template=custom_template,
rails=["helpful", "unhelpful"],
)
Arize Cloud: Production Monitoring
Journalisation des données de modèle
from arize.api import Client
from arize.utils.types import ModelTypes, Environments
client = Client(space_key="YOUR_SPACE_KEY", api_key="YOUR_API_KEY")
# Log prediction data
response = client.log(
model_id="my-model-v1",
model_version="1.0",
model_type=ModelTypes.SCORE,
environment=Environments.PRODUCTION,
prediction_id="pred-001",
prediction_label=0.85,
actual_label=1.0,
features={"feature_a": 1.2, "feature_b": "category_1"},
)
Journalisation d’embeddings
from arize.utils.types import Embedding
import numpy as np
# Log embedding vectors for drift analysis
response = client.log(
model_id="embedding-model",
model_type=ModelTypes.SCORE,
environment=Environments.PRODUCTION,
prediction_id="pred-002",
prediction_label=0.9,
features={
"text_embedding": Embedding(
vector=np.random.randn(384).tolist(),
raw_data="The original text that was embedded",
)
},
)
Détection de dérive
Surveillance de la dérive des données
# Arize automatically detects drift when you log training and production data
# Log training (baseline) data
client.log(
model_id="my-model",
model_type=ModelTypes.SCORE,
environment=Environments.TRAINING,
prediction_id="train-001",
prediction_label=0.75,
features={"income": 50000, "age": 35},
)
# Log production data - Arize compares distributions
client.log(
model_id="my-model",
model_type=ModelTypes.SCORE,
environment=Environments.PRODUCTION,
prediction_id="prod-001",
prediction_label=0.82,
features={"income": 120000, "age": 28},
)
# Drift alerts trigger when PSI/KL-divergence exceeds thresholds
Évaluation et suivi des prompts
from arize.utils.types import LLMConfigColumnNames
# Log LLM prompts and responses for evaluation
client.log(
model_id="my-llm-app",
model_type=ModelTypes.GENERATIVE_LLM,
environment=Environments.PRODUCTION,
prediction_id="llm-001",
prompt="Summarize this article about AI safety",
response="AI safety research focuses on...",
prompt_template="Summarize this article about {topic}",
prompt_template_version="v2",
llm_config={
LLMConfigColumnNames.MODEL_NAME: "gpt-4o",
LLMConfigColumnNames.TEMPERATURE: 0.7,
},
)
Jeux de données Phoenix
import phoenix as px
# Create a dataset from traces for evaluation
client = px.Client()
# Upload a dataset
dataset = client.upload_dataset(
dataset_name="eval-set",
dataframe=pd.DataFrame({
"input": ["q1", "q2", "q3"],
"expected_output": ["a1", "a2", "a3"],
}),
input_keys=["input"],
output_keys=["expected_output"],
)
# Run evaluations on the dataset
task_results = client.run_experiment(
dataset=dataset,
task=my_task_function,
evaluators=[my_evaluator],
experiment_name="baseline-eval",
)