Arize AI
Arize AI توفر مراقبة ML لمراقبة النماذج في الإنتاج وتصحيح أخطاء تطبيقات LLM. Phoenix هي أداة التتبع والتقييم مفتوحة المصدر الخاصة بها، بينما تتعامل منصة Arize السحابية مع مراقبة الإنتاج واكتشاف الانحراف وتحليل التضمينات.
التثبيت
# Install Phoenix (open-source local observability)
pip install arize-phoenix
# Install Arize SDK for cloud platform
pip install arize
# Install OpenTelemetry instrumentors
pip install openinference-instrumentation-openai
pip install openinference-instrumentation-langchain
pip install openinference-instrumentation-llama-index
Phoenix: Local LLM Tracing
Launch Phoenix UI
# Start Phoenix server locally
python -m phoenix.server.main serve
# Or launch from Python
import phoenix as px
session = px.launch_app()
print(session.url) # http://localhost:6006
Tracing OpenAI Calls
import phoenix as px
from openinference.instrumentation.openai import OpenAIInstrumentor
from phoenix.otel import register
# Connect to Phoenix
tracer_provider = register(project_name="my-llm-app")
# Instrument OpenAI
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)
# All OpenAI calls are now traced
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "Explain observability"}]
)
Tracing LangChain
from openinference.instrumentation.langchain import LangChainInstrumentor
from phoenix.otel import register
tracer_provider = register(project_name="langchain-app")
LangChainInstrumentor().instrument(tracer_provider=tracer_provider)
# LangChain calls are now traced in Phoenix
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o")
result = llm.invoke("What is Phoenix?")
Tracing LlamaIndex
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from phoenix.otel import register
tracer_provider = register(project_name="llamaindex-app")
LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)
# LlamaIndex queries are traced
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query("What is in this document?")
Phoenix Evaluations
LLM-as-Judge Evaluations
from phoenix.evals import (
OpenAIModel,
llm_classify,
QA_PROMPT_RAIL_MAP,
HALLUCINATION_PROMPT_RAIL_MAP,
)
import pandas as pd
# Set up the evaluation model
eval_model = OpenAIModel(model="gpt-4o")
# Evaluate QA relevance
qa_results = llm_classify(
dataframe=pd.DataFrame({
"input": ["What is Python?"],
"output": ["Python is a programming language."],
"reference": ["Python is a high-level programming language."],
}),
model=eval_model,
template=QA_PROMPT_RAIL_MAP,
rails=["correct", "incorrect"],
)
# Evaluate hallucination
hallucination_results = llm_classify(
dataframe=traces_df,
model=eval_model,
template=HALLUCINATION_PROMPT_RAIL_MAP,
rails=["hallucinated", "factual"],
)
Custom Evaluators
from phoenix.evals import llm_classify
custom_template = """
Given the following query and response, rate the helpfulness.
Query: {input}
Response: {output}
Is this response helpful? Respond with "helpful" or "unhelpful".
"""
results = llm_classify(
dataframe=my_data,
model=eval_model,
template=custom_template,
rails=["helpful", "unhelpful"],
)
Arize Cloud: Production المراقبة
Logging Model Data
from arize.api import Client
from arize.utils.types import ModelTypes, Environments
client = Client(space_key="YOUR_SPACE_KEY", api_key="YOUR_API_KEY")
# Log prediction data
response = client.log(
model_id="my-model-v1",
model_version="1.0",
model_type=ModelTypes.SCORE,
environment=Environments.PRODUCTION,
prediction_id="pred-001",
prediction_label=0.85,
actual_label=1.0,
features={"feature_a": 1.2, "feature_b": "category_1"},
)
Logging Embeddings
from arize.utils.types import Embedding
import numpy as np
# Log embedding vectors for drift analysis
response = client.log(
model_id="embedding-model",
model_type=ModelTypes.SCORE,
environment=Environments.PRODUCTION,
prediction_id="pred-002",
prediction_label=0.9,
features={
"text_embedding": Embedding(
vector=np.random.randn(384).tolist(),
raw_data="The original text that was embedded",
)
},
)
Drift Detection
المراقبة Data Drift
# Arize automatically detects drift when you log training and production data
# Log training (baseline) data
client.log(
model_id="my-model",
model_type=ModelTypes.SCORE,
environment=Environments.TRAINING,
prediction_id="train-001",
prediction_label=0.75,
features={"income": 50000, "age": 35},
)
# Log production data - Arize compares distributions
client.log(
model_id="my-model",
model_type=ModelTypes.SCORE,
environment=Environments.PRODUCTION,
prediction_id="prod-001",
prediction_label=0.82,
features={"income": 120000, "age": 28},
)
# Drift alerts trigger when PSI/KL-divergence exceeds thresholds
Prompt Evaluation and Tracking
from arize.utils.types import LLMConfigColumnNames
# Log LLM prompts and responses for evaluation
client.log(
model_id="my-llm-app",
model_type=ModelTypes.GENERATIVE_LLM,
environment=Environments.PRODUCTION,
prediction_id="llm-001",
prompt="Summarize this article about AI safety",
response="AI safety research focuses on...",
prompt_template="Summarize this article about {topic}",
prompt_template_version="v2",
llm_config={
LLMConfigColumnNames.MODEL_NAME: "gpt-4o",
LLMConfigColumnNames.TEMPERATURE: 0.7,
},
)
Phoenix Datasets
import phoenix as px
# Create a dataset from traces for evaluation
client = px.Client()
# Upload a dataset
dataset = client.upload_dataset(
dataset_name="eval-set",
dataframe=pd.DataFrame({
"input": ["q1", "q2", "q3"],
"expected_output": ["a1", "a2", "a3"],
}),
input_keys=["input"],
output_keys=["expected_output"],
)
# Run evaluations on the dataset
task_results = client.run_experiment(
dataset=dataset,
task=my_task_function,
evaluators=[my_evaluator],
experiment_name="baseline-eval",
)