Arize AI

Arize AI توفر مراقبة ML لمراقبة النماذج في الإنتاج وتصحيح أخطاء تطبيقات LLM. Phoenix هي أداة التتبع والتقييم مفتوحة المصدر الخاصة بها، بينما تتعامل منصة Arize السحابية مع مراقبة الإنتاج واكتشاف الانحراف وتحليل التضمينات.

التثبيت

# Install Phoenix (open-source local observability)
pip install arize-phoenix

# Install Arize SDK for cloud platform
pip install arize

# Install OpenTelemetry instrumentors
pip install openinference-instrumentation-openai
pip install openinference-instrumentation-langchain
pip install openinference-instrumentation-llama-index

Phoenix: Local LLM Tracing

Launch Phoenix UI

# Start Phoenix server locally
python -m phoenix.server.main serve

# Or launch from Python
import phoenix as px
session = px.launch_app()
print(session.url)  # http://localhost:6006

Tracing OpenAI Calls

import phoenix as px
from openinference.instrumentation.openai import OpenAIInstrumentor
from phoenix.otel import register

# Connect to Phoenix
tracer_provider = register(project_name="my-llm-app")

# Instrument OpenAI
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)

# All OpenAI calls are now traced
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "Explain observability"}]
)

Tracing LangChain

from openinference.instrumentation.langchain import LangChainInstrumentor
from phoenix.otel import register

tracer_provider = register(project_name="langchain-app")
LangChainInstrumentor().instrument(tracer_provider=tracer_provider)

# LangChain calls are now traced in Phoenix
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o")
result = llm.invoke("What is Phoenix?")

Tracing LlamaIndex

from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from phoenix.otel import register

tracer_provider = register(project_name="llamaindex-app")
LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)

# LlamaIndex queries are traced
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query("What is in this document?")

Phoenix Evaluations

LLM-as-Judge Evaluations

from phoenix.evals import (
    OpenAIModel,
    llm_classify,
    QA_PROMPT_RAIL_MAP,
    HALLUCINATION_PROMPT_RAIL_MAP,
)
import pandas as pd

# Set up the evaluation model
eval_model = OpenAIModel(model="gpt-4o")

# Evaluate QA relevance
qa_results = llm_classify(
    dataframe=pd.DataFrame({
        "input": ["What is Python?"],
        "output": ["Python is a programming language."],
        "reference": ["Python is a high-level programming language."],
    }),
    model=eval_model,
    template=QA_PROMPT_RAIL_MAP,
    rails=["correct", "incorrect"],
)

# Evaluate hallucination
hallucination_results = llm_classify(
    dataframe=traces_df,
    model=eval_model,
    template=HALLUCINATION_PROMPT_RAIL_MAP,
    rails=["hallucinated", "factual"],
)

Custom Evaluators

from phoenix.evals import llm_classify

custom_template = """
Given the following query and response, rate the helpfulness.
Query: {input}
Response: {output}
Is this response helpful? Respond with "helpful" or "unhelpful".
"""

results = llm_classify(
    dataframe=my_data,
    model=eval_model,
    template=custom_template,
    rails=["helpful", "unhelpful"],
)

Arize Cloud: Production المراقبة

Logging Model Data

from arize.api import Client
from arize.utils.types import ModelTypes, Environments

client = Client(space_key="YOUR_SPACE_KEY", api_key="YOUR_API_KEY")

# Log prediction data
response = client.log(
    model_id="my-model-v1",
    model_version="1.0",
    model_type=ModelTypes.SCORE,
    environment=Environments.PRODUCTION,
    prediction_id="pred-001",
    prediction_label=0.85,
    actual_label=1.0,
    features={"feature_a": 1.2, "feature_b": "category_1"},
)

Logging Embeddings

from arize.utils.types import Embedding
import numpy as np

# Log embedding vectors for drift analysis
response = client.log(
    model_id="embedding-model",
    model_type=ModelTypes.SCORE,
    environment=Environments.PRODUCTION,
    prediction_id="pred-002",
    prediction_label=0.9,
    features={
        "text_embedding": Embedding(
            vector=np.random.randn(384).tolist(),
            raw_data="The original text that was embedded",
        )
    },
)

Drift Detection

المراقبة Data Drift

# Arize automatically detects drift when you log training and production data

# Log training (baseline) data
client.log(
    model_id="my-model",
    model_type=ModelTypes.SCORE,
    environment=Environments.TRAINING,
    prediction_id="train-001",
    prediction_label=0.75,
    features={"income": 50000, "age": 35},
)

# Log production data - Arize compares distributions
client.log(
    model_id="my-model",
    model_type=ModelTypes.SCORE,
    environment=Environments.PRODUCTION,
    prediction_id="prod-001",
    prediction_label=0.82,
    features={"income": 120000, "age": 28},
)
# Drift alerts trigger when PSI/KL-divergence exceeds thresholds

Prompt Evaluation and Tracking

from arize.utils.types import LLMConfigColumnNames

# Log LLM prompts and responses for evaluation
client.log(
    model_id="my-llm-app",
    model_type=ModelTypes.GENERATIVE_LLM,
    environment=Environments.PRODUCTION,
    prediction_id="llm-001",
    prompt="Summarize this article about AI safety",
    response="AI safety research focuses on...",
    prompt_template="Summarize this article about {topic}",
    prompt_template_version="v2",
    llm_config={
        LLMConfigColumnNames.MODEL_NAME: "gpt-4o",
        LLMConfigColumnNames.TEMPERATURE: 0.7,
    },
)

Phoenix Datasets

import phoenix as px

# Create a dataset from traces for evaluation
client = px.Client()

# Upload a dataset
dataset = client.upload_dataset(
    dataset_name="eval-set",
    dataframe=pd.DataFrame({
        "input": ["q1", "q2", "q3"],
        "expected_output": ["a1", "a2", "a3"],
    }),
    input_keys=["input"],
    output_keys=["expected_output"],
)

# Run evaluations on the dataset
task_results = client.run_experiment(
    dataset=dataset,
    task=my_task_function,
    evaluators=[my_evaluator],
    experiment_name="baseline-eval",
)