Salta ai contenuti

MLflow

Installation

# Basic install
pip install mlflow

# With extras
pip install mlflow[extras]           # all integrations
pip install mlflow[sklearn]          # scikit-learn autolog
pip install mlflow[tensorflow]       # TensorFlow/Keras autolog
pip install mlflow[pytorch]          # PyTorch autolog

# Verify
mlflow --version

Start the tracking server:

# Local file-based (no server needed)
mlflow ui                           # http://localhost:5000

# With SQLite backend
mlflow server \
  --backend-store-uri sqlite:///mlflow.db \
  --default-artifact-root ./mlruns \
  --host 0.0.0.0 \
  --port 5000

# With PostgreSQL + S3 artifacts
mlflow server \
  --backend-store-uri postgresql://user:pass@localhost/mlflow \
  --default-artifact-root s3://my-bucket/mlflow \
  --host 0.0.0.0 \
  --port 5000

# Docker
docker run -p 5000:5000 ghcr.io/mlflow/mlflow:v2.13.0 \
  mlflow server --host 0.0.0.0

Configuration

Tracking URI

import mlflow

# Environment variable (preferred for CI/CD)
# export MLFLOW_TRACKING_URI=http://mlflow.mycompany.com:5000

# In code
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_tracking_uri("sqlite:///mlflow.db")       # local SQLite
mlflow.set_tracking_uri("./mlruns")                  # local file store
mlflow.set_tracking_uri("databricks")                # Databricks (uses env vars)

# Check current URI
print(mlflow.get_tracking_uri())

Environment Variables

# Tracking
export MLFLOW_TRACKING_URI=http://mlflow.example.com:5000
export MLFLOW_EXPERIMENT_NAME=my-experiment

# Auth (tracking server with auth)
export MLFLOW_TRACKING_USERNAME=admin
export MLFLOW_TRACKING_PASSWORD=secret

# Artifact store
export AWS_ACCESS_KEY_ID=...
export AWS_SECRET_ACCESS_KEY=...
export MLFLOW_S3_ENDPOINT_URL=https://s3.amazonaws.com

# Azure Blob
export AZURE_STORAGE_CONNECTION_STRING=...

# Databricks
export DATABRICKS_HOST=https://adb-xxx.azuredatabricks.net
export DATABRICKS_TOKEN=dapi...

mlflow.yml (Project Config)

name: my-ml-project

python_env: python_env.yaml

entry_points:
  main:
    parameters:
      learning_rate: {type: float, default: 0.01}
      epochs: {type: int, default: 10}
    command: "python train.py --lr {learning_rate} --epochs {epochs}"

  evaluate:
    parameters:
      model_uri: {type: str}
    command: "python evaluate.py --model {model_uri}"

Core Commands

CLI

CommandDescription
mlflow uiStart tracking UI
mlflow serverStart tracking server
mlflow run .Run MLflow project
mlflow run --env-manager=conda .Run with conda env
mlflow run --experiment-name exp1 .Run in experiment
mlflow models serve -m runs:/abc/modelServe a model
mlflow models predict -m runs:/abc/model -i data.csvBatch predict
mlflow models build-docker -m runs:/abc/modelBuild serving image
mlflow experiments create -n "my-exp"Create experiment
mlflow experiments listList experiments
mlflow experiments searchSearch experiments
mlflow runs list --experiment-id 1List runs
mlflow runs describe --run-id abc123Run details
mlflow artifacts list --run-id abc123List artifacts
mlflow artifacts download --run-id abc123Download artifacts
mlflow gc --backend-store-uri sqlite:///mlflow.dbDelete deleted runs

Python Tracking API

FunctionDescription
mlflow.set_experiment("name")Set active experiment
mlflow.start_run()Start a run (context manager)
mlflow.end_run()End active run
mlflow.active_run()Get current run object
mlflow.log_param("lr", 0.01)Log a parameter
mlflow.log_params({"lr": 0.01, "epochs": 10})Log multiple params
mlflow.log_metric("loss", 0.5, step=1)Log a metric
mlflow.log_metrics({"loss": 0.5, "acc": 0.95})Log multiple metrics
mlflow.log_artifact("model.pkl")Log a file
mlflow.log_artifacts("./output/")Log a directory
mlflow.log_image(img, "plot.png")Log an image
mlflow.log_text("some text", "notes.txt")Log text as artifact
mlflow.log_dict({"key": "val"}, "config.json")Log dict as JSON
mlflow.log_figure(fig, "plot.png")Log matplotlib/plotly figure
mlflow.set_tag("model_type", "xgboost")Set a tag
mlflow.set_tags({"env": "prod", "team": "ml"})Set multiple tags
mlflow.log_input(dataset, context="training")Log dataset

Advanced Usage

Full Training Loop

import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Configure experiment
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("fraud-detection-v2")

# Hyperparameters to try
params = {
    "n_estimators": 200,
    "max_depth": 8,
    "min_samples_split": 5,
    "random_state": 42
}

with mlflow.start_run(run_name="rf-baseline") as run:
    # Log params
    mlflow.log_params(params)

    # Train
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Log metrics
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)

    # Log model with input/output signature
    from mlflow.models.signature import infer_signature
    signature = infer_signature(X_train, model.predict(X_train))
    mlflow.sklearn.log_model(
        model,
        artifact_path="model",
        signature=signature,
        registered_model_name="FraudDetectionModel",
        input_example=X_train.iloc[:3]
    )

    # Log feature importances as artifact
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots()
    pd.Series(model.feature_importances_, index=X_train.columns).sort_values().plot.barh(ax=ax)
    mlflow.log_figure(fig, "feature_importance.png")
    plt.close()

    print(f"Run ID: {run.info.run_id}")
    print(f"Accuracy: {acc:.4f}, F1: {f1:.4f}")

Autologging

# Enable autologging (logs params, metrics, model automatically)
mlflow.sklearn.autolog()
mlflow.xgboost.autolog()
mlflow.tensorflow.autolog()
mlflow.pytorch.autolog()
mlflow.lightgbm.autolog()
mlflow.statsmodels.autolog()
mlflow.spark.autolog()

# Autolog everything (framework-agnostic)
mlflow.autolog()

# Custom autolog options
mlflow.sklearn.autolog(
    log_input_examples=True,
    log_model_signatures=True,
    log_models=True,
    disable=False,
    exclusive=False,
    max_tuning_runs=5
)

# Use with normal training — everything is logged automatically
with mlflow.start_run():
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X_train, y_train)   # autolog captures params, CV metrics, model

Model Registry

from mlflow.tracking import MlflowClient

client = MlflowClient()

# Register a model from a run
result = mlflow.register_model(
    model_uri=f"runs:/{run_id}/model",
    name="FraudDetectionModel"
)
print(f"Version: {result.version}")

# Transition version stages
client.transition_model_version_stage(
    name="FraudDetectionModel",
    version=3,
    stage="Staging",           # None → Staging → Production → Archived
    archive_existing_versions=False
)

client.transition_model_version_stage(
    name="FraudDetectionModel",
    version=3,
    stage="Production",
    archive_existing_versions=True   # archive old production version
)

# Add description and tags
client.update_model_version(
    name="FraudDetectionModel",
    version=3,
    description="XGBoost model trained on Q1 2026 data. F1=0.94"
)

client.set_model_version_tag("FraudDetectionModel", "3", "validated_by", "alice")

# Search model versions
for mv in client.search_model_versions("name='FraudDetectionModel'"):
    print(f"Version {mv.version}: {mv.current_stage}")

# Load from registry
model = mlflow.pyfunc.load_model("models:/FraudDetectionModel/Production")
model = mlflow.sklearn.load_model("models:/FraudDetectionModel/3")

MLflow Projects

# Run local project
mlflow run . -P learning_rate=0.001 -P epochs=50

# Run from Git
mlflow run https://github.com/myorg/myrepo \
  -P learning_rate=0.01 \
  --experiment-name my-experiment \
  --env-manager=conda

# Run a specific entrypoint
mlflow run . --entry-point evaluate -P model_uri=runs:/abc123/model

Model Serving

# Serve a model from a run
mlflow models serve \
  --model-uri runs:/abc123/model \
  --port 8080 \
  --no-conda

# Serve from registry
mlflow models serve \
  --model-uri models:/FraudDetectionModel/Production \
  --port 8080

# Call the REST endpoint
curl http://localhost:8080/invocations \
  -H "Content-Type: application/json" \
  -d '{"dataframe_records": [{"feature1": 1.0, "feature2": 2.0}]}'

# Build a Docker image
mlflow models build-docker \
  --model-uri models:/FraudDetectionModel/Production \
  --name fraud-model:latest

docker run -p 8080:8080 fraud-model:latest

Searching Runs

from mlflow.tracking import MlflowClient

client = MlflowClient()

# Search runs with filters
runs = client.search_runs(
    experiment_ids=["1", "2"],
    filter_string="metrics.accuracy > 0.9 and params.model_type = 'xgboost'",
    order_by=["metrics.f1_score DESC"],
    max_results=10
)

for run in runs:
    print(run.info.run_id, run.data.metrics.get("accuracy"))

# MLflow fluent API search
runs_df = mlflow.search_runs(
    experiment_names=["fraud-detection-v2"],
    filter_string="metrics.accuracy > 0.90",
    order_by=["start_time DESC"]
)
print(runs_df[["run_id", "metrics.accuracy", "params.n_estimators"]])

Common Workflows

Compare Runs and Pick Best

import mlflow

mlflow.set_experiment("hyperparameter-search")

# Run multiple experiments
for lr in [0.001, 0.01, 0.1]:
    with mlflow.start_run():
        mlflow.log_param("learning_rate", lr)
        # ... train model ...
        mlflow.log_metric("val_loss", val_loss)

# Find best run
best_run = mlflow.search_runs(
    order_by=["metrics.val_loss ASC"]
).iloc[0]

print(f"Best run: {best_run.run_id}")
print(f"Best LR: {best_run['params.learning_rate']}")
print(f"Best loss: {best_run['metrics.val_loss']}")

Nested Runs (Cross-Validation)

with mlflow.start_run(run_name="cv-parent") as parent:
    mlflow.log_param("k_folds", 5)

    fold_metrics = []
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
        with mlflow.start_run(run_name=f"fold-{fold}", nested=True):
            # train and evaluate fold
            mlflow.log_metric("fold_accuracy", fold_acc)
            fold_metrics.append(fold_acc)

    # Log aggregate to parent
    mlflow.log_metric("mean_cv_accuracy", sum(fold_metrics)/len(fold_metrics))

CI/CD Model Promotion

from mlflow.tracking import MlflowClient

client = MlflowClient()

# Get latest staging model
staging = client.get_latest_versions("FraudModel", stages=["Staging"])[0]
prod = client.get_latest_versions("FraudModel", stages=["Production"])

# Promote if staging beats production
if staging_f1 > prod_f1 * 1.02:  # 2% improvement threshold
    client.transition_model_version_stage(
        name="FraudModel",
        version=staging.version,
        stage="Production",
        archive_existing_versions=True
    )

Tips and Best Practices

  • Use experiments to group related runs — create one experiment per task, dataset version, or code branch rather than dumping everything into the default experiment.
  • Log input_example and signature when logging models — this enables schema validation during serving and makes model expectations explicit.
  • Autologging is the fastest way to start — enable it and add manual log_metric calls only for business-specific metrics.
  • Nested runs keep hyperparameter search and cross-validation organized — parent run holds aggregates, children hold per-fold or per-trial results.
  • Use the model registry stage transitions (StagingProduction) as a deployment gate — code your CI/CD to check the stage before deploying.
  • Store artifacts for reproducibility — log your training data hash, preprocessing pipeline, and environment requirements alongside the model.
  • SQLite backend is fine for small teams — switch to PostgreSQL for concurrent multi-user setups.
  • Tag runs with metadata (git_commit, dataset_version, author) to make them searchable months later.
  • mlflow.search_runs() returns a DataFrame — use it to compare experiments programmatically in notebooks or CI scripts.
  • Set MLFLOW_EXPERIMENT_NAME in your CI environment to ensure every automated run goes to the correct experiment without hardcoding.
  • Model signatures prevent silent type errors at serving time — always infer or define them explicitly.