Skip to content

BentoML Cheat Sheet

Overview

BentoML is an open-source framework for building, shipping, and scaling AI applications. It provides a standardized way to package trained ML models with their serving logic, dependencies, and configurations into portable artifacts called Bentos. These Bentos can be deployed as microservices, containerized with Docker, or deployed to cloud platforms including BentoCloud, AWS, GCP, and Kubernetes clusters.

BentoML supports all major ML frameworks including PyTorch, TensorFlow, Scikit-learn, XGBoost, LightGBM, HuggingFace Transformers, ONNX, and custom models. It provides features like adaptive batching for throughput optimization, built-in model management with versioning, parallel inference, GPU support, and a runner architecture that separates API logic from model computation. The framework handles the entire lifecycle from model development to production deployment.

Installation

# Install BentoML
pip install bentoml

# Install with specific framework support
pip install "bentoml[pytorch]"
pip install "bentoml[tensorflow]"
pip install "bentoml[transformers]"

# Verify installation
bentoml --version

# Check available models in local store
bentoml models list

Core Workflow

Save a Model

import bentoml
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Train model
X, y = load_iris(return_X_y=True)
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X, y)

# Save to BentoML model store
saved_model = bentoml.sklearn.save_model(
    "iris_classifier",
    clf,
    labels={"team": "data-science", "stage": "production"},
    metadata={"accuracy": 0.96, "dataset": "iris"},
    signatures={"predict": {"batchable": True}},
)
print(f"Model saved: {saved_model}")

Define a Service

# service.py
import bentoml
import numpy as np
from bentoml.io import NumpyNdarray, JSON
from pydantic import BaseModel
from typing import List

class IrisFeatures(BaseModel):
    sepal_length: float
    sepal_width: float
    petal_length: float
    petal_width: float

class PredictionResponse(BaseModel):
    prediction: int
    probability: List[float]

# Load model reference
iris_model = bentoml.sklearn.get("iris_classifier:latest")

# Create a runner
iris_runner = iris_model.to_runner()

# Define the service
svc = bentoml.Service("iris_service", runners=[iris_runner])

@svc.api(input=NumpyNdarray(), output=NumpyNdarray())
async def predict(input_array: np.ndarray) -> np.ndarray:
    return await iris_runner.predict.async_run(input_array)

@svc.api(input=JSON(pydantic_model=IrisFeatures), output=JSON(pydantic_model=PredictionResponse))
async def classify(features: IrisFeatures) -> PredictionResponse:
    input_array = np.array([[
        features.sepal_length,
        features.sepal_width,
        features.petal_length,
        features.petal_width
    ]])
    prediction = await iris_runner.predict.async_run(input_array)
    probabilities = await iris_runner.predict_proba.async_run(input_array)
    return PredictionResponse(
        prediction=int(prediction[0]),
        probability=probabilities[0].tolist()
    )

Serve Locally

# Start development server
bentoml serve service:svc --reload --port 3000

# Or specify the service file
bentoml serve service.py:svc

Build and Deploy

# Build a Bento
bentoml build

# List built Bentos
bentoml list

# Containerize as Docker image
bentoml containerize iris_service:latest

# Run Docker container
docker run -p 3000:3000 iris_service:latest

CLI Commands

CommandDescription
bentoml serve <service>Start development server
bentoml buildBuild a Bento from service
bentoml listList all built Bentos
bentoml get <bento>Get Bento details
bentoml delete <bento>Delete a Bento
bentoml export <bento> <path>Export Bento to file
bentoml import <path>Import Bento from file
bentoml containerize <bento>Build Docker image
bentoml push <bento>Push to BentoCloud
bentoml pull <bento>Pull from BentoCloud
bentoml models listList saved models
bentoml models get <model>Get model details
bentoml models delete <model>Delete a model
bentoml models export <model> <path>Export model
bentoml models import <path>Import model
bentoml deploy <bento>Deploy to BentoCloud
bentoml deployment listList deployments

Configuration

bentofile.yaml

service: "service:svc"
labels:
  team: ml-platform
  project: iris-classifier

include:
  - "*.py"
  - "config/*.yaml"

exclude:
  - "tests/"
  - "*.ipynb"

python:
  packages:
    - scikit-learn==1.4.0
    - numpy>=1.24
    - pydantic>=2.0
  lock_packages: true

envs:
  - name: MODEL_TIMEOUT
    value: "60"
  - name: BENTOML_CONFIG
    value: "production"

docker:
  distro: debian
  python_version: "3.11"
  cuda_version: null
  system_packages:
    - libgomp1
  setup_script: "./setup.sh"
  dockerfile_template: null

Service Configuration

# Configuration via decorators
@svc.api(
    input=NumpyNdarray(dtype="float64", shape=(-1, 4)),
    output=NumpyNdarray(dtype="int64"),
    route="/v1/predict",
)
async def predict(input_array: np.ndarray) -> np.ndarray:
    return await iris_runner.predict.async_run(input_array)

# Runner configuration
iris_runner = iris_model.to_runner(
    name="iris_runner",
    max_batch_size=100,
    max_latency_ms=500,
)

Adaptive Batching

# Configure batching at model save time
bentoml.sklearn.save_model(
    "iris_classifier",
    clf,
    signatures={
        "predict": {
            "batchable": True,
            "batch_dim": 0,
        }
    },
)

# Or configure at runner creation
runner = model.to_runner(
    max_batch_size=64,
    max_latency_ms=300,
)

Advanced Usage

Multi-Model Service

import bentoml
from bentoml.io import JSON, NumpyNdarray

preprocessor = bentoml.sklearn.get("preprocessor:latest").to_runner()
classifier = bentoml.sklearn.get("classifier:latest").to_runner()
postprocessor = bentoml.sklearn.get("postprocessor:latest").to_runner()

svc = bentoml.Service(
    "pipeline_service",
    runners=[preprocessor, classifier, postprocessor]
)

@svc.api(input=JSON(), output=JSON())
async def predict(input_data: dict) -> dict:
    features = await preprocessor.transform.async_run(input_data["features"])
    predictions = await classifier.predict.async_run(features)
    result = await postprocessor.format.async_run(predictions)
    return {"result": result}

GPU Configuration

import bentoml

# Save with GPU resource requirements
runner = model.to_runner(
    name="gpu_runner",
    runnable_init_params={"device": "cuda"},
)

# In bentofile.yaml
# docker:
#   cuda_version: "11.8"

Custom Runner

import bentoml

class MyCustomRunnable(bentoml.Runnable):
    SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
    SUPPORTS_CPU_MULTI_THREADING = True

    def __init__(self):
        import torch
        self.model = torch.load("model.pt")
        self.model.eval()

    @bentoml.Runnable.method(batchable=True, batch_dim=0)
    def predict(self, input_tensor):
        import torch
        with torch.no_grad():
            return self.model(input_tensor)

custom_runner = bentoml.Runner(
    MyCustomRunnable,
    name="custom_runner",
    max_batch_size=32,
)

svc = bentoml.Service("custom_service", runners=[custom_runner])

Monitoring and Metrics

# BentoML exposes Prometheus metrics at /metrics
# Key metrics:
# - bentoml_api_request_duration_seconds
# - bentoml_api_request_total
# - bentoml_runner_request_duration_seconds
# - bentoml_runner_request_total
# - bentoml_runner_adaptive_batch_size

# Access via: curl http://localhost:3000/metrics

Troubleshooting

IssueSolution
Model not foundCheck bentoml models list. Verify model name and version tag
Build failsCheck bentofile.yaml syntax. Verify all included files exist
Container build failsCheck Docker daemon is running. Verify base image compatibility
Slow inferenceEnable adaptive batching. Check runner resource allocation
GPU not detectedVerify CUDA drivers installed. Set cuda_version in bentofile.yaml
Import errors in containerAdd missing packages to bentofile.yaml python.packages list
Port already in useChange port with --port flag. Kill existing process on the port
Runner timeoutIncrease timeout configuration. Check model loading time
Batch dimension mismatchVerify batch_dim in signatures matches input array shape
Memory leak in serviceCheck for retained references in service code. Monitor with /metrics