Ir al contenido

Comandos de ONNX Runtime

ONNX Runtime es un motor de inferencia multiplataforma para ejecutar modelos ONNX con optimización de alto rendimiento para CPU, GPU, NPU y dispositivos edge. Soporta cuantización, optimización de grafos y múltiples proveedores de ejecución.

Instalación

# CPU-only
pip install onnxruntime

# GPU (CUDA)
pip install onnxruntime-gpu

# Install ONNX for model export
pip install onnx onnxscript

# Install optimization tools
pip install onnxruntime-tools

# Install for model conversion from PyTorch
pip install optimum[onnxruntime-gpu]

# Verify
python -c "import onnxruntime as ort; print(ort.__version__); print(ort.get_available_providers())"

Inferencia básica

import onnxruntime as ort
import numpy as np

# Create inference session
session = ort.InferenceSession(
    "model.onnx",
    providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
)

# Inspect model inputs and outputs
for inp in session.get_inputs():
    print(f"Input: {inp.name}, Shape: {inp.shape}, Type: {inp.type}")

for out in session.get_outputs():
    print(f"Output: {out.name}, Shape: {out.shape}, Type: {out.type}")

# Run inference
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
results = session.run(
    None,  # None = all outputs
    {"input": input_data},
)

output = results[0]
print(f"Output shape: {output.shape}")

Proveedores de ejecución

import onnxruntime as ort

# List available providers
print(ort.get_available_providers())

# CUDA GPU
session = ort.InferenceSession("model.onnx", providers=["CUDAExecutionProvider"])

# CUDA with specific device
session = ort.InferenceSession("model.onnx", providers=[
    ("CUDAExecutionProvider", {"device_id": 0}),
    "CPUExecutionProvider",
])

# TensorRT (optimized for NVIDIA GPUs)
session = ort.InferenceSession("model.onnx", providers=[
    ("TensorrtExecutionProvider", {
        "trt_max_workspace_size": 2147483648,
        "trt_fp16_enable": True,
    }),
    "CUDAExecutionProvider",
    "CPUExecutionProvider",
])

# DirectML (Windows GPU)
session = ort.InferenceSession("model.onnx", providers=["DmlExecutionProvider"])

# CoreML (macOS/iOS)
session = ort.InferenceSession("model.onnx", providers=["CoreMLExecutionProvider"])

# OpenVINO (Intel hardware)
session = ort.InferenceSession("model.onnx", providers=["OpenVINOExecutionProvider"])

Opciones de sesión

import onnxruntime as ort

# Configure session options
options = ort.SessionOptions()

# Thread settings
options.intra_op_num_threads = 4
options.inter_op_num_threads = 2

# Graph optimization level
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

# Save optimized model
options.optimized_model_filepath = "model_optimized.onnx"

# Enable profiling
options.enable_profiling = True

# Memory settings
options.enable_mem_pattern = True
options.enable_cpu_mem_arena = True

# Execution mode
options.execution_mode = ort.ExecutionMode.ORT_PARALLEL

session = ort.InferenceSession("model.onnx", options, providers=["CPUExecutionProvider"])

Exportar modelo PyTorch a ONNX

import torch
import torch.onnx

# Define or load your PyTorch model
model = MyModel()
model.eval()

# Create dummy input matching model's expected input
dummy_input = torch.randn(1, 3, 224, 224)

# Export to ONNX
torch.onnx.export(
    model,
    dummy_input,
    "model.onnx",
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={
        "input": {0: "batch_size"},
        "output": {0: "batch_size"},
    },
    opset_version=17,
    do_constant_folding=True,
)

# Verify the exported model
import onnx
model = onnx.load("model.onnx")
onnx.checker.check_model(model)
print("Model exported and verified successfully")

Exportar modelos HuggingFace con Optimum

# Export transformer model to ONNX
optimum-cli export onnx \
  --model bert-base-uncased \
  --task text-classification \
  ./bert-onnx/

# Export with specific opset
optimum-cli export onnx \
  --model meta-llama/Llama-3.1-8B \
  --task text-generation \
  --opset 17 \
  ./llama-onnx/

# Export with FP16
optimum-cli export onnx \
  --model bert-base-uncased \
  --task text-classification \
  --fp16 \
  ./bert-onnx-fp16/
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer

# Load and use ONNX model with Optimum
tokenizer = AutoTokenizer.from_pretrained("./bert-onnx")
model = ORTModelForSequenceClassification.from_pretrained(
    "./bert-onnx",
    provider="CUDAExecutionProvider",
)

inputs = tokenizer("This is great!", return_tensors="np")
outputs = model(**inputs)
print(outputs.logits)

Optimización de modelos

from onnxruntime.quantization import quantize_dynamic, quantize_static, QuantType

# Dynamic quantization (no calibration data needed)
quantize_dynamic(
    model_input="model.onnx",
    model_output="model_int8.onnx",
    weight_type=QuantType.QInt8,
)

# Static quantization (requires calibration data)
from onnxruntime.quantization import CalibrationDataReader

class MyCalibrationDataReader(CalibrationDataReader):
    def __init__(self, calibration_data):
        self.data = iter(calibration_data)

    def get_next(self):
        try:
            return next(self.data)
        except StopIteration:
            return None

calibration_reader = MyCalibrationDataReader(calibration_samples)

quantize_static(
    model_input="model.onnx",
    model_output="model_static_int8.onnx",
    calibration_data_reader=calibration_reader,
    quant_format=QuantFormat.QDQ,
    weight_type=QuantType.QInt8,
    activation_type=QuantType.QInt8,
)

Optimización de grafos

# Optimize model with onnxruntime tools
python -m onnxruntime.transformers.optimizer \
  --input model.onnx \
  --output model_optimized.onnx \
  --model_type bert \
  --num_heads 12 \
  --hidden_size 768 \
  --opt_level 2 \
  --use_gpu
# Programmatic graph optimization
import onnxruntime as ort

options = ort.SessionOptions()
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
options.optimized_model_filepath = "model_optimized.onnx"

# This creates and saves the optimized model
session = ort.InferenceSession("model.onnx", options)

Benchmarking

import onnxruntime as ort
import numpy as np
import time

session = ort.InferenceSession("model.onnx", providers=["CUDAExecutionProvider"])

input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)

# Warmup
for _ in range(10):
    session.run(None, {"input": input_data})

# Benchmark
num_iterations = 100
start = time.perf_counter()
for _ in range(num_iterations):
    session.run(None, {"input": input_data})
end = time.perf_counter()

avg_latency = (end - start) / num_iterations * 1000
print(f"Average latency: {avg_latency:.2f} ms")
print(f"Throughput: {1000 / avg_latency:.1f} inferences/sec")

IO Binding (GPU sin copia)

import onnxruntime as ort
import numpy as np

session = ort.InferenceSession("model.onnx", providers=["CUDAExecutionProvider"])

# Create IO binding for zero-copy GPU inference
io_binding = session.io_binding()

# Bind input on GPU
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
input_tensor = ort.OrtValue.ortvalue_from_numpy(input_data, "cuda", 0)
io_binding.bind_ortvalue_input("input", input_tensor)

# Bind output on GPU
io_binding.bind_output("output", "cuda")

# Run (data stays on GPU)
session.run_with_iobinding(io_binding)

# Get output
output = io_binding.get_outputs()[0].numpy()

Comparación de proveedores de ejecución

ProviderHardwareBest For
CPUExecutionProviderAny CPUUniversal fallback
CUDAExecutionProviderNVIDIA GPUGPU inference
TensorrtExecutionProviderNVIDIA GPUMaximum GPU throughput
DirectMLExecutionProviderWindows GPUWindows GPU (any vendor)
CoreMLExecutionProviderApple SiliconmacOS/iOS
OpenVINOExecutionProviderIntel CPU/GPU/VPUIntel hardware
QNNExecutionProviderQualcomm NPUMobile/edge

Patrones comunes

TaskApproach
Export from PyTorchtorch.onnx.export()
Export from HuggingFaceoptimum-cli export onnx
Dynamic quantizationquantize_dynamic() (no calibration)
Static quantizationquantize_static() (with calibration)
GPU inferenceproviders=["CUDAExecutionProvider"]
Graph optimizationORT_ENABLE_ALL optimization level
Zero-copy GPUUse IOBinding
Profilingoptions.enable_profiling = True