ONNX Runtime est un moteur d’inférence multiplateforme pour exécuter des modèles ONNX avec une optimisation haute performance pour CPU, GPU, NPU et appareils edge. Il supporte la quantification, l’optimisation de graphes et plusieurs fournisseurs d’exécution.
Installation
# CPU-only
pip install onnxruntime
# GPU (CUDA)
pip install onnxruntime-gpu
# Install ONNX for model export
pip install onnx onnxscript
# Install optimization tools
pip install onnxruntime-tools
# Install for model conversion from PyTorch
pip install optimum[onnxruntime-gpu]
# Verify
python -c "import onnxruntime as ort; print(ort.__version__); print(ort.get_available_providers())"
Inférence de base
import onnxruntime as ort
import numpy as np
# Create inference session
session = ort.InferenceSession(
"model.onnx",
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
)
# Inspect model inputs and outputs
for inp in session.get_inputs():
print(f"Input: {inp.name}, Shape: {inp.shape}, Type: {inp.type}")
for out in session.get_outputs():
print(f"Output: {out.name}, Shape: {out.shape}, Type: {out.type}")
# Run inference
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
results = session.run(
None, # None = all outputs
{"input": input_data},
)
output = results[0]
print(f"Output shape: {output.shape}")
Fournisseurs d’exécution
import onnxruntime as ort
# List available providers
print(ort.get_available_providers())
# CUDA GPU
session = ort.InferenceSession("model.onnx", providers=["CUDAExecutionProvider"])
# CUDA with specific device
session = ort.InferenceSession("model.onnx", providers=[
("CUDAExecutionProvider", {"device_id": 0}),
"CPUExecutionProvider",
])
# TensorRT (optimized for NVIDIA GPUs)
session = ort.InferenceSession("model.onnx", providers=[
("TensorrtExecutionProvider", {
"trt_max_workspace_size": 2147483648,
"trt_fp16_enable": True,
}),
"CUDAExecutionProvider",
"CPUExecutionProvider",
])
# DirectML (Windows GPU)
session = ort.InferenceSession("model.onnx", providers=["DmlExecutionProvider"])
# CoreML (macOS/iOS)
session = ort.InferenceSession("model.onnx", providers=["CoreMLExecutionProvider"])
# OpenVINO (Intel hardware)
session = ort.InferenceSession("model.onnx", providers=["OpenVINOExecutionProvider"])
Options de session
import onnxruntime as ort
# Configure session options
options = ort.SessionOptions()
# Thread settings
options.intra_op_num_threads = 4
options.inter_op_num_threads = 2
# Graph optimization level
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# Save optimized model
options.optimized_model_filepath = "model_optimized.onnx"
# Enable profiling
options.enable_profiling = True
# Memory settings
options.enable_mem_pattern = True
options.enable_cpu_mem_arena = True
# Execution mode
options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
session = ort.InferenceSession("model.onnx", options, providers=["CPUExecutionProvider"])
Exporter un modèle PyTorch en ONNX
import torch
import torch.onnx
# Define or load your PyTorch model
model = MyModel()
model.eval()
# Create dummy input matching model's expected input
dummy_input = torch.randn(1, 3, 224, 224)
# Export to ONNX
torch.onnx.export(
model,
dummy_input,
"model.onnx",
input_names=["input"],
output_names=["output"],
dynamic_axes={
"input": {0: "batch_size"},
"output": {0: "batch_size"},
},
opset_version=17,
do_constant_folding=True,
)
# Verify the exported model
import onnx
model = onnx.load("model.onnx")
onnx.checker.check_model(model)
print("Model exported and verified successfully")
Exporter des modèles HuggingFace avec Optimum
# Export transformer model to ONNX
optimum-cli export onnx \
--model bert-base-uncased \
--task text-classification \
./bert-onnx/
# Export with specific opset
optimum-cli export onnx \
--model meta-llama/Llama-3.1-8B \
--task text-generation \
--opset 17 \
./llama-onnx/
# Export with FP16
optimum-cli export onnx \
--model bert-base-uncased \
--task text-classification \
--fp16 \
./bert-onnx-fp16/
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer
# Load and use ONNX model with Optimum
tokenizer = AutoTokenizer.from_pretrained("./bert-onnx")
model = ORTModelForSequenceClassification.from_pretrained(
"./bert-onnx",
provider="CUDAExecutionProvider",
)
inputs = tokenizer("This is great!", return_tensors="np")
outputs = model(**inputs)
print(outputs.logits)
Optimisation de modèles
from onnxruntime.quantization import quantize_dynamic, quantize_static, QuantType
# Dynamic quantization (no calibration data needed)
quantize_dynamic(
model_input="model.onnx",
model_output="model_int8.onnx",
weight_type=QuantType.QInt8,
)
# Static quantization (requires calibration data)
from onnxruntime.quantization import CalibrationDataReader
class MyCalibrationDataReader(CalibrationDataReader):
def __init__(self, calibration_data):
self.data = iter(calibration_data)
def get_next(self):
try:
return next(self.data)
except StopIteration:
return None
calibration_reader = MyCalibrationDataReader(calibration_samples)
quantize_static(
model_input="model.onnx",
model_output="model_static_int8.onnx",
calibration_data_reader=calibration_reader,
quant_format=QuantFormat.QDQ,
weight_type=QuantType.QInt8,
activation_type=QuantType.QInt8,
)
Optimisation de graphes
# Optimize model with onnxruntime tools
python -m onnxruntime.transformers.optimizer \
--input model.onnx \
--output model_optimized.onnx \
--model_type bert \
--num_heads 12 \
--hidden_size 768 \
--opt_level 2 \
--use_gpu
# Programmatic graph optimization
import onnxruntime as ort
options = ort.SessionOptions()
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
options.optimized_model_filepath = "model_optimized.onnx"
# This creates and saves the optimized model
session = ort.InferenceSession("model.onnx", options)
Benchmarking
import onnxruntime as ort
import numpy as np
import time
session = ort.InferenceSession("model.onnx", providers=["CUDAExecutionProvider"])
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
# Warmup
for _ in range(10):
session.run(None, {"input": input_data})
# Benchmark
num_iterations = 100
start = time.perf_counter()
for _ in range(num_iterations):
session.run(None, {"input": input_data})
end = time.perf_counter()
avg_latency = (end - start) / num_iterations * 1000
print(f"Average latency: {avg_latency:.2f} ms")
print(f"Throughput: {1000 / avg_latency:.1f} inferences/sec")
IO Binding (GPU sans copie)
import onnxruntime as ort
import numpy as np
session = ort.InferenceSession("model.onnx", providers=["CUDAExecutionProvider"])
# Create IO binding for zero-copy GPU inference
io_binding = session.io_binding()
# Bind input on GPU
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
input_tensor = ort.OrtValue.ortvalue_from_numpy(input_data, "cuda", 0)
io_binding.bind_ortvalue_input("input", input_tensor)
# Bind output on GPU
io_binding.bind_output("output", "cuda")
# Run (data stays on GPU)
session.run_with_iobinding(io_binding)
# Get output
output = io_binding.get_outputs()[0].numpy()
Comparaison des fournisseurs d’exécution
| Provider | Hardware | Best For |
|---|
| CPUExecutionProvider | Any CPU | Universal fallback |
| CUDAExecutionProvider | NVIDIA GPU | GPU inference |
| TensorrtExecutionProvider | NVIDIA GPU | Maximum GPU throughput |
| DirectMLExecutionProvider | Windows GPU | Windows GPU (any vendor) |
| CoreMLExecutionProvider | Apple Silicon | macOS/iOS |
| OpenVINOExecutionProvider | Intel CPU/GPU/VPU | Intel hardware |
| QNNExecutionProvider | Qualcomm NPU | Mobile/edge |
Patterns courants
| Task | Approach |
|---|
| Export from PyTorch | torch.onnx.export() |
| Export from HuggingFace | optimum-cli export onnx |
| Dynamic quantization | quantize_dynamic() (no calibration) |
| Static quantization | quantize_static() (with calibration) |
| GPU inference | providers=["CUDAExecutionProvider"] |
| Graph optimization | ORT_ENABLE_ALL optimization level |
| Zero-copy GPU | Use IOBinding |
| Profiling | options.enable_profiling = True |