ONNX Runtime هو محرك استدلال متعدد المنصات لتشغيل نماذج ONNX مع تحسينات خاصة بالعتاد. يدعم موفري تنفيذ لـ CUDA و TensorRT و DirectML و CoreML والمزيد، مما يتيح النشر الفعال عبر المعالج ووحدة الرسومات والأجهزة الطرفية.
التثبيت
# CPU-only
pip install onnxruntime
# GPU (CUDA)
pip install onnxruntime-gpu
# Install ONNX for model export
pip install onnx onnxscript
# Install optimization tools
pip install onnxruntime-tools
# Install for model conversion from PyTorch
pip install optimum[onnxruntime-gpu]
# Verify
python -c "import onnxruntime as ort; print(ort.__version__); print(ort.get_available_providers())"
Basic Inference
import onnxruntime as ort
import numpy as np
# Create inference session
session = ort.InferenceSession(
"model.onnx",
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
)
# Inspect model inputs and outputs
for inp in session.get_inputs():
print(f"Input: {inp.name}, Shape: {inp.shape}, Type: {inp.type}")
for out in session.get_outputs():
print(f"Output: {out.name}, Shape: {out.shape}, Type: {out.type}")
# Run inference
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
results = session.run(
None, # None = all outputs
{"input": input_data},
)
output = results[0]
print(f"Output shape: {output.shape}")
Execution Providers
import onnxruntime as ort
# List available providers
print(ort.get_available_providers())
# CUDA GPU
session = ort.InferenceSession("model.onnx", providers=["CUDAExecutionProvider"])
# CUDA with specific device
session = ort.InferenceSession("model.onnx", providers=[
("CUDAExecutionProvider", {"device_id": 0}),
"CPUExecutionProvider",
])
# TensorRT (optimized for NVIDIA GPUs)
session = ort.InferenceSession("model.onnx", providers=[
("TensorrtExecutionProvider", {
"trt_max_workspace_size": 2147483648,
"trt_fp16_enable": True,
}),
"CUDAExecutionProvider",
"CPUExecutionProvider",
])
# DirectML (Windows GPU)
session = ort.InferenceSession("model.onnx", providers=["DmlExecutionProvider"])
# CoreML (macOS/iOS)
session = ort.InferenceSession("model.onnx", providers=["CoreMLExecutionProvider"])
# OpenVINO (Intel hardware)
session = ort.InferenceSession("model.onnx", providers=["OpenVINOExecutionProvider"])
Session الخيارات
import onnxruntime as ort
# Configure session options
options = ort.SessionOptions()
# Thread settings
options.intra_op_num_threads = 4
options.inter_op_num_threads = 2
# Graph optimization level
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# Save optimized model
options.optimized_model_filepath = "model_optimized.onnx"
# Enable profiling
options.enable_profiling = True
# Memory settings
options.enable_mem_pattern = True
options.enable_cpu_mem_arena = True
# Execution mode
options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
session = ort.InferenceSession("model.onnx", options, providers=["CPUExecutionProvider"])
Export PyTorch Model to ONNX
import torch
import torch.onnx
# Define or load your PyTorch model
model = MyModel()
model.eval()
# Create dummy input matching model's expected input
dummy_input = torch.randn(1, 3, 224, 224)
# Export to ONNX
torch.onnx.export(
model,
dummy_input,
"model.onnx",
input_names=["input"],
output_names=["output"],
dynamic_axes={
"input": {0: "batch_size"},
"output": {0: "batch_size"},
},
opset_version=17,
do_constant_folding=True,
)
# Verify the exported model
import onnx
model = onnx.load("model.onnx")
onnx.checker.check_model(model)
print("Model exported and verified successfully")
Export HuggingFace Models with Optimum
# Export transformer model to ONNX
optimum-cli export onnx \
--model bert-base-uncased \
--task text-classification \
./bert-onnx/
# Export with specific opset
optimum-cli export onnx \
--model meta-llama/Llama-3.1-8B \
--task text-generation \
--opset 17 \
./llama-onnx/
# Export with FP16
optimum-cli export onnx \
--model bert-base-uncased \
--task text-classification \
--fp16 \
./bert-onnx-fp16/
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer
# Load and use ONNX model with Optimum
tokenizer = AutoTokenizer.from_pretrained("./bert-onnx")
model = ORTModelForSequenceClassification.from_pretrained(
"./bert-onnx",
provider="CUDAExecutionProvider",
)
inputs = tokenizer("This is great!", return_tensors="np")
outputs = model(**inputs)
print(outputs.logits)
Model Optimization
from onnxruntime.quantization import quantize_dynamic, quantize_static, QuantType
# Dynamic quantization (no calibration data needed)
quantize_dynamic(
model_input="model.onnx",
model_output="model_int8.onnx",
weight_type=QuantType.QInt8,
)
# Static quantization (requires calibration data)
from onnxruntime.quantization import CalibrationDataReader
class MyCalibrationDataReader(CalibrationDataReader):
def __init__(self, calibration_data):
self.data = iter(calibration_data)
def get_next(self):
try:
return next(self.data)
except StopIteration:
return None
calibration_reader = MyCalibrationDataReader(calibration_samples)
quantize_static(
model_input="model.onnx",
model_output="model_static_int8.onnx",
calibration_data_reader=calibration_reader,
quant_format=QuantFormat.QDQ,
weight_type=QuantType.QInt8,
activation_type=QuantType.QInt8,
)
Graph Optimization
# Optimize model with onnxruntime tools
python -m onnxruntime.transformers.optimizer \
--input model.onnx \
--output model_optimized.onnx \
--model_type bert \
--num_heads 12 \
--hidden_size 768 \
--opt_level 2 \
--use_gpu
# Programmatic graph optimization
import onnxruntime as ort
options = ort.SessionOptions()
options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
options.optimized_model_filepath = "model_optimized.onnx"
# This creates and saves the optimized model
session = ort.InferenceSession("model.onnx", options)
قياس الأداء
import onnxruntime as ort
import numpy as np
import time
session = ort.InferenceSession("model.onnx", providers=["CUDAExecutionProvider"])
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
# Warmup
for _ in range(10):
session.run(None, {"input": input_data})
# Benchmark
num_iterations = 100
start = time.perf_counter()
for _ in range(num_iterations):
session.run(None, {"input": input_data})
end = time.perf_counter()
avg_latency = (end - start) / num_iterations * 1000
print(f"Average latency: {avg_latency:.2f} ms")
print(f"Throughput: {1000 / avg_latency:.1f} inferences/sec")
IO Binding (Zero-Copy GPU)
import onnxruntime as ort
import numpy as np
session = ort.InferenceSession("model.onnx", providers=["CUDAExecutionProvider"])
# Create IO binding for zero-copy GPU inference
io_binding = session.io_binding()
# Bind input on GPU
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
input_tensor = ort.OrtValue.ortvalue_from_numpy(input_data, "cuda", 0)
io_binding.bind_ortvalue_input("input", input_tensor)
# Bind output on GPU
io_binding.bind_output("output", "cuda")
# Run (data stays on GPU)
session.run_with_iobinding(io_binding)
# Get output
output = io_binding.get_outputs()[0].numpy()
Execution Providers Comparison
| Provider | Hardware | Best For |
|---|
| CPUExecutionProvider | Any CPU | Universal fallback |
| CUDAExecutionProvider | NVIDIA GPU | GPU inference |
| TensorrtExecutionProvider | NVIDIA GPU | Maximum GPU throughput |
| DirectMLExecutionProvider | Windows GPU | Windows GPU (any vendor) |
| CoreMLExecutionProvider | Apple Silicon | macOS/iOS |
| OpenVINOExecutionProvider | Intel CPU/GPU/VPU | Intel hardware |
| QNNExecutionProvider | Qualcomm NPU | Mobile/edge |
الأنماط الشائعة
| Task | Approach |
|---|
| Export from PyTorch | torch.onnx.export() |
| Export from HuggingFace | optimum-cli export onnx |
| Dynamic quantization | quantize_dynamic() (no calibration) |
| Static quantization | quantize_static() (with calibration) |
| GPU inference | providers=["CUDAExecutionProvider"] |
| Graph optimization | ORT_ENABLE_ALL optimization level |
| Zero-copy GPU | Use IOBinding |
| Profiling | options.enable_profiling = True |