DeepSpeed est une bibliothèque d’optimisation d’apprentissage profond de Microsoft qui permet l’entraînement distribué et l’inférence efficace pour les grands modèles. Elle fournit l’optimisation mémoire ZeRO, le parallélisme de pipeline, l’entraînement en précision mixte et le déchargement CPU/NVMe.
Installation
# Install from PyPI
pip install deepspeed
# Install with specific ops
DS_BUILD_OPS=1 pip install deepspeed
# Install with CPU Adam optimizer
DS_BUILD_CPU_ADAM=1 pip install deepspeed
# Install with sparse attention
DS_BUILD_SPARSE_ATTN=1 pip install deepspeed
# Verify installation and check available ops
ds_report
# Check version
python -c "import deepspeed; print(deepspeed.__version__)"
Lanceur DeepSpeed
# Single node, all GPUs
deepspeed train.py --deepspeed ds_config.json
# Single node, specific GPU count
deepspeed --num_gpus=4 train.py --deepspeed ds_config.json
# Single node, specific GPUs
deepspeed --include localhost:0,1,2,3 train.py --deepspeed ds_config.json
# Multi-node training
deepspeed --num_nodes=2 \
--hostfile hostfile.txt \
train.py --deepspeed ds_config.json
# With HuggingFace Accelerate
accelerate launch --use_deepspeed \
--deepspeed_config_file ds_config.json \
train.py
Configuration ZeRO étape 1
{
"train_batch_size": 32,
"gradient_accumulation_steps": 4,
"optimizer": {
"type": "AdamW",
"params": {
"lr": 2e-5,
"betas": [0.9, 0.999],
"eps": 1e-8,
"weight_decay": 0.01
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": 0,
"warmup_max_lr": 2e-5,
"warmup_num_steps": 100,
"total_num_steps": 1000
}
},
"zero_optimization": {
"stage": 1
},
"bf16": {
"enabled": true
}
}
Configuration ZeRO étape 2
{
"train_batch_size": 32,
"gradient_accumulation_steps": 4,
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true
},
"bf16": {
"enabled": true
}
}
Configuration ZeRO étape 3
{
"train_batch_size": 32,
"gradient_accumulation_steps": 8,
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"bf16": {
"enabled": true
}
}
ZeRO étape 3 avec déchargement NVMe
{
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "nvme",
"nvme_path": "/local_nvme",
"pin_memory": true,
"buffer_count": 4
},
"offload_param": {
"device": "nvme",
"nvme_path": "/local_nvme",
"pin_memory": true,
"buffer_count": 5,
"max_in_cpu": 1e9
}
}
}
Intégration Python
import deepspeed
import torch
model = MyModel()
# Initialize DeepSpeed engine
model_engine, optimizer, _, _ = deepspeed.initialize(
model=model,
model_parameters=model.parameters(),
config="ds_config.json",
)
# Training loop
for batch in dataloader:
loss = model_engine(batch)
model_engine.backward(loss)
model_engine.step()
# Save checkpoint
model_engine.save_checkpoint("./checkpoints", tag="step_1000")
# Load checkpoint
model_engine.load_checkpoint("./checkpoints", tag="step_1000")
Intégration HuggingFace Trainer
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
output_dir="./output",
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
num_train_epochs=3,
bf16=True,
deepspeed="ds_config.json", # Just point to config
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
)
trainer.train()
Parallélisme de pipeline
import deepspeed
from deepspeed.pipe import PipelineModule, LayerSpec
# Define pipeline-parallel model
class PipeModel(PipelineModule):
def __init__(self, num_stages):
layers = [
LayerSpec(torch.nn.Embedding, vocab_size, hidden_size),
*[LayerSpec(TransformerBlock, hidden_size) for _ in range(num_layers)],
LayerSpec(torch.nn.Linear, hidden_size, vocab_size),
]
super().__init__(layers=layers, num_stages=num_stages)
model = PipeModel(num_stages=4)
engine, _, _, _ = deepspeed.initialize(
model=model,
config={
"train_batch_size": 32,
"train_micro_batch_size_per_gpu": 4,
"pipeline": {"micro_batches": 8},
},
)
Inférence DeepSpeed
import deepspeed
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")
# Initialize for inference with tensor parallelism
ds_engine = deepspeed.init_inference(
model,
tensor_parallel={"tp_size": 4},
dtype=torch.float16,
replace_with_kernel_inject=True,
)
# Use the model
output = ds_engine.module.generate(input_ids, max_new_tokens=100)
# hostfile.txt for multi-node training
worker-0 slots=8
worker-1 slots=8
worker-2 slots=4
Options de précision mixte
{
"bf16": {
"enabled": true
}
}
{
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
}
}
Comparaison des étapes ZeRO
| Fonctionnalité | Stage 1 | Stage 2 | Stage 3 |
|---|
| Optimizer state partitioning | Yes | Yes | Yes |
| Gradient partitioning | No | Yes | Yes |
| Parameter partitioning | No | No | Yes |
| CPU offloading | No | Optimizer | Optimizer + Params |
| NVMe offloading | No | No | Yes |
| Memory savings | ~4x | ~8x | ~Nx (linear) |
Commandes courantes
| Task | Commande |
|---|
| Launch training | deepspeed train.py --deepspeed ds_config.json |
| Multi-GPU (4) | deepspeed --num_gpus=4 train.py --deepspeed ds_config.json |
| Multi-node | deepspeed --hostfile hostfile.txt train.py --deepspeed ds_config.json |
| System report | ds_report |
| Convert ZeRO-3 checkpoint | python zero_to_fp32.py checkpoint_dir output.pt |