Aller au contenu

Commandes DeepSpeed

DeepSpeed est une bibliothèque d’optimisation d’apprentissage profond de Microsoft qui permet l’entraînement distribué et l’inférence efficace pour les grands modèles. Elle fournit l’optimisation mémoire ZeRO, le parallélisme de pipeline, l’entraînement en précision mixte et le déchargement CPU/NVMe.

Installation

# Install from PyPI
pip install deepspeed

# Install with specific ops
DS_BUILD_OPS=1 pip install deepspeed

# Install with CPU Adam optimizer
DS_BUILD_CPU_ADAM=1 pip install deepspeed

# Install with sparse attention
DS_BUILD_SPARSE_ATTN=1 pip install deepspeed

# Verify installation and check available ops
ds_report

# Check version
python -c "import deepspeed; print(deepspeed.__version__)"

Lanceur DeepSpeed

# Single node, all GPUs
deepspeed train.py --deepspeed ds_config.json

# Single node, specific GPU count
deepspeed --num_gpus=4 train.py --deepspeed ds_config.json

# Single node, specific GPUs
deepspeed --include localhost:0,1,2,3 train.py --deepspeed ds_config.json

# Multi-node training
deepspeed --num_nodes=2 \
  --hostfile hostfile.txt \
  train.py --deepspeed ds_config.json

# With HuggingFace Accelerate
accelerate launch --use_deepspeed \
  --deepspeed_config_file ds_config.json \
  train.py

Configuration ZeRO étape 1

{
  "train_batch_size": 32,
  "gradient_accumulation_steps": 4,
  "optimizer": {
    "type": "AdamW",
    "params": {
      "lr": 2e-5,
      "betas": [0.9, 0.999],
      "eps": 1e-8,
      "weight_decay": 0.01
    }
  },
  "scheduler": {
    "type": "WarmupDecayLR",
    "params": {
      "warmup_min_lr": 0,
      "warmup_max_lr": 2e-5,
      "warmup_num_steps": 100,
      "total_num_steps": 1000
    }
  },
  "zero_optimization": {
    "stage": 1
  },
  "bf16": {
    "enabled": true
  }
}

Configuration ZeRO étape 2

{
  "train_batch_size": 32,
  "gradient_accumulation_steps": 4,
  "zero_optimization": {
    "stage": 2,
    "allgather_partitions": true,
    "allgather_bucket_size": 2e8,
    "overlap_comm": true,
    "reduce_scatter": true,
    "reduce_bucket_size": 2e8,
    "contiguous_gradients": true
  },
  "bf16": {
    "enabled": true
  }
}

Configuration ZeRO étape 3

{
  "train_batch_size": 32,
  "gradient_accumulation_steps": 8,
  "zero_optimization": {
    "stage": 3,
    "offload_optimizer": {
      "device": "cpu",
      "pin_memory": true
    },
    "offload_param": {
      "device": "cpu",
      "pin_memory": true
    },
    "overlap_comm": true,
    "contiguous_gradients": true,
    "sub_group_size": 1e9,
    "reduce_bucket_size": "auto",
    "stage3_prefetch_bucket_size": "auto",
    "stage3_param_persistence_threshold": "auto",
    "stage3_max_live_parameters": 1e9,
    "stage3_max_reuse_distance": 1e9,
    "stage3_gather_16bit_weights_on_model_save": true
  },
  "bf16": {
    "enabled": true
  }
}

ZeRO étape 3 avec déchargement NVMe

{
  "zero_optimization": {
    "stage": 3,
    "offload_optimizer": {
      "device": "nvme",
      "nvme_path": "/local_nvme",
      "pin_memory": true,
      "buffer_count": 4
    },
    "offload_param": {
      "device": "nvme",
      "nvme_path": "/local_nvme",
      "pin_memory": true,
      "buffer_count": 5,
      "max_in_cpu": 1e9
    }
  }
}

Intégration Python

import deepspeed
import torch

model = MyModel()

# Initialize DeepSpeed engine
model_engine, optimizer, _, _ = deepspeed.initialize(
    model=model,
    model_parameters=model.parameters(),
    config="ds_config.json",
)

# Training loop
for batch in dataloader:
    loss = model_engine(batch)
    model_engine.backward(loss)
    model_engine.step()

# Save checkpoint
model_engine.save_checkpoint("./checkpoints", tag="step_1000")

# Load checkpoint
model_engine.load_checkpoint("./checkpoints", tag="step_1000")

Intégration HuggingFace Trainer

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    bf16=True,
    deepspeed="ds_config.json",  # Just point to config
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()

Parallélisme de pipeline

import deepspeed
from deepspeed.pipe import PipelineModule, LayerSpec

# Define pipeline-parallel model
class PipeModel(PipelineModule):
    def __init__(self, num_stages):
        layers = [
            LayerSpec(torch.nn.Embedding, vocab_size, hidden_size),
            *[LayerSpec(TransformerBlock, hidden_size) for _ in range(num_layers)],
            LayerSpec(torch.nn.Linear, hidden_size, vocab_size),
        ]
        super().__init__(layers=layers, num_stages=num_stages)

model = PipeModel(num_stages=4)
engine, _, _, _ = deepspeed.initialize(
    model=model,
    config={
        "train_batch_size": 32,
        "train_micro_batch_size_per_gpu": 4,
        "pipeline": {"micro_batches": 8},
    },
)

Inférence DeepSpeed

import deepspeed
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")

# Initialize for inference with tensor parallelism
ds_engine = deepspeed.init_inference(
    model,
    tensor_parallel={"tp_size": 4},
    dtype=torch.float16,
    replace_with_kernel_inject=True,
)

# Use the model
output = ds_engine.module.generate(input_ids, max_new_tokens=100)

Format du fichier d’hôtes

# hostfile.txt for multi-node training
worker-0 slots=8
worker-1 slots=8
worker-2 slots=4

Options de précision mixte

{
  "bf16": {
    "enabled": true
  }
}
{
  "fp16": {
    "enabled": true,
    "loss_scale": 0,
    "loss_scale_window": 1000,
    "initial_scale_power": 16,
    "hysteresis": 2,
    "min_loss_scale": 1
  }
}

Comparaison des étapes ZeRO

FonctionnalitéStage 1Stage 2Stage 3
Optimizer state partitioningYesYesYes
Gradient partitioningNoYesYes
Parameter partitioningNoNoYes
CPU offloadingNoOptimizerOptimizer + Params
NVMe offloadingNoNoYes
Memory savings~4x~8x~Nx (linear)

Commandes courantes

TaskCommande
Launch trainingdeepspeed train.py --deepspeed ds_config.json
Multi-GPU (4)deepspeed --num_gpus=4 train.py --deepspeed ds_config.json
Multi-nodedeepspeed --hostfile hostfile.txt train.py --deepspeed ds_config.json
System reportds_report
Convert ZeRO-3 checkpointpython zero_to_fp32.py checkpoint_dir output.pt