Aller au contenu

Commandes Accelerate

Accelerate est une bibliothèque HuggingFace qui permet d’exécuter du code PyTorch sur n’importe quelle configuration distribuée avec des modifications de code minimales. Elle supporte le multi-GPU, multi-nœuds, TPU, la précision mixte et l’intégration avec DeepSpeed et FSDP.

Installation

# Install from PyPI
pip install accelerate

# Install with DeepSpeed support
pip install accelerate[deepspeed]

# Install with testing tools
pip install accelerate[testing]

# Verify installation
accelerate env

Configuration

# Interactive configuration wizard
accelerate config

# Generate default config
accelerate config default

# Config for multi-GPU
accelerate config --config_file multi_gpu.yaml

# View current config
accelerate env

# Estimate memory requirements
accelerate estimate-memory meta-llama/Llama-3.1-8B --dtypes float16 int8

Exemples de fichiers de configuration

# multi_gpu.yaml - Multi-GPU on single machine
compute_environment: LOCAL_MACHINE
distributed_type: MULTI_GPU
num_machines: 1
num_processes: 4
mixed_precision: bf16
use_cpu: false
# deepspeed.yaml - DeepSpeed ZeRO-3
compute_environment: LOCAL_MACHINE
distributed_type: DEEPSPEED
deepspeed_config:
  zero_stage: 3
  offload_optimizer_device: cpu
  offload_param_device: cpu
  zero3_save_16bit_model: true
num_machines: 1
num_processes: 4
mixed_precision: bf16
# fsdp.yaml - Fully Sharded Data Parallel
compute_environment: LOCAL_MACHINE
distributed_type: FSDP
fsdp_config:
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_backward_prefetch_policy: BACKWARD_PRE
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_state_dict_type: SHARDED_STATE_DICT
num_machines: 1
num_processes: 4
mixed_precision: bf16

Commandes de lancement

# Launch with config file
accelerate launch --config_file multi_gpu.yaml train.py

# Launch with inline args (no config file)
accelerate launch --multi_gpu --num_processes=4 --mixed_precision=bf16 train.py

# Single GPU
accelerate launch --num_processes=1 train.py

# Multi-node
accelerate launch \
  --num_machines=2 \
  --num_processes=8 \
  --machine_rank=0 \
  --main_process_ip=10.0.0.1 \
  --main_process_port=29500 \
  train.py

# With DeepSpeed
accelerate launch --use_deepspeed \
  --deepspeed_config_file ds_config.json \
  --zero_stage 3 \
  train.py

# TPU training
accelerate launch --tpu --num_processes=8 train.py

Classe Accelerator

from accelerate import Accelerator

accelerator = Accelerator(
    mixed_precision="bf16",
    gradient_accumulation_steps=4,
    log_with="wandb",
)

# Prepare model, optimizer, dataloader, scheduler
model, optimizer, train_dataloader, scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, scheduler
)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        with accelerator.accumulate(model):
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        # Log metrics (only on main process)
        if accelerator.is_main_process:
            accelerator.log({"loss": loss.item()})

# Save model (handles distributed state)
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
accelerator.save_model(unwrapped_model, "./output")

Accumulation de gradients

accelerator = Accelerator(gradient_accumulation_steps=8)

model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)

for batch in dataloader:
    # accumulate() handles gradient sync and scaling
    with accelerator.accumulate(model):
        loss = model(**batch).loss
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()

Entraînement en précision mixte

# Via Accelerator
accelerator = Accelerator(mixed_precision="bf16")

# Custom autocast blocks
with accelerator.autocast():
    output = model(input_data)

Points de contrôle

# Save full training state (model + optimizer + scheduler + RNG)
accelerator.save_state("./checkpoint-1000")

# Load training state to resume
accelerator.load_state("./checkpoint-1000")

# Save just the model weights
accelerator.wait_for_everyone()
unwrapped = accelerator.unwrap_model(model)
accelerator.save_model(unwrapped, "./model-final")

# Push to HuggingFace Hub
unwrapped.push_to_hub("my-org/my-model")

Lanceur de notebooks

from accelerate import notebook_launcher

def training_function():
    accelerator = Accelerator()
    model = MyModel()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    model, optimizer = accelerator.prepare(model, optimizer)
    # ... training loop ...

# Launch multi-GPU training from a Jupyter notebook
notebook_launcher(training_function, num_processes=4)

Utilitaires distribués

from accelerate import Accelerator

accelerator = Accelerator()

# Check process info
print(accelerator.process_index)     # Current process rank
print(accelerator.num_processes)     # Total number of processes
print(accelerator.is_main_process)   # True on rank 0
print(accelerator.device)            # Device for this process

# Gather tensors from all processes
gathered = accelerator.gather(local_tensor)

# Run only on main process
if accelerator.is_main_process:
    print("Training complete!")

# Synchronize all processes
accelerator.wait_for_everyone()

# Print only on main process
accelerator.print("This prints once across all processes")

Chargement de grands modèles

from accelerate import init_empty_weights, load_checkpoint_and_dispatch

# Load a model that doesn't fit in single GPU memory
with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config)

# Dispatch across GPUs automatically
model = load_checkpoint_and_dispatch(
    model,
    checkpoint="./model-dir",
    device_map="auto",
    no_split_module_classes=["LlamaDecoderLayer"],
    dtype=torch.float16,
)

Options courantes

OptionDescription
--multi_gpuMulti-GPU on single machine
--num_processes NNumber of GPUs/processes
--mixed_precision bf16BF16 mixed precision
--use_deepspeedEnable DeepSpeed
--use_fsdpEnable FSDP
--tpuTPU training
--num_machines NMulti-node count
--gradient_accumulation_steps NGradient accumulation