Accelerate Comandos
Accelerate é uma biblioteca HuggingFace que permite que código PyTorch execute em qualquer configuração distribuída (multi-GPU, TPU, precisão mista) com alterações mínimas de código. Gerencia automaticamente posicionamento de dispositivos, sincronização de gradientes e precisão mista.
Instalação
# Install from PyPI
pip install accelerate
# Install with DeepSpeed support
pip install accelerate[deepspeed]
# Install with testing tools
pip install accelerate[testing]
# Verify installation
accelerate env
Configuração
# Interactive configuration wizard
accelerate config
# Generate default config
accelerate config default
# Config for multi-GPU
accelerate config --config_file multi_gpu.yaml
# View current config
accelerate env
# Estimate memory requirements
accelerate estimate-memory meta-llama/Llama-3.1-8B --dtypes float16 int8
Config File Exemplos
# multi_gpu.yaml - Multi-GPU on single machine
compute_environment: LOCAL_MACHINE
distributed_type: MULTI_GPU
num_machines: 1
num_processes: 4
mixed_precision: bf16
use_cpu: false
# deepspeed.yaml - DeepSpeed ZeRO-3
compute_environment: LOCAL_MACHINE
distributed_type: DEEPSPEED
deepspeed_config:
zero_stage: 3
offload_optimizer_device: cpu
offload_param_device: cpu
zero3_save_16bit_model: true
num_machines: 1
num_processes: 4
mixed_precision: bf16
# fsdp.yaml - Fully Sharded Data Parallel
compute_environment: LOCAL_MACHINE
distributed_type: FSDP
fsdp_config:
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
fsdp_backward_prefetch_policy: BACKWARD_PRE
fsdp_sharding_strategy: FULL_SHARD
fsdp_state_dict_type: SHARDED_STATE_DICT
num_machines: 1
num_processes: 4
mixed_precision: bf16
Launch Comandos
# Launch with config file
accelerate launch --config_file multi_gpu.yaml train.py
# Launch with inline args (no config file)
accelerate launch --multi_gpu --num_processes=4 --mixed_precision=bf16 train.py
# Single GPU
accelerate launch --num_processes=1 train.py
# Multi-node
accelerate launch \
--num_machines=2 \
--num_processes=8 \
--machine_rank=0 \
--main_process_ip=10.0.0.1 \
--main_process_port=29500 \
train.py
# With DeepSpeed
accelerate launch --use_deepspeed \
--deepspeed_config_file ds_config.json \
--zero_stage 3 \
train.py
# TPU training
accelerate launch --tpu --num_processes=8 train.py
Accelerator Class
from accelerate import Accelerator
accelerator = Accelerator(
mixed_precision="bf16",
gradient_accumulation_steps=4,
log_with="wandb",
)
# Prepare model, optimizer, dataloader, scheduler
model, optimizer, train_dataloader, scheduler = accelerator.prepare(
model, optimizer, train_dataloader, scheduler
)
# Training loop
for epoch in range(num_epochs):
model.train()
for batch in train_dataloader:
with accelerator.accumulate(model):
outputs = model(**batch)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
# Log metrics (only on main process)
if accelerator.is_main_process:
accelerator.log({"loss": loss.item()})
# Save model (handles distributed state)
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
accelerator.save_model(unwrapped_model, "./output")
Gradient Accumulation
accelerator = Accelerator(gradient_accumulation_steps=8)
model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
for batch in dataloader:
# accumulate() handles gradient sync and scaling
with accelerator.accumulate(model):
loss = model(**batch).loss
accelerator.backward(loss)
optimizer.step()
optimizer.zero_grad()
Mixed Precision Training
# Via Accelerator
accelerator = Accelerator(mixed_precision="bf16")
# Custom autocast blocks
with accelerator.autocast():
output = model(input_data)
Checkpointing
# Save full training state (model + optimizer + scheduler + RNG)
accelerator.save_state("./checkpoint-1000")
# Load training state to resume
accelerator.load_state("./checkpoint-1000")
# Save just the model weights
accelerator.wait_for_everyone()
unwrapped = accelerator.unwrap_model(model)
accelerator.save_model(unwrapped, "./model-final")
# Push to HuggingFace Hub
unwrapped.push_to_hub("my-org/my-model")
Notebook Launcher
from accelerate import notebook_launcher
def training_function():
accelerator = Accelerator()
model = MyModel()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
model, optimizer = accelerator.prepare(model, optimizer)
# ... training loop ...
# Launch multi-GPU training from a Jupyter notebook
notebook_launcher(training_function, num_processes=4)
Distributed Utilities
from accelerate import Accelerator
accelerator = Accelerator()
# Check process info
print(accelerator.process_index) # Current process rank
print(accelerator.num_processes) # Total number of processes
print(accelerator.is_main_process) # True on rank 0
print(accelerator.device) # Device for this process
# Gather tensors from all processes
gathered = accelerator.gather(local_tensor)
# Run only on main process
if accelerator.is_main_process:
print("Training complete!")
# Synchronize all processes
accelerator.wait_for_everyone()
# Print only on main process
accelerator.print("This prints once across all processes")
Big Model Loading
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
# Load a model that doesn't fit in single GPU memory
with init_empty_weights():
model = AutoModelForCausalLM.from_config(config)
# Dispatch across GPUs automatically
model = load_checkpoint_and_dispatch(
model,
checkpoint="./model-dir",
device_map="auto",
no_split_module_classes=["LlamaDecoderLayer"],
dtype=torch.float16,
)
Common Opções
| Flag | Descrição |
|---|---|
--multi_gpu | Multi-GPU on single machine |
--num_processes N | Number of GPUs/processes |
--mixed_precision bf16 | BF16 mixed precision |
--use_deepspeed | Enable DeepSpeed |
--use_fsdp | Enable FSDP |
--tpu | TPU training |
--num_machines N | Multi-node count |
--gradient_accumulation_steps N | Gradient accumulation |