Comandi TRL
TRL (Transformers Reinforcement Learning) è una libreria full-stack per il post-training di LLM utilizzando il fine-tuning supervisionato, l’apprendimento per rinforzo dal feedback umano e metodi di allineamento diretto. Si integra perfettamente con HuggingFace Transformers e PEFT.
Installazione
# Install from PyPI
pip install trl
# Install with all optional dependencies
pip install trl[peft,deepspeed]
# Install from source
pip install git+https://github.com/huggingface/trl.git
# Verify
python -c "import trl; print(trl.__version__)"
Utilizzo CLI
# SFT via CLI
trl sft \
--model_name_or_path meta-llama/Llama-3.1-8B \
--dataset_name trl-lib/Capybara \
--output_dir ./sft-output \
--per_device_train_batch_size 2 \
--num_train_epochs 1
# DPO via CLI
trl dpo \
--model_name_or_path ./sft-output \
--dataset_name trl-lib/ultrafeedback_binarized \
--output_dir ./dpo-output \
--beta 0.1
# Chat with trained model
trl chat --model_name_or_path ./sft-output
SFTTrainer
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
dataset = load_dataset("trl-lib/Capybara", split="train")
# Configure training
training_args = SFTConfig(
output_dir="./sft-output",
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
num_train_epochs=3,
learning_rate=2e-5,
bf16=True,
logging_steps=10,
max_seq_length=2048,
packing=True, # Pack multiple samples into one sequence
)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
args=training_args,
)
trainer.train()
trainer.save_model("./sft-final")
DPOTrainer
from trl import DPOTrainer, DPOConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
model = AutoModelForCausalLM.from_pretrained("./sft-output")
tokenizer = AutoTokenizer.from_pretrained("./sft-output")
# Dataset needs: prompt, chosen, rejected columns
dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
training_args = DPOConfig(
output_dir="./dpo-output",
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
num_train_epochs=1,
learning_rate=5e-7,
beta=0.1, # KL penalty coefficient
bf16=True,
logging_steps=10,
max_length=1024,
max_prompt_length=512,
)
trainer = DPOTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
args=training_args,
)
trainer.train()
PPOTrainer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from transformers import AutoTokenizer
# PPO requires a value head on the model
model = AutoModelForCausalLMWithValueHead.from_pretrained("./sft-output")
tokenizer = AutoTokenizer.from_pretrained("./sft-output")
tokenizer.pad_token = tokenizer.eos_token
ppo_config = PPOConfig(
learning_rate=1e-5,
batch_size=16,
mini_batch_size=4,
ppo_epochs=4,
log_with="wandb",
)
trainer = PPOTrainer(
config=ppo_config,
model=model,
tokenizer=tokenizer,
)
# PPO training loop
for batch in dataloader:
query_tensors = tokenizer(batch["query"], return_tensors="pt").input_ids
response_tensors = trainer.generate(query_tensors, max_new_tokens=128)
rewards = reward_model(query_tensors, response_tensors) # Your reward model
stats = trainer.step(query_tensors, response_tensors, rewards)
GRPOTrainer
from trl import GRPOTrainer, GRPOConfig
# Group Relative Policy Optimization - no separate reward model needed
training_args = GRPOConfig(
output_dir="./grpo-output",
per_device_train_batch_size=2,
num_train_epochs=1,
learning_rate=1e-6,
num_generations=4, # Number of completions per prompt
bf16=True,
)
def reward_fn(completions, prompts):
"""Custom reward function scoring each completion."""
rewards = []
for completion in completions:
score = len(completion) / 100 # Example: reward longer responses
rewards.append(score)
return rewards
trainer = GRPOTrainer(
model="./sft-output",
reward_funcs=reward_fn,
args=training_args,
train_dataset=dataset,
)
trainer.train()
ORPOTrainer
from trl import ORPOTrainer, ORPOConfig
# Odds Ratio Preference Optimization - no reference model needed
training_args = ORPOConfig(
output_dir="./orpo-output",
per_device_train_batch_size=2,
num_train_epochs=1,
learning_rate=5e-6,
beta=0.1,
bf16=True,
)
trainer = ORPOTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset, # Needs prompt, chosen, rejected
args=training_args,
)
trainer.train()
RewardTrainer
from trl import RewardTrainer, RewardConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Train a reward model for RLHF
model = AutoModelForSequenceClassification.from_pretrained(
"./sft-output", num_labels=1
)
tokenizer = AutoTokenizer.from_pretrained("./sft-output")
training_args = RewardConfig(
output_dir="./reward-model",
per_device_train_batch_size=4,
num_train_epochs=1,
learning_rate=1e-5,
max_length=512,
)
# Dataset needs: chosen and rejected columns
trainer = RewardTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
args=training_args,
)
trainer.train()
PEFT Integration
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
# Add LoRA to any TRL trainer
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
task_type="CAUSAL_LM",
)
trainer = SFTTrainer(
model="meta-llama/Llama-3.1-8B",
train_dataset=dataset,
args=SFTConfig(output_dir="./lora-sft", bf16=True),
peft_config=peft_config,
)
trainer.train()
Dataset Formatting
from datasets import Dataset
# Conversational format (preferred)
conversations = [
{
"messages": [
{"role": "system", "content": "You are helpful."},
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi! How can I help?"},
]
}
]
dataset = Dataset.from_list(conversations)
# Preference dataset for DPO/ORPO
preferences = [
{
"prompt": "Explain gravity.",
"chosen": "Gravity is a fundamental force...",
"rejected": "Gravity is magic..."
}
]
dataset = Dataset.from_list(preferences)
Model Merging
from trl import ModelMerger
# Merge multiple fine-tuned models
merger = ModelMerger(
base_model="meta-llama/Llama-3.1-8B",
models=["./sft-output", "./dpo-output"],
merge_method="linear", # linear, slerp, ties, dare
weights=[0.5, 0.5],
)
merged_model = merger.merge()
merged_model.save_pretrained("./merged-model")
Training Methods Reference
| Trainer | Use Case | Required Data |
|---|---|---|
SFTTrainer | Supervised fine-tuning | instruction/response pairs |
DPOTrainer | Direct preference optimization | prompt + chosen + rejected |
PPOTrainer | RLHF with reward model | prompts + reward model |
GRPOTrainer | Group relative policy optimization | prompts + reward function |
ORPOTrainer | Odds ratio preference optimization | prompt + chosen + rejected |
RewardTrainer | Train reward models | chosen + rejected pairs |
KTOTrainer | Kahneman-Tversky optimization | prompt + completion + label |
CPOTrainer | Contrastive preference optimization | prompt + chosen + rejected |