TRL أوامر
TRL (Transformers Reinforcement Learning) هي مكتبة شاملة لما بعد تدريب نماذج اللغة الكبيرة باستخدام الضبط الدقيق المُشرف والتعلم المعزز من التغذية الراجعة البشرية وطرق المحاذاة المباشرة. تتكامل بسلاسة مع HuggingFace Transformers و PEFT.
التثبيت
# Install from PyPI
pip install trl
# Install with all optional dependencies
pip install trl[peft,deepspeed]
# Install from source
pip install git+https://github.com/huggingface/trl.git
# Verify
python -c "import trl; print(trl.__version__)"
CLI الاستخدام
# SFT via CLI
trl sft \
--model_name_or_path meta-llama/Llama-3.1-8B \
--dataset_name trl-lib/Capybara \
--output_dir ./sft-output \
--per_device_train_batch_size 2 \
--num_train_epochs 1
# DPO via CLI
trl dpo \
--model_name_or_path ./sft-output \
--dataset_name trl-lib/ultrafeedback_binarized \
--output_dir ./dpo-output \
--beta 0.1
# Chat with trained model
trl chat --model_name_or_path ./sft-output
SFTTrainer
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
dataset = load_dataset("trl-lib/Capybara", split="train")
# Configure training
training_args = SFTConfig(
output_dir="./sft-output",
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
num_train_epochs=3,
learning_rate=2e-5,
bf16=True,
logging_steps=10,
max_seq_length=2048,
packing=True, # Pack multiple samples into one sequence
)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
args=training_args,
)
trainer.train()
trainer.save_model("./sft-final")
DPOTrainer
from trl import DPOTrainer, DPOConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
model = AutoModelForCausalLM.from_pretrained("./sft-output")
tokenizer = AutoTokenizer.from_pretrained("./sft-output")
# Dataset needs: prompt, chosen, rejected columns
dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
training_args = DPOConfig(
output_dir="./dpo-output",
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
num_train_epochs=1,
learning_rate=5e-7,
beta=0.1, # KL penalty coefficient
bf16=True,
logging_steps=10,
max_length=1024,
max_prompt_length=512,
)
trainer = DPOTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
args=training_args,
)
trainer.train()
PPOTrainer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from transformers import AutoTokenizer
# PPO requires a value head on the model
model = AutoModelForCausalLMWithValueHead.from_pretrained("./sft-output")
tokenizer = AutoTokenizer.from_pretrained("./sft-output")
tokenizer.pad_token = tokenizer.eos_token
ppo_config = PPOConfig(
learning_rate=1e-5,
batch_size=16,
mini_batch_size=4,
ppo_epochs=4,
log_with="wandb",
)
trainer = PPOTrainer(
config=ppo_config,
model=model,
tokenizer=tokenizer,
)
# PPO training loop
for batch in dataloader:
query_tensors = tokenizer(batch["query"], return_tensors="pt").input_ids
response_tensors = trainer.generate(query_tensors, max_new_tokens=128)
rewards = reward_model(query_tensors, response_tensors) # Your reward model
stats = trainer.step(query_tensors, response_tensors, rewards)
GRPOTrainer
from trl import GRPOTrainer, GRPOConfig
# Group Relative Policy Optimization - no separate reward model needed
training_args = GRPOConfig(
output_dir="./grpo-output",
per_device_train_batch_size=2,
num_train_epochs=1,
learning_rate=1e-6,
num_generations=4, # Number of completions per prompt
bf16=True,
)
def reward_fn(completions, prompts):
"""Custom reward function scoring each completion."""
rewards = []
for completion in completions:
score = len(completion) / 100 # Example: reward longer responses
rewards.append(score)
return rewards
trainer = GRPOTrainer(
model="./sft-output",
reward_funcs=reward_fn,
args=training_args,
train_dataset=dataset,
)
trainer.train()
ORPOTrainer
from trl import ORPOTrainer, ORPOConfig
# Odds Ratio Preference Optimization - no reference model needed
training_args = ORPOConfig(
output_dir="./orpo-output",
per_device_train_batch_size=2,
num_train_epochs=1,
learning_rate=5e-6,
beta=0.1,
bf16=True,
)
trainer = ORPOTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset, # Needs prompt, chosen, rejected
args=training_args,
)
trainer.train()
RewardTrainer
from trl import RewardTrainer, RewardConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Train a reward model for RLHF
model = AutoModelForSequenceClassification.from_pretrained(
"./sft-output", num_labels=1
)
tokenizer = AutoTokenizer.from_pretrained("./sft-output")
training_args = RewardConfig(
output_dir="./reward-model",
per_device_train_batch_size=4,
num_train_epochs=1,
learning_rate=1e-5,
max_length=512,
)
# Dataset needs: chosen and rejected columns
trainer = RewardTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
args=training_args,
)
trainer.train()
PEFT التكامل
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
# Add LoRA to any TRL trainer
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
task_type="CAUSAL_LM",
)
trainer = SFTTrainer(
model="meta-llama/Llama-3.1-8B",
train_dataset=dataset,
args=SFTConfig(output_dir="./lora-sft", bf16=True),
peft_config=peft_config,
)
trainer.train()
Dataset Formatting
from datasets import Dataset
# Conversational format (preferred)
conversations = [
{
"messages": [
{"role": "system", "content": "You are helpful."},
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi! How can I help?"},
]
}
]
dataset = Dataset.from_list(conversations)
# Preference dataset for DPO/ORPO
preferences = [
{
"prompt": "Explain gravity.",
"chosen": "Gravity is a fundamental force...",
"rejected": "Gravity is magic..."
}
]
dataset = Dataset.from_list(preferences)
Model Merging
from trl import ModelMerger
# Merge multiple fine-tuned models
merger = ModelMerger(
base_model="meta-llama/Llama-3.1-8B",
models=["./sft-output", "./dpo-output"],
merge_method="linear", # linear, slerp, ties, dare
weights=[0.5, 0.5],
)
merged_model = merger.merge()
merged_model.save_pretrained("./merged-model")
Training Methods Reference
| Trainer | Use Case | Required Data |
|---|---|---|
SFTTrainer | Supervised fine-tuning | instruction/response pairs |
DPOTrainer | Direct preference optimization | prompt + chosen + rejected |
PPOTrainer | RLHF with reward model | prompts + reward model |
GRPOTrainer | Group relative policy optimization | prompts + reward function |
ORPOTrainer | Odds ratio preference optimization | prompt + chosen + rejected |
RewardTrainer | Train reward models | chosen + rejected pairs |
KTOTrainer | Kahneman-Tversky optimization | prompt + completion + label |
CPOTrainer | Contrastive preference optimization | prompt + chosen + rejected |