PEFT (Parameter-Efficient Fine-Tuning) è una libreria HuggingFace che permette di adattare grandi modelli pre-addestrati allenando solo un piccolo numero di parametri aggiuntivi, riducendo drasticamente l’utilizzo di memoria e il tempo di addestramento mantenendo le prestazioni.
Installazione
# Install from PyPI
pip install peft
# Install with quantization support
pip install peft bitsandbytes
# Install from source
pip install git+https://github.com/huggingface/peft.git
# Verify
python -c "import peft; print(peft.__version__)"
LoRA (Low-Rank Adaptation)
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")
# Configure LoRA
lora_config = LoraConfig(
r=16, # Rank
lora_alpha=32, # Scaling factor
lora_dropout=0.05, # Dropout for LoRA layers
target_modules=[ # Modules to apply LoRA
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
task_type=TaskType.CAUSAL_LM,
bias="none", # none, all, or lora_only
)
# Wrap model with LoRA adapters
model = get_peft_model(model, lora_config)
# Print trainable parameter count
model.print_trainable_parameters()
# Output: trainable params: 13M || all params: 8B || trainable%: 0.16%
QLoRA (Quantized LoRA)
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
# Load model in 4-bit
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
quantization_config=bnb_config,
device_map="auto",
)
# Prepare quantized model for training
model = prepare_model_for_kbit_training(model)
# Apply LoRA on top
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules="all-linear", # Apply to all linear layers
lora_dropout=0.05,
task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_config)
AdaLoRA (Adaptive LoRA)
from peft import AdaLoraConfig, get_peft_model
# AdaLoRA dynamically adjusts rank per layer
config = AdaLoraConfig(
init_r=12, # Initial rank for all layers
target_r=4, # Target average rank after pruning
beta1=0.85,
beta2=0.85,
tinit=200, # Steps before pruning starts
tfinal=1000, # Steps when pruning ends
deltaT=10, # Pruning interval
lora_alpha=32,
lora_dropout=0.05,
target_modules=["q_proj", "v_proj"],
task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, config)
IA3 (Infused Adapter by Inhibiting and Amplifying Inner Activations)
from peft import IA3Config, get_peft_model
# IA3 learns rescaling vectors (even fewer params than LoRA)
config = IA3Config(
target_modules=["k_proj", "v_proj", "down_proj"],
feedforward_modules=["down_proj"],
task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, config)
model.print_trainable_parameters()
# Typically < 0.01% trainable parameters
Prefix Tuning
from peft import PrefixTuningConfig, get_peft_model
config = PrefixTuningConfig(
num_virtual_tokens=20,
task_type=TaskType.CAUSAL_LM,
prefix_projection=True, # Use MLP to project prefix
encoder_hidden_size=1024,
)
model = get_peft_model(model, config)
Prompt Tuning
from peft import PromptTuningConfig, PromptTuningInit, get_peft_model
config = PromptTuningConfig(
num_virtual_tokens=20,
task_type=TaskType.CAUSAL_LM,
prompt_tuning_init=PromptTuningInit.TEXT,
prompt_tuning_init_text="Classify the following text:",
tokenizer_name_or_path="meta-llama/Llama-3.1-8B",
)
model = get_peft_model(model, config)
Salvare e caricare adapter
# Save only the adapter weights (small file)
model.save_pretrained("./my-lora-adapter")
# Load adapter onto base model
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")
model = PeftModel.from_pretrained(base_model, "./my-lora-adapter")
# Push adapter to HuggingFace Hub
model.push_to_hub("my-org/llama3-lora-adapter")
# Load from Hub
model = PeftModel.from_pretrained(base_model, "my-org/llama3-lora-adapter")
Unione adapter
# Merge LoRA weights into base model permanently
model = PeftModel.from_pretrained(base_model, "./my-lora-adapter")
merged_model = model.merge_and_unload()
# Save the merged model (full size, no adapter dependency)
merged_model.save_pretrained("./merged-model")
# Merge with different methods
model.merge_adapter() # Merge into base weights
model.unmerge_adapter() # Unmerge (revert to adapter mode)
Inferenza multi-adapter
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")
# Load first adapter
model = PeftModel.from_pretrained(base_model, "./adapter-coding", adapter_name="coding")
# Load additional adapters
model.load_adapter("./adapter-writing", adapter_name="writing")
model.load_adapter("./adapter-math", adapter_name="math")
# Switch between adapters at inference time
model.set_adapter("coding")
output_code = model.generate(**inputs)
model.set_adapter("writing")
output_text = model.generate(**inputs)
# Combine adapters with weighted merge
model.add_weighted_adapter(
adapters=["coding", "writing"],
weights=[0.7, 0.3],
adapter_name="combined",
)
model.set_adapter("combined")
Training con HuggingFace Trainer
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir="./output",
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
num_train_epochs=3,
learning_rate=2e-4,
bf16=True,
logging_steps=10,
save_steps=200,
remove_unused_columns=False,
)
trainer = Trainer(
model=model, # PEFT-wrapped model
args=training_args,
train_dataset=dataset,
data_collator=data_collator,
)
trainer.train()
Confronto metodi PEFT
| Method | Trainable Params | Memory | Speed | Quality |
|---|
| LoRA | ~0.1-1% | Low | Fast | High |
| QLoRA | ~0.1-1% | Very Low | Medium | High |
| AdaLoRA | ~0.1-1% | Low | Medium | High |
| IA3 | ~0.01% | Very Low | Fast | Good |
| Prefix Tuning | ~0.1% | Low | Fast | Good |
| Prompt Tuning | ~0.01% | Very Low | Fast | Moderate |
| Full Fine-Tune | 100% | Very High | Slow | Highest |
Opzioni di configurazione comuni
| Parameter | Description | Typical Value |
|---|
r | LoRA rank | 8, 16, 32, 64 |
lora_alpha | Scaling factor (usually 2x rank) | 16, 32, 64 |
lora_dropout | Dropout probability | 0.0 - 0.1 |
target_modules | Which layers to adapt | "all-linear" or list |
bias | Train bias params | "none", "all", "lora_only" |
task_type | Task type enum | CAUSAL_LM, SEQ_CLS, SEQ_2_SEQ_LM |