Unsloth Cheat Sheet
Overview
Unsloth dramatically accelerates LLM fine-tuning using hand-written CUDA kernels, replacing slow HuggingFace attention and linear layers with optimized implementations. Key benefits: 2x faster training, 80% less VRAM, and no accuracy loss compared to standard PEFT/QLoRA workflows.
Unsloth integrates with HuggingFace Transformers/PEFT/TRL — you replace a few import lines and the rest of your training code stays the same. It supports 4-bit quantization (QLoRA), LoRA fine-tuning, supervised fine-tuning (SFT) via TRL, and GGUF export for Ollama/llama.cpp deployment.
Minimum requirements: NVIDIA GPU (Ampere+ recommended), CUDA 11.8+, Python 3.10+.
Installation
# Recommended: install for your CUDA version
# CUDA 12.1
pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
# pip install (stable)
pip install unsloth
# Conda (includes CUDA dependencies)
conda create --name unsloth_env python=3.11 pytorch-cuda=12.1 \
pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers -y
conda activate unsloth_env
pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
pip install --no-deps trl peft accelerate bitsandbytes
# Verify installation
python -c "import unsloth; print(unsloth.__version__)"
Configuration
from unsloth import FastLanguageModel
import torch
# Key parameters
MAX_SEQ_LENGTH = 2048 # can be longer with RoPE scaling
DTYPE = None # None = auto-detect (float16 on older GPUs, bfloat16 on Ampere+)
LOAD_IN_4BIT = True # 4-bit quantization (QLoRA) — reduces VRAM ~4x
# Supported models (auto-detected by HF model ID)
SUPPORTED_MODELS = [
"unsloth/Meta-Llama-3.1-8B",
"unsloth/Meta-Llama-3.1-8B-Instruct",
"unsloth/Meta-Llama-3.1-70B",
"unsloth/mistralai/Mistral-7B-v0.3",
"unsloth/Qwen2.5-7B",
"unsloth/gemma-2-9b",
"unsloth/Phi-3-mini-4k-instruct",
# Also works with standard HF IDs like "meta-llama/Meta-Llama-3.1-8B"
]
Core API Reference
| API | Description |
|---|---|
FastLanguageModel.from_pretrained(model, max_seq_length, dtype, load_in_4bit) | Load base model with Unsloth optimizations |
FastLanguageModel.get_peft_model(model, r, target_modules, lora_alpha, ...) | Add LoRA adapters to model |
FastLanguageModel.for_inference(model) | Switch model to optimized inference mode |
FastLanguageModel.for_training(model) | Switch model back to training mode |
model.save_pretrained(path) | Save LoRA adapter weights |
model.save_pretrained_merged(path, tokenizer, save_method) | Save merged 16-bit or GGUF model |
model.push_to_hub(repo, token) | Upload LoRA adapter to HuggingFace Hub |
model.push_to_hub_merged(repo, tokenizer, save_method, token) | Upload merged model to Hub |
tokenizer.apply_chat_template(messages, tokenize, add_generation_prompt) | Format chat messages |
get_chat_template(tokenizer, chat_template) | Apply chat template to tokenizer |
Advanced Usage
Load Model and Add LoRA
from unsloth import FastLanguageModel
import torch
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Meta-Llama-3.1-8B-Instruct",
max_seq_length=2048,
dtype=None, # auto-detect
load_in_4bit=True, # QLoRA
)
# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
model,
r=16, # LoRA rank — higher = more params, slower
target_modules=[ # which layers to apply LoRA to
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
lora_alpha=16, # LoRA scaling factor
lora_dropout=0, # 0 is optimized; non-zero is slower
bias="none", # "none" is optimized
use_gradient_checkpointing="unsloth", # saves 30% more VRAM
random_state=42,
use_rslora=False, # rank-stabilized LoRA
loftq_config=None, # LoftQ initialization
)
print(model.print_trainable_parameters())
# trainable params: 41,943,040 || all params: 8,072,192,000 || trainable%: 0.5197
Dataset Formatting
from unsloth.chat_templates import get_chat_template, standardize_sharegpt
from datasets import load_dataset
# Apply chat template to tokenizer
tokenizer = get_chat_template(
tokenizer,
chat_template="llama-3", # "llama-3", "mistral", "chatml", "phi-3", "gemma", ...
)
# Format function for ShareGPT-style data
def formatting_prompts_func(examples):
convos = examples["conversations"]
texts = [
tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
for convo in convos
]
return {"text": texts}
# Load and format dataset
dataset = load_dataset("mlabonne/FineTome-100k", split="train")
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched=True)
# Alpaca-style formatting (instruction + input + output)
alpaca_prompt = """Below is an instruction that describes a task.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
def format_alpaca(examples):
instructions = examples["instruction"]
inputs = examples["input"]
outputs = examples["output"]
texts = []
for inst, inp, out in zip(instructions, inputs, outputs):
text = alpaca_prompt.format(inst, inp, out) + tokenizer.eos_token
texts.append(text)
return {"text": texts}
Training with TRL SFTTrainer
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=2048,
dataset_num_proc=2,
packing=False, # packing=True is faster for short sequences
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_steps=5,
num_train_epochs=1, # set epochs or max_steps
# max_steps=60, # for quick tests
learning_rate=2e-4,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=1,
optim="adamw_8bit", # 8-bit optimizer saves VRAM
weight_decay=0.01,
lr_scheduler_type="linear",
seed=42,
output_dir="outputs",
report_to="none", # or "wandb"
),
)
# Show current memory usage before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
trainer_stats = trainer.train()
# Show memory and training time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Training time: {round(trainer_stats.metrics['train_runtime']/60, 2)} minutes")
Inference
from unsloth import FastLanguageModel
# Switch to inference mode (2x faster)
FastLanguageModel.for_inference(model)
messages = [
{"role": "user", "content": "Explain quantum entanglement simply."},
]
input_text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
inputs = tokenizer([input_text], return_tensors="pt").to("cuda")
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.7,
top_p=0.9,
use_cache=True,
)
decoded = tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1]:])
print(decoded[0])
Saving and Exporting Models
# Save LoRA adapter only (small, ~100MB)
model.save_pretrained("my-lora-adapter")
tokenizer.save_pretrained("my-lora-adapter")
# Save merged 16-bit model (large, requires 2x VRAM temporarily)
model.save_pretrained_merged(
"my-merged-model",
tokenizer,
save_method="merged_16bit",
)
# Save merged 4-bit quantized model (smallest)
model.save_pretrained_merged(
"my-merged-4bit",
tokenizer,
save_method="merged_4bit_forced",
)
# Export to GGUF for Ollama/llama.cpp
model.save_pretrained_gguf(
"my-gguf-model",
tokenizer,
quantization_method="q4_k_m", # q4_k_m | q8_0 | f16 | q5_k_m
)
# Push to HuggingFace Hub
model.push_to_hub("username/my-lora-adapter", token="hf_...")
model.push_to_hub_merged(
"username/my-merged-model",
tokenizer,
save_method="merged_16bit",
token="hf_...",
)
model.push_to_hub_gguf(
"username/my-gguf-model",
tokenizer,
quantization_method="q4_k_m",
token="hf_...",
)
Common Workflows
Workflow 1: Fine-Tune on Custom Instruction Dataset
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset
import pandas as pd
# 1. Load model
model, tokenizer = FastLanguageModel.from_pretrained(
"unsloth/Qwen2.5-7B-Instruct", max_seq_length=2048, load_in_4bit=True
)
model = FastLanguageModel.get_peft_model(
model, r=16, target_modules=["q_proj","k_proj","v_proj","o_proj",
"gate_proj","up_proj","down_proj"],
lora_alpha=16, lora_dropout=0, bias="none",
use_gradient_checkpointing="unsloth",
)
# 2. Prepare data
tokenizer = get_chat_template(tokenizer, chat_template="qwen-2.5")
data = pd.read_csv("my_training_data.csv") # columns: instruction, output
dataset = Dataset.from_pandas(data)
def format_fn(examples):
convos = [
[{"role": "user", "content": inst},
{"role": "assistant", "content": out}]
for inst, out in zip(examples["instruction"], examples["output"])
]
return {"text": [
tokenizer.apply_chat_template(c, tokenize=False, add_generation_prompt=False)
for c in convos
]}
dataset = dataset.map(format_fn, batched=True)
# 3. Train
trainer = SFTTrainer(model=model, tokenizer=tokenizer, train_dataset=dataset,
dataset_text_field="text", max_seq_length=2048,
args=TrainingArguments(
num_train_epochs=3, per_device_train_batch_size=2,
gradient_accumulation_steps=4, learning_rate=2e-4,
bf16=True, output_dir="outputs", optim="adamw_8bit",
))
trainer.train()
# 4. Export GGUF for local use with Ollama
model.save_pretrained_gguf("my-model-gguf", tokenizer, quantization_method="q4_k_m")
Workflow 2: Continued Pre-Training
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
model, tokenizer = FastLanguageModel.from_pretrained(
"unsloth/Meta-Llama-3.1-8B",
max_seq_length=8192,
load_in_4bit=True,
)
model = FastLanguageModel.get_peft_model(model, r=128, lora_alpha=32,
target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
use_gradient_checkpointing="unsloth",
)
dataset = load_dataset("your-domain/text-corpus", split="train")
trainer = SFTTrainer(model=model, tokenizer=tokenizer, train_dataset=dataset,
dataset_text_field="text", max_seq_length=8192, packing=True,
args=TrainingArguments(
max_steps=1000, per_device_train_batch_size=1,
gradient_accumulation_steps=8, learning_rate=5e-5,
bf16=True, output_dir="cpt-outputs", optim="adamw_8bit",
))
trainer.train()
Tips and Best Practices
use_gradient_checkpointing="unsloth"saves an extra 30% VRAM beyond standard gradient checkpointing with no speed cost.- LoRA rank
r=16is a good default. User=32orr=64for harder tasks;r=8if VRAM is critically constrained. lora_dropout=0is fastest with Unsloth’s kernels; non-zero dropout disables some optimizations.optim="adamw_8bit"from bitsandbytes reduces optimizer state VRAM by ~50% with minimal quality loss.- Packing (
packing=Truein SFTTrainer) dramatically increases throughput for short sequences by concatenating examples. - Start with 1 epoch and evaluate before running more — overfitting on instruction data is common.
- GGUF
q4_k_mis the recommended quantization for Ollama deployment — good quality/size balance. - Check
model.print_trainable_parameters()after adding LoRA to verify the expected parameter count. - Monitor GPU memory with
nvidia-smi dmonornvitopduring training; OOM kills are silent on some systems. - Unsloth Pro (paid) adds multi-GPU support, 4x speed on H100s, and long-context training up to 1M tokens.