Transformers-Befehle

HuggingFace Transformers bietet Tausende von vortrainierten Modellen für NLP, Computer Vision, Audio und multimodale Aufgaben. Es stellt eine einheitliche API für das Laden von Modellen, Tokenisierung, Training und Inferenz über PyTorch, TensorFlow und JAX bereit.

Installation

# Install with PyTorch backend
pip install transformers[torch]

# Install with all optional dependencies
pip install transformers[torch,sentencepiece,tokenizers,vision,audio]

# Install from source
pip install git+https://github.com/huggingface/transformers.git

# Install specific version
pip install transformers==4.46.0

# Verify
python -c "import transformers; print(transformers.__version__)"

Pipeline-API (Schnelle Inferenz)

from transformers import pipeline

# Text generation
generator = pipeline("text-generation", model="meta-llama/Llama-3.1-8B-Instruct")
result = generator("Explain quantum computing:", max_new_tokens=200)

# Sentiment analysis
classifier = pipeline("sentiment-analysis")
result = classifier("I love this product!")
# [{'label': 'POSITIVE', 'score': 0.9998}]

# Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summary = summarizer(long_text, max_length=130, min_length=30)

# Question answering
qa = pipeline("question-answering")
result = qa(question="What is BERT?", context="BERT is a transformer model...")

# Zero-shot classification
classifier = pipeline("zero-shot-classification")
result = classifier("I need to pay my bill", candidate_labels=["billing", "support", "sales"])

# Image classification
classifier = pipeline("image-classification", model="google/vit-base-patch16-224")
result = classifier("photo.jpg")

# Automatic speech recognition
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
result = asr("audio.mp3")

AutoModel and AutoTokenizer

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load model and tokenizer
model_name = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# Tokenize input
inputs = tokenizer("Hello, how are you?", return_tensors="pt").to(model.device)

# Generate
outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
)

# Decode
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

Chat-Vorlage

from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Explain transformers in ML."},
]

# Apply chat template
input_ids = tokenizer.apply_chat_template(
    messages, return_tensors="pt", add_generation_prompt=True
).to(model.device)

outputs = model.generate(input_ids, max_new_tokens=256)
response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

Quantisierung mit BitsAndBytes

from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch

# 4-bit quantization (QLoRA-style)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-70B-Instruct",
    quantization_config=bnb_config,
    device_map="auto",
)

# 8-bit quantization
model_8bit = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-70B-Instruct",
    load_in_8bit=True,
    device_map="auto",
)

Trainer-API

from transformers import (
    AutoModelForCausalLM, AutoTokenizer,
    Trainer, TrainingArguments, DataCollatorForLanguageModeling,
)
from datasets import load_dataset

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset("tatsu-lab/alpaca", split="train")

def tokenize(example):
    return tokenizer(example["text"], truncation=True, max_length=2048)

tokenized = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)

training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    bf16=True,
    logging_steps=10,
    save_steps=500,
    save_total_limit=3,
    warmup_steps=100,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    report_to="wandb",
    gradient_checkpointing=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()
trainer.save_model("./final-model")

Zum Hub hochladen

from huggingface_hub import login

# Authenticate
login(token="hf_YOUR_TOKEN")

# Push model and tokenizer
model.push_to_hub("my-org/my-model")
tokenizer.push_to_hub("my-org/my-model")

# Push with Trainer
training_args = TrainingArguments(
    output_dir="./output",
    push_to_hub=True,
    hub_model_id="my-org/my-model",
)

Modell-Parallelismus

# Automatic device placement across GPUs
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-70B-Instruct",
    device_map="auto",        # Automatic placement
    torch_dtype=torch.float16,
)

# Custom device map
device_map = {
    "model.embed_tokens": 0,
    "model.layers.0": 0,
    "model.layers.1": 0,
    "model.layers.2": 1,
    "model.layers.3": 1,
    "model.norm": 1,
    "lm_head": 1,
}

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B",
    device_map=device_map,
)

Generierungskonfiguration

from transformers import GenerationConfig

gen_config = GenerationConfig(
    max_new_tokens=256,
    temperature=0.7,
    top_p=0.9,
    top_k=50,
    repetition_penalty=1.1,
    do_sample=True,
    num_beams=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

outputs = model.generate(**inputs, generation_config=gen_config)

Häufige Auto-Klassen

Class	Use Case
`AutoModel`	Base model (no head)
`AutoModelForCausalLM`	Text generation (GPT, Llama)
`AutoModelForSeq2SeqLM`	Seq2seq (T5, BART)
`AutoModelForSequenceClassification`	Text classification
`AutoModelForTokenClassification`	NER, POS tagging
`AutoModelForQuestionAnswering`	Extractive QA
`AutoModelForImageClassification`	Image classification
`AutoModelForSpeechSeq2Seq`	Speech-to-text
`AutoTokenizer`	Auto-detect tokenizer
`AutoProcessor`	Multi-modal processing

CLI-Befehle

# Download model
huggingface-cli download meta-llama/Llama-3.1-8B-Instruct

# Upload model
huggingface-cli upload my-org/my-model ./model-dir

# Login
huggingface-cli login

# Check cache
huggingface-cli scan-cache

# Delete cached models
huggingface-cli delete-cache