Transformers Comandos
HuggingFace Transformers fornece milhares de modelos pré-treinados para NLP, visão computacional, áudio e tarefas multimodais. Oferece uma API unificada para carregamento de modelos, tokenização, treinamento e inferência em PyTorch, TensorFlow e JAX.
Instalação
# Install with PyTorch backend
pip install transformers[torch]
# Install with all optional dependencies
pip install transformers[torch,sentencepiece,tokenizers,vision,audio]
# Install from source
pip install git+https://github.com/huggingface/transformers.git
# Install specific version
pip install transformers==4.46.0
# Verify
python -c "import transformers; print(transformers.__version__)"
Pipeline API (Quick Inference)
from transformers import pipeline
# Text generation
generator = pipeline("text-generation", model="meta-llama/Llama-3.1-8B-Instruct")
result = generator("Explain quantum computing:", max_new_tokens=200)
# Sentiment analysis
classifier = pipeline("sentiment-analysis")
result = classifier("I love this product!")
# [{'label': 'POSITIVE', 'score': 0.9998}]
# Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summary = summarizer(long_text, max_length=130, min_length=30)
# Question answering
qa = pipeline("question-answering")
result = qa(question="What is BERT?", context="BERT is a transformer model...")
# Zero-shot classification
classifier = pipeline("zero-shot-classification")
result = classifier("I need to pay my bill", candidate_labels=["billing", "support", "sales"])
# Image classification
classifier = pipeline("image-classification", model="google/vit-base-patch16-224")
result = classifier("photo.jpg")
# Automatic speech recognition
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
result = asr("audio.mp3")
AutoModel and AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# Load model and tokenizer
model_name = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
)
# Tokenize input
inputs = tokenizer("Hello, how are you?", return_tensors="pt").to(model.device)
# Generate
outputs = model.generate(
**inputs,
max_new_tokens=100,
temperature=0.7,
top_p=0.9,
do_sample=True,
)
# Decode
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
Chat Template
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B-Instruct",
torch_dtype=torch.bfloat16,
device_map="auto",
)
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain transformers in ML."},
]
# Apply chat template
input_ids = tokenizer.apply_chat_template(
messages, return_tensors="pt", add_generation_prompt=True
).to(model.device)
outputs = model.generate(input_ids, max_new_tokens=256)
response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
Quantization with BitsAndBytes
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
# 4-bit quantization (QLoRA-style)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-70B-Instruct",
quantization_config=bnb_config,
device_map="auto",
)
# 8-bit quantization
model_8bit = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-70B-Instruct",
load_in_8bit=True,
device_map="auto",
)
Trainer API
from transformers import (
AutoModelForCausalLM, AutoTokenizer,
Trainer, TrainingArguments, DataCollatorForLanguageModeling,
)
from datasets import load_dataset
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
tokenizer.pad_token = tokenizer.eos_token
dataset = load_dataset("tatsu-lab/alpaca", split="train")
def tokenize(example):
return tokenizer(example["text"], truncation=True, max_length=2048)
tokenized = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
training_args = TrainingArguments(
output_dir="./output",
per_device_train_batch_size=2,
gradient_accumulation_steps=8,
num_train_epochs=3,
learning_rate=2e-5,
bf16=True,
logging_steps=10,
save_steps=500,
save_total_limit=3,
warmup_steps=100,
weight_decay=0.01,
lr_scheduler_type="cosine",
report_to="wandb",
gradient_checkpointing=True,
push_to_hub=False,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized,
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
trainer.train()
trainer.save_model("./final-model")
Push to Hub
from huggingface_hub import login
# Authenticate
login(token="hf_YOUR_TOKEN")
# Push model and tokenizer
model.push_to_hub("my-org/my-model")
tokenizer.push_to_hub("my-org/my-model")
# Push with Trainer
training_args = TrainingArguments(
output_dir="./output",
push_to_hub=True,
hub_model_id="my-org/my-model",
)
Model Parallelism
# Automatic device placement across GPUs
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-70B-Instruct",
device_map="auto", # Automatic placement
torch_dtype=torch.float16,
)
# Custom device map
device_map = {
"model.embed_tokens": 0,
"model.layers.0": 0,
"model.layers.1": 0,
"model.layers.2": 1,
"model.layers.3": 1,
"model.norm": 1,
"lm_head": 1,
}
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
device_map=device_map,
)
Generation Config
from transformers import GenerationConfig
gen_config = GenerationConfig(
max_new_tokens=256,
temperature=0.7,
top_p=0.9,
top_k=50,
repetition_penalty=1.1,
do_sample=True,
num_beams=1,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
)
outputs = model.generate(**inputs, generation_config=gen_config)
Common Auto Classes
| Class | Use Case |
|---|---|
AutoModel | Base model (no head) |
AutoModelForCausalLM | Text generation (GPT, Llama) |
AutoModelForSeq2SeqLM | Seq2seq (T5, BART) |
AutoModelForSequenceClassification | Text classification |
AutoModelForTokenClassification | NER, POS tagging |
AutoModelForQuestionAnswering | Extractive QA |
AutoModelForImageClassification | Image classification |
AutoModelForSpeechSeq2Seq | Speech-to-text |
AutoTokenizer | Auto-detect tokenizer |
AutoProcessor | Multi-modal processing |
CLI Comandos
# Download model
huggingface-cli download meta-llama/Llama-3.1-8B-Instruct
# Upload model
huggingface-cli upload my-org/my-model ./model-dir
# Login
huggingface-cli login
# Check cache
huggingface-cli scan-cache
# Delete cached models
huggingface-cli delete-cache