Overview
CLIP (Contrastive Language-Image Pre-training) is a neural network model developed by OpenAI that learns visual concepts from natural language supervision. Trained on 400 million image-text pairs from the internet, CLIP can perform zero-shot image classification, compute similarity between images and text, and serve as a powerful feature extractor for downstream tasks without task-specific training data.
CLIP encodes images and text into a shared embedding space where semantically related content clusters together. This enables applications like image search using natural language queries, content moderation, image captioning, and multi-modal RAG systems. CLIP’s zero-shot capabilities often match or exceed supervised models trained on specific datasets.
Installation
# OpenAI CLIP
pip install git+https://github.com/openai/CLIP.git
pip install torch torchvision
# OpenCLIP (community, more models)
pip install open_clip_torch
# HuggingFace Transformers (easiest)
pip install transformers torch torchvision pillow
Core Usage
OpenAI CLIP
import clip
import torch
from PIL import Image
# Load model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
# Encode image
image = preprocess(Image.open("photo.jpg")).unsqueeze(0).to(device)
image_features = model.encode_image(image)
# Encode text
text = clip.tokenize(["a dog", "a cat", "a car"]).to(device)
text_features = model.encode_text(text)
# Compute similarity
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (image_features @ text_features.T).softmax(dim=-1)
print("Label probabilities:")
for i, label in enumerate(["a dog", "a cat", "a car"]):
print(f" {label}: {similarity[0][i].item():.4f}")
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
image = Image.open("photo.jpg")
inputs = processor(
text=["a photo of a dog", "a photo of a cat", "a photo of a car"],
images=image,
return_tensors="pt",
padding=True
)
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(f"Probabilities: {probs}")
OpenCLIP
import open_clip
import torch
from PIL import Image
model, _, preprocess = open_clip.create_model_and_transforms(
"ViT-B-32", pretrained="laion2b_s34b_b79k"
)
tokenizer = open_clip.get_tokenizer("ViT-B-32")
image = preprocess(Image.open("photo.jpg")).unsqueeze(0)
text = tokenizer(["a dog", "a cat", "a bird"])
with torch.no_grad():
image_features = model.encode_image(image)
text_features = model.encode_text(text)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (image_features @ text_features.T)
print(f"Similarity: {similarity}")
Available Models
OpenAI CLIP Models
| Model | Params | Image Size | Embedding Dim |
|---|
| RN50 | 102M | 224x224 | 1024 |
| RN101 | 120M | 224x224 | 512 |
| ViT-B/32 | 151M | 224x224 | 512 |
| ViT-B/16 | 150M | 224x224 | 512 |
| ViT-L/14 | 428M | 224x224 | 768 |
| ViT-L/14@336px | 428M | 336x336 | 768 |
OpenCLIP Models
| Model | Training Data | Performance |
|---|
| ViT-B-32 (LAION-2B) | 2B image-text pairs | Good baseline |
| ViT-L-14 (LAION-2B) | 2B image-text pairs | Better accuracy |
| ViT-H-14 (LAION-2B) | 2B image-text pairs | Best accuracy |
| ViT-G-14 (LAION-2B) | 2B image-text pairs | Highest accuracy |
Common Applications
Zero-Shot Image Classification
import clip
import torch
from PIL import Image
model, preprocess = clip.load("ViT-L/14", device="cuda")
image = preprocess(Image.open("photo.jpg")).unsqueeze(0).to("cuda")
# Define categories
categories = [
"a photo of a landscape",
"a photo of a person",
"a photo of food",
"a photo of an animal",
"a screenshot of a website",
"a photo of a building",
]
text = clip.tokenize(categories).to("cuda")
with torch.no_grad():
image_features = model.encode_image(image)
text_features = model.encode_text(text)
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
for cat, score in zip(categories, similarity[0]):
print(f"{cat}: {score.item():.2%}")
Image Search Engine
import clip
import torch
from PIL import Image
import numpy as np
import os
model, preprocess = clip.load("ViT-B/32", device="cuda")
# Index images
image_dir = "./photos/"
embeddings = []
filenames = []
for fname in os.listdir(image_dir):
if fname.endswith((".jpg", ".png")):
img = preprocess(Image.open(f"{image_dir}/{fname}")).unsqueeze(0).to("cuda")
with torch.no_grad():
feat = model.encode_image(img)
feat /= feat.norm(dim=-1, keepdim=True)
embeddings.append(feat.cpu().numpy())
filenames.append(fname)
embeddings = np.vstack(embeddings)
# Search with text query
def search(query, top_k=5):
text = clip.tokenize([query]).to("cuda")
with torch.no_grad():
text_feat = model.encode_text(text)
text_feat /= text_feat.norm(dim=-1, keepdim=True)
scores = (text_feat.cpu().numpy() @ embeddings.T)[0]
top_indices = scores.argsort()[-top_k:][::-1]
return [(filenames[i], scores[i]) for i in top_indices]
results = search("sunset over the ocean")
for name, score in results:
print(f"{name}: {score:.4f}")
Image-Text Similarity for RAG
import clip
import torch
from PIL import Image
model, preprocess = clip.load("ViT-B/32", device="cuda")
def get_image_embedding(image_path):
image = preprocess(Image.open(image_path)).unsqueeze(0).to("cuda")
with torch.no_grad():
features = model.encode_image(image)
features /= features.norm(dim=-1, keepdim=True)
return features.cpu().numpy()[0]
def get_text_embedding(text):
tokens = clip.tokenize([text]).to("cuda")
with torch.no_grad():
features = model.encode_text(tokens)
features /= features.norm(dim=-1, keepdim=True)
return features.cpu().numpy()[0]
# Store in vector database
img_emb = get_image_embedding("diagram.png")
txt_emb = get_text_embedding("system architecture diagram")
# Cosine similarity
similarity = np.dot(img_emb, txt_emb)
print(f"Similarity: {similarity:.4f}")
Configuration
Batch Processing
import clip
import torch
from PIL import Image
model, preprocess = clip.load("ViT-B/32", device="cuda")
# Batch encode images
images = [preprocess(Image.open(f)) for f in image_paths]
image_batch = torch.stack(images).to("cuda")
with torch.no_grad():
image_features = model.encode_image(image_batch)
image_features /= image_features.norm(dim=-1, keepdim=True)
# Batch encode text
texts = clip.tokenize(["dog", "cat", "bird", "fish"]).to("cuda")
with torch.no_grad():
text_features = model.encode_text(texts)
text_features /= text_features.norm(dim=-1, keepdim=True)
Model Selection Guide
| Use Case | Recommended Model | Notes |
|---|
| Fast prototyping | ViT-B/32 | Fastest, good enough |
| Production accuracy | ViT-L/14 | Best quality/speed ratio |
| Maximum accuracy | ViT-L/14@336px | Highest quality |
| Edge/mobile | RN50 | Smallest model |
Advanced Usage
Fine-Tuning CLIP
import clip
import torch
from torch.utils.data import DataLoader
model, preprocess = clip.load("ViT-B/32", device="cuda", jit=False)
# Unfreeze model for fine-tuning
for param in model.parameters():
param.requires_grad = True
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6, weight_decay=0.01)
# Training loop
for epoch in range(10):
for images, texts in dataloader:
images = images.to("cuda")
texts = clip.tokenize(texts).to("cuda")
image_features = model.encode_image(images)
text_features = model.encode_text(texts)
# Normalize
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
# Contrastive loss
logit_scale = model.logit_scale.exp()
logits = logit_scale * image_features @ text_features.T
labels = torch.arange(len(images)).to("cuda")
loss_i = torch.nn.functional.cross_entropy(logits, labels)
loss_t = torch.nn.functional.cross_entropy(logits.T, labels)
loss = (loss_i + loss_t) / 2
optimizer.zero_grad()
loss.backward()
optimizer.step()
Content Moderation
def check_content_safety(image_path, threshold=0.3):
image = preprocess(Image.open(image_path)).unsqueeze(0).to("cuda")
safety_labels = [
"safe content",
"violent content",
"explicit content",
"hateful content",
]
text = clip.tokenize(safety_labels).to("cuda")
with torch.no_grad():
img_feat = model.encode_image(image)
txt_feat = model.encode_text(text)
img_feat /= img_feat.norm(dim=-1, keepdim=True)
txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
probs = (100.0 * img_feat @ txt_feat.T).softmax(dim=-1)[0]
results = {label: prob.item() for label, prob in zip(safety_labels, probs)}
is_safe = results["safe content"] > threshold
return is_safe, results
Troubleshooting
| Issue | Solution |
|---|
| CUDA out of memory | Use smaller model (ViT-B/32), reduce batch size |
| Slow inference | Use GPU, batch inputs, use ViT-B/32 |
| Poor classification | Use more descriptive prompts (“a photo of a {class}“) |
| Token length exceeded | CLIP max is 77 tokens; truncate text inputs |
| Model download fails | Set TORCH_HOME or download manually |
| Low similarity scores | Normalize embeddings before computing similarity |
| Import errors | Install correct package: clip vs open_clip_torch |
| Image preprocessing | Always use model’s preprocess function |
# Check available models
python -c "import clip; print(clip.available_models())"
# Check GPU memory
python -c "import torch; print(f'GPU: {torch.cuda.get_device_name(0)}, Memory: {torch.cuda.get_device_properties(0).total_mem/1e9:.1f}GB')"