CLIP Cheat Sheet

Overview

CLIP (Contrastive Language-Image Pre-training) is a neural network model developed by OpenAI that learns visual concepts from natural language supervision. Trained on 400 million image-text pairs from the internet, CLIP can perform zero-shot image classification, compute similarity between images and text, and serve as a powerful feature extractor for downstream tasks without task-specific training data.

CLIP encodes images and text into a shared embedding space where semantically related content clusters together. This enables applications like image search using natural language queries, content moderation, image captioning, and multi-modal RAG systems. CLIP’s zero-shot capabilities often match or exceed supervised models trained on specific datasets.

Installation

# OpenAI CLIP
pip install git+https://github.com/openai/CLIP.git
pip install torch torchvision

# OpenCLIP (community, more models)
pip install open_clip_torch

# HuggingFace Transformers (easiest)
pip install transformers torch torchvision pillow

Core Usage

OpenAI CLIP

import clip
import torch
from PIL import Image

# Load model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Encode image
image = preprocess(Image.open("photo.jpg")).unsqueeze(0).to(device)
image_features = model.encode_image(image)

# Encode text
text = clip.tokenize(["a dog", "a cat", "a car"]).to(device)
text_features = model.encode_text(text)

# Compute similarity
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (image_features @ text_features.T).softmax(dim=-1)

print("Label probabilities:")
for i, label in enumerate(["a dog", "a cat", "a car"]):
    print(f"  {label}: {similarity[0][i].item():.4f}")

HuggingFace Transformers

from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

image = Image.open("photo.jpg")
inputs = processor(
    text=["a photo of a dog", "a photo of a cat", "a photo of a car"],
    images=image,
    return_tensors="pt",
    padding=True
)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(f"Probabilities: {probs}")

OpenCLIP

import open_clip
import torch
from PIL import Image

model, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-B-32", pretrained="laion2b_s34b_b79k"
)
tokenizer = open_clip.get_tokenizer("ViT-B-32")

image = preprocess(Image.open("photo.jpg")).unsqueeze(0)
text = tokenizer(["a dog", "a cat", "a bird"])

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)

    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    similarity = (image_features @ text_features.T)
    print(f"Similarity: {similarity}")

Available Models

OpenAI CLIP Models

Model	Params	Image Size	Embedding Dim
RN50	102M	224x224	1024
RN101	120M	224x224	512
ViT-B/32	151M	224x224	512
ViT-B/16	150M	224x224	512
ViT-L/14	428M	224x224	768
ViT-L/14@336px	428M	336x336	768

OpenCLIP Models

Model	Training Data	Performance
ViT-B-32 (LAION-2B)	2B image-text pairs	Good baseline
ViT-L-14 (LAION-2B)	2B image-text pairs	Better accuracy
ViT-H-14 (LAION-2B)	2B image-text pairs	Best accuracy
ViT-G-14 (LAION-2B)	2B image-text pairs	Highest accuracy

Common Applications

Zero-Shot Image Classification

import clip
import torch
from PIL import Image

model, preprocess = clip.load("ViT-L/14", device="cuda")

image = preprocess(Image.open("photo.jpg")).unsqueeze(0).to("cuda")

# Define categories
categories = [
    "a photo of a landscape",
    "a photo of a person",
    "a photo of food",
    "a photo of an animal",
    "a screenshot of a website",
    "a photo of a building",
]

text = clip.tokenize(categories).to("cuda")

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)

    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)

for cat, score in zip(categories, similarity[0]):
    print(f"{cat}: {score.item():.2%}")

Image Search Engine

import clip
import torch
from PIL import Image
import numpy as np
import os

model, preprocess = clip.load("ViT-B/32", device="cuda")

# Index images
image_dir = "./photos/"
embeddings = []
filenames = []

for fname in os.listdir(image_dir):
    if fname.endswith((".jpg", ".png")):
        img = preprocess(Image.open(f"{image_dir}/{fname}")).unsqueeze(0).to("cuda")
        with torch.no_grad():
            feat = model.encode_image(img)
            feat /= feat.norm(dim=-1, keepdim=True)
        embeddings.append(feat.cpu().numpy())
        filenames.append(fname)

embeddings = np.vstack(embeddings)

# Search with text query
def search(query, top_k=5):
    text = clip.tokenize([query]).to("cuda")
    with torch.no_grad():
        text_feat = model.encode_text(text)
        text_feat /= text_feat.norm(dim=-1, keepdim=True)

    scores = (text_feat.cpu().numpy() @ embeddings.T)[0]
    top_indices = scores.argsort()[-top_k:][::-1]
    return [(filenames[i], scores[i]) for i in top_indices]

results = search("sunset over the ocean")
for name, score in results:
    print(f"{name}: {score:.4f}")

Image-Text Similarity for RAG

import clip
import torch
from PIL import Image

model, preprocess = clip.load("ViT-B/32", device="cuda")

def get_image_embedding(image_path):
    image = preprocess(Image.open(image_path)).unsqueeze(0).to("cuda")
    with torch.no_grad():
        features = model.encode_image(image)
        features /= features.norm(dim=-1, keepdim=True)
    return features.cpu().numpy()[0]

def get_text_embedding(text):
    tokens = clip.tokenize([text]).to("cuda")
    with torch.no_grad():
        features = model.encode_text(tokens)
        features /= features.norm(dim=-1, keepdim=True)
    return features.cpu().numpy()[0]

# Store in vector database
img_emb = get_image_embedding("diagram.png")
txt_emb = get_text_embedding("system architecture diagram")

# Cosine similarity
similarity = np.dot(img_emb, txt_emb)
print(f"Similarity: {similarity:.4f}")

Configuration

Batch Processing

import clip
import torch
from PIL import Image

model, preprocess = clip.load("ViT-B/32", device="cuda")

# Batch encode images
images = [preprocess(Image.open(f)) for f in image_paths]
image_batch = torch.stack(images).to("cuda")

with torch.no_grad():
    image_features = model.encode_image(image_batch)
    image_features /= image_features.norm(dim=-1, keepdim=True)

# Batch encode text
texts = clip.tokenize(["dog", "cat", "bird", "fish"]).to("cuda")
with torch.no_grad():
    text_features = model.encode_text(texts)
    text_features /= text_features.norm(dim=-1, keepdim=True)

Model Selection Guide

Use Case	Recommended Model	Notes
Fast prototyping	ViT-B/32	Fastest, good enough
Production accuracy	ViT-L/14	Best quality/speed ratio
Maximum accuracy	ViT-L/14@336px	Highest quality
Edge/mobile	RN50	Smallest model

Advanced Usage

Fine-Tuning CLIP

import clip
import torch
from torch.utils.data import DataLoader

model, preprocess = clip.load("ViT-B/32", device="cuda", jit=False)

# Unfreeze model for fine-tuning
for param in model.parameters():
    param.requires_grad = True

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6, weight_decay=0.01)

# Training loop
for epoch in range(10):
    for images, texts in dataloader:
        images = images.to("cuda")
        texts = clip.tokenize(texts).to("cuda")

        image_features = model.encode_image(images)
        text_features = model.encode_text(texts)

        # Normalize
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        # Contrastive loss
        logit_scale = model.logit_scale.exp()
        logits = logit_scale * image_features @ text_features.T
        labels = torch.arange(len(images)).to("cuda")

        loss_i = torch.nn.functional.cross_entropy(logits, labels)
        loss_t = torch.nn.functional.cross_entropy(logits.T, labels)
        loss = (loss_i + loss_t) / 2

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Content Moderation

def check_content_safety(image_path, threshold=0.3):
    image = preprocess(Image.open(image_path)).unsqueeze(0).to("cuda")
    safety_labels = [
        "safe content",
        "violent content",
        "explicit content",
        "hateful content",
    ]
    text = clip.tokenize(safety_labels).to("cuda")

    with torch.no_grad():
        img_feat = model.encode_image(image)
        txt_feat = model.encode_text(text)
        img_feat /= img_feat.norm(dim=-1, keepdim=True)
        txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
        probs = (100.0 * img_feat @ txt_feat.T).softmax(dim=-1)[0]

    results = {label: prob.item() for label, prob in zip(safety_labels, probs)}
    is_safe = results["safe content"] > threshold
    return is_safe, results

Troubleshooting

Issue	Solution
CUDA out of memory	Use smaller model (ViT-B/32), reduce batch size
Slow inference	Use GPU, batch inputs, use ViT-B/32
Poor classification	Use more descriptive prompts (“a photo of a {class}“)
Token length exceeded	CLIP max is 77 tokens; truncate text inputs
Model download fails	Set `TORCH_HOME` or download manually
Low similarity scores	Normalize embeddings before computing similarity
Import errors	Install correct package: `clip` vs `open_clip_torch`
Image preprocessing	Always use model’s preprocess function

# Check available models
python -c "import clip; print(clip.available_models())"

# Check GPU memory
python -c "import torch; print(f'GPU: {torch.cuda.get_device_name(0)}, Memory: {torch.cuda.get_device_properties(0).total_mem/1e9:.1f}GB')"