Installation
# Python SDK
pip install replicate
# Node.js SDK
npm install replicate
# CLI (via pip)
pip install replicate
# Set API token
export REPLICATE_API_TOKEN=r8_your_token_here
# Verify auth
replicate whoami
Configuration
# Environment variable (recommended)
export REPLICATE_API_TOKEN=r8_xxxxxxxxxxxx
# .env file
echo "REPLICATE_API_TOKEN=r8_xxxxxxxxxxxx" >> .env
# Python: configure client explicitly
import replicate
client = replicate.Client(api_token="r8_xxxxxxxxxxxx")
# Node.js: configure client
import Replicate from "replicate";
const replicate = new Replicate({ auth: process.env.REPLICATE_API_TOKEN });
# Python — global client (uses env var automatically)
import replicate
# Run a model (blocking)
output = replicate.run(
"stability-ai/stable-diffusion:ac732df...",
input={"prompt": "a photo of an astronaut riding a horse"}
)
print(output)
Core Commands / API
| Operation | Python | Description |
|---|
replicate.run() | replicate.run("owner/model:version", input={}) | Run model, wait for result |
replicate.predictions.create() | replicate.predictions.create(version="abc", input={}) | Create async prediction |
replicate.predictions.get() | replicate.predictions.get("prediction_id") | Fetch prediction status |
replicate.predictions.list() | replicate.predictions.list() | List all predictions |
replicate.models.get() | replicate.models.get("owner/model") | Get model metadata |
replicate.models.list() | replicate.models.list() | List available models |
replicate.deployments.run() | replicate.deployments.run("owner/name", input={}) | Run a deployment |
replicate.trainings.create() | replicate.trainings.create(...) | Fine-tune a model |
prediction.cancel() | prediction.cancel() | Cancel running prediction |
prediction.reload() | prediction.reload() | Refresh prediction state |
| CLI Command | Description |
|---|
replicate run owner/model | Run a model interactively |
replicate predict -i prompt="..." | Run with named inputs |
replicate whoami | Show authenticated user |
replicate models list | Browse models |
Advanced Usage
Async Predictions with Polling
import replicate
import time
# Create prediction without waiting
prediction = replicate.predictions.create(
version="ac732df83cea7fff18b8472768c88ad041fa750ff7682a21affe81863cbe77e4",
input={
"prompt": "a surrealist painting of a city underwater",
"width": 768,
"height": 768,
"num_outputs": 4,
}
)
print(f"Prediction ID: {prediction.id}")
print(f"Status: {prediction.status}")
# Poll until complete
while prediction.status not in ["succeeded", "failed", "canceled"]:
time.sleep(1)
prediction.reload()
print(f"Status: {prediction.status}")
if prediction.status == "succeeded":
print("Output URLs:", prediction.output)
else:
print("Error:", prediction.error)
Streaming Output
import replicate
# Stream tokens from a language model
for event in replicate.stream(
"meta/llama-3-70b-instruct",
input={
"prompt": "Explain quantum entanglement in simple terms.",
"max_tokens": 512,
"temperature": 0.7,
},
):
print(str(event), end="", flush=True)
print() # newline at end
Webhooks
import replicate
# Create prediction with webhook callback
prediction = replicate.predictions.create(
version="ac732df83cea7fff18b8472768c88ad041fa750ff7682a21affe81863cbe77e4",
input={"prompt": "a robot made of flowers"},
webhook="https://your-app.com/webhooks/replicate",
webhook_events_filter=["completed"], # or ["start", "output", "logs", "completed"]
)
# Webhook payload received at your endpoint (POST):
# {
# "id": "xyz123",
# "status": "succeeded",
# "output": ["https://replicate.delivery/..."],
# "metrics": { "predict_time": 3.14 }
# }
# Validate webhook signatures (security)
import replicate.webhooks
webhook_secret = replicate.webhooks.default.secret
# Use secret to verify HMAC-SHA256 signature in X-Replicate-Signature header
Fine-tuning (Training)
import replicate
# Fine-tune SDXL on custom images
training = replicate.trainings.create(
version="stability-ai/sdxl:39ed52f2319f9a8d...",
input={
"input_images": "https://example.com/training-images.zip",
"trigger_word": "TOK",
"steps": 1000,
"lora_lr": 1e-4,
},
destination="your-username/my-custom-model",
)
print(f"Training ID: {training.id}")
training.wait()
print("Training complete:", training.output)
Deployments (Persistent Endpoints)
import replicate
# Run via a named deployment (always-on, no cold start)
output = replicate.deployments.run(
"your-username/my-deployment",
input={"prompt": "fast inference query"},
)
# Create deployment programmatically
deployment = replicate.deployments.create(
name="my-fast-endpoint",
model="your-username/my-model",
version="abc123...",
hardware="gpu-a40-large",
min_instances=1,
max_instances=5,
)
Node.js SDK
import Replicate from "replicate";
const replicate = new Replicate();
// Basic run
const output = await replicate.run(
"stability-ai/stable-diffusion:ac732df83cea7fff...",
{ input: { prompt: "a neon city at night" } }
);
console.log(output);
// Streaming
const stream = await replicate.stream("meta/llama-3-70b-instruct", {
input: { prompt: "Write a haiku about coding" },
});
for await (const event of stream) {
process.stdout.write(event.toString());
}
// Async with webhook
const prediction = await replicate.predictions.create({
version: "ac732df83cea7fff...",
input: { prompt: "an oil painting of a sunset" },
webhook: "https://your-app.com/webhook",
webhook_events_filter: ["completed"],
});
import replicate
# Pass a local file as input
with open("image.jpg", "rb") as f:
output = replicate.run(
"andreasjansson/blip-2:4b32258c...",
input={"image": f, "question": "What is in this image?"}
)
# Pass a URL
output = replicate.run(
"andreasjansson/blip-2:4b32258c...",
input={
"image": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/280px-PNG_transparency_demonstration_1.png",
"question": "What colors are in this image?",
}
)
Common Workflows
Image Generation Pipeline
import replicate
import httpx
from pathlib import Path
def generate_and_save(prompt: str, output_path: str = "output.png"):
output = replicate.run(
"stability-ai/sdxl:39ed52f2319f9a8d...",
input={
"prompt": prompt,
"negative_prompt": "blurry, low quality, ugly",
"width": 1024,
"height": 1024,
"num_inference_steps": 25,
"guidance_scale": 7.5,
}
)
# Download output image
image_url = output[0]
response = httpx.get(image_url)
Path(output_path).write_bytes(response.content)
print(f"Saved to {output_path}")
generate_and_save("a cyberpunk samurai at dusk", "samurai.png")
Batch Processing
import replicate
import asyncio
prompts = [
"a red apple on a wooden table",
"a blue ocean under stormy skies",
"a green forest in autumn",
]
async def run_batch(prompts):
tasks = [
replicate.async_run(
"stability-ai/stable-diffusion:ac732df...",
input={"prompt": p}
)
for p in prompts
]
results = await asyncio.gather(*tasks)
return results
outputs = asyncio.run(run_batch(prompts))
for prompt, output in zip(prompts, outputs):
print(f"{prompt}: {output}")
LLM Chat with History
import replicate
history = []
def chat(user_message: str) -> str:
history.append({"role": "user", "content": user_message})
prompt = "\n".join(
f"{m['role'].upper()}: {m['content']}" for m in history
)
response = ""
for token in replicate.stream(
"meta/llama-3-70b-instruct",
input={"prompt": prompt, "max_tokens": 256},
):
response += str(token)
history.append({"role": "assistant", "content": response})
return response
print(chat("What is machine learning?"))
print(chat("Can you give me a simple example?"))
Cog — Package Your Own Model
# Install cog
sudo curl -o /usr/local/bin/cog -L \
https://github.com/replicate/cog/releases/latest/download/cog_$(uname -s)_$(uname -m)
sudo chmod +x /usr/local/bin/cog
# Initialize project
cog init
# Build image locally
cog build
# Run predictions locally
cog predict -i prompt="test"
# Push to Replicate
cog push r8.im/your-username/your-model
# cog.yaml — model definition
build:
gpu: true
python_version: "3.11"
python_packages:
- torch==2.1.0
- diffusers==0.24.0
- transformers==4.36.0
predict: "predict.py:Predictor"
# predict.py — model interface
from cog import BasePredictor, Input, Path
import torch
class Predictor(BasePredictor):
def setup(self) -> None:
"""Load model into memory once at startup."""
self.model = load_my_model()
def predict(
self,
prompt: str = Input(description="Text prompt"),
steps: int = Input(description="Inference steps", default=25, ge=1, le=100),
) -> Path:
"""Run a single prediction."""
output = self.model.generate(prompt, steps=steps)
output_path = Path("/tmp/output.png")
output.save(output_path)
return output_path
Tips and Best Practices
Cost Optimization
# Use version hashes — avoid resolving "latest" on every call (saves time + ensures reproducibility)
# BAD:
replicate.run("stability-ai/stable-diffusion", input={...})
# GOOD:
replicate.run("stability-ai/stable-diffusion:ac732df83cea7fff18b8472768c88ad041fa750ff7682a21affe81863cbe77e4", input={...})
# Use webhooks instead of polling for long-running models
# Polling wastes compute on your server; webhooks push results to you
# Batch similar requests to maximize throughput
# Use deployments (min_instances > 0) for latency-sensitive workloads
Error Handling
import replicate
from replicate.exceptions import ReplicateError, ModelError
try:
output = replicate.run(
"owner/model:version",
input={"prompt": "test"},
)
except ModelError as e:
print(f"Model error: {e}") # model-specific failure
except ReplicateError as e:
print(f"API error: {e}") # auth, rate limit, network
except Exception as e:
print(f"Unexpected error: {e}")
| Tip | Detail |
|---|
| Use version hashes | Avoid model resolution overhead on every call |
| Prefer webhooks | Eliminates polling overhead for async workloads |
| Use deployments | min_instances=1 eliminates cold starts |
| Cache outputs | Store URLs or download files to avoid re-running |
| Stream LLMs | Reduces perceived latency for text generation |
Use async_run | Python asyncio support for concurrent predictions |
| Choose right hardware | Match GPU tier to model size (A40 vs T4) |
Environment Best Practices
# Never hardcode tokens
# Use environment variables or secrets managers
# For production apps:
export REPLICATE_API_TOKEN=$(vault kv get -field=token secret/replicate)
# Rate limits: ~600 predictions/minute on standard tier
# Use exponential backoff on 429 responses
# Monitor spend via dashboard or API:
# https://replicate.com/account/billing
Useful Model Categories
| Category | Popular Models |
|---|
| Image generation | stability-ai/sdxl, black-forest-labs/flux-schnell |
| Language models | meta/llama-3-70b-instruct, mistralai/mistral-7b-instruct-v0.2 |
| Image editing | stability-ai/stable-diffusion-inpainting |
| Video generation | anotherjesse/zeroscope-v2-xl |
| Speech / TTS | adirik/styletts2 |
| Image captioning | andreasjansson/blip-2 |
| Upscaling | nightmareai/real-esrgan |
| Background removal | cjwbw/rembg |