Aller au contenu

Commandes llama.cpp

llama.cpp est un moteur d’inférence LLM haute performance en C/C++ qui exécute des modèles quantifiés efficacement sur du matériel grand public. Il supporte les formats GGUF, la quantification 2-8 bits, l’inférence CPU/GPU et un serveur API compatible OpenAI.

Installation

# Clone and build with CUDA support
git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp
cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release -j$(nproc)

# Build with Metal (macOS Apple Silicon)
cmake -B build -DGGML_METAL=ON
cmake --build build --config Release -j$(sysctl -n hw.ncpu)

# Build with Vulkan (cross-platform GPU)
cmake -B build -DGGML_VULKAN=ON
cmake --build build --config Release -j$(nproc)

# CPU-only build
cmake -B build
cmake --build build --config Release -j$(nproc)

# Install Python bindings
pip install llama-cpp-python

# Install with CUDA support for Python
CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python

Conversion de modèles

# Convert HuggingFace model to GGUF format
python convert_hf_to_gguf.py \
  /path/to/hf-model \
  --outfile model-f16.gguf \
  --outtype f16

# Convert with BF16
python convert_hf_to_gguf.py \
  /path/to/hf-model \
  --outfile model-bf16.gguf \
  --outtype bf16

Quantification

# Quantize to different precision levels
./build/bin/llama-quantize model-f16.gguf model-q4_k_m.gguf Q4_K_M
./build/bin/llama-quantize model-f16.gguf model-q5_k_m.gguf Q5_K_M
./build/bin/llama-quantize model-f16.gguf model-q8_0.gguf Q8_0
./build/bin/llama-quantize model-f16.gguf model-q3_k_m.gguf Q3_K_M
./build/bin/llama-quantize model-f16.gguf model-q2_k.gguf Q2_K

# Importance matrix quantization (better quality)
./build/bin/llama-imatrix \
  -m model-f16.gguf \
  -f calibration_data.txt \
  -o imatrix.dat

./build/bin/llama-quantize \
  --imatrix imatrix.dat \
  model-f16.gguf model-q4_k_m.gguf Q4_K_M

Niveaux de quantification

TypeBitsSize (7B)QualitySpeed
Q2_K2~2.7 GBLowFastest
Q3_K_M3~3.3 GBFairFast
Q4_K_M4~4.1 GBGoodFast
Q5_K_M5~4.8 GBVery GoodMedium
Q6_K6~5.5 GBExcellentMedium
Q8_08~7.2 GBNear FP16Slower
F1616~13.5 GBBaselineSlowest

Génération de texte (CLI)

# Basic generation
./build/bin/llama-cli \
  -m model-q4_k_m.gguf \
  -p "Explain machine learning:" \
  -n 256

# Interactive chat mode
./build/bin/llama-cli \
  -m model-q4_k_m.gguf \
  --interactive \
  --color \
  -n -1

# With sampling parameters
./build/bin/llama-cli \
  -m model-q4_k_m.gguf \
  -p "Write a poem about AI:" \
  -n 200 \
  --temp 0.7 \
  --top-p 0.9 \
  --top-k 40 \
  --repeat-penalty 1.1

# GPU offloading (put 35 layers on GPU)
./build/bin/llama-cli \
  -m model-q4_k_m.gguf \
  -p "Hello" \
  -ngl 35

# Full GPU offload
./build/bin/llama-cli \
  -m model-q4_k_m.gguf \
  -p "Hello" \
  -ngl 999

Mode serveur

# Start OpenAI-compatible API server
./build/bin/llama-server \
  -m model-q4_k_m.gguf \
  --host 0.0.0.0 \
  --port 8080 \
  -ngl 999

# Server with context size and parallel requests
./build/bin/llama-server \
  -m model-q4_k_m.gguf \
  --host 0.0.0.0 \
  --port 8080 \
  -c 4096 \
  -np 4 \
  -ngl 999

# Server with chat template
./build/bin/llama-server \
  -m model-q4_k_m.gguf \
  --host 0.0.0.0 \
  --port 8080 \
  --chat-template llama3 \
  -ngl 999

Utilisation de l’API

# Chat completions (OpenAI-compatible)
curl http://localhost:8080/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "local-model",
    "messages": [
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": "Explain transformers."}
    ],
    "temperature": 0.7,
    "max_tokens": 256
  }'

# Text completions
curl http://localhost:8080/v1/completions \
  -H "Content-Type: application/json" \
  -d '{
    "prompt": "The meaning of life is",
    "temperature": 0.8,
    "n_predict": 128
  }'

# Embeddings
curl http://localhost:8080/v1/embeddings \
  -H "Content-Type: application/json" \
  -d '{"input": "Hello world"}'

# Health check
curl http://localhost:8080/health

Bindings Python

from llama_cpp import Llama

# Load model
llm = Llama(
    model_path="model-q4_k_m.gguf",
    n_ctx=4096,       # Context window
    n_gpu_layers=-1,  # Offload all layers to GPU
    n_threads=8,      # CPU threads
    verbose=False,
)

# Text completion
output = llm(
    "Explain quantum computing:",
    max_tokens=256,
    temperature=0.7,
    top_p=0.9,
    echo=False,
)
print(output["choices"][0]["text"])

# Chat completion
output = llm.create_chat_completion(
    messages=[
        {"role": "system", "content": "You are helpful."},
        {"role": "user", "content": "What is deep learning?"},
    ],
    max_tokens=256,
    temperature=0.7,
)
print(output["choices"][0]["message"]["content"])

Benchmarking

# Benchmark model performance
./build/bin/llama-bench \
  -m model-q4_k_m.gguf \
  -ngl 999 \
  -p 512 \
  -n 128

# Perplexity evaluation
./build/bin/llama-perplexity \
  -m model-q4_k_m.gguf \
  -f wikitext-2-raw/wiki.test.raw \
  -ngl 999

Options CLI courantes

OptionDescription
-mModel file path
-pPrompt text
-nMax tokens to generate
-cContext size (default 2048)
-nglNumber of layers offloaded to GPU
-tNumber of CPU threads
--tempTemperature (0.0 = greedy)
--top-pTop-p sampling
--top-kTop-k sampling
--repeat-penaltyRepetition penalty
-bBatch size for prompt processing
-npNumber of parallel sequences (server)
--interactiveInteractive chat mode
--colorColorize output