Aller au contenu

Commandes Whisper

Whisper est le modèle de reconnaissance vocale polyvalent d’OpenAI entraîné sur 680 000 heures de données multilingues. Il supporte la transcription, la traduction, la détection de langue et l’horodatage au niveau du mot.

Installation

# Install OpenAI Whisper
pip install openai-whisper

# Install with PyTorch CUDA support
pip install openai-whisper torch torchvision torchaudio

# Install faster-whisper (CTranslate2 backend, 4x faster)
pip install faster-whisper

# Install WhisperX (with alignment and diarization)
pip install whisperx

# Install ffmpeg (required dependency)
# macOS
brew install ffmpeg
# Ubuntu
sudo apt install ffmpeg

# Verify
whisper --help

Tailles de modèles

ModelParametersEnglish-onlyVRAM RequiredRelative Speed
tiny39Mtiny.en~1 GB~10x
base74Mbase.en~1 GB~7x
small244Msmall.en~2 GB~4x
medium769Mmedium.en~5 GB~2x
large-v31550MNo~10 GB1x
turbo809MNo~6 GB~8x

Utilisation CLI

# Basic transcription
whisper audio.mp3

# Specify model size
whisper audio.mp3 --model large-v3

# Specify language
whisper audio.mp3 --model large-v3 --language en

# Translate to English
whisper audio.mp3 --model large-v3 --task translate

# Specify output formats
whisper audio.mp3 --model large-v3 \
  --output_format all  # txt, vtt, srt, tsv, json

# Output to specific directory
whisper audio.mp3 --model large-v3 \
  --output_dir ./transcripts

# Multiple files
whisper audio1.mp3 audio2.wav audio3.m4a --model medium

# Word-level timestamps
whisper audio.mp3 --model large-v3 \
  --word_timestamps True

# Use specific device
whisper audio.mp3 --model large-v3 --device cuda

# CPU with specific threads
whisper audio.mp3 --model medium --device cpu --threads 8

API Python

import whisper

# Load model
model = whisper.load_model("large-v3")  # tiny, base, small, medium, large-v3, turbo

# Basic transcription
result = model.transcribe("audio.mp3")
print(result["text"])

# With options
result = model.transcribe(
    "audio.mp3",
    language="en",
    task="transcribe",       # or "translate" for English translation
    fp16=True,               # Use FP16 (GPU only)
    beam_size=5,
    best_of=5,
    temperature=0.0,
    word_timestamps=True,
    verbose=True,
)

# Access segments with timestamps
for segment in result["segments"]:
    print(f"[{segment['start']:.2f} - {segment['end']:.2f}] {segment['text']}")

# Access word-level timestamps
for segment in result["segments"]:
    for word in segment.get("words", []):
        print(f"  [{word['start']:.2f} - {word['end']:.2f}] {word['word']}")

Détection de langue

import whisper

model = whisper.load_model("large-v3")

# Load and pad audio
audio = whisper.load_audio("audio.mp3")
audio = whisper.pad_or_trim(audio)

# Compute log-Mel spectrogram
mel = whisper.log_mel_spectrogram(audio).to(model.device)

# Detect language
_, probs = model.detect_language(mel)
detected_lang = max(probs, key=probs.get)
print(f"Detected language: {detected_lang} ({probs[detected_lang]:.2%})")

Faster-Whisper

from faster_whisper import WhisperModel

# Load model (uses CTranslate2 - significantly faster)
model = WhisperModel(
    "large-v3",
    device="cuda",
    compute_type="float16",  # float16, int8_float16, int8
)

# Transcribe
segments, info = model.transcribe(
    "audio.mp3",
    language="en",
    beam_size=5,
    word_timestamps=True,
    vad_filter=True,  # Filter out silence
)

print(f"Detected language: {info.language} (prob: {info.language_probability:.2f})")

for segment in segments:
    print(f"[{segment.start:.2f} - {segment.end:.2f}] {segment.text}")

    if segment.words:
        for word in segment.words:
            print(f"  [{word.start:.2f} - {word.end:.2f}] {word.word}")
# Faster-Whisper CLI
faster-whisper audio.mp3 --model large-v3 --language en

WhisperX (alignement et diarisation)

import whisperx

# Load model
model = whisperx.load_model("large-v3", device="cuda", compute_type="float16")

# Transcribe
audio = whisperx.load_audio("audio.mp3")
result = model.transcribe(audio, batch_size=16)

# Align timestamps at word level
model_a, metadata = whisperx.load_align_model(language_code="en", device="cuda")
result = whisperx.align(
    result["segments"], model_a, metadata, audio, device="cuda"
)

# Speaker diarization (requires HuggingFace token)
diarize_model = whisperx.DiarizationPipeline(
    use_auth_token="YOUR_HF_TOKEN", device="cuda"
)
diarize_segments = diarize_model(audio)

# Assign speakers to words
result = whisperx.assign_word_speakers(diarize_segments, result)

for segment in result["segments"]:
    speaker = segment.get("speaker", "UNKNOWN")
    print(f"[{speaker}] {segment['text']}")

Formats de sortie

# Generate all output formats
whisper audio.mp3 --model large-v3 --output_format all

Generated files include:

FormatExtensionDescription
Text.txtPlain text transcript
SRT.srtSubRip subtitle format
VTT.vttWebVTT subtitle format
TSV.tsvTab-separated with timestamps
JSON.jsonFull results with metadata

Écriture de sortie programmatique

import whisper
from whisper.utils import get_writer

model = whisper.load_model("large-v3")
result = model.transcribe("audio.mp3")

# Write SRT file
writer = get_writer("srt", "./output")
writer(result, "audio.mp3")

# Write VTT file
writer = get_writer("vtt", "./output")
writer(result, "audio.mp3")

# Write JSON
writer = get_writer("json", "./output")
writer(result, "audio.mp3")

Traitement par lots

import os
import whisper

model = whisper.load_model("large-v3")

audio_dir = "./audio_files"
for filename in os.listdir(audio_dir):
    if filename.endswith((".mp3", ".wav", ".m4a", ".flac")):
        filepath = os.path.join(audio_dir, filename)
        result = model.transcribe(filepath)
        print(f"\n--- {filename} ---")
        print(result["text"])

Options courantes

OptionDescriptionPar défaut
--modelModel sizeturbo
--languageAudio language codeAuto-detect
--tasktranscribe or translatetranscribe
--output_formattxt, vtt, srt, tsv, json, allall
--output_dirOutput directory.
--devicecuda or cpuAuto
--word_timestampsEnable word-level timestampsFalse
--beam_sizeBeam search width5
--temperatureSampling temperature0
--fp16Use FP16 inferenceTrue on GPU