-
Notifications
You must be signed in to change notification settings - Fork 0
Python API
Behnam Ebrahimi edited this page Mar 29, 2026
·
1 revision
Vayu provides two Python interfaces: the simple LightningWhisperMLX class and the full transcribe() function.
The recommended interface for most users. Provides a simple wrapper around the full transcription pipeline.
class LightningWhisperMLX:
def __init__(
self,
model: str = "distil-large-v3",
batch_size: int = 12,
quant: Optional[str] = None,
)| Parameter | Type | Default | Description |
|---|---|---|---|
model |
str |
"distil-large-v3" |
Model name (e.g., "tiny", "turbo") or HuggingFace repo path |
batch_size |
int |
12 |
Number of audio segments to process in parallel. Higher = faster but more memory |
quant |
Optional[str] |
None |
Quantization level: "4bit", "8bit", or None for full precision |
def transcribe(
self,
audio: Union[str, np.ndarray, mx.array],
language: Optional[str] = None,
task: str = "transcribe",
verbose: Optional[bool] = None,
word_timestamps: bool = False,
**kwargs,
) -> dict| Parameter | Type | Default | Description |
|---|---|---|---|
audio |
str, np.ndarray, mx.array
|
— | Audio file path or waveform array |
language |
Optional[str] |
None |
Language code (e.g., "en", "fa"). Auto-detected if None
|
task |
str |
"transcribe" |
"transcribe" or "translate" (translate to English) |
verbose |
Optional[bool] |
None |
Print progress. None = default, True = detailed, False = silent |
word_timestamps |
bool |
False |
Extract word-level timestamps |
**kwargs |
— | — | Additional parameters passed to the core transcribe() function |
from whisper_mlx import LightningWhisperMLX
# Standard usage
whisper = LightningWhisperMLX(model="distil-large-v3", batch_size=12)
result = whisper.transcribe("audio.mp3", language="en")
# With quantization for lower memory
whisper = LightningWhisperMLX(model="large-v3", quant="4bit", batch_size=8)
result = whisper.transcribe("lecture.wav", word_timestamps=True)
# Translation
result = whisper.transcribe("french_audio.mp3", task="translate")The full transcription function with all available options.
def transcribe(
audio: Union[str, np.ndarray, mx.array],
*,
path_or_hf_repo: str = "mlx-community/whisper-turbo",
batch_size: int = 1,
verbose: Optional[bool] = None,
temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
compression_ratio_threshold: Optional[float] = 2.4,
logprob_threshold: Optional[float] = -1.0,
no_speech_threshold: Optional[float] = 0.6,
condition_on_previous_text: bool = True,
initial_prompt: Optional[str] = None,
word_timestamps: bool = False,
**decode_options,
) -> dict| Parameter | Type | Default | Description |
|---|---|---|---|
audio |
str, np.ndarray, mx.array
|
— | Audio file path or waveform |
path_or_hf_repo |
str |
"mlx-community/whisper-turbo" |
Model path or HuggingFace repo |
batch_size |
int |
1 |
Segments processed per forward pass (set >1 for batched decoding) |
verbose |
Optional[bool] |
None |
Verbosity level |
temperature |
float or tuple
|
(0.0, 0.2, ..., 1.0) |
Sampling temperature(s). Tuple enables fallback strategy |
compression_ratio_threshold |
Optional[float] |
2.4 |
Reject segments with compression ratio above this (hallucination filter) |
logprob_threshold |
Optional[float] |
-1.0 |
Reject segments with avg log probability below this |
no_speech_threshold |
Optional[float] |
0.6 |
Silence detection threshold |
condition_on_previous_text |
bool |
True |
Use previous segment text as prompt context |
initial_prompt |
Optional[str] |
None |
Initial text prompt for the decoder |
word_timestamps |
bool |
False |
Extract word-level timestamps via cross-attention + DTW |
**decode_options |
— | — | Additional options: beam_size, best_of, patience, fp16, etc. |
| Option | Type | Description |
|---|---|---|
beam_size |
int |
Beam search width (default: greedy) |
best_of |
int |
Number of candidates for best-of-N sampling |
patience |
float |
Beam search length penalty |
fp16 |
bool |
Use float16 for inference (default: True) |
language |
str |
Language code |
task |
str |
"transcribe" or "translate"
|
clip_timestamps |
str |
Comma-separated timestamp ranges to process |
from whisper_mlx import load_audio, log_mel_spectrogram
# Load audio file (resampled to 16kHz mono)
waveform = load_audio("audio.mp3") # Returns np.ndarray
# Compute mel spectrogram
mel = log_mel_spectrogram(waveform, n_mels=80)from whisper_mlx import load_model, Whisper, ModelDimensions
# Load a model directly
model = load_model("mlx-community/whisper-turbo")
# Access model properties
print(model.dims) # ModelDimensions
print(model.is_multilingual) # True/Falsefrom whisper_mlx import get_tokenizer, LANGUAGES
# Get tokenizer for a language
tokenizer = get_tokenizer(multilingual=True, language="en", task="transcribe")
# Encode/decode
tokens = tokenizer.encode("Hello world")
text = tokenizer.decode(tokens)
# Available languages
print(LANGUAGES) # {"en": "english", "zh": "chinese", ...}Vayu supports 99 languages. Use the two-letter ISO code:
| Code | Language | Code | Language | Code | Language |
|---|---|---|---|---|---|
en |
English | zh |
Chinese | de |
German |
es |
Spanish | ru |
Russian | ko |
Korean |
fr |
French | ja |
Japanese | fa |
Persian |
pt |
Portuguese | tr |
Turkish | ar |
Arabic |
it |
Italian | pl |
Polish | hi |
Hindi |
Pass language=None for automatic language detection.