Voxtral (#20970)

patrickvonplaten · DarkLight1337 · web-flow · commit e7e3e6d2636f · 2025-07-15T07:35:30.000-07:00
Signed-off-by: Patrick von Platen &lt;patrick.v.platen@gmail.com&gt;
Co-authored-by: Cyrus Leung &lt;cyrus.tl.leung@gmail.com&gt;
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
@@ -10,7 +10,7 @@
 
 import os
 from dataclasses import asdict
-from typing import NamedTuple, Optional
+from typing import Any, NamedTuple, Optional
 
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
@@ -30,7 +30,9 @@
 
 class ModelRequestData(NamedTuple):
     engine_args: EngineArgs
-    prompt: str
+    prompt: Optional[str] = None
+    prompt_token_ids: Optional[dict[str, list[int]]] = None
+    multi_modal_data: Optional[dict[str, Any]] = None
     stop_token_ids: Optional[list[int]] = None
     lora_requests: Optional[list[LoRARequest]] = None
 
@@ -40,6 +42,60 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.
 
 
+# Voxtral
+def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
+    from mistral_common.audio import Audio
+    from mistral_common.protocol.instruct.messages import (
+        AudioChunk,
+        RawAudio,
+        TextChunk,
+        UserMessage,
+    )
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest
+    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+
+    model_name = "mistralai/Voxtral-Mini-3B-2507"
+    tokenizer = MistralTokenizer.from_hf_hub(model_name)
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"audio": audio_count},
+        config_format="mistral",
+        load_format="mistral",
+        tokenizer_mode="mistral",
+        enforce_eager=True,
+        enable_chunked_prefill=False,
+    )
+
+    text_chunk = TextChunk(text=question)
+    audios = [
+        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
+        for i in range(audio_count)
+    ]
+    audio_chunks = [
+        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
+    ]
+
+    messages = [UserMessage(content=[*audio_chunks, text_chunk])]
+
+    req = ChatCompletionRequest(messages=messages, model=model_name)
+
+    tokens = tokenizer.encode_chat_completion(req)
+    prompt_ids, audios = tokens.tokens, tokens.audios
+
+    audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
+
+    multi_modal_data = {"audio": audios_and_sr}
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt_token_ids=prompt_ids,
+        multi_modal_data=multi_modal_data,
+    )
+
+
 # Granite Speech
 def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
     # NOTE - the setting in this example are somehat different than what is
@@ -243,6 +299,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
 
 
 model_example_map = {
+    "voxtral": run_voxtral,
     "granite_speech": run_granite_speech,
     "minicpmo": run_minicpmo,
     "phi4_mm": run_phi4mm,
@@ -311,16 +368,24 @@ def main(args):
         temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
     )
 
-    mm_data = {}
-    if audio_count > 0:
-        mm_data = {
-            "audio": [
-                asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
-            ]
-        }
+    mm_data = req_data.multi_modal_data
+    if not mm_data:
+        mm_data = {}
+        if audio_count > 0:
+            mm_data = {
+                "audio": [
+                    asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
+                ]
+            }
 
     assert args.num_prompts > 0
-    inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
+    inputs = {"multi_modal_data": mm_data}
+
+    if req_data.prompt:
+        inputs["prompt"] = req_data.prompt
+    else:
+        inputs["prompt_token_ids"] = req_data.prompt_token_ids
+
     if args.num_prompts > 1:
         # Batch inference
         inputs = [inputs] * args.num_prompts
diff --git a/requirements/common.txt b/requirements/common.txt
@@ -33,7 +33,7 @@ pyzmq >= 25.0.0
 msgspec
 gguf >= 0.13.0
 importlib_metadata; python_version < '3.10'
-mistral_common[opencv] >= 1.6.2
+mistral_common[opencv] >= 1.8.0
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
@@ -23,7 +23,7 @@ jiwer # required for audio tests
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[opencv] >= 1.6.2 # required for pixtral test
+mistral_common[opencv] >= 1.8.0 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
diff --git a/requirements/test.in b/requirements/test.in
@@ -28,7 +28,7 @@ torchvision==0.22.0
 transformers_stream_generator # required for qwen-vl test
 mamba_ssm # required for plamo2 test
 matplotlib # required for qwen-vl test
-mistral_common[opencv] >= 1.7.0 # required for pixtral test
+mistral_common[opencv] >= 1.8.0 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -305,7 +305,7 @@ mbstrdecoder==1.1.3
     #   typepy
 mdurl==0.1.2
     # via markdown-it-py
-mistral-common==1.7.0
+mistral-common==1.8.0
     # via -r requirements/test.in
 more-itertools==10.5.0
     # via lm-eval
@@ -518,6 +518,8 @@ pyasn1-modules==0.4.2
     # via google-auth
 pybind11==2.13.6
     # via lm-eval
+pycountry==24.6.1
+    # via pydantic-extra-types
 pycparser==2.22
     # via cffi
 pycryptodomex==3.22.0
@@ -528,9 +530,12 @@ pydantic==2.11.5
     #   datamodel-code-generator
     #   mistral-common
     #   mteb
+    #   pydantic-extra-types
     #   ray
 pydantic-core==2.33.2
     # via pydantic
+pydantic-extra-types==2.10.5
+    # via mistral-common
 pygments==2.18.0
     # via rich
 pyparsing==3.2.0
@@ -835,6 +840,7 @@ typing-extensions==4.12.2
     #   pqdm
     #   pydantic
     #   pydantic-core
+    #   pydantic-extra-types
     #   torch
     #   typer
     #   typing-inspection
diff --git a/setup.py b/setup.py
@@ -692,7 +692,8 @@ def _read_requirements(filename: str) -> list[str]:
         "tensorizer": ["tensorizer==2.10.1"],
         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
         "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
-        "audio": ["librosa", "soundfile"],  # Required for audio processing
+        "audio": ["librosa", "soundfile",
+                  "mistral_common[audio]"],  # Required for audio processing
         "video": []  # Kept for backwards compatibility
     },
     cmdclass=cmdclass,
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
@@ -17,6 +17,11 @@
 
 from ...utils import RemoteOpenAIServer
 
+MISTRAL_FORMAT_ARGS = [
+    "--tokenizer_mode", "mistral", "--config_format", "mistral",
+    "--load_format", "mistral"
+]
+
 
 @pytest.fixture
 def mary_had_lamb():
@@ -33,9 +38,18 @@ def winning_call():
 
 
 @pytest.mark.asyncio
-async def test_basic_audio(mary_had_lamb):
-    model_name = "openai/whisper-large-v3-turbo"
+@pytest.mark.parametrize(
+    "model_name",
+    ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"])
+async def test_basic_audio(mary_had_lamb, model_name):
     server_args = ["--enforce-eager"]
+
+    if model_name.startswith("mistralai"):
+        server_args += MISTRAL_FORMAT_ARGS
+
+        # TODO(PATRICK) - REMOVE AFTER RELEASE
+        return  # skip for now
+
     # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
     with RemoteOpenAIServer(model_name, server_args) as remote_server:
         client = remote_server.get_async_client()
@@ -65,10 +79,13 @@ async def test_bad_requests(mary_had_lamb):
 
 
 @pytest.mark.asyncio
-async def test_long_audio_request(mary_had_lamb):
-    model_name = "openai/whisper-large-v3-turbo"
+@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3-turbo"])
+async def test_long_audio_request(mary_had_lamb, model_name):
     server_args = ["--enforce-eager"]
 
+    if model_name.startswith("openai"):
+        return
+
     mary_had_lamb.seek(0)
     audio, sr = librosa.load(mary_had_lamb)
     # Add small silence after each audio for repeatability in the split process
@@ -87,7 +104,8 @@ async def test_long_audio_request(mary_had_lamb):
             response_format="text",
             temperature=0.0)
         out = json.loads(transcription)['text']
-        assert out.count("Mary had a little lamb") == 10
+        counts = out.count("Mary had a little lamb")
+        assert counts == 10, counts
 
 
 @pytest.mark.asyncio
diff --git a/tests/models/registry.py b/tests/models/registry.py
@@ -440,6 +440,7 @@ def check_available_online(
                                                          tokenizer="Isotr0py/Florence-2-tokenizer",  # noqa: E501
                                                          trust_remote_code=True),  # noqa: E501
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
+    "VoxtralForConditionalGeneration": _HfExamplesInfo("mistralai/Voxtral-Mini-3B-2507", is_available_online=False, tokenizer_mode="mistral"),  # noqa: E501
     "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
 
     # [Cross-encoder]
@@ -513,4 +514,4 @@ def find_hf_info(self, model_id: str) -> _HfExamplesInfo:
         raise ValueError(f"No example model defined for {model_id}")
 
 
-HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
+HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
@@ -112,6 +112,7 @@ async def _preprocess_speech_to_text(
             prompt = self.model_cls.get_generation_prompt(
                 audio=chunk,
                 stt_config=self.asr_config,
+                model_config=self.model_config,
                 language=lang,
                 task_type=self.task_type,
                 request_prompt=request.prompt)
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
@@ -722,7 +722,8 @@ class SupportsTranscription(Protocol):
 
     @classmethod
     def get_generation_prompt(cls, audio: np.ndarray,
-                              stt_config: SpeechToTextConfig, language: str,
+                              stt_config: SpeechToTextConfig,
+                              model_config: ModelConfig, language: str,
                               task_type: str,
                               request_prompt: str) -> PromptType:
         """Get the prompt for the ASR model.
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
@@ -231,6 +231,7 @@
     "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
     "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),  # noqa: E501
     "Tarsier2ForConditionalGeneration": ("qwen2_vl", "Tarsier2ForConditionalGeneration"),  # noqa: E501
+    "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
     # [Encoder-decoder]
     "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py