Skip to content

Commit cfbcb9e

Browse files
[Voxtral] Add more tests (#21010)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent 76ddeff commit cfbcb9e

File tree

4 files changed

+125
-8
lines changed

4 files changed

+125
-8
lines changed

tests/conftest.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -804,7 +804,7 @@ def __init__(
804804

805805
def get_inputs(
806806
self,
807-
prompts: Union[list[str], list[torch.Tensor]],
807+
prompts: Union[list[str], list[torch.Tensor], list[int]],
808808
images: Optional[PromptImageInput] = None,
809809
videos: Optional[PromptVideoInput] = None,
810810
audios: Optional[PromptAudioInput] = None,
@@ -826,11 +826,16 @@ def get_inputs(
826826
if audios is not None and (audio := audios[i]) is not None:
827827
multi_modal_data["audio"] = audio
828828

829-
text_prompt_kwargs = {
830-
("prompt" if isinstance(prompt, str) else "prompt_embeds"):
831-
prompt,
829+
text_prompt_kwargs: dict[str, Any] = {
832830
"multi_modal_data": multi_modal_data or None
833831
}
832+
if isinstance(prompt, str):
833+
text_prompt_kwargs["prompt"] = prompt
834+
elif isinstance(prompt, list):
835+
text_prompt_kwargs["prompt_token_ids"] = prompt
836+
else:
837+
text_prompt_kwargs["prompt_embeds"] = prompt
838+
834839
inputs.append(TextPrompt(**text_prompt_kwargs))
835840

836841
return inputs

tests/entrypoints/openai/test_transcription_validation.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,6 @@ async def test_basic_audio(mary_had_lamb, model_name):
4747
if model_name.startswith("mistralai"):
4848
server_args += MISTRAL_FORMAT_ARGS
4949

50-
# TODO(PATRICK) - REMOVE AFTER RELEASE
51-
return # skip for now
52-
5350
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
5451
with RemoteOpenAIServer(model_name, server_args) as remote_server:
5552
client = remote_server.get_async_client()
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import json
5+
6+
import pytest
7+
import pytest_asyncio
8+
from mistral_common.audio import Audio
9+
from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
10+
TextChunk, UserMessage)
11+
12+
from vllm.transformers_utils.tokenizer import MistralTokenizer
13+
14+
from ....conftest import AudioTestAssets
15+
from ....utils import RemoteOpenAIServer
16+
from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
17+
18+
MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
19+
MISTRAL_FORMAT_ARGS = [
20+
"--tokenizer_mode", "mistral", "--config_format", "mistral",
21+
"--load_format", "mistral"
22+
]
23+
24+
25+
@pytest.fixture()
26+
def server(request, audio_assets: AudioTestAssets):
27+
args = [
28+
"--enforce-eager",
29+
"--limit-mm-per-prompt",
30+
json.dumps({"audio": len(audio_assets)}),
31+
] + MISTRAL_FORMAT_ARGS
32+
33+
with RemoteOpenAIServer(MODEL_NAME,
34+
args,
35+
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
36+
"30"}) as remote_server:
37+
yield remote_server
38+
39+
40+
@pytest_asyncio.fixture
41+
async def client(server):
42+
async with server.get_async_client() as async_client:
43+
yield async_client
44+
45+
46+
def _get_prompt(audio_assets, question):
47+
tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)
48+
49+
audios = [
50+
Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
51+
for i in range(len(audio_assets))
52+
]
53+
audio_chunks = [
54+
AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
55+
]
56+
57+
text_chunk = TextChunk(text=question)
58+
messages = [UserMessage(content=[*audio_chunks, text_chunk]).to_openai()]
59+
60+
return tokenizer.apply_chat_template(messages=messages)
61+
62+
63+
@pytest.mark.core_model
64+
@pytest.mark.parametrize("dtype", ["half"])
65+
@pytest.mark.parametrize("max_tokens", [128])
66+
@pytest.mark.parametrize("num_logprobs", [5])
67+
def test_models_with_multiple_audios(vllm_runner,
68+
audio_assets: AudioTestAssets, dtype: str,
69+
max_tokens: int,
70+
num_logprobs: int) -> None:
71+
vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
72+
run_multi_audio_test(
73+
vllm_runner,
74+
[(vllm_prompt, [audio.audio_and_sample_rate
75+
for audio in audio_assets])],
76+
MODEL_NAME,
77+
dtype=dtype,
78+
max_tokens=max_tokens,
79+
num_logprobs=num_logprobs,
80+
tokenizer_mode="mistral",
81+
)
82+
83+
84+
@pytest.mark.asyncio
85+
async def test_online_serving(client, audio_assets: AudioTestAssets):
86+
"""Exercises online serving with/without chunked prefill enabled."""
87+
88+
def asset_to_chunk(asset):
89+
audio = Audio.from_file(str(asset.get_local_path()), strict=False)
90+
audio.format = "wav"
91+
audio_dict = AudioChunk.from_audio(audio).to_openai()
92+
return audio_dict
93+
94+
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
95+
messages = [{
96+
"role":
97+
"user",
98+
"content": [
99+
*audio_chunks,
100+
{
101+
"type":
102+
"text",
103+
"text":
104+
f"What's happening in these {len(audio_assets)} audio clips?"
105+
},
106+
],
107+
}]
108+
109+
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
110+
messages=messages,
111+
max_tokens=10)
112+
113+
assert len(chat_completion.choices) == 1
114+
choice = chat_completion.choices[0]
115+
assert choice.finish_reason == "length"

tests/models/registry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -440,7 +440,7 @@ def check_available_online(
440440
tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501
441441
trust_remote_code=True), # noqa: E501
442442
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
443-
"VoxtralForConditionalGeneration": _HfExamplesInfo("mistralai/Voxtral-Mini-3B-2507", is_available_online=False, tokenizer_mode="mistral"), # noqa: E501
443+
"VoxtralForConditionalGeneration": _HfExamplesInfo("mistralai/Voxtral-Mini-3B-2507", tokenizer_mode="mistral"), # noqa: E501
444444
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
445445

446446
# [Cross-encoder]

0 commit comments

Comments
 (0)