|
10 | 10 | from ..core.block.e2e.test_correctness_sliding_window import prep_prompts
|
11 | 11 | from ..utils import multi_gpu_test
|
12 | 12 | from .utils import check_logprobs_close
|
| 13 | +from transformers import AutoModelForImageTextToText |
13 | 14 |
|
14 | 15 |
|
15 | 16 | def check_implementation(
|
@@ -71,6 +72,27 @@ def test_models(
|
71 | 72 | model_impl=model_impl)
|
72 | 73 |
|
73 | 74 |
|
| 75 | +@pytest.mark.parametrize( |
| 76 | + "model,model_impl", |
| 77 | + [ |
| 78 | + ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "transformers"), # dynamic image length and number of patches |
| 79 | + ("HuggingFaceTB/SmolVLM-256M-Instruct", "transformers"), # has col/row special token between patches |
| 80 | + ("Qwen/Qwen2.5-VL-3B-Instruct", "transformers"), # pixel values from processor are not 4D or 5D arraya |
| 81 | + ]) # no custom code support because custom models don't follow the standard yet! |
| 82 | +def test_models_multimodal( |
| 83 | + hf_runner: type[HfRunner], |
| 84 | + vllm_runner: type[VllmRunner], |
| 85 | + example_prompts: list[str], |
| 86 | + model: str, |
| 87 | + model_impl: str, |
| 88 | +) -> None: |
| 89 | + check_implementation(hf_runner, |
| 90 | + vllm_runner, |
| 91 | + example_prompts, |
| 92 | + model, |
| 93 | + model_impl=model_impl, |
| 94 | + kwargs_ref={"auto_cls": AutoModelForImageTextToText},) |
| 95 | + |
74 | 96 | def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
|
75 | 97 | prompts, _, _ = prep_prompts(4, (800, 801))
|
76 | 98 | kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
|
|
0 commit comments