vllm-project · zucchini-nlp · Feb 19, 2025 · Feb 21, 2025 · Feb 24, 2025 · Feb 24, 2025
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -27,6 +27,10 @@ argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
     # via isoduration
+async-timeout==5.0.1
+    # via
+    #   aiohttp
+    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -129,6 +133,11 @@ eval-type-backport==0.2.2
     # via mteb
 evaluate==0.4.3
     # via lm-eval
+exceptiongroup==1.3.0
+    # via
+    #   anyio
+    #   hypothesis
+    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -640,7 +649,6 @@ setuptools==77.0.3
     # via
     #   mamba-ssm
     #   pytablewriter
-    #   torch
     #   triton
 shellingham==1.5.4
     # via typer
@@ -700,8 +708,13 @@ tokenizers==0.21.1
     # via
     #   -r requirements/test.in
     #   transformers
+toml==0.10.2
+    # via datamodel-code-generator
 tomli==2.2.1
-    # via schemathesis
+    # via
+    #   black
+    #   pytest
+    #   schemathesis
 tomli-w==1.2.0
     # via schemathesis
 torch==2.7.0+cu128
@@ -775,13 +788,18 @@ types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
     # via
+    #   anyio
+    #   black
+    #   exceptiongroup
     #   huggingface-hub
     #   librosa
     #   mistral-common
     #   mteb
+    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
+    #   rich
     #   torch
     #   typer
 tzdata==2024.2

@@ -3,6 +3,7 @@
 from typing import Any, Optional, Union
 
 import pytest
+from transformers import AutoModelForImageTextToText
 
 from vllm.platforms import current_platform
 
@@ -71,6 +72,34 @@ def test_models(
                          model_impl=model_impl)
 
 
+@pytest.mark.parametrize(
+    "model,model_impl",
+    [
+        ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+         "transformers"),  # dynamic image length and number of patches
+        ("HuggingFaceTB/SmolVLM-256M-Instruct",
+         "transformers"),  # has col/row special token between patches
+        ("Qwen/Qwen2.5-VL-3B-Instruct", "transformers"
+         ),  # pixel values from processor are not 4D or 5D arraya
+    ]
+)  # no custom code support because custom models don't follow the standard yet!
+def test_models_multimodal(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    example_prompts: list[str],
+    model: str,
+    model_impl: str,
+) -> None:
+    check_implementation(
+        hf_runner,
+        vllm_runner,
+        example_prompts,
+        model,
+        model_impl=model_impl,
+        kwargs_ref={"auto_cls": AutoModelForImageTextToText},
+    )
+
+
 def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
     prompts, _, _ = prep_prompts(4, (800, 801))
     kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
@@ -164,7 +164,7 @@ def device_loading_context(module: torch.nn.Module,
 def resolve_transformers_arch(model_config: ModelConfig,
                               architectures: list[str]):
     for i, arch in enumerate(architectures):
-        if arch == "TransformersForCausalLM":
+        if arch in ["TransformersForCausalLM", "TransformersForMultimodalLM"]:
             continue
         auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
                                            None) or dict()
@@ -200,7 +200,7 @@ def resolve_transformers_arch(model_config: ModelConfig,
                 raise ValueError(
                     f"The Transformers implementation of {arch} is not "
                     "compatible with vLLM.")
-            architectures[i] = "TransformersForCausalLM"
+            architectures[i] = "TransformersForMultimodalLM"
         if model_config.model_impl == ModelImpl.AUTO:
             if not model_module.is_backend_compatible():
                 raise ValueError(
@@ -211,7 +211,7 @@ def resolve_transformers_arch(model_config: ModelConfig,
                 "%s has no vLLM implementation, falling back to Transformers "
                 "implementation. Some features may not be supported and "
                 "performance may not be optimal.", arch)
-            architectures[i] = "TransformersForCausalLM"
+            architectures[i] = "TransformersForMultimodalLM"
     return architectures
 
 

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
@@ -185,6 +185,7 @@
     "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"),  # noqa: E501
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
+    # "InternVLForConditionalGeneration": ("internvl", "InternVLForConditionalGeneration"), # noqa: E501
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
     "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
     "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"),  # noqa: E501
@@ -230,6 +231,7 @@
 }
 
 _TRANSFORMERS_MODELS = {
+    "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
     "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
 }
 # yapf: enable
@@ -455,15 +457,17 @@ def _normalize_archs(
 
         # make sure Transformers backend is put at the last as a fallback
         if len(normalized_arch) != len(architectures):
-            normalized_arch.append("TransformersForCausalLM")
+            # The order matters. If the CausalLM comes first, then checks for
+            # registered model in MultimodalRegistry fail
+            normalized_arch.extend(
+                ["TransformersForMultimodalLM", "TransformersForCausalLM"])
         return normalized_arch
 
     def inspect_model_cls(
         self,
         architectures: Union[str, list[str]],
     ) -> tuple[_ModelInfo, str]:
         architectures = self._normalize_archs(architectures)
-
         for arch in architectures:
             model_info = self._try_inspect_model_cls(arch)
             if model_info is not None: