vllm-project · zucchini-nlp · Feb 19, 2025 · Feb 21, 2025 · Feb 24, 2025 · Feb 24, 2025
@@ -21,7 +21,7 @@ These models are what we list in [supported-text-models][supported-text-models]
 
 ### Transformers
 
-vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned!
+vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported!
 
 To check if the modeling backend is Transformers, you can simply do this:
 
@@ -31,14 +31,17 @@ llm = LLM(model=..., task="generate")  # Name or path of your model
 llm.apply_model(lambda model: print(type(model)))
 ```
 
-If it is `TransformersForCausalLM` then it means it's based on Transformers!
+If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it means it's based on Transformers!
 
 !!! tip
-    You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference][offline-inference] or `--model-impl transformers` for the [openai-compatible-server][serving-openai-compatible-server].
+    You can force the use of `Transformers` model by setting `model_impl="transformers"` for [offline-inference][offline-inference] or `--model-impl transformers` for the [openai-compatible-server][serving-openai-compatible-server].
 
 !!! note
     vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
 
+!!! note
+    In case of vision language models if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Trasnformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
+
 #### Custom models
 
 If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM!
@@ -102,7 +105,7 @@ Here is what happens in the background when this model is loaded:
 
 1. The config is loaded.
 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
-3. `MyModel` is loaded into `TransformersForCausalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
+3. `MyModel` is loaded into `TransformersForCausalLM` or `TransformersForMultimodalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
 
 That's it!
 

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -31,6 +31,10 @@ argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
     # via isoduration
+async-timeout==5.0.1
+    # via
+    #   aiohttp
+    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -141,6 +145,11 @@ eval-type-backport==0.2.2
     # via mteb
 evaluate==0.4.3
     # via lm-eval
+exceptiongroup==1.3.0
+    # via
+    #   anyio
+    #   hypothesis
+    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -690,7 +699,6 @@ setuptools==77.0.3
     # via
     #   mamba-ssm
     #   pytablewriter
-    #   torch
     #   triton
 shellingham==1.5.4
     # via typer
@@ -753,8 +761,13 @@ tokenizers==0.21.1
     # via
     #   -r requirements/test.in
     #   transformers
+toml==0.10.2
+    # via datamodel-code-generator
 tomli==2.2.1
-    # via schemathesis
+    # via
+    #   black
+    #   pytest
+    #   schemathesis
 tomli-w==1.2.0
     # via schemathesis
 torch==2.7.0+cu128
@@ -828,13 +841,18 @@ types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
     # via
+    #   anyio
+    #   black
+    #   exceptiongroup
     #   huggingface-hub
     #   librosa
     #   mistral-common
     #   mteb
+    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
+    #   rich
     #   torch
     #   typer
     #   typing-inspection

@@ -4,6 +4,7 @@
 from typing import Any, Optional, Union
 
 import pytest
+from transformers import AutoModelForImageTextToText
 
 from vllm.platforms import current_platform
 
@@ -72,6 +73,36 @@ def test_models(
                          model_impl=model_impl)
 
 
+@pytest.mark.parametrize(
+    "model,model_impl",
+    [
+        # Dynamic image length and number of patches
+        ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "transformers"),
+        # Has col/row special token between patches
+        ("HuggingFaceTB/SmolVLM-256M-Instruct", "transformers"),
+        # Pixel values from processor are not 4D or 5D arrays
+        ("Qwen/Qwen2.5-VL-3B-Instruct", "transformers"),
+        # Check "auto" with fallback to transformers
+        ("BAAI/Emu3-Chat-hf", "auto"),
+    ]
+)  # no custom code support because custom models don't follow the standard yet!
+def test_models_multimodal(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    example_prompts: list[str],
+    model: str,
+    model_impl: str,
+) -> None:
+    check_implementation(
+        hf_runner,
+        vllm_runner,
+        example_prompts,
+        model,
+        model_impl=model_impl,
+        kwargs_ref={"auto_cls": AutoModelForImageTextToText},
+    )
+
+
 def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
     prompts, _, _ = prep_prompts(4, (800, 801))
     kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
@@ -170,7 +170,7 @@ def device_loading_context(module: torch.nn.Module,
 def resolve_transformers_arch(model_config: ModelConfig,
                               architectures: list[str]):
     for i, arch in enumerate(architectures):
-        if arch == "TransformersForCausalLM":
+        if arch in ["TransformersForCausalLM", "TransformersForMultimodalLM"]:
             continue
         auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
                                            None) or dict()
@@ -206,7 +206,13 @@ def resolve_transformers_arch(model_config: ModelConfig,
                 raise ValueError(
                     f"The Transformers implementation of {arch} is not "
                     "compatible with vLLM.")
-            architectures[i] = "TransformersForCausalLM"
+            # Check if text-config is `self`. If not most probably it is
+            # a composite config, i.e. mutlimodal
+            if model_config.hf_config.get_text_config(
+            ) != model_config.hf_config:
+                architectures[i] = "TransformersForMultimodalLM"
+            else:
+                architectures[i] = "TransformersForCausalLM"
         if model_config.model_impl == ModelImpl.AUTO:
             if not model_module.is_backend_compatible():
                 raise ValueError(
@@ -217,7 +223,13 @@ def resolve_transformers_arch(model_config: ModelConfig,
                 "%s has no vLLM implementation, falling back to Transformers "
                 "implementation. Some features may not be supported and "
                 "performance may not be optimal.", arch)
-            architectures[i] = "TransformersForCausalLM"
+            # Check if text-config is `self`. If not most probably it is
+            # a composite config, i.e. mutlimodal
+            if model_config.hf_config.get_text_config(
+            ) != model_config.hf_config:
+                architectures[i] = "TransformersForMultimodalLM"
+            else:
+                architectures[i] = "TransformersForCausalLM"
     return architectures
 
 

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
@@ -244,6 +244,7 @@
 }
 
 _TRANSFORMERS_MODELS = {
+    "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
     "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
 }
 # yapf: enable
@@ -469,15 +470,17 @@ def _normalize_archs(
 
         # make sure Transformers backend is put at the last as a fallback
         if len(normalized_arch) != len(architectures):
-            normalized_arch.append("TransformersForCausalLM")
+            # The order matters. If the CausalLM comes first, then checks for
+            # registered model in MultimodalRegistry fail
+            normalized_arch.extend(
+                ["TransformersForMultimodalLM", "TransformersForCausalLM"])
         return normalized_arch
 
     def inspect_model_cls(
         self,
         architectures: Union[str, list[str]],
     ) -> tuple[_ModelInfo, str]:
         architectures = self._normalize_archs(architectures)
-
         for arch in architectures:
             model_info = self._try_inspect_model_cls(arch)
             if model_info is not None: