address some comments

zucchini-nlp · zucchini-nlp · commit be850dc7807c · 2025-06-03T12:47:34.000+02:00
Signed-off-by: raushan &lt;raushan@huggingface.co&gt;
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
@@ -75,12 +75,14 @@ def test_models(
 @pytest.mark.parametrize(
     "model,model_impl",
     [
-        ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-         "transformers"),  # dynamic image length and number of patches
-        ("HuggingFaceTB/SmolVLM-256M-Instruct",
-         "transformers"),  # has col/row special token between patches
-        ("Qwen/Qwen2.5-VL-3B-Instruct", "transformers"
-         ),  # pixel values from processor are not 4D or 5D arraya
+        # Dynamic image length and number of patches
+        ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "transformers"),
+        # Has col/row special token between patches
+        ("HuggingFaceTB/SmolVLM-256M-Instruct", "transformers"),
+        # Pixel values from processor are not 4D or 5D arrays
+        ("Qwen/Qwen2.5-VL-3B-Instruct", "transformers"),
+        # Check "auto" with fallback to transformers
+        ("BAAI/Emu3-Chat-hf", "auto"),
     ]
 )  # no custom code support because custom models don't follow the standard yet!
 def test_models_multimodal(
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
@@ -200,7 +200,13 @@ def resolve_transformers_arch(model_config: ModelConfig,
                 raise ValueError(
                     f"The Transformers implementation of {arch} is not "
                     "compatible with vLLM.")
-            architectures[i] = "TransformersForMultimodalLM"
+            # Check if text-config is `self`. If not most probably it is
+            # a composite config, i.e. mutlimodal
+            if model_config.hf_config.get_text_config(
+            ) != model_config.hf_config:
+                architectures[i] = "TransformersForMultimodalLM"
+            else:
+                architectures[i] = "TransformersForCausalLM"
         if model_config.model_impl == ModelImpl.AUTO:
             if not model_module.is_backend_compatible():
                 raise ValueError(
@@ -211,7 +217,13 @@ def resolve_transformers_arch(model_config: ModelConfig,
                 "%s has no vLLM implementation, falling back to Transformers "
                 "implementation. Some features may not be supported and "
                 "performance may not be optimal.", arch)
-            architectures[i] = "TransformersForMultimodalLM"
+            # Check if text-config is `self`. If not most probably it is
+            # a composite config, i.e. mutlimodal
+            if model_config.hf_config.get_text_config(
+            ) != model_config.hf_config:
+                architectures[i] = "TransformersForMultimodalLM"
+            else:
+                architectures[i] = "TransformersForCausalLM"
     return architectures
 
 
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
@@ -318,11 +318,11 @@ def apply(
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
         (prompt_ids, processed_data,
-        mm_token_type_ids) = self._apply_hf_processor_text_mm(
-            prompt_text=prompt,
-            mm_items=mm_items,
-            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
-        )
+         mm_token_type_ids) = self._apply_hf_processor_text_mm(
+             prompt_text=prompt,
+             mm_items=mm_items,
+             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+         )
 
         # HF processor will return `mm_token_type_ids` from which
         # we can infer mm_placeholders. Until then hardcode to make code run
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
@@ -34,8 +34,8 @@ class MirroredProcessingCache:
 
     def __init__(self, model_config):
         mm_config = model_config.multimodal_config
-        disable_mm_preprocessor_cache = (
-            mm_config is not None and mm_config.disable_mm_preprocessor_cache)
+        disable_mm_preprocessor_cache = mm_config is not None and \
+            not mm_config.disable_mm_preprocessor_cache
         self.use_cache = not disable_mm_preprocessor_cache
         self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
                                                       MultiModalKwargs)