style

zucchini-nlp · zucchini-nlp · commit 2c73f8843bc1 · 2025-06-02T17:07:02.000+02:00
Signed-off-by: raushan &lt;raushan@huggingface.co&gt;
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
@@ -3,14 +3,14 @@
 from typing import Any, Optional, Union
 
 import pytest
+from transformers import AutoModelForImageTextToText
 
 from vllm.platforms import current_platform
 
 from ..conftest import HfRunner, VllmRunner
 from ..core.block.e2e.test_correctness_sliding_window import prep_prompts
 from ..utils import multi_gpu_test
 from .utils import check_logprobs_close
-from transformers import AutoModelForImageTextToText
 
 
 def check_implementation(
@@ -75,23 +75,30 @@ def test_models(
 @pytest.mark.parametrize(
     "model,model_impl",
     [
-        ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "transformers"), # dynamic image length and number of patches
-        ("HuggingFaceTB/SmolVLM-256M-Instruct", "transformers"), # has col/row special token between patches
-        ("Qwen/Qwen2.5-VL-3B-Instruct", "transformers"), # pixel values from processor are not 4D or 5D arraya
-    ]) # no custom code support because custom models don't follow the standard yet!
+        ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+         "transformers"),  # dynamic image length and number of patches
+        ("HuggingFaceTB/SmolVLM-256M-Instruct",
+         "transformers"),  # has col/row special token between patches
+        ("Qwen/Qwen2.5-VL-3B-Instruct", "transformers"
+         ),  # pixel values from processor are not 4D or 5D arraya
+    ]
+)  # no custom code support because custom models don't follow the standard yet!
 def test_models_multimodal(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
     example_prompts: list[str],
     model: str,
     model_impl: str,
 ) -> None:
-    check_implementation(hf_runner,
-                         vllm_runner,
-                         example_prompts,
-                         model,
-                         model_impl=model_impl,
-                         kwargs_ref={"auto_cls": AutoModelForImageTextToText},)
+    check_implementation(
+        hf_runner,
+        vllm_runner,
+        example_prompts,
+        model,
+        model_impl=model_impl,
+        kwargs_ref={"auto_cls": AutoModelForImageTextToText},
+    )
+
 
 def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
     prompts, _, _ = prep_prompts(4, (800, 801))
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
@@ -231,7 +231,7 @@
 }
 
 _TRANSFORMERS_MODELS = {
-    "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"),
+    "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
     "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
 }
 # yapf: enable
@@ -457,8 +457,8 @@ def _normalize_archs(
 
         # make sure Transformers backend is put at the last as a fallback
         if len(normalized_arch) != len(architectures):
-            # The order matters. If causal comes first, checks on MM model fails because it is not registered in MultimodalRegistry
-            # TODO: needs help from vLLM team
+            # The order matters. If the CausalLM comes first, then checks for
+            # registered model in MultimodalRegistry fail
             normalized_arch.extend(
                 ["TransformersForMultimodalLM", "TransformersForCausalLM"])
         return normalized_arch
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
@@ -41,12 +41,12 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalInputs,
-                                    PlaceholderRange, MultiModalDataDict)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputs, PlaceholderRange)
 from vllm.multimodal.parse import ImageProcessorItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.processor import cached_get_processor
 
@@ -124,8 +124,9 @@ def replace_linear_class(
 @contextmanager
 def init_on_device_without_buffers(device: torch.device):
     """
-    A context manager under which models are initialized with all parameters on the specified device.
-    However buffers are not initialized on specified device.
+    A context manager under which models are initialized with all
+    parameters on the specified device. However buffers are not
+    initialized on specified device.
 
     Args:
         device (`torch.device`):
@@ -162,8 +163,7 @@ def wrapper(*args, **kwargs):
         yield
     finally:
         nn.Module.register_parameter = old_register_parameter
-        for torch_function_name, old_torch_function in tensor_constructors_to_patch.items(
-        ):
+        for torch_function_name, old_torch_function in tensor_constructors_to_patch.items():
             setattr(torch, torch_function_name, old_torch_function)
 
 
@@ -216,7 +216,7 @@ def get_dummy_mm_data(
 
         target_width, target_height = self.info.get_max_image_size()
 
-        return  {
+        return {
             "image":
             self._get_dummy_images(width=target_width,
                                    height=target_height,
@@ -253,13 +253,11 @@ def _get_mm_fields_config(
         hf_processor_mm_kwargs,
         num_image_patches: torch.Tensor = None,
     ):
-        hf_inputs.pop(
-            "attention_mask",
-            None)  # processors always return a mask but vLLM doesn't need it
+        # HF Processors always return a mask but vLLM doesn't need it
+        hf_inputs.pop("attention_mask", None)
         mm_fields = {
-            key: MultiModalFieldConfig.flat_from_sizes("image",
-                                                       num_image_patches)
-            for key in hf_inputs.keys()
+            key: MultiModalFieldConfig.flat_from_sizes("image", num_image_patches)
+            for key in hf_inputs
         }
         mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes(
             "image", num_image_patches)
@@ -311,13 +309,17 @@ def apply(
         """
         if return_mm_hashes:
             raise ValueError(
-                "TransformersMultimodalLM doesn't support mm hashing yet! Probably you did not set "
-                "`disable_mm_preprocessor_cache=True`.")
+                "TransformersMultimodalLM doesn't support mm hashing yet! "
+                "Probably you did not set `disable_mm_preprocessor_cache=True`")
 
         mm_items = self._to_mm_items(mm_data)
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
-        prompt_ids, processed_data, mm_token_type_ids = self._apply_hf_processor_text_mm(
+        (
+            prompt_ids,
+            processed_data,
+            mm_token_type_ids
+        ) = self._apply_hf_processor_text_mm(
             prompt_text=prompt,
             mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
@@ -435,7 +437,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config_override = ConfigOverride(
                 config, sliding_window=config.interleaved_sliding_window)
 
-        # Set correct attn impl and init on "meta" to delay allocating GPU tensors
+        # Set correct attn and init on "meta" to delay allocating GPU tensors
         self.text_config._attn_implementation = "vllm"
         with init_on_device_without_buffers("meta"):
             # FIXME(Isotr0py): We need to refactor this part in the future to
@@ -870,9 +872,9 @@ def get_multimodal_embeddings(self, **kwargs):
                 if vision_embeddings.ndim == 2:
                     vision_embeddings = vision_embeddings.unsqueeze(0)
 
-                # Embeddings have to be 2D tensors of length `num_images` but transformers
-                # returns concat tensors if each patch is of different size. We split it back
-                # to make vLLM assertions happy
+                # Embeddings have to be 2D tensors of length `num_images`
+                # but transformers returns concat tensors if each patch
+                # is of different size. We split it back to make vLLM happy
                 vision_embeddings = torch.split(vision_embeddings,
                                                 num_image_patches.tolist())
                 vision_embeddings = [
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
@@ -34,10 +34,13 @@ class MirroredProcessingCache:
 
     def __init__(self, model_config):
         mm_config = model_config.multimodal_config
-        disable_mm_preprocessor_cache = mm_config is not None and mm_config.disable_mm_preprocessor_cache
+        disable_mm_preprocessor_cache = (
+            mm_config is not None and mm_config.disable_mm_preprocessor_cache
+        )
         self.use_cache = not disable_mm_preprocessor_cache
-        self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
-                                                      MultiModalKwargs)
+        self.mm_cache = ProcessingCache.get_lru_cache(
+            VLLM_MM_INPUT_CACHE_GIB, MultiModalKwargs
+        )
 
     def get_and_update_p0(
         self,