From 64f2e9cf4d90aec29b1b902f7c470577b870a7d6 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 7 Jul 2025 11:55:54 +0800 Subject: [PATCH 1/9] bump transformers version Signed-off-by: Isotr0py <2037008807@qq.com> --- requirements/test.in | 2 +- requirements/test.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/test.in b/requirements/test.in index 5f8b97a0e34..57e31c9af9b 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -34,7 +34,7 @@ opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.8 # required for model evaluation test mteb[bm25s]>=1.38.11, <2 # required for mteb test -transformers==4.52.4 +transformers==4.53.1 tokenizers==0.21.1 huggingface-hub[hf_xet]>=0.33.0 # Required for Xet downloads. schemathesis>=3.39.15 # Required for openai schema test. diff --git a/requirements/test.txt b/requirements/test.txt index f6f599df758..0ed9141ab45 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -800,7 +800,7 @@ tqdm==4.66.6 # transformers tqdm-multiprocess==0.0.11 # via lm-eval -transformers==4.52.4 +transformers==4.53.1 # via # -r requirements/test.in # genai-perf From 785fcac256bb4cb74291fd39d2a2d0167f4ed61e Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 7 Jul 2025 15:22:34 +0800 Subject: [PATCH 2/9] fix whisper qwen2.5-omni, and disable glm4.1v test due to oom Signed-off-by: Isotr0py <2037008807@qq.com> --- tests/models/multimodal/generation/test_common.py | 4 ++-- vllm/inputs/registry.py | 8 +------- vllm/model_executor/models/qwen2_5_omni_thinker.py | 10 +++++++++- vllm/model_executor/models/whisper.py | 9 ++++++++- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index cbc2e9c87a6..4ac90755c3b 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -317,6 +317,7 @@ num_logprobs=10, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], auto_cls=AutoModelForImageTextToText, + marks=[large_gpu_mark(min_gb=32)], ), "glm4_1v-video": VLMTestInfo( models=["THUDM/GLM-4.1V-9B-Thinking"], @@ -330,8 +331,7 @@ inputs=custom_inputs.video_with_metadata_glm4_1v(), limit_mm_per_prompt={"video": 1}, )], - # This is needed to run on machine with 24GB VRAM - vllm_runner_kwargs={"gpu_memory_utilization": 0.95}, + marks=[large_gpu_mark(min_gb=32)], ), "h2ovl": VLMTestInfo( models = [ diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index fc6e190e548..66e78833f52 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -5,9 +5,7 @@ from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union import torch -from packaging.version import Version from transformers import BatchFeature, PretrainedConfig, ProcessorMixin -from transformers import __version__ as TRANSFORMERS_VERSION from typing_extensions import TypeVar from vllm.jsontree import JSONTree, json_map_leaves @@ -130,13 +128,9 @@ def get_hf_processor( /, **kwargs: object, ) -> _P: - # Transformers 4.53.0 has issue with passing tokenizer to - # initialize processor. We disable it for this version. - # See: https://github.com/vllm-project/vllm/issues/20224 - if Version(TRANSFORMERS_VERSION) != Version("4.53.0"): - kwargs["tokenizer"] = self.tokenizer return super().get_hf_processor( typ, + tokenizer=self.tokenizer, **kwargs, ) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 377a34f2088..c5a5c10d950 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -144,8 +144,16 @@ def get_hf_processor( ) -> Qwen2_5OmniProcessor: if fps is not None: kwargs["fps"] = fps + + # Monkey patch for Transformers v4.53 + processor_class = Qwen2_5OmniProcessor + if processor_class.image_processor_class != "AutoImageProcessor": + processor_class.image_processor_class = "AutoImageProcessor" + if processor_class.video_processor_class != "AutoVideoProcessor": + processor_class.video_processor_class = "AutoVideoProcessor" + processor = self.ctx.get_hf_processor( - Qwen2_5OmniProcessor, + processor_class, image_processor=self.get_image_processor(min_pixels=min_pixels, max_pixels=max_pixels, size=size, diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 344d6fc8f45..ee1cfd7d713 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -634,7 +634,14 @@ def get_hf_config(self) -> WhisperConfig: def get_hf_processor(self, sampling_rate: Optional[int] = None ) -> WhisperProcessor: - return self.ctx.get_hf_processor(WhisperProcessor) + # HACK: Transformers 4.53.0 has issue with whisper tokenizer to + # initialize processor. We use a monkeypatch to fix it here. + # See: https://github.com/vllm-project/vllm/issues/20224 + processor_class = WhisperProcessor + tokenizer_class = ("WhisperTokenizer", "WhisperTokenizerFast") + if processor_class.tokenizer_class != tokenizer_class: + processor_class.tokenizer_class = tokenizer_class + return self.ctx.get_hf_processor(processor_class) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": 1} From 5da1146317b12d644cf5a2750c380ac72f188238 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 7 Jul 2025 15:44:55 +0800 Subject: [PATCH 3/9] fix minicpmo Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/minicpmo.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index 71593d4bb89..4e4fc3d5c76 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -30,8 +30,10 @@ from torch import nn from transformers import BatchFeature, PretrainedConfig from transformers.modeling_outputs import BaseModelOutputWithPast -from transformers.models.whisper.modeling_whisper import ( - ACT2FN, WHISPER_ATTENTION_CLASSES, WhisperConfig, WhisperEncoder) +from transformers.models.whisper.modeling_whisper import (ACT2FN, + WhisperAttention, + WhisperConfig, + WhisperEncoder) from vllm.config import VllmConfig from vllm.model_executor.layers.quantization import QuantizationConfig @@ -378,14 +380,13 @@ class MiniCPMWhisperEncoderLayer(nn.Module): def __init__(self, config: WhisperConfig, layer_idx: int): super().__init__() self.embed_dim = config.d_model - self.self_attn = WHISPER_ATTENTION_CLASSES[ - config._attn_implementation]( - embed_dim=self.embed_dim, - num_heads=config.encoder_attention_heads, - dropout=config.attention_dropout, - config=config, - layer_idx=layer_idx, - ) + self.self_attn = WhisperAttention( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + config=config, + layer_idx=layer_idx, + ) self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] From 77169afb08f396d85d8d1f7835abb54d29cbeff2 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 8 Jul 2025 01:39:50 +0800 Subject: [PATCH 4/9] fix fuyu Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/fuyu.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 26c8f80d5a0..56edaaf99b8 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -175,12 +175,21 @@ def _call_hf_processor( # Original output: (1, num_images, Pn, Px * Py * C) # New output: (num_images, Pn, Px * Py * C) - assert (isinstance(image_patches, list) - and len(image_patches) == 1) - assert (isinstance(image_patches[0], torch.Tensor) - and len(image_patches[0]) == len(images)) - - processed_outputs["image_patches"] = image_patches[0] + # image_patches is a list with shape: + # (1, num_images, Pn, Px * Py * C) + # before Transformers 4.53 + if isinstance(image_patches, list): + assert len(image_patches) == 1 + assert (isinstance(image_patches[0], torch.Tensor) + and len(image_patches[0]) == len(images)) + processed_outputs["image_patches"] = image_patches[0] + # image_patches is a tensor with shape: + # (num_images, Pn, Px * Py * C) + # after Transformers 4.53 + elif isinstance(image_patches, torch.Tensor): + assert len(image_patches) == len(images) + else: + raise AssertionError("This line should be unreachable.") return processed_outputs From 88bb93ca06a6c4bab6a98ad61328b269b1baf411 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 8 Jul 2025 11:44:15 +0800 Subject: [PATCH 5/9] fix fuyu processing Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/fuyu.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 56edaaf99b8..558d4fbb4de 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -202,8 +202,10 @@ def _apply_hf_processor_tokens_only( vocab = tokenizer.get_vocab() boa_token_id = vocab["<0x04>"] + if prompt_tokens[-1] != boa_token_id: + prompt_tokens.append(boa_token_id) - return prompt_tokens + [boa_token_id] + return prompt_tokens def _get_mm_fields_config( self, From 300029e549e359d0d09d7ec8f90f95987010568d Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Thu, 10 Jul 2025 22:17:27 +0800 Subject: [PATCH 6/9] fix paligemma and glm4.1v Signed-off-by: Isotr0py <2037008807@qq.com> --- tests/models/multimodal/processing/test_common.py | 1 + tests/models/test_initialization.py | 3 ++- vllm/model_executor/models/paligemma.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 0f33225eda2..ab21941fae9 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -159,6 +159,7 @@ def _test_processing_correctness( _ADD_SPECIAL_TOKENS_OVERRIDES = { "mllama": False, "ovis": False, + "paligemma": False, "ultravox": False, "whisper": False, } diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 25bc96bf326..38b207073eb 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -22,7 +22,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch): model_info.check_transformers_version(on_fail="skip") # FIXME: Possible memory leak in the previous tests? - if model_arch in ("GraniteSpeechForConditionalGeneration", + if model_arch in ("Glm4vForConditionalGeneration", + "GraniteSpeechForConditionalGeneration", "KimiVLForConditionalGeneration"): pytest.skip("Avoid OOM") diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 77197abe571..b1f2e53b0c7 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -125,7 +125,7 @@ def _call_hf_processor( ) -> BatchFeature: tokenizer = self.info.get_tokenizer() if not mm_data: - prompt_ids = tokenizer.encode(prompt) + prompt_ids = tokenizer.encode(prompt, add_special_tokens=False) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") return super()._call_hf_processor( From 42d517bac8ed02c24237a4daf05ab070824ec442 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 10 Jul 2025 23:19:51 +0800 Subject: [PATCH 7/9] fix gemma3n test Signed-off-by: Isotr0py --- tests/models/test_initialization.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index b042a6b6423..07ded1e5880 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -47,9 +47,14 @@ def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: n_group = getattr(text_config, 'n_group', None) num_experts = n_group * 2 if n_group is not None else 2 + # we use three layers for Gemma-3n to check + # both normal layer and kv_shared_layer + num_hidden_layers = (3 if model_arch + == "Gemma3nForConditionalGeneration" else 1) + text_config.update({ "num_layers": 1, - "num_hidden_layers": 1, + "num_hidden_layers": num_hidden_layers, "num_experts": num_experts, "num_experts_per_tok": 2, "num_local_experts": num_experts, @@ -57,6 +62,8 @@ def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: "first_k_dense_replace": 0, # To avoid OOM on DeepSeek-V3 "n_routed_experts": num_experts, + # For Gemma-3n + "num_kv_shared_layers": 1, }) if hasattr(hf_config, "vision_config"): From 1346b3e37139bf2749c5d7f33c87d5e26bf78524 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 11 Jul 2025 21:33:19 +0800 Subject: [PATCH 8/9] use transformers 4.53.2 Signed-off-by: Isotr0py --- requirements/test.in | 2 +- requirements/test.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/test.in b/requirements/test.in index 2686e8d6228..1c725df7e60 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -34,7 +34,7 @@ opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.8 # required for model evaluation test mteb[bm25s]>=1.38.11, <2 # required for mteb test -transformers==4.53.1 +transformers==4.53.2 tokenizers==0.21.1 huggingface-hub[hf_xet]>=0.33.0 # Required for Xet downloads. schemathesis>=3.39.15 # Required for openai schema test. diff --git a/requirements/test.txt b/requirements/test.txt index 0b023d7529d..6f500992bb5 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -800,7 +800,7 @@ tqdm==4.66.6 # transformers tqdm-multiprocess==0.0.11 # via lm-eval -transformers==4.53.1 +transformers==4.53.2 # via # -r requirements/test.in # genai-perf From bca7bbf28ab9ea05de3f96e30ab2caeaaf17eb2c Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 11 Jul 2025 22:05:32 +0800 Subject: [PATCH 9/9] add sliding window compatibility Signed-off-by: Isotr0py --- vllm/model_executor/models/commandr.py | 7 +++++-- vllm/model_executor/models/gemma3.py | 9 ++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 817c6bb9a7f..c4f6144ed91 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -189,10 +189,13 @@ def __init__( layer_idx = extract_layer_index(prefix) layer_has_sliding_window = ( - getattr(config, "sliding_window_pattern", False) - and (layer_idx + 1) % self.config.sliding_window_pattern != 0) + getattr(config, "sliding_window_pattern", False) and + (layer_idx + 1) % self.config.sliding_window_pattern + != 0) or (getattr(config, "layer_types", False) + and config.layer_types[layer_idx] == "sliding_attention") self.sliding_window = (interleaved_sliding_window + or config.sliding_window if layer_has_sliding_window else None) self.attn = Attention(self.num_heads, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 954e48d25f6..1a2ce65d1e4 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -149,14 +149,17 @@ def __init__(self, # TODO(woosuk): Add reference to the original HF implementation. layer_idx = extract_layer_index(prefix) self.is_sliding = (getattr( - config, "interleaved_sliding_window", None) is not None and bool( - (layer_idx + 1) % config.sliding_window_pattern)) + config, "interleaved_sliding_window", None) is not None and (bool( + (layer_idx + 1) % config.sliding_window_pattern))) or ( + getattr(config, "layer_types", None) is not None + and config.layer_types[layer_idx] == "sliding_attention") # Initialize the rotary embedding. if self.is_sliding: # Local attention. Override the values in config.json. self.rope_theta = config.rope_local_base_freq self.rope_scaling = {"rope_type": "default"} - self.sliding_window = config.interleaved_sliding_window + self.sliding_window = (config.interleaved_sliding_window + or config.sliding_window) else: # Global attention. Use the values in config.json. self.rope_theta = config.rope_theta