[Core] Rename get_max_tokens_per_item for backward compatibility (#20630)

DarkLight1337 · web-flow · commit 32dffc277206 · 2025-07-08T23:11:30.000Z
Signed-off-by: DarkLight1337 &lt;tlleungac@connect.ust.hk&gt;
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
@@ -823,10 +823,11 @@ def get_image_processor(
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
-    def get_max_tokens_per_item(
-            self, seq_len: int,
-            mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
-
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
         max_image_tokens = self.get_max_image_tokens()
         max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
         return {"image": max_image_tokens, "video": max_video_tokens}
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
@@ -1100,24 +1100,29 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]:
 
         return allowed_limits
 
-    def get_max_tokens_per_item(
-            self, seq_len: int,
-            mm_counts: Optional[Mapping[str,
-                                        int]]) -> Optional[Mapping[str, int]]:
-        """Return the maximum number of tokens per item of for each modality.
-        By default, returns `None`. When `None` is returned, vLLM will generate
-        dummy inputs (images/videos) at maximum possible sizes and process them
-        to determine the maximum token count per modality.
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Optional[Mapping[str, int]]:
+        """
+        Return the maximum number of tokens per item of for each modality.
+        
+        When `None` (the default) is returned, vLLM will generate dummy inputs
+        (images/videos) at maximum possible sizes and process them to determine
+        the maximum token count per modality.
+
         This approach works but can be very slow for certain models (e.g.,
         Qwen2.5-VL), leading to very long startup time. For better performance,
         each model can override this method to return pre-computed maximum token
         counts, avoiding the need for dummy input generation and processing.
 
-        NOTE: The maximum number of tokens per item of each modality returned 
-        from this function should respect to the model maximum sequence length 
-        and the maximum number of items of each modality allowed, and agrees 
-        with dummy inputs (images/videos) at maximum possible sizes.
-
+        Note:
+            The maximum number of tokens per item of each modality returned 
+            from this function should respect the model's maximum sequence
+            length and the maximum number of items of each modality allowed,
+            and agree with dummy inputs (images/videos) at maximum possible
+            sizes.
         """
         return None
 
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
@@ -258,8 +258,13 @@ def get_mm_max_tokens(
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
     ) -> Mapping[str, int]:
-        max_tokens_per_item = self.processing_info.get_max_tokens_per_item(
-            seq_len=seq_len, mm_counts=mm_counts)
+        if mm_counts is None:
+            mm_counts = self.get_mm_limits()
+
+        max_tokens_per_item = self.processing_info.get_mm_max_tokens_per_item(
+            seq_len=seq_len,
+            mm_counts=mm_counts,
+        )
         if max_tokens_per_item is not None:
             if mm_counts is None:
                 total_mm_tokens = sum(max_tokens_per_item.values())