Skip to content

Commit 32dffc2

Browse files
[Core] Rename get_max_tokens_per_item for backward compatibility (#20630)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1 parent c438183 commit 32dffc2

File tree

3 files changed

+30
-19
lines changed

3 files changed

+30
-19
lines changed

vllm/model_executor/models/qwen2_vl.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -823,10 +823,11 @@ def get_image_processor(
823823
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
824824
return {"image": None, "video": None}
825825

826-
def get_max_tokens_per_item(
827-
self, seq_len: int,
828-
mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
829-
826+
def get_mm_max_tokens_per_item(
827+
self,
828+
seq_len: int,
829+
mm_counts: Mapping[str, int],
830+
) -> Mapping[str, int]:
830831
max_image_tokens = self.get_max_image_tokens()
831832
max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
832833
return {"image": max_image_tokens, "video": max_video_tokens}

vllm/multimodal/processing.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1100,24 +1100,29 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]:
11001100

11011101
return allowed_limits
11021102

1103-
def get_max_tokens_per_item(
1104-
self, seq_len: int,
1105-
mm_counts: Optional[Mapping[str,
1106-
int]]) -> Optional[Mapping[str, int]]:
1107-
"""Return the maximum number of tokens per item of for each modality.
1108-
By default, returns `None`. When `None` is returned, vLLM will generate
1109-
dummy inputs (images/videos) at maximum possible sizes and process them
1110-
to determine the maximum token count per modality.
1103+
def get_mm_max_tokens_per_item(
1104+
self,
1105+
seq_len: int,
1106+
mm_counts: Mapping[str, int],
1107+
) -> Optional[Mapping[str, int]]:
1108+
"""
1109+
Return the maximum number of tokens per item of for each modality.
1110+
1111+
When `None` (the default) is returned, vLLM will generate dummy inputs
1112+
(images/videos) at maximum possible sizes and process them to determine
1113+
the maximum token count per modality.
1114+
11111115
This approach works but can be very slow for certain models (e.g.,
11121116
Qwen2.5-VL), leading to very long startup time. For better performance,
11131117
each model can override this method to return pre-computed maximum token
11141118
counts, avoiding the need for dummy input generation and processing.
11151119
1116-
NOTE: The maximum number of tokens per item of each modality returned
1117-
from this function should respect to the model maximum sequence length
1118-
and the maximum number of items of each modality allowed, and agrees
1119-
with dummy inputs (images/videos) at maximum possible sizes.
1120-
1120+
Note:
1121+
The maximum number of tokens per item of each modality returned
1122+
from this function should respect the model's maximum sequence
1123+
length and the maximum number of items of each modality allowed,
1124+
and agree with dummy inputs (images/videos) at maximum possible
1125+
sizes.
11211126
"""
11221127
return None
11231128

vllm/multimodal/profiling.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -258,8 +258,13 @@ def get_mm_max_tokens(
258258
seq_len: int,
259259
mm_counts: Optional[Mapping[str, int]] = None,
260260
) -> Mapping[str, int]:
261-
max_tokens_per_item = self.processing_info.get_max_tokens_per_item(
262-
seq_len=seq_len, mm_counts=mm_counts)
261+
if mm_counts is None:
262+
mm_counts = self.get_mm_limits()
263+
264+
max_tokens_per_item = self.processing_info.get_mm_max_tokens_per_item(
265+
seq_len=seq_len,
266+
mm_counts=mm_counts,
267+
)
263268
if max_tokens_per_item is not None:
264269
if mm_counts is None:
265270
total_mm_tokens = sum(max_tokens_per_item.values())

0 commit comments

Comments
 (0)