Skip to content

Commit 2c5302f

Browse files
WoosukKwonywang96
andauthored
[Multimodal] Optimize Qwen2/2.5-VL startup time (#19756)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Roger Wang <hey@rogerw.me> Co-authored-by: Roger Wang <hey@rogerw.me>
1 parent caa680f commit 2c5302f

File tree

3 files changed

+50
-1
lines changed

3 files changed

+50
-1
lines changed

vllm/model_executor/models/qwen2_vl.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -823,6 +823,14 @@ def get_image_processor(
823823
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
824824
return {"image": None, "video": None}
825825

826+
def get_max_tokens_per_item(
827+
self, seq_len: int,
828+
mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
829+
830+
max_image_tokens = self.get_max_image_tokens()
831+
max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
832+
return {"image": max_image_tokens, "video": max_video_tokens}
833+
826834
def _get_vision_info(
827835
self,
828836
*,

vllm/multimodal/processing.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1100,6 +1100,27 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]:
11001100

11011101
return allowed_limits
11021102

1103+
def get_max_tokens_per_item(
1104+
self, seq_len: int,
1105+
mm_counts: Optional[Mapping[str,
1106+
int]]) -> Optional[Mapping[str, int]]:
1107+
"""Return the maximum number of tokens per item of for each modality.
1108+
By default, returns `None`. When `None` is returned, vLLM will generate
1109+
dummy inputs (images/videos) at maximum possible sizes and process them
1110+
to determine the maximum token count per modality.
1111+
This approach works but can be very slow for certain models (e.g.,
1112+
Qwen2.5-VL), leading to very long startup time. For better performance,
1113+
each model can override this method to return pre-computed maximum token
1114+
counts, avoiding the need for dummy input generation and processing.
1115+
1116+
NOTE: The maximum number of tokens per item of each modality returned
1117+
from this function should respect to the model maximum sequence length
1118+
and the maximum number of items of each modality allowed, and agrees
1119+
with dummy inputs (images/videos) at maximum possible sizes.
1120+
1121+
"""
1122+
return None
1123+
11031124

11041125
_I = TypeVar("_I", bound=BaseProcessingInfo)
11051126

vllm/multimodal/profiling.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,26 @@ def get_mm_max_tokens(
253253
seq_len: int,
254254
mm_counts: Optional[Mapping[str, int]] = None,
255255
) -> Mapping[str, int]:
256-
mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
256+
max_tokens_per_item = self.processing_info.get_max_tokens_per_item(
257+
seq_len=seq_len, mm_counts=mm_counts)
258+
if max_tokens_per_item is not None:
259+
if mm_counts is None:
260+
total_mm_tokens = sum(max_tokens_per_item.values())
261+
else:
262+
total_mm_tokens = sum(max_tokens_per_item[k] * mm_counts[k]
263+
for k in max_tokens_per_item.keys()
264+
& mm_counts.keys())
265+
if total_mm_tokens > seq_len:
266+
logger.warning_once(
267+
"The sequence length (%d) is smaller than the pre-defined"
268+
" wosrt-case total number of multimodal tokens (%d). "
269+
"This may cause certain multi-modal inputs to fail during "
270+
"inference. To avoid this, you should increase "
271+
"`max_model_len` or reduce `mm_counts`.",
272+
seq_len,
273+
total_mm_tokens,
274+
)
275+
return max_tokens_per_item
257276

277+
mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
258278
return self._get_mm_num_tokens(mm_inputs)

0 commit comments

Comments
 (0)