File tree Expand file tree Collapse file tree 3 files changed +50
-1
lines changed Expand file tree Collapse file tree 3 files changed +50
-1
lines changed Original file line number Diff line number Diff line change @@ -823,6 +823,14 @@ def get_image_processor(
823
823
def get_supported_mm_limits (self ) -> Mapping [str , Optional [int ]]:
824
824
return {"image" : None , "video" : None }
825
825
826
+ def get_max_tokens_per_item (
827
+ self , seq_len : int ,
828
+ mm_counts : Mapping [str , int ]) -> Optional [Mapping [str , int ]]:
829
+
830
+ max_image_tokens = self .get_max_image_tokens ()
831
+ max_video_tokens = self .get_max_video_tokens (seq_len , mm_counts )
832
+ return {"image" : max_image_tokens , "video" : max_video_tokens }
833
+
826
834
def _get_vision_info (
827
835
self ,
828
836
* ,
Original file line number Diff line number Diff line change @@ -1100,6 +1100,27 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]:
1100
1100
1101
1101
return allowed_limits
1102
1102
1103
+ def get_max_tokens_per_item (
1104
+ self , seq_len : int ,
1105
+ mm_counts : Optional [Mapping [str ,
1106
+ int ]]) -> Optional [Mapping [str , int ]]:
1107
+ """Return the maximum number of tokens per item of for each modality.
1108
+ By default, returns `None`. When `None` is returned, vLLM will generate
1109
+ dummy inputs (images/videos) at maximum possible sizes and process them
1110
+ to determine the maximum token count per modality.
1111
+ This approach works but can be very slow for certain models (e.g.,
1112
+ Qwen2.5-VL), leading to very long startup time. For better performance,
1113
+ each model can override this method to return pre-computed maximum token
1114
+ counts, avoiding the need for dummy input generation and processing.
1115
+
1116
+ NOTE: The maximum number of tokens per item of each modality returned
1117
+ from this function should respect to the model maximum sequence length
1118
+ and the maximum number of items of each modality allowed, and agrees
1119
+ with dummy inputs (images/videos) at maximum possible sizes.
1120
+
1121
+ """
1122
+ return None
1123
+
1103
1124
1104
1125
_I = TypeVar ("_I" , bound = BaseProcessingInfo )
1105
1126
Original file line number Diff line number Diff line change @@ -253,6 +253,26 @@ def get_mm_max_tokens(
253
253
seq_len : int ,
254
254
mm_counts : Optional [Mapping [str , int ]] = None ,
255
255
) -> Mapping [str , int ]:
256
- mm_inputs = self ._get_dummy_mm_inputs (seq_len , mm_counts )
256
+ max_tokens_per_item = self .processing_info .get_max_tokens_per_item (
257
+ seq_len = seq_len , mm_counts = mm_counts )
258
+ if max_tokens_per_item is not None :
259
+ if mm_counts is None :
260
+ total_mm_tokens = sum (max_tokens_per_item .values ())
261
+ else :
262
+ total_mm_tokens = sum (max_tokens_per_item [k ] * mm_counts [k ]
263
+ for k in max_tokens_per_item .keys ()
264
+ & mm_counts .keys ())
265
+ if total_mm_tokens > seq_len :
266
+ logger .warning_once (
267
+ "The sequence length (%d) is smaller than the pre-defined"
268
+ " wosrt-case total number of multimodal tokens (%d). "
269
+ "This may cause certain multi-modal inputs to fail during "
270
+ "inference. To avoid this, you should increase "
271
+ "`max_model_len` or reduce `mm_counts`." ,
272
+ seq_len ,
273
+ total_mm_tokens ,
274
+ )
275
+ return max_tokens_per_item
257
276
277
+ mm_inputs = self ._get_dummy_mm_inputs (seq_len , mm_counts )
258
278
return self ._get_mm_num_tokens (mm_inputs )
You can’t perform that action at this time.
0 commit comments