@@ -2219,8 +2219,8 @@ def profile_run(self) -> None:
2219
2219
encoder_budget = min (self .max_num_encoder_input_tokens ,
2220
2220
self .encoder_cache_size )
2221
2221
2222
- max_num_mm_items_encoder_budget = cdiv ( encoder_budget ,
2223
- max_tokens_per_mm_item )
2222
+ max_num_mm_items_encoder_budget = encoder_budget // \
2223
+ max_tokens_per_mm_item
2224
2224
2225
2225
# Check how many items of this modality can be supported by
2226
2226
# the decoder budget.
@@ -2233,8 +2233,10 @@ def profile_run(self) -> None:
2233
2233
max_num_mm_items_decoder_budget = self .max_num_reqs * \
2234
2234
max_mm_items_per_req
2235
2235
2236
- max_num_mm_items = min (max_num_mm_items_encoder_budget ,
2237
- max_num_mm_items_decoder_budget )
2236
+ max_num_mm_items = max (
2237
+ 1 ,
2238
+ min (max_num_mm_items_encoder_budget ,
2239
+ max_num_mm_items_decoder_budget ))
2238
2240
2239
2241
logger .info (
2240
2242
"Encoder cache will be initialized with a budget of %s tokens,"
@@ -2244,7 +2246,7 @@ def profile_run(self) -> None:
2244
2246
# Create dummy batch of multimodal inputs.
2245
2247
dummy_mm_kwargs = self .mm_registry .get_decoder_dummy_data (
2246
2248
model_config = self .model_config ,
2247
- seq_len = self . max_num_tokens ,
2249
+ seq_len = max_tokens_per_mm_item ,
2248
2250
mm_counts = {
2249
2251
dummy_data_modality : 1
2250
2252
},
0 commit comments