@@ -123,7 +123,7 @@ def __init__(
123
123
cache_config .cache_dtype ]
124
124
125
125
self .is_multimodal_model = model_config .is_multimodal_model
126
- self .is_pooling_model = model_config .pooler_config is not None
126
+ self .model_supports_multimodal_raw_input = model_config .model_supports_multimodal_raw_input
127
127
self .max_model_len = model_config .max_model_len
128
128
self .max_num_tokens = scheduler_config .max_num_batched_tokens
129
129
self .max_num_reqs = scheduler_config .max_num_seqs
@@ -326,6 +326,11 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
326
326
Args:
327
327
scheduler_output: The scheduler output.
328
328
"""
329
+
330
+ # nothing to be reordered when the mdoel is attention free
331
+ if self .model_config .is_attention_free :
332
+ return False
333
+
329
334
self .attn_metadata_builders [0 ].reorder_batch (self .input_batch ,
330
335
scheduler_output )
331
336
@@ -1019,13 +1024,14 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
1019
1024
curr_group_outputs = self .model .get_multimodal_embeddings (
1020
1025
** batched_mm_inputs )
1021
1026
1022
- sanity_check_mm_encoder_outputs (
1023
- curr_group_outputs ,
1024
- expected_num_items = len (grouped_mm_inputs ),
1025
- )
1027
+ if curr_group_outputs :
1028
+ sanity_check_mm_encoder_outputs (
1029
+ curr_group_outputs ,
1030
+ expected_num_items = len (grouped_mm_inputs ),
1031
+ )
1026
1032
1027
- for output in curr_group_outputs :
1028
- encoder_outputs .append (output )
1033
+ for output in curr_group_outputs :
1034
+ encoder_outputs .append (output )
1029
1035
1030
1036
# Cache the encoder outputs.
1031
1037
for (req_id , input_id , pos_info ), output in zip (
@@ -1324,6 +1330,9 @@ def execute_model(
1324
1330
# embeddings), we always use embeddings (rather than token ids)
1325
1331
# as input to the multimodal model, even when the input is text.
1326
1332
input_ids = self .input_ids [:num_scheduled_tokens ]
1333
+ self ._maybe_add_model_args (num_scheduled_tokens ,
1334
+ model_kwargs , scheduler_output )
1335
+
1327
1336
if mm_embeds :
1328
1337
inputs_embeds = self .model .get_input_embeddings (
1329
1338
input_ids , mm_embeds )
@@ -1339,6 +1348,7 @@ def execute_model(
1339
1348
# multimodal models, it is not desirable for performance since
1340
1349
# then the embedding layer is not included in the CUDA graph.
1341
1350
input_ids = self .input_ids [:num_input_tokens ]
1351
+ self ._maybe_add_model_args (num_input_tokens , model_kwargs , scheduler_output )
1342
1352
inputs_embeds = None
1343
1353
if self .uses_mrope :
1344
1354
positions = self .mrope_positions [:, :num_input_tokens ]
@@ -1372,6 +1382,10 @@ def execute_model(
1372
1382
positions = positions ,
1373
1383
intermediate_tensors = intermediate_tensors ,
1374
1384
inputs_embeds = inputs_embeds ,
1385
+ ** MultiModalKwargs .as_kwargs (
1386
+ model_kwargs ,
1387
+ device = self .device ,
1388
+ )
1375
1389
)
1376
1390
1377
1391
self .maybe_wait_for_kv_save ()
@@ -1998,6 +2012,8 @@ def _dummy_run(
1998
2012
with self .maybe_dummy_run_with_lora (self .lora_config ,
1999
2013
num_scheduled_tokens ):
2000
2014
model = self .model
2015
+ model_kwargs : dict [str , Any ] = {}
2016
+ self ._maybe_add_model_args (num_tokens , model_kwargs )
2001
2017
if self .is_multimodal_model :
2002
2018
input_ids = None
2003
2019
inputs_embeds = self .inputs_embeds [:num_tokens ]
@@ -2032,7 +2048,11 @@ def _dummy_run(
2032
2048
positions = positions ,
2033
2049
intermediate_tensors = intermediate_tensors ,
2034
2050
inputs_embeds = inputs_embeds ,
2051
+ ** MultiModalKwargs .as_kwargs (
2052
+ model_kwargs ,
2053
+ device = self .device )
2035
2054
)
2055
+
2036
2056
if self .use_aux_hidden_state_outputs :
2037
2057
hidden_states , _ = outputs
2038
2058
else :
0 commit comments