@@ -124,7 +124,7 @@ def __init__(
124
124
cache_config .cache_dtype ]
125
125
126
126
self .is_multimodal_model = model_config .is_multimodal_model
127
- self .is_pooling_model = model_config .pooler_config is not None
127
+ self .model_supports_multimodal_raw_input = model_config .model_supports_multimodal_raw_input
128
128
self .max_model_len = model_config .max_model_len
129
129
self .max_num_tokens = scheduler_config .max_num_batched_tokens
130
130
self .max_num_reqs = scheduler_config .max_num_seqs
@@ -327,6 +327,11 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
327
327
Args:
328
328
scheduler_output: The scheduler output.
329
329
"""
330
+
331
+ # nothing to be reordered when the mdoel is attention free
332
+ if self .model_config .is_attention_free :
333
+ return False
334
+
330
335
self .attn_metadata_builders [0 ].reorder_batch (self .input_batch ,
331
336
scheduler_output )
332
337
@@ -1016,13 +1021,14 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
1016
1021
curr_group_outputs = self .model .get_multimodal_embeddings (
1017
1022
** batched_mm_inputs )
1018
1023
1019
- sanity_check_mm_encoder_outputs (
1020
- curr_group_outputs ,
1021
- expected_num_items = len (grouped_mm_inputs ),
1022
- )
1024
+ if curr_group_outputs :
1025
+ sanity_check_mm_encoder_outputs (
1026
+ curr_group_outputs ,
1027
+ expected_num_items = len (grouped_mm_inputs ),
1028
+ )
1023
1029
1024
- for output in curr_group_outputs :
1025
- encoder_outputs .append (output )
1030
+ for output in curr_group_outputs :
1031
+ encoder_outputs .append (output )
1026
1032
1027
1033
# Cache the encoder outputs.
1028
1034
for (req_id , input_id , pos_info ), output in zip (
@@ -1324,6 +1330,9 @@ def execute_model(
1324
1330
# embeddings), we always use embeddings (rather than token ids)
1325
1331
# as input to the multimodal model, even when the input is text.
1326
1332
input_ids = self .input_ids [:num_scheduled_tokens ]
1333
+ self ._maybe_add_model_args (num_scheduled_tokens ,
1334
+ model_kwargs , scheduler_output )
1335
+
1327
1336
if mm_embeds :
1328
1337
inputs_embeds = self .model .get_input_embeddings (
1329
1338
input_ids , mm_embeds )
@@ -1339,6 +1348,7 @@ def execute_model(
1339
1348
# multimodal models, it is not desirable for performance since
1340
1349
# then the embedding layer is not included in the CUDA graph.
1341
1350
input_ids = self .input_ids [:num_input_tokens ]
1351
+ self ._maybe_add_model_args (num_input_tokens , model_kwargs , scheduler_output )
1342
1352
inputs_embeds = None
1343
1353
if self .uses_mrope :
1344
1354
positions = self .mrope_positions [:, :num_input_tokens ]
@@ -1372,6 +1382,10 @@ def execute_model(
1372
1382
positions = positions ,
1373
1383
intermediate_tensors = intermediate_tensors ,
1374
1384
inputs_embeds = inputs_embeds ,
1385
+ ** MultiModalKwargs .as_kwargs (
1386
+ model_kwargs ,
1387
+ device = self .device ,
1388
+ )
1375
1389
)
1376
1390
1377
1391
self .maybe_wait_for_kv_save ()
@@ -2021,6 +2035,8 @@ def _dummy_run(
2021
2035
with self .maybe_dummy_run_with_lora (self .lora_config ,
2022
2036
num_scheduled_tokens ):
2023
2037
model = self .model
2038
+ model_kwargs : dict [str , Any ] = {}
2039
+ self ._maybe_add_model_args (num_tokens , model_kwargs )
2024
2040
if self .is_multimodal_model :
2025
2041
input_ids = None
2026
2042
inputs_embeds = self .inputs_embeds [:num_tokens ]
@@ -2055,7 +2071,11 @@ def _dummy_run(
2055
2071
positions = positions ,
2056
2072
intermediate_tensors = intermediate_tensors ,
2057
2073
inputs_embeds = inputs_embeds ,
2074
+ ** MultiModalKwargs .as_kwargs (
2075
+ model_kwargs ,
2076
+ device = self .device )
2058
2077
)
2078
+
2059
2079
if self .use_aux_hidden_state_outputs :
2060
2080
hidden_states , _ = outputs
2061
2081
else :
0 commit comments