Last few changes after rebasing to latest branch version

christian-pinto · christian-pinto · commit 4bf021432bf1 · 2025-06-24T10:55:20.000Z
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -62,8 +62,8 @@ def get_dummy_mm_data(
         # The size of pixel_values might change in the cases where we resize
         # the input but never exceeds the dimensions below.
         return {
-            "pixel_values": torch.full((1, 6, 512, 512), 1.0),
-            "location_coords": torch.full((1, 2), 1.0),
+            "pixel_values": torch.full((1, 6, 512, 512), 1.0, dtype=torch.float16),
+            "location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
         }
 
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -122,7 +122,7 @@ def __init__(
                 cache_config.cache_dtype]
 
         self.is_multimodal_model = model_config.is_multimodal_model
-        self.is_pooling_model = model_config.pooler_config is not None
+        self.model_supports_multimodal_raw_input = model_config.model_supports_multimodal_raw_input
         self.max_model_len = model_config.max_model_len
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
         self.max_num_reqs = scheduler_config.max_num_seqs
@@ -320,6 +320,11 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
         Returns:
             True if the batch was reordered, False otherwise.
         """
+
+        # nothing to be reordered when the mdoel is attention free
+        if self.model_config.is_attention_free:
+            return False
+
         batch_reordered = self.attn_metadata_builders[0].reorder_batch(
             self.input_batch, scheduler_output)
 
@@ -545,7 +550,23 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         batch_reordered = self._may_reorder_batch(scheduler_output)
 
         if batch_changed or batch_reordered:
-            self.input_batch.refresh_sampling_metadata()
+            self.input_batch.refresh()
+
+    def _maybe_add_model_args(self, num_tokens: int, 
+                              model_kwargs: dict[str, Any],
+                              scheduler_output: "SchedulerOutput"=None):
+        pass
+
+    def _maybe_compute_attn_prefix(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> list[int]:
+        return [0] * len(self.kv_cache_config.kv_cache_groups)
+
+    def _maybe_prepare_additional_inputs(self,
+                                         scheduler_output: "SchedulerOutput",
+                                         token_indices: torch.Tensor):
+        pass
 
     def _get_cumsum_and_arange(
         self,
@@ -1012,13 +1033,14 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
             curr_group_outputs = self.model.get_multimodal_embeddings(
                 **batched_mm_inputs)
 
-            sanity_check_mm_encoder_outputs(
-                curr_group_outputs,
-                expected_num_items=len(grouped_mm_inputs),
-            )
+            if curr_group_outputs:
+                sanity_check_mm_encoder_outputs(
+                    curr_group_outputs,
+                    expected_num_items=len(grouped_mm_inputs),
+                )
 
-            for output in curr_group_outputs:
-                encoder_outputs.append(output)
+                for output in curr_group_outputs:
+                    encoder_outputs.append(output)
 
         # Cache the encoder outputs.
         for (req_id, input_id, pos_info), output in zip(
@@ -1304,6 +1326,9 @@ def execute_model(
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
             input_ids = self.input_ids[:num_scheduled_tokens]
+            self._maybe_add_model_args(num_scheduled_tokens,
+                                       model_kwargs, scheduler_output)
+
             if mm_embeds:
                 inputs_embeds = self.model.get_input_embeddings(
                     input_ids, mm_embeds)
@@ -1319,6 +1344,7 @@ def execute_model(
             # multimodal models, it is not desirable for performance since
             # then the embedding layer is not included in the CUDA graph.
             input_ids = self.input_ids[:num_input_tokens]
+            self._maybe_add_model_args(num_input_tokens, model_kwargs, scheduler_output)
             inputs_embeds = None
         if self.uses_mrope:
             positions = self.mrope_positions[:, :num_input_tokens]
@@ -1352,6 +1378,10 @@ def execute_model(
                 positions=positions,
                 intermediate_tensors=intermediate_tensors,
                 inputs_embeds=inputs_embeds,
+                **MultiModalKwargs.as_kwargs(
+                    model_kwargs,
+                    device=self.device,
+                )
             )
 
             self.maybe_wait_for_kv_save()
@@ -1939,6 +1969,8 @@ def _dummy_run(
         with self.maybe_dummy_run_with_lora(self.lora_config,
                                             num_scheduled_tokens):
             model = self.model
+            model_kwargs: dict[str, Any] = {}
+            self._maybe_add_model_args(num_tokens, model_kwargs)
             if self.is_multimodal_model:
                 input_ids = None
                 inputs_embeds = self.inputs_embeds[:num_tokens]
@@ -1973,7 +2005,11 @@ def _dummy_run(
                     positions=positions,
                     intermediate_tensors=intermediate_tensors,
                     inputs_embeds=inputs_embeds,
+                    **MultiModalKwargs.as_kwargs(
+                                    model_kwargs,
+                                    device=self.device)
                 )
+
             if self.use_aux_hidden_state_outputs:
                 hidden_states, _ = outputs
             else:

Original file line number	Diff line number	Diff line change
`@@ -62,8 +62,8 @@ def get_dummy_mm_data(`
`62`	`62`	`# The size of pixel_values might change in the cases where we resize`
`63`	`63`	`# the input but never exceeds the dimensions below.`
`64`	`64`	`return {`
`65`		`- "pixel_values": torch.full((1, 6, 512, 512), 1.0),`
`66`		`- "location_coords": torch.full((1, 2), 1.0),`
	`65`	`+ "pixel_values": torch.full((1, 6, 512, 512), 1.0, dtype=torch.float16),`
	`66`	`+ "location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),`
`67`	`67`	`}`
`68`	`68`
`69`	`69`