latest changes to align with the original branch

christian-pinto · christian-pinto · commit 7e2df9ea0ef7 · 2025-06-25T11:38:59.000Z
Signed-off-by: Christian Pinto &lt;christian.pinto@ibm.com&gt;
diff --git a/vllm/config.py b/vllm/config.py
@@ -614,6 +614,7 @@ def __post_init__(self) -> None:
         self.served_model_name = get_served_model_name(self.model,
                                                        self.served_model_name)
         self.multimodal_config = self._init_multimodal_config()
+        self.is_pooling_model = self.registry.is_pooling_model(self.architectures)
         self.model_supports_multimodal_raw_input = self._init_model_supports_multimodal_raw_input()
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -62,7 +62,7 @@ def get_dummy_mm_data(
         # The size of pixel_values might change in the cases where we resize
         # the input but never exceeds the dimensions below.
         return {
-            "pixel_values": torch.full((1, 6, 512, 512), 1.0, dtype=torch.float16),
+            "pixel_values": torch.full((6, 512, 512), 1.0, dtype=torch.float16),
             "location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
         }
 
@@ -170,7 +170,7 @@ def _parse_and_validate_multimodal_data(
         if not isinstance(pixel_values, torch.Tensor):
             raise ValueError(f"Incorrect type of pixel_values. "
                              f"Got type: {type(pixel_values)}")
-        pixel_values = torch.unbind(pixel_values, dim=0)[0]
+        # pixel_values = torch.unbind(pixel_values, dim=0)[0]
 
         location_coords = kwargs.pop("location_coords", None)
         if not isinstance(location_coords, torch.Tensor):
@@ -209,7 +209,7 @@ def pooler(
         hidden_states: torch.Tensor,
         pooling_metadata: PoolingMetadata,
     ) -> Optional[PoolerOutput]:
-        return PoolerOutput([PoolingSequenceGroupOutput(hidden_states[0])])
+        return PoolerOutput([PoolingSequenceGroupOutput(hidden_state) for hidden_state in hidden_states])
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -122,6 +122,7 @@ def __init__(
                 cache_config.cache_dtype]
 
         self.is_multimodal_model = model_config.is_multimodal_model
+        self.is_pooling_model = model_config.is_pooling_model
         self.model_supports_multimodal_raw_input = model_config.model_supports_multimodal_raw_input
         self.max_model_len = model_config.max_model_len
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
@@ -550,10 +551,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         batch_reordered = self._may_reorder_batch(scheduler_output)
 
         if batch_changed or batch_reordered:
-            self.input_batch.refresh()
+            self.input_batch.refresh_sampling_metadata()
 
     def _add_multimodal_inputs_to_model_args(self, model_kwargs: dict[str, Any],
-                                             scheduler_output: "SchedulerOutput"):
+                                             scheduler_output: "SchedulerOutput",
+                                             num_reqs: int=-1):
         # Multi-modal data.
         if scheduler_output:
             multi_modal_kwargs_list = []
@@ -565,21 +567,20 @@ def _add_multimodal_inputs_to_model_args(self, model_kwargs: dict[str, Any],
             multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
         else:
             # The only case where SchedulerOtput is None is for a dummy run, let's get some dummy data.
-            dummy_data = self.mm_registry.get_decoder_dummy_data(model_config=self.model_config, seq_len =1)
-            multi_modal_kwargs = MultiModalKwargs.batch([dummy_data.multi_modal_data])
+            dummy_data = [self.mm_registry.get_decoder_dummy_data(model_config=self.model_config, seq_len =1).multi_modal_data for i in range(num_reqs)]
+            # dummy_data = self.mm_registry.get_decoder_dummy_data(model_config=self.model_config, seq_len =1)
+            # multi_modal_kwargs = MultiModalKwargs.batch([dummy_data.multi_modal_data])
+            multi_modal_kwargs = MultiModalKwargs.batch(dummy_data)
             
         model_kwargs.update(multi_modal_kwargs)
 
-    def _maybe_add_model_args(self, num_tokens: int,
+    def _maybe_add_multimodal_kwargs(self,
                               model_kwargs: dict[str,Any], 
-                              scheduler_output: "SchedulerOutput"=None):
-        
-        if self.supports_token_type_ids:
-            model_kwargs["token_type_ids"] =\
-                  self.get_token_type_ids()[:num_tokens]
+                              scheduler_output: "SchedulerOutput"=None,
+                              num_reqs: int=-1):
 
         if self.model_supports_multimodal_raw_input:
-            self._add_multimodal_inputs_to_model_args(model_kwargs, scheduler_output)
+            self._add_multimodal_inputs_to_model_args(model_kwargs, scheduler_output, num_reqs)
 
     def _maybe_compute_attn_prefix(
         self,
@@ -1344,15 +1345,15 @@ def execute_model(
             mm_embeds = self._gather_mm_embeddings(scheduler_output)
         else:
             mm_embeds = []
-
+        
+        model_kwargs: dict[str, Any] = {}
         if self.is_multimodal_model and get_pp_group().is_first_rank:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
             input_ids = self.input_ids[:num_scheduled_tokens]
-            self._maybe_add_model_args(num_scheduled_tokens,
-                                       model_kwargs, scheduler_output)
-
+            self._maybe_add_multimodal_kwargs(model_kwargs=model_kwargs,
+                                              scheduler_output=scheduler_output)
             if mm_embeds:
                 inputs_embeds = self.model.get_input_embeddings(
                     input_ids, mm_embeds)
@@ -1368,7 +1369,6 @@ def execute_model(
             # multimodal models, it is not desirable for performance since
             # then the embedding layer is not included in the CUDA graph.
             input_ids = self.input_ids[:num_input_tokens]
-            self._maybe_add_model_args(num_input_tokens, model_kwargs, scheduler_output)
             inputs_embeds = None
         if self.uses_mrope:
             positions = self.mrope_positions[:, :num_input_tokens]
@@ -1994,8 +1994,9 @@ def _dummy_run(
                                             num_scheduled_tokens):
             model = self.model
             model_kwargs: dict[str, Any] = {}
-            self._maybe_add_model_args(num_tokens, model_kwargs)
             if self.is_multimodal_model:
+                self._maybe_add_multimodal_kwargs(model_kwargs=model_kwargs,
+                                                  num_reqs=num_reqs)
                 input_ids = None
                 inputs_embeds = self.inputs_embeds[:num_tokens]
             else: