Rebased to master

christian-pinto · christian-pinto · commit e59d7dc49046 · 2025-07-16T10:31:59.000Z
Signed-off-by: Christian Pinto &lt;christian.pinto@ibm.com&gt;
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
diff --git a/tests/models/multimodal/pooling/test_prithvi_mae.py b/tests/models/multimodal/pooling/test_prithvi_mae.py
@@ -36,7 +36,7 @@ def _run_test(
 
 MODELS = ["christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"]
 
-
+@pytest.mark.core_model
 @pytest.mark.parametrize("model", MODELS)
 def test_models_image(
     hf_runner,
diff --git a/vllm/config.py b/vllm/config.py
@@ -1514,10 +1514,6 @@ def uses_mrope(self) -> bool:
     @property
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
-    
-    @property
-    def is_pooling_model(self) -> bool: 
-        return self.registry.is_pooling_model(self.architectures)
 
     @property
     def is_cross_encoder(self) -> bool:
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -184,7 +184,6 @@ def _parse_and_validate_multimodal_data(
         if not isinstance(pixel_values, torch.Tensor):
             raise ValueError(f"Incorrect type of pixel_values. "
                              f"Got type: {type(pixel_values)}")
-        # pixel_values = torch.unbind(pixel_values, dim=0)[0]
 
         location_coords = kwargs.pop("location_coords", None)
         if not isinstance(location_coords, torch.Tensor):
@@ -201,7 +200,7 @@ def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         # to be calculated. However, due to the mandatory token ids in
         # the input prompt we pass one token and the size of the dummy
         #  embedding tensors must reflect that.
-        return torch.empty(input_ids.shape)
+        return torch.empty((input_ids.shape[0], 0))
 
     def forward(
         self,
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from abc import ABC, abstractmethod
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import Optional
@@ -66,6 +65,7 @@ def new_empty(self) -> "KVCacheBlocks":
 
 
 class KVCacheManager:
+
     def __init__(
         self,
         kv_cache_config: KVCacheConfig,
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -488,8 +488,8 @@ def schedule(self) -> SchedulerOutput:
 
                 if self.lora_config and request.lora_request:
                     scheduled_loras.add(request.lora_request.lora_int_id)
-                req_to_new_block_ids[request.request_id] = \
-                    self.kv_cache_manager.get_block_ids(request.request_id)
+                req_to_new_block_ids[request.request_id] = (
+                    self.kv_cache_manager.get_block_ids(request.request_id))
                 num_scheduled_tokens[request.request_id] = num_new_tokens
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -152,8 +152,8 @@ def _initialize_kv_caches(
         kv_cache_configs = [
             get_kv_cache_config(vllm_config, kv_cache_spec_one_worker,
                                 available_gpu_memory_one_worker)
-            for kv_cache_spec_one_worker, available_gpu_memory_one_worker
-            in zip(kv_cache_specs, available_gpu_memory)
+            for kv_cache_spec_one_worker, available_gpu_memory_one_worker in
+            zip(kv_cache_specs, available_gpu_memory)
         ]
 
         # Since we use a shared centralized controller, we need the
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
@@ -330,13 +330,14 @@ def add_request(
         tokenizer = None if not self.tokenizer else \
             self.tokenizer.get_lora_tokenizer(request.lora_request)
 
-        req_state = RequestState.from_new_request(tokenizer=tokenizer,
-                                                  request=request,
-                                                  prompt=prompt,
-                                                  parent_req=parent_req,
-                                                  request_index=request_index,
-                                                  queue=queue,
-                                                  log_stats=self.log_stats)
+        req_state = RequestState.from_new_request(
+            tokenizer=tokenizer,
+            request=request,
+            prompt=prompt,
+            parent_req=parent_req,
+            request_index=request_index,
+            queue=queue,
+            log_stats=self.log_stats)
         self.request_states[request_id] = req_state
         self.lora_states.add_request(req_state)
         if parent_req:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
@@ -384,6 +384,10 @@ def _validate_model_input(
             tokenizer = None
         else:
             tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
+            max_input_id = max(prompt_ids, default=0)
+            if max_input_id > tokenizer.max_token_id:
+                raise ValueError(
+                    f"Token id {max_input_id} is out of vocabulary")
 
         prompt_ids = prompt_inputs["prompt_token_ids"]
         if not prompt_ids:
@@ -392,12 +396,6 @@ def _validate_model_input(
             else:
                 raise ValueError(f"The {prompt_type} prompt cannot be empty")
 
-        if tokenizer:
-            max_input_id = max(prompt_ids, default=0)
-            if max_input_id > tokenizer.max_token_id:
-                raise ValueError(
-                    f"Token id {max_input_id} is out of vocabulary")
-
         max_prompt_len = self.model_config.max_model_len
         if len(prompt_ids) > max_prompt_len:
             if prompt_type == "encoder" and model_config.is_multimodal_model:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -123,7 +123,7 @@ def __init__(
                 cache_config.cache_dtype]
 
         self.is_multimodal_model = model_config.is_multimodal_model
-        self.is_pooling_model = model_config.is_pooling_model
+        self.is_pooling_model = model_config.pooler_config is not None
         self.model_supports_multimodal_raw_input = (
             model_config.model_supports_multimodal_raw_input)
         self.max_model_len = model_config.max_model_len
@@ -328,8 +328,6 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         Args:
             scheduler_output: The scheduler output.
         """
-        
-        # nothing to be reordered when the mdoel is attention free
         if self.model_config.is_attention_free:
             return False
 
@@ -1059,14 +1057,13 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
             curr_group_outputs = self.model.get_multimodal_embeddings(
                 **batched_mm_inputs)
 
-            if curr_group_outputs:
-                sanity_check_mm_encoder_outputs(
-                    curr_group_outputs,
-                    expected_num_items=len(grouped_mm_inputs),
-                )
+            sanity_check_mm_encoder_outputs(
+                curr_group_outputs,
+                expected_num_items=len(grouped_mm_inputs),
+            )
 
-                for output in curr_group_outputs:
-                    encoder_outputs.append(output)
+            for output in curr_group_outputs:
+                encoder_outputs.append(output)
 
         # Cache the encoder outputs.
         for (req_id, input_id, pos_info), output in zip(