Latest changes to aadpt to upstream master

christian-pinto · christian-pinto · commit 5ac66e7deaba · 2025-07-15T14:52:17.000Z
Signed-off-by: Christian Pinto &lt;christian.pinto@ibm.com&gt;
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -143,7 +143,8 @@ def __init__(self):
         self.model = LLM(
             model=os.path.join(os.path.dirname(__file__), "./model"),
             skip_tokenizer_init=True,
-            dtype="float32",
+            dtype="float16",
+            enforce_eager=True
         )
 
     def run(self, input_data, location_coords):
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -134,8 +134,17 @@ def _initialize_kv_caches(
             self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
         start = time.time()
 
-        # Get all kv cache needed by the model
-        kv_cache_specs = self.model_executor.get_kv_cache_specs()
+        #TODO: CP start from here
+        if vllm_config.model_config.is_attention_free:
+            # No need for initializing anything related to KV cache if the model
+            # is attention free.
+            kv_cache_specs = []
+            kv_cache_configs = [
+                    KVCacheConfig(num_blocks=0, kv_cache_tensors={}, kv_cache_groups=[])
+                ]
+        else:
+            # Get all kv cache needed by the model
+            kv_cache_specs = self.model_executor.get_kv_cache_specs()
 
         # Profiles the peak memory usage of the model to determine how much
         # memory can be allocated for kv cache.

Original file line number	Diff line number	Diff line change
`@@ -143,7 +143,8 @@ def __init__(self):`
`143`	`143`	`self.model = LLM(`
`144`	`144`	`model=os.path.join(os.path.dirname(__file__), "./model"),`
`145`	`145`	`skip_tokenizer_init=True,`
`146`		`- dtype="float32",`
	`146`	`+ dtype="float16",`
	`147`	`+ enforce_eager=True`
`147`	`148`	`)`
`148`	`149`
`149`	`150`	`def run(self, input_data, location_coords):`