From b764c9ddfffe04307a2a255fc3116c9554889e14 Mon Sep 17 00:00:00 2001
From: Christian Pinto <christian.pinto@ibm.com>
Date: Mon, 7 Jul 2025 14:46:11 +0000
Subject: [PATCH 01/10] Support for attention free models

Forces 0 KV Cache groups to disable KV Cache in attention free models

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
---
 vllm/v1/core/kv_cache_coordinator.py |  4 +++-
 vllm/v1/core/kv_cache_manager.py     |  2 +-
 vllm/v1/core/kv_cache_utils.py       | 21 ++++++++++++++++++++-
 vllm/v1/executor/abstract.py         |  6 ++++++
 4 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index de72e60434a..7401c4b31e5 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -250,7 +250,9 @@ def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
         super().__init__(kv_cache_config, max_model_len, use_eagle,
                          enable_caching, caching_hash_fn,
                          enable_kv_cache_events)
-        self.verify_and_split_kv_cache_groups()
+        # attention free models are initialized with 0 kv_cache_groups
+        if len(self.kv_cache_config.kv_cache_groups) > 0:
+            self.verify_and_split_kv_cache_groups()
 
     def verify_and_split_kv_cache_groups(self) -> None:
         """
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index cbc787e8dd5..22e0341ebaf 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -89,7 +89,7 @@ def __init__(
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
 
         self.block_size: Optional[int] = None
-        if self.enable_caching:
+        if self.enable_caching and len(self.kv_cache_config.kv_cache_groups) > 0:
             assert len(
                 set(g.kv_cache_spec.block_size
                     for g in kv_cache_config.kv_cache_groups)
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 544b9f59932..1fd888eb640 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -563,6 +563,10 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
         ValueError: If there is not enough memory available for the KV cache.
     """
 
+    # No need to check for available memory if the model is attention free
+    if vllm_config.model_config.is_attention_free:
+        return
+
     if available_memory <= 0:
         raise ValueError("No available memory for the cache blocks. "
                          "Try increasing `gpu_memory_utilization` when "
@@ -748,6 +752,12 @@ def is_kv_cache_page_size_uniform(
     page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
     return len(page_sizes) == 1
 
+def is_kv_cache_type_attention_free(
+        kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+
+    # kv_cache_spec is an empty dict for attention free models
+    if not kv_cache_spec:
+        return True
 
 def _get_kv_cache_config_uniform_page_size(
         vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
@@ -891,6 +901,11 @@ def _get_kv_cache_config_uniform_page_size(
     return kv_cache_config
 
 
+def _get_kv_cache_config_attention_free() -> KVCacheConfig:
+    return KVCacheConfig(num_blocks=1,
+                         kv_cache_tensors=[],
+                         kv_cache_groups=[])
+
 def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
     """
     This function tries to convert the KV cache specs to one type if the model
@@ -957,7 +972,11 @@ def get_kv_cache_config(
     if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
         unify_hybrid_kv_cache_specs(kv_cache_spec)
 
-    if is_kv_cache_type_uniform(kv_cache_spec):
+    if is_kv_cache_type_attention_free(kv_cache_spec):
+        # This returns a kv_cahce config with 0 kv_cache groups and 1 block
+        # to allow for the KVCache manager to handle attention free models.
+        return _get_kv_cache_config_attention_free()
+    elif is_kv_cache_type_uniform(kv_cache_spec):
         # KV cache of all layers are the same, which is true for
         # most models. Allocate the same amount of memory for
         # each layer.
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 50b9634a49e..9a3aa9888ec 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -73,10 +73,16 @@ def register_failure_callback(self, callback: FailureCallback):
         pass
 
     def determine_available_memory(self) -> list[int]:  # in bytes
+        if self.vllm_config.model_config.is_attention_free:
+            return [0]
+
         output = self.collective_rpc("determine_available_memory")
         return output
 
     def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]:
+        if self.vllm_config.model_config.is_attention_free:
+            return [{}]
+
         output = self.collective_rpc("get_kv_cache_spec")
         return output
 

From 5825ba45dcfcce3b6cfc94d92a084989e801f63f Mon Sep 17 00:00:00 2001
From: Christian Pinto <christian.pinto@ibm.com>
Date: Fri, 11 Jul 2025 12:13:00 +0000
Subject: [PATCH 02/10] is_kv_cache_type_attention_free: return False if not
 attention free

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
---
 vllm/v1/core/kv_cache_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 1fd888eb640..38ad1cbafb9 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -759,6 +759,8 @@ def is_kv_cache_type_attention_free(
     if not kv_cache_spec:
         return True
 
+    return False
+
 def _get_kv_cache_config_uniform_page_size(
         vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
         available_memory: int) -> KVCacheConfig:

From fc86350bbd9eccdaf40bf2257e4e6a6d4b3452dd Mon Sep 17 00:00:00 2001
From: Christian Pinto <christian.pinto@ibm.com>
Date: Fri, 11 Jul 2025 12:57:52 +0000
Subject: [PATCH 03/10] some minor edits after first review round

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
---
 vllm/v1/core/kv_cache_utils.py     | 5 +----
 vllm/v1/executor/abstract.py       | 6 ------
 vllm/v1/worker/gpu_model_runner.py | 2 ++
 vllm/v1/worker/gpu_worker.py       | 3 +++
 4 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 38ad1cbafb9..be5e5071a3e 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -756,10 +756,7 @@ def is_kv_cache_type_attention_free(
         kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
 
     # kv_cache_spec is an empty dict for attention free models
-    if not kv_cache_spec:
-        return True
-
-    return False
+    return not kv_cache_spec
 
 def _get_kv_cache_config_uniform_page_size(
         vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 9a3aa9888ec..50b9634a49e 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -73,16 +73,10 @@ def register_failure_callback(self, callback: FailureCallback):
         pass
 
     def determine_available_memory(self) -> list[int]:  # in bytes
-        if self.vllm_config.model_config.is_attention_free:
-            return [0]
-
         output = self.collective_rpc("determine_available_memory")
         return output
 
     def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]:
-        if self.vllm_config.model_config.is_attention_free:
-            return [{}]
-
         output = self.collective_rpc("get_kv_cache_spec")
         return output
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4551cb2df98..2ac3c083f0a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2590,6 +2590,8 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
             KVCacheSpec: A dictionary mapping layer names to their KV cache
             format. Layers that do not need KV cache are not included.
         """
+        if self.vllm_config.model_config.is_attention_free:
+            return {}
 
         block_size = self.vllm_config.cache_config.block_size
         use_mla = self.vllm_config.model_config.use_mla
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 6458b55777a..dff59ea5fc4 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -209,6 +209,9 @@ def determine_available_memory(self) -> int:
             You may limit the usage of GPU memory
             by adjusting the `gpu_memory_utilization` parameter.
         """
+        if self.vllm_config.model_config.is_attention_free:
+            return 0
+
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
         GiB = lambda b: b / GiB_bytes

From 97c11e62169b4988303157ca6456f295b0463436 Mon Sep 17 00:00:00 2001
From: Christian Pinto <christian.pinto@ibm.com>
Date: Mon, 14 Jul 2025 08:12:33 +0000
Subject: [PATCH 04/10] Rebase to current master

- Changes after #20661 merge
- Fixed one pre-commit error

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
---
 vllm/v1/core/kv_cache_coordinator.py | 8 ++++----
 vllm/v1/core/kv_cache_manager.py     | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 7401c4b31e5..4d8ff32e850 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -250,9 +250,7 @@ def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
         super().__init__(kv_cache_config, max_model_len, use_eagle,
                          enable_caching, caching_hash_fn,
                          enable_kv_cache_events)
-        # attention free models are initialized with 0 kv_cache_groups
-        if len(self.kv_cache_config.kv_cache_groups) > 0:
-            self.verify_and_split_kv_cache_groups()
+        self.verify_and_split_kv_cache_groups()
 
     def verify_and_split_kv_cache_groups(self) -> None:
         """
@@ -390,7 +388,9 @@ def get_kv_cache_coordinator(
         kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool,
         enable_caching: bool, caching_hash_fn: Callable,
         enable_kv_cache_events: bool) -> KVCacheCoordinator:
-    if not enable_caching:
+    if not enable_caching or len(kv_cache_config.kv_cache_groups) == 0:
+        # We instantiate this coordinator also for attention free models that
+        # have 0 kv_cache_groups 
         return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len,
                                                use_eagle, caching_hash_fn,
                                                enable_kv_cache_events)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 22e0341ebaf..728becec74f 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -89,7 +89,7 @@ def __init__(
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
 
         self.block_size: Optional[int] = None
-        if self.enable_caching and len(self.kv_cache_config.kv_cache_groups) > 0:
+        if self.enable_caching and len(kv_cache_config.kv_cache_groups) > 0:
             assert len(
                 set(g.kv_cache_spec.block_size
                     for g in kv_cache_config.kv_cache_groups)

From 673aeb067a0f59db221e0c0d05e94e5b3418efcc Mon Sep 17 00:00:00 2001
From: Christian Pinto <christian.pinto@ibm.com>
Date: Mon, 14 Jul 2025 10:22:47 +0000
Subject: [PATCH 05/10] Make pre-commits pass

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
---
 vllm/v1/core/kv_cache_coordinator.py | 2 +-
 vllm/v1/core/kv_cache_utils.py       | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 4d8ff32e850..312d08119b2 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -390,7 +390,7 @@ def get_kv_cache_coordinator(
         enable_kv_cache_events: bool) -> KVCacheCoordinator:
     if not enable_caching or len(kv_cache_config.kv_cache_groups) == 0:
         # We instantiate this coordinator also for attention free models that
-        # have 0 kv_cache_groups 
+        # have 0 kv_cache_groups
         return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len,
                                                use_eagle, caching_hash_fn,
                                                enable_kv_cache_events)
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index be5e5071a3e..04f25bf5c92 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -752,12 +752,14 @@ def is_kv_cache_page_size_uniform(
     page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
     return len(page_sizes) == 1
 
+
 def is_kv_cache_type_attention_free(
         kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
 
     # kv_cache_spec is an empty dict for attention free models
     return not kv_cache_spec
 
+
 def _get_kv_cache_config_uniform_page_size(
         vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
         available_memory: int) -> KVCacheConfig:
@@ -901,9 +903,8 @@ def _get_kv_cache_config_uniform_page_size(
 
 
 def _get_kv_cache_config_attention_free() -> KVCacheConfig:
-    return KVCacheConfig(num_blocks=1,
-                         kv_cache_tensors=[],
-                         kv_cache_groups=[])
+    return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[])
+
 
 def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
     """

From fb3ecfbc6d2d8e6a03a755ade914d85296d7d679 Mon Sep 17 00:00:00 2001
From: Christian Pinto <christian.pinto@ibm.com>
Date: Mon, 14 Jul 2025 20:15:13 +0000
Subject: [PATCH 06/10] Disable chunk prefill and prefix caching when model is
 attention free

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
---
 vllm/config.py                       | 9 +++++++++
 vllm/v1/core/kv_cache_coordinator.py | 2 +-
 vllm/v1/core/kv_cache_manager.py     | 2 +-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index d9f356c5c60..e42f4cb35ab 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4710,6 +4710,15 @@ def __post_init__(self):
                     "Only \"last\" pooling supports chunked "
                     "prefill and prefix caching; disabling both.")
 
+        if self.model_config.is_attention_free:
+            # If the model is not of pooling type and it is attention free,
+            # we make sure chunked prefill and prefix_caching are
+            # disabled so that the correct KVCacheCoordinator
+            # is loaded.
+            disable_chunked_prefill_reasons.append(
+                "This is an attention free model, "
+                "disabling chunked prefill and prefix caching.")
+
         if disable_chunked_prefill_reasons:
             for reason in disable_chunked_prefill_reasons:
                 logger.info(reason)
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 312d08119b2..a1dc2904a3c 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -388,7 +388,7 @@ def get_kv_cache_coordinator(
         kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool,
         enable_caching: bool, caching_hash_fn: Callable,
         enable_kv_cache_events: bool) -> KVCacheCoordinator:
-    if not enable_caching or len(kv_cache_config.kv_cache_groups) == 0:
+    if not enable_caching:
         # We instantiate this coordinator also for attention free models that
         # have 0 kv_cache_groups
         return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len,
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 728becec74f..cbc787e8dd5 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -89,7 +89,7 @@ def __init__(
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
 
         self.block_size: Optional[int] = None
-        if self.enable_caching and len(kv_cache_config.kv_cache_groups) > 0:
+        if self.enable_caching:
             assert len(
                 set(g.kv_cache_spec.block_size
                     for g in kv_cache_config.kv_cache_groups)

From 8e5dbee2b72786e3e7282a24198d7cd02c56a923 Mon Sep 17 00:00:00 2001
From: Christian Pinto <christian.pinto@ibm.com>
Date: Tue, 15 Jul 2025 09:55:59 +0000
Subject: [PATCH 07/10] reworked to allow for models like mamba to use the
 kv_cache for state retention

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
---
 vllm/v1/core/kv_cache_manager.py   | 6 ++++--
 vllm/v1/core/kv_cache_utils.py     | 6 +++---
 vllm/v1/engine/core.py             | 6 +++++-
 vllm/v1/worker/gpu_model_runner.py | 2 --
 vllm/v1/worker/gpu_worker.py       | 2 --
 5 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index cbc787e8dd5..0f68a57a37f 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -78,7 +78,9 @@ def __init__(
     ) -> None:
         self.max_model_len = max_model_len
 
-        self.enable_caching = enable_caching
+        self.enable_caching = (enable_caching
+                               if len(kv_cache_config.kv_cache_groups) > 0
+                               else False)
         self.caching_hash_fn = (
             sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else
             sha256 if caching_hash_algo == "sha256" else hash)
@@ -101,7 +103,7 @@ def __init__(
             kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
             use_eagle=self.use_eagle,
-            enable_caching=enable_caching,
+            enable_caching=self.enable_caching,
             caching_hash_fn=self.caching_hash_fn,
             enable_kv_cache_events=enable_kv_cache_events,
         )
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 04f25bf5c92..6067a127e97 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -563,8 +563,8 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
         ValueError: If there is not enough memory available for the KV cache.
     """
 
-    # No need to check for available memory if the model is attention free
-    if vllm_config.model_config.is_attention_free:
+    # No need to check for available memory if the kv_cache_spec is empty
+    if not kv_cache_spec:
         return
 
     if available_memory <= 0:
@@ -973,7 +973,7 @@ def get_kv_cache_config(
         unify_hybrid_kv_cache_specs(kv_cache_spec)
 
     if is_kv_cache_type_attention_free(kv_cache_spec):
-        # This returns a kv_cahce config with 0 kv_cache groups and 1 block
+        # This returns a kv_cache config with 0 kv_cache groups and 1 block
         # to allow for the KVCache manager to handle attention free models.
         return _get_kv_cache_config_attention_free()
     elif is_kv_cache_type_uniform(kv_cache_spec):
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e2fdf6f8a11..7568bd96f85 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -139,7 +139,11 @@ def _initialize_kv_caches(
 
         # Profiles the peak memory usage of the model to determine how much
         # memory can be allocated for kv cache.
-        available_gpu_memory = self.model_executor.determine_available_memory()
+        check_available_memory = not(len(kv_cache_specs) == 1 and not kv_cache_specs[0])
+        available_gpu_memory = [0]
+        if check_available_memory:
+            available_gpu_memory = (
+                self.model_executor.determine_available_memory())
 
         assert len(kv_cache_specs) == len(available_gpu_memory)
         # Get the kv cache tensor size
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2ac3c083f0a..4551cb2df98 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2590,8 +2590,6 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
             KVCacheSpec: A dictionary mapping layer names to their KV cache
             format. Layers that do not need KV cache are not included.
         """
-        if self.vllm_config.model_config.is_attention_free:
-            return {}
 
         block_size = self.vllm_config.cache_config.block_size
         use_mla = self.vllm_config.model_config.use_mla
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index dff59ea5fc4..3aec95a6388 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -209,8 +209,6 @@ def determine_available_memory(self) -> int:
             You may limit the usage of GPU memory
             by adjusting the `gpu_memory_utilization` parameter.
         """
-        if self.vllm_config.model_config.is_attention_free:
-            return 0
 
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()

From 2ee7087c57ed779e2a586f904fdec485b665a0da Mon Sep 17 00:00:00 2001
From: Christian Pinto <christian.pinto@ibm.com>
Date: Tue, 15 Jul 2025 09:58:52 +0000
Subject: [PATCH 08/10] cleanup config.py

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
---
 vllm/config.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index e42f4cb35ab..d9f356c5c60 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4710,15 +4710,6 @@ def __post_init__(self):
                     "Only \"last\" pooling supports chunked "
                     "prefill and prefix caching; disabling both.")
 
-        if self.model_config.is_attention_free:
-            # If the model is not of pooling type and it is attention free,
-            # we make sure chunked prefill and prefix_caching are
-            # disabled so that the correct KVCacheCoordinator
-            # is loaded.
-            disable_chunked_prefill_reasons.append(
-                "This is an attention free model, "
-                "disabling chunked prefill and prefix caching.")
-
         if disable_chunked_prefill_reasons:
             for reason in disable_chunked_prefill_reasons:
                 logger.info(reason)

From 19a7d7089503f2b07087f83606ff1ab7f5d0b6c0 Mon Sep 17 00:00:00 2001
From: Christian Pinto <christian.pinto@ibm.com>
Date: Tue, 15 Jul 2025 10:01:29 +0000
Subject: [PATCH 09/10] cleanup gpu_worker.py

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
---
 vllm/v1/worker/gpu_worker.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 3aec95a6388..6458b55777a 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -209,7 +209,6 @@ def determine_available_memory(self) -> int:
             You may limit the usage of GPU memory
             by adjusting the `gpu_memory_utilization` parameter.
         """
-
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
         GiB = lambda b: b / GiB_bytes

From b8f355e8e3c6ad68c7df20e559552bd597424129 Mon Sep 17 00:00:00 2001
From: Christian Pinto <christian.pinto@ibm.com>
Date: Tue, 15 Jul 2025 10:42:24 +0000
Subject: [PATCH 10/10] Edits after review

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
---
 vllm/v1/core/kv_cache_coordinator.py |  2 --
 vllm/v1/core/kv_cache_manager.py     |  9 ++++++---
 vllm/v1/engine/core.py               | 12 +++++++-----
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index a1dc2904a3c..de72e60434a 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -389,8 +389,6 @@ def get_kv_cache_coordinator(
         enable_caching: bool, caching_hash_fn: Callable,
         enable_kv_cache_events: bool) -> KVCacheCoordinator:
     if not enable_caching:
-        # We instantiate this coordinator also for attention free models that
-        # have 0 kv_cache_groups
         return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len,
                                                use_eagle, caching_hash_fn,
                                                enable_kv_cache_events)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 0f68a57a37f..e820a0ad6d5 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -78,9 +78,12 @@ def __init__(
     ) -> None:
         self.max_model_len = max_model_len
 
-        self.enable_caching = (enable_caching
-                               if len(kv_cache_config.kv_cache_groups) > 0
-                               else False)
+        if len(kv_cache_config.kv_cache_groups) == 0:
+            # Attention free models don't have kv cache,
+            # thus don't need prefix caching.
+            enable_caching = False
+        self.enable_caching = enable_caching
+
         self.caching_hash_fn = (
             sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else
             sha256 if caching_hash_algo == "sha256" else hash)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 7568bd96f85..f5c59bef478 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -139,11 +139,13 @@ def _initialize_kv_caches(
 
         # Profiles the peak memory usage of the model to determine how much
         # memory can be allocated for kv cache.
-        check_available_memory = not(len(kv_cache_specs) == 1 and not kv_cache_specs[0])
-        available_gpu_memory = [0]
-        if check_available_memory:
-            available_gpu_memory = (
-                self.model_executor.determine_available_memory())
+        has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
+        if has_kv_cache:
+            available_gpu_memory = \
+                self.model_executor.determine_available_memory()
+        else:
+            # Attention free models don't need memory for kv cache
+            available_gpu_memory = [0] * len(kv_cache_specs)
 
         assert len(kv_cache_specs) == len(available_gpu_memory)
         # Get the kv cache tensor size