Disable prefix caching when model is attention free

christian-pinto · christian-pinto · commit 0923e3442e98 · 2025-07-14T20:16:08.000Z
Signed-off-by: Christian Pinto &lt;christian.pinto@ibm.com&gt;
diff --git a/vllm/config.py b/vllm/config.py
@@ -4722,6 +4722,15 @@ def __post_init__(self):
             if self.cache_config is not None:
                 self.cache_config.enable_prefix_caching = False
 
+        if self.model_config.is_attention_free:
+            # If the model is not of pooling type and it is attention free,
+            # we make sure prefix_caching is disabled so that the correct
+            # KVCacheCoordinator is loaded during initialization.
+            if self.cache_config is not None:
+                logger.info("This is an attention free model, "
+                        "disabling prefix caching.")
+                self.cache_config.enable_prefix_caching = False
+
         if (self.kv_events_config is not None
                 and self.kv_events_config.enable_kv_cache_events
                 and not self.cache_config.enable_prefix_caching):
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
@@ -388,7 +388,7 @@ def get_kv_cache_coordinator(
         kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool,
         enable_caching: bool, caching_hash_fn: Callable,
         enable_kv_cache_events: bool) -> KVCacheCoordinator:
-    if not enable_caching or len(kv_cache_config.kv_cache_groups) == 0:
+    if not enable_caching:
         # We instantiate this coordinator also for attention free models that
         # have 0 kv_cache_groups
         return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len,
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -89,7 +89,7 @@ def __init__(
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
 
         self.block_size: Optional[int] = None
-        if self.enable_caching and len(kv_cache_config.kv_cache_groups) > 0:
+        if self.enable_caching:
             assert len(
                 set(g.kv_cache_spec.block_size
                     for g in kv_cache_config.kv_cache_groups)