some minor edits after first review round

christian-pinto · christian-pinto · commit bb9f2db8c680 · 2025-07-11T12:57:52.000Z
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
@@ -744,10 +744,7 @@ def is_kv_cache_type_attention_free(
         kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
 
     # kv_cache_spec is an empty dict for attention free models
-    if not kv_cache_spec:
-        return True
-
-    return False
+    return not kv_cache_spec
 
 def _get_kv_cache_config_uniform_page_size(
         vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
@@ -73,16 +73,10 @@ def register_failure_callback(self, callback: FailureCallback):
         pass
 
     def determine_available_memory(self) -> list[int]:  # in bytes
-        if self.vllm_config.model_config.is_attention_free:
-            return [0]
-
         output = self.collective_rpc("determine_available_memory")
         return output
 
     def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]:
-        if self.vllm_config.model_config.is_attention_free:
-            return [{}]
-
         output = self.collective_rpc("get_kv_cache_spec")
         return output
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2578,6 +2578,8 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
             KVCacheSpec: A dictionary mapping layer names to their KV cache
             format. Layers that do not need KV cache are not included.
         """
+        if self.vllm_config.model_config.is_attention_free:
+            return {}
 
         block_size = self.vllm_config.cache_config.block_size
         use_mla = self.vllm_config.model_config.use_mla
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
@@ -206,6 +206,9 @@ def determine_available_memory(self) -> int:
             You may limit the usage of GPU memory
             by adjusting the `gpu_memory_utilization` parameter.
         """
+        if self.vllm_config.model_config.is_attention_free:
+            return 0
+
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
         GiB = lambda b: b / GiB_bytes