Skip to content

Commit bb9f2db

Browse files
some minor edits after first review round
1 parent e8a14f8 commit bb9f2db

File tree

4 files changed

+6
-10
lines changed

4 files changed

+6
-10
lines changed

vllm/v1/core/kv_cache_utils.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -744,10 +744,7 @@ def is_kv_cache_type_attention_free(
744744
kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
745745

746746
# kv_cache_spec is an empty dict for attention free models
747-
if not kv_cache_spec:
748-
return True
749-
750-
return False
747+
return not kv_cache_spec
751748

752749
def _get_kv_cache_config_uniform_page_size(
753750
vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],

vllm/v1/executor/abstract.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,16 +73,10 @@ def register_failure_callback(self, callback: FailureCallback):
7373
pass
7474

7575
def determine_available_memory(self) -> list[int]: # in bytes
76-
if self.vllm_config.model_config.is_attention_free:
77-
return [0]
78-
7976
output = self.collective_rpc("determine_available_memory")
8077
return output
8178

8279
def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]:
83-
if self.vllm_config.model_config.is_attention_free:
84-
return [{}]
85-
8680
output = self.collective_rpc("get_kv_cache_spec")
8781
return output
8882

vllm/v1/worker/gpu_model_runner.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2578,6 +2578,8 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
25782578
KVCacheSpec: A dictionary mapping layer names to their KV cache
25792579
format. Layers that do not need KV cache are not included.
25802580
"""
2581+
if self.vllm_config.model_config.is_attention_free:
2582+
return {}
25812583

25822584
block_size = self.vllm_config.cache_config.block_size
25832585
use_mla = self.vllm_config.model_config.use_mla

vllm/v1/worker/gpu_worker.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,9 @@ def determine_available_memory(self) -> int:
206206
You may limit the usage of GPU memory
207207
by adjusting the `gpu_memory_utilization` parameter.
208208
"""
209+
if self.vllm_config.model_config.is_attention_free:
210+
return 0
211+
209212
torch.cuda.empty_cache()
210213
torch.cuda.reset_peak_memory_stats()
211214
GiB = lambda b: b / GiB_bytes

0 commit comments

Comments
 (0)