Skip to content

Commit f4224b2

Browse files
some minor edits after first review round
1 parent 2deb1e4 commit f4224b2

File tree

4 files changed

+6
-10
lines changed

4 files changed

+6
-10
lines changed

vllm/v1/core/kv_cache_utils.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -756,10 +756,7 @@ def is_kv_cache_type_attention_free(
756756
kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
757757

758758
# kv_cache_spec is an empty dict for attention free models
759-
if not kv_cache_spec:
760-
return True
761-
762-
return False
759+
return not kv_cache_spec
763760

764761
def _get_kv_cache_config_uniform_page_size(
765762
vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],

vllm/v1/executor/abstract.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,16 +73,10 @@ def register_failure_callback(self, callback: FailureCallback):
7373
pass
7474

7575
def determine_available_memory(self) -> list[int]: # in bytes
76-
if self.vllm_config.model_config.is_attention_free:
77-
return [0]
78-
7976
output = self.collective_rpc("determine_available_memory")
8077
return output
8178

8279
def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]:
83-
if self.vllm_config.model_config.is_attention_free:
84-
return [{}]
85-
8680
output = self.collective_rpc("get_kv_cache_spec")
8781
return output
8882

vllm/v1/worker/gpu_model_runner.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2590,6 +2590,8 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
25902590
KVCacheSpec: A dictionary mapping layer names to their KV cache
25912591
format. Layers that do not need KV cache are not included.
25922592
"""
2593+
if self.vllm_config.model_config.is_attention_free:
2594+
return {}
25932595

25942596
block_size = self.vllm_config.cache_config.block_size
25952597
use_mla = self.vllm_config.model_config.use_mla

vllm/v1/worker/gpu_worker.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,9 @@ def determine_available_memory(self) -> int:
209209
You may limit the usage of GPU memory
210210
by adjusting the `gpu_memory_utilization` parameter.
211211
"""
212+
if self.vllm_config.model_config.is_attention_free:
213+
return 0
214+
212215
torch.cuda.empty_cache()
213216
torch.cuda.reset_peak_memory_stats()
214217
GiB = lambda b: b / GiB_bytes

0 commit comments

Comments
 (0)