Skip to content

Commit 4ffd963

Browse files
[v1][core] Support for attention free models (#20811)
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
1 parent 56fe4be commit 4ffd963

File tree

3 files changed

+33
-3
lines changed

3 files changed

+33
-3
lines changed

vllm/v1/core/kv_cache_manager.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,12 @@ def __init__(
7878
) -> None:
7979
self.max_model_len = max_model_len
8080

81+
if len(kv_cache_config.kv_cache_groups) == 0:
82+
# Attention free models don't have kv cache,
83+
# thus don't need prefix caching.
84+
enable_caching = False
8185
self.enable_caching = enable_caching
86+
8287
self.caching_hash_fn = (
8388
sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else
8489
sha256 if caching_hash_algo == "sha256" else hash)
@@ -101,7 +106,7 @@ def __init__(
101106
kv_cache_config=kv_cache_config,
102107
max_model_len=self.max_model_len,
103108
use_eagle=self.use_eagle,
104-
enable_caching=enable_caching,
109+
enable_caching=self.enable_caching,
105110
caching_hash_fn=self.caching_hash_fn,
106111
enable_kv_cache_events=enable_kv_cache_events,
107112
)

vllm/v1/core/kv_cache_utils.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,10 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
563563
ValueError: If there is not enough memory available for the KV cache.
564564
"""
565565

566+
# No need to check for available memory if the kv_cache_spec is empty
567+
if not kv_cache_spec:
568+
return
569+
566570
if available_memory <= 0:
567571
raise ValueError("No available memory for the cache blocks. "
568572
"Try increasing `gpu_memory_utilization` when "
@@ -749,6 +753,13 @@ def is_kv_cache_page_size_uniform(
749753
return len(page_sizes) == 1
750754

751755

756+
def is_kv_cache_type_attention_free(
757+
kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
758+
759+
# kv_cache_spec is an empty dict for attention free models
760+
return not kv_cache_spec
761+
762+
752763
def _get_kv_cache_config_uniform_page_size(
753764
vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
754765
available_memory: int) -> KVCacheConfig:
@@ -891,6 +902,10 @@ def _get_kv_cache_config_uniform_page_size(
891902
return kv_cache_config
892903

893904

905+
def _get_kv_cache_config_attention_free() -> KVCacheConfig:
906+
return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[])
907+
908+
894909
def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
895910
"""
896911
This function tries to convert the KV cache specs to one type if the model
@@ -957,7 +972,11 @@ def get_kv_cache_config(
957972
if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
958973
unify_hybrid_kv_cache_specs(kv_cache_spec)
959974

960-
if is_kv_cache_type_uniform(kv_cache_spec):
975+
if is_kv_cache_type_attention_free(kv_cache_spec):
976+
# This returns a kv_cache config with 0 kv_cache groups and 1 block
977+
# to allow for the KVCache manager to handle attention free models.
978+
return _get_kv_cache_config_attention_free()
979+
elif is_kv_cache_type_uniform(kv_cache_spec):
961980
# KV cache of all layers are the same, which is true for
962981
# most models. Allocate the same amount of memory for
963982
# each layer.

vllm/v1/engine/core.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,13 @@ def _initialize_kv_caches(
139139

140140
# Profiles the peak memory usage of the model to determine how much
141141
# memory can be allocated for kv cache.
142-
available_gpu_memory = self.model_executor.determine_available_memory()
142+
has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
143+
if has_kv_cache:
144+
available_gpu_memory = \
145+
self.model_executor.determine_available_memory()
146+
else:
147+
# Attention free models don't need memory for kv cache
148+
available_gpu_memory = [0] * len(kv_cache_specs)
143149

144150
assert len(kv_cache_specs) == len(available_gpu_memory)
145151
# Get the kv cache tensor size

0 commit comments

Comments
 (0)