Skip to content

[v1][core] Support for attention free models #20811

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
7 changes: 6 additions & 1 deletion vllm/v1/core/kv_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,12 @@ def __init__(
) -> None:
self.max_model_len = max_model_len

if len(kv_cache_config.kv_cache_groups) == 0:
# Attention free models don't have kv cache,
# thus don't need prefix caching.
enable_caching = False
self.enable_caching = enable_caching

self.caching_hash_fn = (
sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else
sha256 if caching_hash_algo == "sha256" else hash)
Expand All @@ -101,7 +106,7 @@ def __init__(
kv_cache_config=kv_cache_config,
max_model_len=self.max_model_len,
use_eagle=self.use_eagle,
enable_caching=enable_caching,
enable_caching=self.enable_caching,
caching_hash_fn=self.caching_hash_fn,
enable_kv_cache_events=enable_kv_cache_events,
)
Expand Down
21 changes: 20 additions & 1 deletion vllm/v1/core/kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,10 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
ValueError: If there is not enough memory available for the KV cache.
"""

# No need to check for available memory if the kv_cache_spec is empty
if not kv_cache_spec:
return

if available_memory <= 0:
raise ValueError("No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when "
Expand Down Expand Up @@ -749,6 +753,13 @@ def is_kv_cache_page_size_uniform(
return len(page_sizes) == 1


def is_kv_cache_type_attention_free(
kv_cache_spec: dict[str, KVCacheSpec]) -> bool:

# kv_cache_spec is an empty dict for attention free models
return not kv_cache_spec


def _get_kv_cache_config_uniform_page_size(
vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
available_memory: int) -> KVCacheConfig:
Expand Down Expand Up @@ -891,6 +902,10 @@ def _get_kv_cache_config_uniform_page_size(
return kv_cache_config


def _get_kv_cache_config_attention_free() -> KVCacheConfig:
return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[])


def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
"""
This function tries to convert the KV cache specs to one type if the model
Expand Down Expand Up @@ -957,7 +972,11 @@ def get_kv_cache_config(
if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
unify_hybrid_kv_cache_specs(kv_cache_spec)

if is_kv_cache_type_uniform(kv_cache_spec):
if is_kv_cache_type_attention_free(kv_cache_spec):
# This returns a kv_cache config with 0 kv_cache groups and 1 block
# to allow for the KVCache manager to handle attention free models.
return _get_kv_cache_config_attention_free()
elif is_kv_cache_type_uniform(kv_cache_spec):
# KV cache of all layers are the same, which is true for
# most models. Allocate the same amount of memory for
# each layer.
Expand Down
8 changes: 7 additions & 1 deletion vllm/v1/engine/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,13 @@ def _initialize_kv_caches(

# Profiles the peak memory usage of the model to determine how much
# memory can be allocated for kv cache.
available_gpu_memory = self.model_executor.determine_available_memory()
has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
if has_kv_cache:
available_gpu_memory = \
self.model_executor.determine_available_memory()
else:
# Attention free models don't need memory for kv cache
available_gpu_memory = [0] * len(kv_cache_specs)

assert len(kv_cache_specs) == len(available_gpu_memory)
# Get the kv cache tensor size
Expand Down