diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index cbc787e8dd5..e820a0ad6d5 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -78,7 +78,12 @@ def __init__( ) -> None: self.max_model_len = max_model_len + if len(kv_cache_config.kv_cache_groups) == 0: + # Attention free models don't have kv cache, + # thus don't need prefix caching. + enable_caching = False self.enable_caching = enable_caching + self.caching_hash_fn = ( sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else sha256 if caching_hash_algo == "sha256" else hash) @@ -101,7 +106,7 @@ def __init__( kv_cache_config=kv_cache_config, max_model_len=self.max_model_len, use_eagle=self.use_eagle, - enable_caching=enable_caching, + enable_caching=self.enable_caching, caching_hash_fn=self.caching_hash_fn, enable_kv_cache_events=enable_kv_cache_events, ) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 544b9f59932..6067a127e97 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -563,6 +563,10 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig, ValueError: If there is not enough memory available for the KV cache. """ + # No need to check for available memory if the kv_cache_spec is empty + if not kv_cache_spec: + return + if available_memory <= 0: raise ValueError("No available memory for the cache blocks. " "Try increasing `gpu_memory_utilization` when " @@ -749,6 +753,13 @@ def is_kv_cache_page_size_uniform( return len(page_sizes) == 1 +def is_kv_cache_type_attention_free( + kv_cache_spec: dict[str, KVCacheSpec]) -> bool: + + # kv_cache_spec is an empty dict for attention free models + return not kv_cache_spec + + def _get_kv_cache_config_uniform_page_size( vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec], available_memory: int) -> KVCacheConfig: @@ -891,6 +902,10 @@ def _get_kv_cache_config_uniform_page_size( return kv_cache_config +def _get_kv_cache_config_attention_free() -> KVCacheConfig: + return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[]) + + def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): """ This function tries to convert the KV cache specs to one type if the model @@ -957,7 +972,11 @@ def get_kv_cache_config( if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager: unify_hybrid_kv_cache_specs(kv_cache_spec) - if is_kv_cache_type_uniform(kv_cache_spec): + if is_kv_cache_type_attention_free(kv_cache_spec): + # This returns a kv_cache config with 0 kv_cache groups and 1 block + # to allow for the KVCache manager to handle attention free models. + return _get_kv_cache_config_attention_free() + elif is_kv_cache_type_uniform(kv_cache_spec): # KV cache of all layers are the same, which is true for # most models. Allocate the same amount of memory for # each layer. diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e2fdf6f8a11..f5c59bef478 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -139,7 +139,13 @@ def _initialize_kv_caches( # Profiles the peak memory usage of the model to determine how much # memory can be allocated for kv cache. - available_gpu_memory = self.model_executor.determine_available_memory() + has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs) + if has_kv_cache: + available_gpu_memory = \ + self.model_executor.determine_available_memory() + else: + # Attention free models don't need memory for kv cache + available_gpu_memory = [0] * len(kv_cache_specs) assert len(kv_cache_specs) == len(available_gpu_memory) # Get the kv cache tensor size