From b764c9ddfffe04307a2a255fc3116c9554889e14 Mon Sep 17 00:00:00 2001 From: Christian Pinto Date: Mon, 7 Jul 2025 14:46:11 +0000 Subject: [PATCH 01/10] Support for attention free models Forces 0 KV Cache groups to disable KV Cache in attention free models Signed-off-by: Christian Pinto --- vllm/v1/core/kv_cache_coordinator.py | 4 +++- vllm/v1/core/kv_cache_manager.py | 2 +- vllm/v1/core/kv_cache_utils.py | 21 ++++++++++++++++++++- vllm/v1/executor/abstract.py | 6 ++++++ 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index de72e60434a..7401c4b31e5 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -250,7 +250,9 @@ def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int, super().__init__(kv_cache_config, max_model_len, use_eagle, enable_caching, caching_hash_fn, enable_kv_cache_events) - self.verify_and_split_kv_cache_groups() + # attention free models are initialized with 0 kv_cache_groups + if len(self.kv_cache_config.kv_cache_groups) > 0: + self.verify_and_split_kv_cache_groups() def verify_and_split_kv_cache_groups(self) -> None: """ diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index cbc787e8dd5..22e0341ebaf 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -89,7 +89,7 @@ def __init__( self.prefix_cache_stats = PrefixCacheStats() if log_stats else None self.block_size: Optional[int] = None - if self.enable_caching: + if self.enable_caching and len(self.kv_cache_config.kv_cache_groups) > 0: assert len( set(g.kv_cache_spec.block_size for g in kv_cache_config.kv_cache_groups) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 544b9f59932..1fd888eb640 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -563,6 +563,10 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig, ValueError: If there is not enough memory available for the KV cache. """ + # No need to check for available memory if the model is attention free + if vllm_config.model_config.is_attention_free: + return + if available_memory <= 0: raise ValueError("No available memory for the cache blocks. " "Try increasing `gpu_memory_utilization` when " @@ -748,6 +752,12 @@ def is_kv_cache_page_size_uniform( page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()} return len(page_sizes) == 1 +def is_kv_cache_type_attention_free( + kv_cache_spec: dict[str, KVCacheSpec]) -> bool: + + # kv_cache_spec is an empty dict for attention free models + if not kv_cache_spec: + return True def _get_kv_cache_config_uniform_page_size( vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec], @@ -891,6 +901,11 @@ def _get_kv_cache_config_uniform_page_size( return kv_cache_config +def _get_kv_cache_config_attention_free() -> KVCacheConfig: + return KVCacheConfig(num_blocks=1, + kv_cache_tensors=[], + kv_cache_groups=[]) + def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): """ This function tries to convert the KV cache specs to one type if the model @@ -957,7 +972,11 @@ def get_kv_cache_config( if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager: unify_hybrid_kv_cache_specs(kv_cache_spec) - if is_kv_cache_type_uniform(kv_cache_spec): + if is_kv_cache_type_attention_free(kv_cache_spec): + # This returns a kv_cahce config with 0 kv_cache groups and 1 block + # to allow for the KVCache manager to handle attention free models. + return _get_kv_cache_config_attention_free() + elif is_kv_cache_type_uniform(kv_cache_spec): # KV cache of all layers are the same, which is true for # most models. Allocate the same amount of memory for # each layer. diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 50b9634a49e..9a3aa9888ec 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -73,10 +73,16 @@ def register_failure_callback(self, callback: FailureCallback): pass def determine_available_memory(self) -> list[int]: # in bytes + if self.vllm_config.model_config.is_attention_free: + return [0] + output = self.collective_rpc("determine_available_memory") return output def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]: + if self.vllm_config.model_config.is_attention_free: + return [{}] + output = self.collective_rpc("get_kv_cache_spec") return output From 5825ba45dcfcce3b6cfc94d92a084989e801f63f Mon Sep 17 00:00:00 2001 From: Christian Pinto Date: Fri, 11 Jul 2025 12:13:00 +0000 Subject: [PATCH 02/10] is_kv_cache_type_attention_free: return False if not attention free Signed-off-by: Christian Pinto --- vllm/v1/core/kv_cache_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 1fd888eb640..38ad1cbafb9 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -759,6 +759,8 @@ def is_kv_cache_type_attention_free( if not kv_cache_spec: return True + return False + def _get_kv_cache_config_uniform_page_size( vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec], available_memory: int) -> KVCacheConfig: From fc86350bbd9eccdaf40bf2257e4e6a6d4b3452dd Mon Sep 17 00:00:00 2001 From: Christian Pinto Date: Fri, 11 Jul 2025 12:57:52 +0000 Subject: [PATCH 03/10] some minor edits after first review round Signed-off-by: Christian Pinto --- vllm/v1/core/kv_cache_utils.py | 5 +---- vllm/v1/executor/abstract.py | 6 ------ vllm/v1/worker/gpu_model_runner.py | 2 ++ vllm/v1/worker/gpu_worker.py | 3 +++ 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 38ad1cbafb9..be5e5071a3e 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -756,10 +756,7 @@ def is_kv_cache_type_attention_free( kv_cache_spec: dict[str, KVCacheSpec]) -> bool: # kv_cache_spec is an empty dict for attention free models - if not kv_cache_spec: - return True - - return False + return not kv_cache_spec def _get_kv_cache_config_uniform_page_size( vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec], diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 9a3aa9888ec..50b9634a49e 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -73,16 +73,10 @@ def register_failure_callback(self, callback: FailureCallback): pass def determine_available_memory(self) -> list[int]: # in bytes - if self.vllm_config.model_config.is_attention_free: - return [0] - output = self.collective_rpc("determine_available_memory") return output def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]: - if self.vllm_config.model_config.is_attention_free: - return [{}] - output = self.collective_rpc("get_kv_cache_spec") return output diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4551cb2df98..2ac3c083f0a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2590,6 +2590,8 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: KVCacheSpec: A dictionary mapping layer names to their KV cache format. Layers that do not need KV cache are not included. """ + if self.vllm_config.model_config.is_attention_free: + return {} block_size = self.vllm_config.cache_config.block_size use_mla = self.vllm_config.model_config.use_mla diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 6458b55777a..dff59ea5fc4 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -209,6 +209,9 @@ def determine_available_memory(self) -> int: You may limit the usage of GPU memory by adjusting the `gpu_memory_utilization` parameter. """ + if self.vllm_config.model_config.is_attention_free: + return 0 + torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() GiB = lambda b: b / GiB_bytes From 97c11e62169b4988303157ca6456f295b0463436 Mon Sep 17 00:00:00 2001 From: Christian Pinto Date: Mon, 14 Jul 2025 08:12:33 +0000 Subject: [PATCH 04/10] Rebase to current master - Changes after #20661 merge - Fixed one pre-commit error Signed-off-by: Christian Pinto --- vllm/v1/core/kv_cache_coordinator.py | 8 ++++---- vllm/v1/core/kv_cache_manager.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index 7401c4b31e5..4d8ff32e850 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -250,9 +250,7 @@ def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int, super().__init__(kv_cache_config, max_model_len, use_eagle, enable_caching, caching_hash_fn, enable_kv_cache_events) - # attention free models are initialized with 0 kv_cache_groups - if len(self.kv_cache_config.kv_cache_groups) > 0: - self.verify_and_split_kv_cache_groups() + self.verify_and_split_kv_cache_groups() def verify_and_split_kv_cache_groups(self) -> None: """ @@ -390,7 +388,9 @@ def get_kv_cache_coordinator( kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool, enable_caching: bool, caching_hash_fn: Callable, enable_kv_cache_events: bool) -> KVCacheCoordinator: - if not enable_caching: + if not enable_caching or len(kv_cache_config.kv_cache_groups) == 0: + # We instantiate this coordinator also for attention free models that + # have 0 kv_cache_groups return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len, use_eagle, caching_hash_fn, enable_kv_cache_events) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 22e0341ebaf..728becec74f 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -89,7 +89,7 @@ def __init__( self.prefix_cache_stats = PrefixCacheStats() if log_stats else None self.block_size: Optional[int] = None - if self.enable_caching and len(self.kv_cache_config.kv_cache_groups) > 0: + if self.enable_caching and len(kv_cache_config.kv_cache_groups) > 0: assert len( set(g.kv_cache_spec.block_size for g in kv_cache_config.kv_cache_groups) From 673aeb067a0f59db221e0c0d05e94e5b3418efcc Mon Sep 17 00:00:00 2001 From: Christian Pinto Date: Mon, 14 Jul 2025 10:22:47 +0000 Subject: [PATCH 05/10] Make pre-commits pass Signed-off-by: Christian Pinto --- vllm/v1/core/kv_cache_coordinator.py | 2 +- vllm/v1/core/kv_cache_utils.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index 4d8ff32e850..312d08119b2 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -390,7 +390,7 @@ def get_kv_cache_coordinator( enable_kv_cache_events: bool) -> KVCacheCoordinator: if not enable_caching or len(kv_cache_config.kv_cache_groups) == 0: # We instantiate this coordinator also for attention free models that - # have 0 kv_cache_groups + # have 0 kv_cache_groups return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len, use_eagle, caching_hash_fn, enable_kv_cache_events) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index be5e5071a3e..04f25bf5c92 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -752,12 +752,14 @@ def is_kv_cache_page_size_uniform( page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()} return len(page_sizes) == 1 + def is_kv_cache_type_attention_free( kv_cache_spec: dict[str, KVCacheSpec]) -> bool: # kv_cache_spec is an empty dict for attention free models return not kv_cache_spec + def _get_kv_cache_config_uniform_page_size( vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec], available_memory: int) -> KVCacheConfig: @@ -901,9 +903,8 @@ def _get_kv_cache_config_uniform_page_size( def _get_kv_cache_config_attention_free() -> KVCacheConfig: - return KVCacheConfig(num_blocks=1, - kv_cache_tensors=[], - kv_cache_groups=[]) + return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[]) + def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): """ From fb3ecfbc6d2d8e6a03a755ade914d85296d7d679 Mon Sep 17 00:00:00 2001 From: Christian Pinto Date: Mon, 14 Jul 2025 20:15:13 +0000 Subject: [PATCH 06/10] Disable chunk prefill and prefix caching when model is attention free Signed-off-by: Christian Pinto --- vllm/config.py | 9 +++++++++ vllm/v1/core/kv_cache_coordinator.py | 2 +- vllm/v1/core/kv_cache_manager.py | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index d9f356c5c60..e42f4cb35ab 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4710,6 +4710,15 @@ def __post_init__(self): "Only \"last\" pooling supports chunked " "prefill and prefix caching; disabling both.") + if self.model_config.is_attention_free: + # If the model is not of pooling type and it is attention free, + # we make sure chunked prefill and prefix_caching are + # disabled so that the correct KVCacheCoordinator + # is loaded. + disable_chunked_prefill_reasons.append( + "This is an attention free model, " + "disabling chunked prefill and prefix caching.") + if disable_chunked_prefill_reasons: for reason in disable_chunked_prefill_reasons: logger.info(reason) diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index 312d08119b2..a1dc2904a3c 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -388,7 +388,7 @@ def get_kv_cache_coordinator( kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool, enable_caching: bool, caching_hash_fn: Callable, enable_kv_cache_events: bool) -> KVCacheCoordinator: - if not enable_caching or len(kv_cache_config.kv_cache_groups) == 0: + if not enable_caching: # We instantiate this coordinator also for attention free models that # have 0 kv_cache_groups return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len, diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 728becec74f..cbc787e8dd5 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -89,7 +89,7 @@ def __init__( self.prefix_cache_stats = PrefixCacheStats() if log_stats else None self.block_size: Optional[int] = None - if self.enable_caching and len(kv_cache_config.kv_cache_groups) > 0: + if self.enable_caching: assert len( set(g.kv_cache_spec.block_size for g in kv_cache_config.kv_cache_groups) From 8e5dbee2b72786e3e7282a24198d7cd02c56a923 Mon Sep 17 00:00:00 2001 From: Christian Pinto Date: Tue, 15 Jul 2025 09:55:59 +0000 Subject: [PATCH 07/10] reworked to allow for models like mamba to use the kv_cache for state retention Signed-off-by: Christian Pinto --- vllm/v1/core/kv_cache_manager.py | 6 ++++-- vllm/v1/core/kv_cache_utils.py | 6 +++--- vllm/v1/engine/core.py | 6 +++++- vllm/v1/worker/gpu_model_runner.py | 2 -- vllm/v1/worker/gpu_worker.py | 2 -- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index cbc787e8dd5..0f68a57a37f 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -78,7 +78,9 @@ def __init__( ) -> None: self.max_model_len = max_model_len - self.enable_caching = enable_caching + self.enable_caching = (enable_caching + if len(kv_cache_config.kv_cache_groups) > 0 + else False) self.caching_hash_fn = ( sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else sha256 if caching_hash_algo == "sha256" else hash) @@ -101,7 +103,7 @@ def __init__( kv_cache_config=kv_cache_config, max_model_len=self.max_model_len, use_eagle=self.use_eagle, - enable_caching=enable_caching, + enable_caching=self.enable_caching, caching_hash_fn=self.caching_hash_fn, enable_kv_cache_events=enable_kv_cache_events, ) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 04f25bf5c92..6067a127e97 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -563,8 +563,8 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig, ValueError: If there is not enough memory available for the KV cache. """ - # No need to check for available memory if the model is attention free - if vllm_config.model_config.is_attention_free: + # No need to check for available memory if the kv_cache_spec is empty + if not kv_cache_spec: return if available_memory <= 0: @@ -973,7 +973,7 @@ def get_kv_cache_config( unify_hybrid_kv_cache_specs(kv_cache_spec) if is_kv_cache_type_attention_free(kv_cache_spec): - # This returns a kv_cahce config with 0 kv_cache groups and 1 block + # This returns a kv_cache config with 0 kv_cache groups and 1 block # to allow for the KVCache manager to handle attention free models. return _get_kv_cache_config_attention_free() elif is_kv_cache_type_uniform(kv_cache_spec): diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e2fdf6f8a11..7568bd96f85 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -139,7 +139,11 @@ def _initialize_kv_caches( # Profiles the peak memory usage of the model to determine how much # memory can be allocated for kv cache. - available_gpu_memory = self.model_executor.determine_available_memory() + check_available_memory = not(len(kv_cache_specs) == 1 and not kv_cache_specs[0]) + available_gpu_memory = [0] + if check_available_memory: + available_gpu_memory = ( + self.model_executor.determine_available_memory()) assert len(kv_cache_specs) == len(available_gpu_memory) # Get the kv cache tensor size diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2ac3c083f0a..4551cb2df98 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2590,8 +2590,6 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: KVCacheSpec: A dictionary mapping layer names to their KV cache format. Layers that do not need KV cache are not included. """ - if self.vllm_config.model_config.is_attention_free: - return {} block_size = self.vllm_config.cache_config.block_size use_mla = self.vllm_config.model_config.use_mla diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index dff59ea5fc4..3aec95a6388 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -209,8 +209,6 @@ def determine_available_memory(self) -> int: You may limit the usage of GPU memory by adjusting the `gpu_memory_utilization` parameter. """ - if self.vllm_config.model_config.is_attention_free: - return 0 torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() From 2ee7087c57ed779e2a586f904fdec485b665a0da Mon Sep 17 00:00:00 2001 From: Christian Pinto Date: Tue, 15 Jul 2025 09:58:52 +0000 Subject: [PATCH 08/10] cleanup config.py Signed-off-by: Christian Pinto --- vllm/config.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index e42f4cb35ab..d9f356c5c60 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4710,15 +4710,6 @@ def __post_init__(self): "Only \"last\" pooling supports chunked " "prefill and prefix caching; disabling both.") - if self.model_config.is_attention_free: - # If the model is not of pooling type and it is attention free, - # we make sure chunked prefill and prefix_caching are - # disabled so that the correct KVCacheCoordinator - # is loaded. - disable_chunked_prefill_reasons.append( - "This is an attention free model, " - "disabling chunked prefill and prefix caching.") - if disable_chunked_prefill_reasons: for reason in disable_chunked_prefill_reasons: logger.info(reason) From 19a7d7089503f2b07087f83606ff1ab7f5d0b6c0 Mon Sep 17 00:00:00 2001 From: Christian Pinto Date: Tue, 15 Jul 2025 10:01:29 +0000 Subject: [PATCH 09/10] cleanup gpu_worker.py Signed-off-by: Christian Pinto --- vllm/v1/worker/gpu_worker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 3aec95a6388..6458b55777a 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -209,7 +209,6 @@ def determine_available_memory(self) -> int: You may limit the usage of GPU memory by adjusting the `gpu_memory_utilization` parameter. """ - torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() GiB = lambda b: b / GiB_bytes From b8f355e8e3c6ad68c7df20e559552bd597424129 Mon Sep 17 00:00:00 2001 From: Christian Pinto Date: Tue, 15 Jul 2025 10:42:24 +0000 Subject: [PATCH 10/10] Edits after review Signed-off-by: Christian Pinto --- vllm/v1/core/kv_cache_coordinator.py | 2 -- vllm/v1/core/kv_cache_manager.py | 9 ++++++--- vllm/v1/engine/core.py | 12 +++++++----- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index a1dc2904a3c..de72e60434a 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -389,8 +389,6 @@ def get_kv_cache_coordinator( enable_caching: bool, caching_hash_fn: Callable, enable_kv_cache_events: bool) -> KVCacheCoordinator: if not enable_caching: - # We instantiate this coordinator also for attention free models that - # have 0 kv_cache_groups return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len, use_eagle, caching_hash_fn, enable_kv_cache_events) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 0f68a57a37f..e820a0ad6d5 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -78,9 +78,12 @@ def __init__( ) -> None: self.max_model_len = max_model_len - self.enable_caching = (enable_caching - if len(kv_cache_config.kv_cache_groups) > 0 - else False) + if len(kv_cache_config.kv_cache_groups) == 0: + # Attention free models don't have kv cache, + # thus don't need prefix caching. + enable_caching = False + self.enable_caching = enable_caching + self.caching_hash_fn = ( sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else sha256 if caching_hash_algo == "sha256" else hash) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 7568bd96f85..f5c59bef478 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -139,11 +139,13 @@ def _initialize_kv_caches( # Profiles the peak memory usage of the model to determine how much # memory can be allocated for kv cache. - check_available_memory = not(len(kv_cache_specs) == 1 and not kv_cache_specs[0]) - available_gpu_memory = [0] - if check_available_memory: - available_gpu_memory = ( - self.model_executor.determine_available_memory()) + has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs) + if has_kv_cache: + available_gpu_memory = \ + self.model_executor.determine_available_memory() + else: + # Attention free models don't need memory for kv cache + available_gpu_memory = [0] * len(kv_cache_specs) assert len(kv_cache_specs) == len(available_gpu_memory) # Get the kv cache tensor size