@@ -563,6 +563,10 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
563
563
ValueError: If there is not enough memory available for the KV cache.
564
564
"""
565
565
566
+ # No need to check for available memory if the kv_cache_spec is empty
567
+ if not kv_cache_spec :
568
+ return
569
+
566
570
if available_memory <= 0 :
567
571
raise ValueError ("No available memory for the cache blocks. "
568
572
"Try increasing `gpu_memory_utilization` when "
@@ -749,6 +753,13 @@ def is_kv_cache_page_size_uniform(
749
753
return len (page_sizes ) == 1
750
754
751
755
756
+ def is_kv_cache_type_attention_free (
757
+ kv_cache_spec : dict [str , KVCacheSpec ]) -> bool :
758
+
759
+ # kv_cache_spec is an empty dict for attention free models
760
+ return not kv_cache_spec
761
+
762
+
752
763
def _get_kv_cache_config_uniform_page_size (
753
764
vllm_config : VllmConfig , kv_cache_spec : dict [str , KVCacheSpec ],
754
765
available_memory : int ) -> KVCacheConfig :
@@ -891,6 +902,10 @@ def _get_kv_cache_config_uniform_page_size(
891
902
return kv_cache_config
892
903
893
904
905
+ def _get_kv_cache_config_attention_free () -> KVCacheConfig :
906
+ return KVCacheConfig (num_blocks = 1 , kv_cache_tensors = [], kv_cache_groups = [])
907
+
908
+
894
909
def unify_hybrid_kv_cache_specs (kv_cache_spec : dict [str , KVCacheSpec ]):
895
910
"""
896
911
This function tries to convert the KV cache specs to one type if the model
@@ -957,7 +972,11 @@ def get_kv_cache_config(
957
972
if vllm_config .scheduler_config .disable_hybrid_kv_cache_manager :
958
973
unify_hybrid_kv_cache_specs (kv_cache_spec )
959
974
960
- if is_kv_cache_type_uniform (kv_cache_spec ):
975
+ if is_kv_cache_type_attention_free (kv_cache_spec ):
976
+ # This returns a kv_cache config with 0 kv_cache groups and 1 block
977
+ # to allow for the KVCache manager to handle attention free models.
978
+ return _get_kv_cache_config_attention_free ()
979
+ elif is_kv_cache_type_uniform (kv_cache_spec ):
961
980
# KV cache of all layers are the same, which is true for
962
981
# most models. Allocate the same amount of memory for
963
982
# each layer.
0 commit comments