2
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
3
"""KV-Cache Utilities."""
4
4
5
+ import copy
5
6
import os
6
7
from collections import defaultdict , deque
7
8
from collections .abc import Iterable , Sequence
11
12
from vllm .config import VllmConfig
12
13
from vllm .logger import init_logger
13
14
from vllm .utils import GiB_bytes , cdiv , sha256_cbor_64bit
14
- from vllm .v1 .kv_cache_interface import (ChunkedLocalAttentionSpec ,
15
- FullAttentionSpec , KVCacheConfig ,
15
+ from vllm .v1 .kv_cache_interface import (FullAttentionSpec , KVCacheConfig ,
16
16
KVCacheGroupSpec , KVCacheSpec ,
17
17
KVCacheTensor , SlidingWindowSpec )
18
18
from vllm .v1 .metrics .stats import PrefixCacheStats
19
19
from vllm .v1 .request import Request
20
-
20
+ from vllm .v1 .core .kv_cache_coordinator import HybridKVCacheCoordinator
21
+ from vllm .v1 .core .block_hash import BlockHash , BlockHashWithGroupId
21
22
logger = init_logger (__name__ )
22
23
23
24
24
- class BlockHash (NamedTuple ):
25
- """Hash value of a block (int), the token IDs in the block, and extra keys.
26
- We keep a tuple of token IDs and extra keys to reduce the likelihood of
27
- hash collisions when the hash value is the same. By using SHA256 however,
28
- hash collisions are practically impossible.
29
- """
30
- # Hash value of the block in an integer.
31
- hash_value : int
32
- # Token IDs in the block.
33
- token_ids : tuple [int , ...]
34
- # Extra keys for the block.
35
- extra_keys : Optional [Any ] = None
36
-
37
-
38
- class BlockHashWithGroupId (NamedTuple ):
39
- # The hash value for the contents (e.g., token_ids) of a block without group
40
- # ID. The value is the same for blocks representing the same tokens but for
41
- # different groups.
42
- block_hash : BlockHash
43
- # The KV cache group ID.
44
- group_id : int
45
-
46
- def get_hash_value (self ) -> int :
47
- return self .block_hash .hash_value
48
-
49
-
50
25
# The hash seed for the first block of any prefix block sequence.
51
26
#
52
27
# We use a random value to avoid hash collisions or PYTHONHASHSEED environment
@@ -55,7 +30,7 @@ def get_hash_value(self) -> int:
55
30
# a random seed if PYTHONHASHSEED is not set.
56
31
#
57
32
# The function `init_none_hash` initializes this variable globally.
58
- NONE_HASH : int
33
+ NONE_HASH : int = 0 # Default value, will be overridden by init_none_hash
59
34
60
35
61
36
def init_none_hash (hash_fn : Callable ):
@@ -77,8 +52,8 @@ class PrefixCachingMetrics:
77
52
"""Metrics for prefix caching with a hit rate of the max recent N requests.
78
53
79
54
Args:
80
- max_recent_requests: The number of the max recent requests to aggregate.
81
- Defaults to 1000.
55
+ max_recent_requests: The number of the max recent requests to
56
+ aggregate. Defaults to 1000.
82
57
"""
83
58
84
59
def __init__ (self , max_recent_requests : int = 1000 ):
@@ -197,8 +172,8 @@ class FreeKVCacheBlockQueue:
197
172
manipulating the linked list. Instead, this class manipulates the
198
173
prev_free_block and next_free_block attributes of the given blocks.
199
174
200
- The queue is ordered by block ID in the beginning. When a block is allocated
201
- and then freed, it will be appended back with the eviction order:
175
+ The queue is ordered by block ID in the beginning. When a block is
176
+ allocated and then freed, it will be appended back with the eviction order:
202
177
1. The least recent used block is at the front (LRU).
203
178
2. If two blocks have the same last accessed time (allocated by the
204
179
same sequence), the one with more hash tokens (the tail of a block
@@ -747,7 +722,7 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
747
722
Returns:
748
723
The generated KVCacheConfig
749
724
"""
750
-
725
+
751
726
page_size = get_uniform_page_size (kv_cache_spec )
752
727
num_blocks = get_num_blocks (vllm_config , len (kv_cache_spec ),
753
728
available_memory , page_size )
@@ -762,7 +737,7 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
762
737
KVCacheTensor (size = per_layer_size , shared_by = [layer_name ])
763
738
for layer_name in kv_cache_spec
764
739
]
765
-
740
+
766
741
kv_cache_config = KVCacheConfig (
767
742
num_blocks = num_blocks ,
768
743
kv_cache_tensors = kv_cache_tensors ,
@@ -949,6 +924,49 @@ def _get_kv_cache_config_attention_free() -> KVCacheConfig:
949
924
return KVCacheConfig (num_blocks = 1 , kv_cache_tensors = [], kv_cache_groups = [])
950
925
951
926
927
+ def _get_kv_cache_config_optimal_block_size (
928
+ vllm_config : VllmConfig ,
929
+ kv_cache_spec : dict [str , KVCacheSpec ],
930
+ available_memory : int ) -> KVCacheConfig :
931
+ """Use optimal block size for hybrid models.
932
+
933
+ Args:
934
+ vllm_config: The vLLM configuration.
935
+ kv_cache_spec: KV cache specifications for each cache type.
936
+ available_memory: Available memory in bytes.
937
+
938
+ Returns:
939
+ KV cache configuration with optimal block size.
940
+ """
941
+ try :
942
+ # Import here to avoid circular dependency
943
+ from vllm .v1 .core .kv_cache_coordinator import (
944
+ HybridKVCacheCoordinator )
945
+
946
+ optimal_block_size = HybridKVCacheCoordinator .calculate_optimal_block_size (
947
+ kv_cache_spec )
948
+
949
+ # Update specs with optimal size.
950
+ updated_specs = {}
951
+ for name , spec in kv_cache_spec .items ():
952
+ # The optimal block size is applied to all specs to ensure uniformity.
953
+ new_spec = copy .deepcopy (spec )
954
+ new_spec .block_size = optimal_block_size
955
+ updated_specs [name ] = new_spec
956
+
957
+ # Use existing logic.
958
+ return _get_kv_cache_config_uniform_page_size (vllm_config , updated_specs ,
959
+ available_memory )
960
+ except Exception as e :
961
+ logger .warning (
962
+ "Failed to calculate optimal block size: %s. "
963
+ "Falling back to uniform page size logic." ,
964
+ e ,
965
+ exc_info = True )
966
+ return _get_kv_cache_config_uniform_page_size (vllm_config , kv_cache_spec ,
967
+ available_memory )
968
+
969
+
952
970
def unify_hybrid_kv_cache_specs (kv_cache_spec : dict [str , KVCacheSpec ]):
953
971
"""
954
972
This function tries to convert the KV cache specs to one type if the model
@@ -977,11 +995,7 @@ def is_hybrid(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
977
995
isinstance (spec , FullAttentionSpec ) for spec in kv_cache_spec .values ())
978
996
has_sliding_window = any (
979
997
isinstance (spec , SlidingWindowSpec ) for spec in kv_cache_spec .values ())
980
- has_chunked_local_attention = any (
981
- isinstance (spec , ChunkedLocalAttentionSpec )
982
- for spec in kv_cache_spec .values ())
983
- if has_full_attention and (has_sliding_window
984
- or has_chunked_local_attention ):
998
+ if has_full_attention and has_sliding_window :
985
999
for layer_name , spec in kv_cache_spec .items ():
986
1000
if isinstance (spec , SlidingWindowSpec ):
987
1001
kv_cache_spec [layer_name ] = FullAttentionSpec (
@@ -992,15 +1006,6 @@ def is_hybrid(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
992
1006
use_mla = spec .use_mla ,
993
1007
sliding_window = spec .sliding_window ,
994
1008
)
995
- elif isinstance (spec , ChunkedLocalAttentionSpec ):
996
- kv_cache_spec [layer_name ] = FullAttentionSpec (
997
- block_size = spec .block_size ,
998
- num_kv_heads = spec .num_kv_heads ,
999
- head_size = spec .head_size ,
1000
- dtype = spec .dtype ,
1001
- use_mla = spec .use_mla ,
1002
- attention_chunk_size = spec .attention_chunk_size ,
1003
- )
1004
1009
1005
1010
if is_hybrid (kv_cache_spec ):
1006
1011
raise ValueError ("Hybrid KV cache manager is disabled but failed to "
@@ -1024,6 +1029,7 @@ def get_kv_cache_config(
1024
1029
The generated KVCacheConfigs
1025
1030
"""
1026
1031
check_enough_kv_cache_memory (vllm_config , kv_cache_spec , available_memory )
1032
+
1027
1033
if vllm_config .scheduler_config .disable_hybrid_kv_cache_manager :
1028
1034
unify_hybrid_kv_cache_specs (kv_cache_spec )
1029
1035
@@ -1046,8 +1052,10 @@ def get_kv_cache_config(
1046
1052
kv_cache_spec ,
1047
1053
available_memory )
1048
1054
1049
- raise NotImplementedError
1050
-
1055
+ else :
1056
+ return _get_kv_cache_config_optimal_block_size (vllm_config ,
1057
+ kv_cache_spec ,
1058
+ available_memory )
1051
1059
1052
1060
def unify_kv_cache_configs (kv_cache_configs : list [KVCacheConfig ]):
1053
1061
"""
0 commit comments