2
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
3
"""KV-Cache Utilities."""
4
4
5
+ import copy
5
6
import os
6
7
from collections import defaultdict , deque
7
8
from collections .abc import Iterable , Sequence
16
17
KVCacheTensor , SlidingWindowSpec )
17
18
from vllm .v1 .metrics .stats import PrefixCacheStats
18
19
from vllm .v1 .request import Request
20
+ from vllm .v1 .core .kv_cache_coordinator import calculate_optimal_block_size
19
21
20
22
logger = init_logger (__name__ )
21
23
@@ -35,9 +37,9 @@ class BlockHash(NamedTuple):
35
37
36
38
37
39
class BlockHashWithGroupId (NamedTuple ):
38
- # The hash value for the contents (e.g., token_ids) of a block without group
39
- # ID. The value is the same for blocks representing the same tokens but for
40
- # different groups.
40
+ # The hash value for the contents (e.g., token_ids) of a block without
41
+ # group ID. The value is the same for blocks representing the same tokens
42
+ # but for different groups.
41
43
block_hash : BlockHash
42
44
# The KV cache group ID.
43
45
group_id : int
@@ -54,7 +56,7 @@ def get_hash_value(self) -> int:
54
56
# a random seed if PYTHONHASHSEED is not set.
55
57
#
56
58
# The function `init_none_hash` initializes this variable globally.
57
- NONE_HASH : int
59
+ NONE_HASH : int = 0 # Default value, will be overridden by init_none_hash
58
60
59
61
60
62
def init_none_hash (hash_fn : Callable ):
@@ -76,8 +78,8 @@ class PrefixCachingMetrics:
76
78
"""Metrics for prefix caching with a hit rate of the max recent N requests.
77
79
78
80
Args:
79
- max_recent_requests: The number of the max recent requests to aggregate.
80
- Defaults to 1000.
81
+ max_recent_requests: The number of the max recent requests to
82
+ aggregate. Defaults to 1000.
81
83
"""
82
84
83
85
def __init__ (self , max_recent_requests : int = 1000 ):
@@ -196,8 +198,8 @@ class FreeKVCacheBlockQueue:
196
198
manipulating the linked list. Instead, this class manipulates the
197
199
prev_free_block and next_free_block attributes of the given blocks.
198
200
199
- The queue is ordered by block ID in the beginning. When a block is allocated
200
- and then freed, it will be appended back with the eviction order:
201
+ The queue is ordered by block ID in the beginning. When a block is
202
+ allocated and then freed, it will be appended back with the eviction order:
201
203
1. The least recent used block is at the front (LRU).
202
204
2. If two blocks have the same last accessed time (allocated by the
203
205
same sequence), the one with more hash tokens (the tail of a block
@@ -891,6 +893,24 @@ def _get_kv_cache_config_uniform_page_size(
891
893
return kv_cache_config
892
894
893
895
896
+ def _get_kv_cache_config_optimal_block_size (vllm_config , kv_cache_spec , available_memory ):
897
+ """Use optimal block size for hybrid models."""
898
+ optimal_block_size = calculate_optimal_block_size (kv_cache_spec )
899
+
900
+ # Update specs with optimal size
901
+ updated_specs = {}
902
+ for name , spec in kv_cache_spec .items ():
903
+ if hasattr (spec , 'block_size' ): # AttentionSpec
904
+ new_spec = copy .deepcopy (spec )
905
+ new_spec .block_size = optimal_block_size
906
+ updated_specs [name ] = new_spec
907
+ else :
908
+ updated_specs [name ] = spec
909
+
910
+ # Use existing logic
911
+ return _get_kv_cache_config_uniform_page_size (vllm_config , updated_specs , available_memory )
912
+
913
+
894
914
def unify_hybrid_kv_cache_specs (kv_cache_spec : dict [str , KVCacheSpec ]):
895
915
"""
896
916
This function tries to convert the KV cache specs to one type if the model
@@ -973,7 +993,10 @@ def get_kv_cache_config(
973
993
available_memory )
974
994
975
995
raise NotImplementedError
976
-
996
+ else :
997
+ return _get_kv_cache_config_optimal_block_size (vllm_config ,
998
+ kv_cache_spec ,
999
+ available_memory )
977
1000
978
1001
def unify_kv_cache_configs (kv_cache_configs : list [KVCacheConfig ]):
979
1002
"""
0 commit comments