Skip to content

Commit e0dd56a

Browse files
Support for attention free models revisited to reuse existing KVCache maanger.
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
1 parent 6f4944a commit e0dd56a

File tree

6 files changed

+64
-40
lines changed

6 files changed

+64
-40
lines changed

vllm/v1/core/block_pool.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ def __init__(
3636
enable_caching: bool,
3737
enable_kv_cache_events: bool = False,
3838
):
39-
assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
39+
# num_gpu_blocks can be 0 for attention free models
40+
assert isinstance(num_gpu_blocks, int)
4041
self.num_gpu_blocks = num_gpu_blocks
4142
self.enable_caching = enable_caching
4243
# All kv-cache blocks.
@@ -60,10 +61,12 @@ def __init__(
6061
self.cached_block_hash_to_block: dict[BlockHashWithGroupId, dict[
6162
int, KVCacheBlock]] = defaultdict(dict)
6263

64+
if not self.free_block_queue:
65+
self
6366
# To represent a placeholder block with block_id=0.
6467
# The ref_cnt of null_block is not maintained, needs special care to
6568
# avoid freeing it.
66-
self.null_block = self.free_block_queue.popleft()
69+
self.null_block = self.free_block_queue.popleft() if self.free_block_queue else None
6770
self.null_block.is_null = True
6871

6972
self.enable_kv_cache_events = enable_kv_cache_events

vllm/v1/core/kv_cache_coordinator.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,9 @@ def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
219219
super().__init__(kv_cache_config, max_model_len, use_eagle,
220220
enable_caching, caching_hash_fn,
221221
enable_kv_cache_events)
222-
self.verify_and_split_kv_cache_groups()
222+
# attention free models are initialized with 0 kv_cache_groups
223+
if len(self.kv_cache_config.kv_cache_groups) > 0:
224+
self.verify_and_split_kv_cache_groups()
223225

224226
def verify_and_split_kv_cache_groups(self) -> None:
225227
"""

vllm/v1/core/kv_cache_manager.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,12 +84,17 @@ def __init__(
8484
self.log_stats = log_stats
8585
# FIXME: make prefix cache stats conditional on log_stats
8686
self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
87-
assert len(
88-
set(g.kv_cache_spec.block_size
89-
for g in kv_cache_config.kv_cache_groups)
90-
) == 1, "Only one block size is supported for now"
91-
self.block_size = kv_cache_config.kv_cache_groups[
92-
0].kv_cache_spec.block_size
87+
88+
if len(kv_cache_config.kv_cache_groups) == 0:
89+
#This is an attention free model that is started with 0 KVCache groups.
90+
self.block_size = 0
91+
else:
92+
assert len(
93+
set(g.kv_cache_spec.block_size
94+
for g in kv_cache_config.kv_cache_groups)
95+
) == 1, "Only one block size is supported for now"
96+
self.block_size = kv_cache_config.kv_cache_groups[
97+
0].kv_cache_spec.block_size
9398

9499
self.coordinator = get_kv_cache_coordinator(
95100
kv_cache_config=kv_cache_config,

vllm/v1/core/kv_cache_utils.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -200,14 +200,18 @@ class FreeKVCacheBlockQueue:
200200
def __init__(self, blocks: list[KVCacheBlock]) -> None:
201201
self.num_free_blocks = len(blocks)
202202

203-
# Initialize the doubly linked list of free blocks.
204-
self.free_list_head: Optional[KVCacheBlock] = blocks[0]
205-
self.free_list_tail: Optional[KVCacheBlock] = blocks[-1]
206-
for i in range(self.num_free_blocks):
207-
if i > 0:
208-
blocks[i].prev_free_block = blocks[i - 1]
209-
if i < self.num_free_blocks - 1:
210-
blocks[i].next_free_block = blocks[i + 1]
203+
# This is 0 in attention free models
204+
if self.num_free_blocks > 0:
205+
# Initialize the doubly linked list of free blocks.
206+
self.free_list_head: Optional[KVCacheBlock] = blocks[0]
207+
self.free_list_tail: Optional[KVCacheBlock] = blocks[-1]
208+
for i in range(self.num_free_blocks):
209+
if i > 0:
210+
blocks[i].prev_free_block = blocks[i - 1]
211+
if i < self.num_free_blocks - 1:
212+
blocks[i].next_free_block = blocks[i + 1]
213+
else:
214+
self.free_list_head = self.free_list_tail = KVCacheBlock(block_id=0, ref_cnt=0, is_null=True)
211215

212216
def popleft(self) -> KVCacheBlock:
213217
"""Pop the first free block and reduce num_free_blocks by 1.

vllm/v1/core/sched/scheduler.py

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,9 @@ def __init__(
9292
)
9393

9494
num_gpu_blocks = self.cache_config.num_gpu_blocks
95-
assert num_gpu_blocks is not None and num_gpu_blocks > 0
95+
96+
# num_gpu_blocks can be zero for attention free models
97+
assert num_gpu_blocks is not None
9698

9799
self.block_size = self.cache_config.block_size
98100

@@ -246,6 +248,12 @@ def schedule(self) -> SchedulerOutput:
246248
request.num_tokens, 0)
247249

248250
while True:
251+
# This model is attention free and we do not need to allocate KVCache blocks
252+
# for serving requests.
253+
if self.vllm_config.model_config.is_attention_free:
254+
can_schedule = True
255+
break
256+
249257
new_blocks = self.kv_cache_manager.allocate_slots(
250258
request,
251259
num_new_tokens,
@@ -438,29 +446,29 @@ def schedule(self) -> SchedulerOutput:
438446
if num_new_tokens == 0:
439447
# The request cannot be scheduled.
440448
break
441-
442-
new_blocks = self.kv_cache_manager.allocate_slots(
443-
request,
444-
num_new_tokens + num_external_computed_tokens,
445-
num_new_local_computed_tokens,
446-
new_computed_blocks,
447-
num_lookahead_tokens=self.num_lookahead_tokens,
448-
delay_cache_blocks=load_kv_async,
449-
)
450-
if new_blocks is None:
451-
# The request cannot be scheduled.
452-
break
453-
454-
# KVTransfer: the connector uses this info to determine
455-
# if a load is needed. Note that
456-
# This information is used to determine if a load is
457-
# needed for this request.
458-
if self.connector is not None:
459-
self.connector.update_state_after_alloc(
449+
if not self.vllm_config.model_config.is_attention_free:
450+
new_blocks = self.kv_cache_manager.allocate_slots(
460451
request,
461-
new_computed_blocks + new_blocks,
462-
num_external_computed_tokens,
452+
num_new_tokens + num_external_computed_tokens,
453+
num_new_local_computed_tokens,
454+
new_computed_blocks,
455+
num_lookahead_tokens=self.num_lookahead_tokens,
456+
delay_cache_blocks=load_kv_async,
463457
)
458+
if new_blocks is None:
459+
# The request cannot be scheduled.
460+
break
461+
462+
# KVTransfer: the connector uses this info to determine
463+
# if a load is needed. Note that
464+
# This information is used to determine if a load is
465+
# needed for this request.
466+
if self.connector is not None:
467+
self.connector.update_state_after_alloc(
468+
request,
469+
new_computed_blocks + new_blocks,
470+
num_external_computed_tokens,
471+
)
464472

465473
# Request was already popped from self.waiting
466474
# unless it was re-added above due to new_blocks being None.

vllm/v1/worker/gpu_input_batch.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,9 @@ def add_request(
295295
self.num_tokens_no_spec[req_index] = request.num_tokens
296296

297297
self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
298-
self.block_table.add_row(request.block_ids, req_index)
298+
if request.block_ids:
299+
# Attention free requests have no blocks assigned
300+
self.block_table.add_row(request.block_ids, req_index)
299301

300302
if sampling_params := request.sampling_params:
301303
if (self.is_spec_decode

0 commit comments

Comments
 (0)