Skip to content

Commit a79b122

Browse files
authored
[V1] Do not allocate beyond the max_model_len (#10730)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
1 parent d9b4b3f commit a79b122

File tree

3 files changed

+41
-15
lines changed

3 files changed

+41
-15
lines changed

tests/v1/core/test_prefix_caching.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ def test_prefill():
2323
manager = KVCacheManager(
2424
block_size=16,
2525
num_gpu_blocks=10,
26-
sliding_window=False,
26+
max_model_len=8192,
27+
sliding_window=None,
2728
enable_caching=True,
2829
num_preallocate_tokens=16,
2930
)
@@ -121,7 +122,8 @@ def test_decode():
121122
manager = KVCacheManager(
122123
block_size=16,
123124
num_gpu_blocks=10,
124-
sliding_window=False,
125+
max_model_len=8192,
126+
sliding_window=None,
125127
enable_caching=True,
126128
num_preallocate_tokens=16,
127129
)
@@ -172,7 +174,8 @@ def test_evict():
172174
manager = KVCacheManager(
173175
block_size=16,
174176
num_gpu_blocks=10,
175-
sliding_window=False,
177+
max_model_len=8192,
178+
sliding_window=None,
176179
enable_caching=True,
177180
num_preallocate_tokens=16,
178181
)
@@ -220,7 +223,8 @@ def test_hash_block_correct_reuse():
220223
manager = KVCacheManager(
221224
block_size=block_size,
222225
num_gpu_blocks=1,
223-
sliding_window=False,
226+
max_model_len=8192,
227+
sliding_window=None,
224228
enable_caching=True,
225229
num_preallocate_tokens=0,
226230
)
@@ -256,7 +260,8 @@ def test_computed_blocks_not_evicted():
256260
manager = KVCacheManager(
257261
block_size=block_size,
258262
num_gpu_blocks=2,
259-
sliding_window=False,
263+
max_model_len=8192,
264+
sliding_window=None,
260265
enable_caching=True,
261266
num_preallocate_tokens=0,
262267
)
@@ -303,7 +308,8 @@ def test_basic_prefix_caching_disabled():
303308
manager = KVCacheManager(
304309
block_size=block_size,
305310
num_gpu_blocks=4,
306-
sliding_window=False,
311+
max_model_len=8192,
312+
sliding_window=None,
307313
enable_caching=False,
308314
num_preallocate_tokens=0,
309315
)
@@ -342,7 +348,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
342348
manager = KVCacheManager(
343349
block_size=block_size,
344350
num_gpu_blocks=10,
345-
sliding_window=False,
351+
max_model_len=8192,
352+
sliding_window=None,
346353
enable_caching=True,
347354
num_preallocate_tokens=num_preallocate_tokens,
348355
)
@@ -370,7 +377,8 @@ def test_cache_blocks():
370377
manager = KVCacheManager(
371378
block_size=block_size,
372379
num_gpu_blocks=5,
373-
sliding_window=False,
380+
max_model_len=8192,
381+
sliding_window=None,
374382
enable_caching=True,
375383
num_preallocate_tokens=0,
376384
)

vllm/v1/core/kv_cache_manager.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,15 @@ def __init__(
1717
self,
1818
block_size: int,
1919
num_gpu_blocks: int,
20+
max_model_len: int,
2021
sliding_window: Optional[int] = None,
2122
enable_caching: bool = True,
2223
num_preallocate_tokens: int = 64,
2324
) -> None:
2425
self.block_size = block_size
2526
self.num_gpu_blocks = num_gpu_blocks
27+
self.max_model_len = max_model_len
28+
self.max_num_blocks_per_req = cdiv(max_model_len, block_size)
2629
self.sliding_window = sliding_window
2730
self.enable_caching = enable_caching
2831
# NOTE(woosuk): To avoid frequent block allocation, we preallocate some
@@ -132,7 +135,14 @@ def append_slots(
132135
num_new_blocks = min(
133136
num_new_blocks + self.num_preallocate_blocks,
134137
self.free_block_queue.num_free_blocks,
138+
# Should not exceed the maximum number of blocks per request.
139+
# This is especially because the block table has the shape
140+
# [..., max_num_blocks_per_req].
141+
# TODO(woosuk): Check and reject requests if
142+
# num_prompt_tokens + max_tokens > max_model_len.
143+
self.max_num_blocks_per_req - len(req_blocks),
135144
)
145+
assert num_new_blocks > 0
136146

137147
new_blocks = self._get_new_blocks(num_new_blocks)
138148
req_blocks.extend(new_blocks)
@@ -212,7 +222,14 @@ def allocate_slots(
212222
num_required_blocks + self.num_preallocate_blocks,
213223
self.free_block_queue.num_free_blocks -
214224
num_evictable_computed_blocks,
225+
# Should not exceed the maximum number of blocks per request.
226+
# This is especially because the block table has the shape
227+
# [..., max_num_blocks_per_req].
228+
# TODO(woosuk): Check and reject requests if
229+
# num_prompt_tokens + max_tokens > max_model_len.
230+
self.max_num_blocks_per_req - len(computed_blocks),
215231
)
232+
assert num_new_blocks > 0
216233

217234
# Concatenate the computed block IDs and the new block IDs.
218235
new_blocks = self._get_new_blocks(num_new_blocks)

vllm/v1/core/scheduler.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,22 +33,23 @@ def __init__(
3333
# TODO: Support LoRA.
3434
assert lora_config is None, "V1 does not support LoRA yet."
3535

36+
# Scheduling constraints.
37+
self.max_num_running_reqs = self.scheduler_config.max_num_seqs
38+
self.max_num_scheduled_tokens = \
39+
self.scheduler_config.max_num_batched_tokens
40+
self.max_model_len = self.scheduler_config.max_model_len
41+
3642
num_gpu_blocks = cache_config.num_gpu_blocks
3743
assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
38-
# Create the block space manager.
44+
# Create the KV cache manager.
3945
self.kv_cache_manager = KVCacheManager(
4046
block_size=self.cache_config.block_size,
4147
num_gpu_blocks=num_gpu_blocks,
48+
max_model_len=self.max_model_len,
4249
sliding_window=self.cache_config.sliding_window,
4350
enable_caching=self.cache_config.enable_prefix_caching)
4451
self.block_size = self.cache_config.block_size
4552

46-
# Scheduling constraints.
47-
self.max_num_running_reqs = self.scheduler_config.max_num_seqs
48-
self.max_num_scheduled_tokens = \
49-
self.scheduler_config.max_num_batched_tokens
50-
self.max_model_len = self.scheduler_config.max_model_len
51-
5253
# req_id -> Request
5354
self.requests: Dict[str, Request] = {}
5455
# Priority queues for requests.

0 commit comments

Comments
 (0)