@@ -130,14 +130,15 @@ def skip_cur_request():
130
130
131
131
assert num_new_tokens > 0
132
132
watermark = getattr (self .scheduler_config , "watermark" , 0.01 )
133
- if not self ._check_watermark_for_prefill (
134
- request , num_new_tokens , computed_blocks , watermark ):
133
+ if not self ._check_watermark_for_prefill (request , num_new_tokens ,
134
+ computed_blocks .blocks ,
135
+ watermark ):
135
136
# Scheduling would exceed watermark, skip.
136
137
skip_cur_request ()
137
138
continue
138
139
139
140
new_blocks = self .kv_cache_manager .allocate_slots (
140
- request , num_new_tokens , computed_blocks )
141
+ request , num_new_tokens , new_computed_blocks = computed_blocks )
141
142
if new_blocks is None :
142
143
# The request cannot be scheduled.
143
144
break
@@ -155,9 +156,8 @@ def skip_cur_request():
155
156
156
157
if self .lora_config and request .lora_request :
157
158
scheduled_loras .add (request .lora_request .lora_int_id )
158
- req_to_new_block_ids [request .request_id ] = [
159
- b .block_id for b in computed_blocks + new_blocks
160
- ]
159
+ req_to_new_block_ids [request .request_id ] = (
160
+ self .kv_cache_manager .get_block_ids (request .request_id ))
161
161
# Update request info.
162
162
num_scheduled_tokens [request .request_id ] = num_new_tokens
163
163
token_budget -= num_new_tokens
@@ -215,9 +215,8 @@ def skip_cur_request():
215
215
# Schedule the request.
216
216
scheduled_running_reqs .append (request )
217
217
self .scheduled_req_ids .add (request .request_id )
218
- req_to_new_block_ids [request .request_id ] = [
219
- b .block_id for b in new_blocks
220
- ]
218
+ req_to_new_block_ids [request .request_id ] = (
219
+ new_blocks .get_block_ids ())
221
220
num_scheduled_tokens [request .request_id ] = num_new_tokens
222
221
token_budget -= num_new_tokens
223
222
req_index += 1
@@ -326,7 +325,8 @@ def _check_watermark_for_prefill(self,
326
325
len (computed_blocks ) * self .block_size )
327
326
num_required_blocks = cdiv (num_new_tokens + num_computed_tokens ,
328
327
self .block_size )
329
- req_blocks = self .kv_cache_manager .req_to_blocks [request .request_id ]
328
+ req_blocks = self .kv_cache_manager .single_type_manager .req_to_blocks [
329
+ request .request_id ]
330
330
num_new_blocks = (num_required_blocks - len (req_blocks ) -
331
331
len (computed_blocks ))
332
332
num_evictable_computed_blocks = sum (1 for blk in computed_blocks
0 commit comments