@@ -132,7 +132,7 @@ def skip_cur_request():
132
132
continue
133
133
134
134
new_blocks = self .kv_cache_manager .allocate_slots (
135
- request , num_new_tokens , computed_blocks )
135
+ request , num_new_tokens , num_computed_tokens , computed_blocks )
136
136
if new_blocks is None :
137
137
# The request cannot be scheduled.
138
138
break
@@ -151,7 +151,7 @@ def skip_cur_request():
151
151
if self .lora_config and request .lora_request :
152
152
scheduled_loras .add (request .lora_request .lora_int_id )
153
153
req_to_new_block_ids [request .request_id ] = [
154
- b .block_id for b in computed_blocks + new_blocks
154
+ b .block_id for b in computed_blocks . blocks + new_blocks . blocks
155
155
]
156
156
# Update request info.
157
157
num_scheduled_tokens [request .request_id ] = num_new_tokens
@@ -211,7 +211,7 @@ def skip_cur_request():
211
211
scheduled_running_reqs .append (request )
212
212
self .scheduled_req_ids .add (request .request_id )
213
213
req_to_new_block_ids [request .request_id ] = [
214
- b .block_id for b in new_blocks
214
+ b .block_id for b in new_blocks . blocks
215
215
]
216
216
num_scheduled_tokens [request .request_id ] = num_new_tokens
217
217
token_budget -= num_new_tokens
@@ -307,7 +307,7 @@ def _check_watermark_for_prefill(self,
307
307
num_new_tokens ,
308
308
computed_blocks ,
309
309
watermark = 0.01 ):
310
- computed_blocks = computed_blocks or []
310
+ computed_blocks = computed_blocks . blocks () or []
311
311
watermark_blocks = self .kv_cache_config .num_blocks * watermark
312
312
num_computed_tokens = (request .num_computed_tokens +
313
313
len (computed_blocks ) * self .block_size )
0 commit comments