@@ -239,16 +239,16 @@ def schedule(self) -> SchedulerOutput:
239
239
encoder_budget = new_encoder_budget
240
240
241
241
# Record the LoRAs in scheduled_running_reqs
242
- requested_loras : set [int ] = set ()
242
+ scheduled_loras : set [int ] = set ()
243
243
if self .lora_config :
244
- requested_loras = set (
244
+ scheduled_loras = set (
245
245
req .lora_request .lora_int_id for req in scheduled_running_reqs
246
246
if req .lora_request and req .lora_request .lora_int_id > 0 )
247
- assert len (requested_loras ) <= self .lora_config .max_loras
247
+ assert len (scheduled_loras ) <= self .lora_config .max_loras
248
248
249
249
# Use a temporary deque to collect requests that need to be skipped
250
250
# and put back at the head of the waiting queue later
251
- waiting_for_fsm : deque [Request ] = deque ()
251
+ skipped_waiting_requests : deque [Request ] = deque ()
252
252
253
253
# Next, schedule the WAITING requests.
254
254
if not preempted_reqs :
@@ -258,31 +258,30 @@ def schedule(self) -> SchedulerOutput:
258
258
259
259
request = self .waiting [0 ]
260
260
261
- if request .status == RequestStatus .WAITING_FOR_FSM :
261
+ # Waiting request skipping logic
262
+ is_skipped = False
263
+ # Skip request if the structured output request is still waiting
264
+ # for FSM.
265
+ if (not is_skipped
266
+ and request .status == RequestStatus .WAITING_FOR_FSM ):
262
267
structured_output_req = request .structured_output_request
263
- if structured_output_req and structured_output_req .grammar :
268
+ is_skipped = (not structured_output_req
269
+ or not structured_output_req .grammar )
270
+ if not is_skipped :
264
271
request .status = RequestStatus .WAITING
265
- else :
266
- waiting_structured_output_req = self .waiting .popleft ()
267
- waiting_for_fsm .appendleft (
268
- waiting_structured_output_req )
269
- continue
270
-
271
- # Check that adding the request still respects the max_loras
272
- # constraint.
273
- if self .lora_config and request .lora_request :
272
+
273
+ # Skip request if max_loras can't be honored.
274
+ if (not is_skipped and self .lora_config
275
+ and request .lora_request ):
274
276
req_lora_id = request .lora_request .lora_int_id
275
- if len (requested_loras ) == self .lora_config .max_loras and (
276
- req_lora_id not in requested_loras ):
277
- # Cannot schedule.
278
- # TODO (varun): This means all the other requests in
279
- # the WAITING queue will be blocked by this request,
280
- # even if,
281
- # 1. these other requests do not use LoRA, or,
282
- # 2. these other requests use the already requested
283
- # LoRAs.
284
- # This is too conservative and could be optimized.
285
- break
277
+ is_skipped = (len (scheduled_loras )
278
+ == self .lora_config .max_loras
279
+ and (req_lora_id not in scheduled_loras ))
280
+
281
+ if is_skipped :
282
+ skipped_waiting_requests .appendleft (request )
283
+ self .waiting .popleft ()
284
+ continue
286
285
287
286
# Get already-cached tokens.
288
287
computed_blocks , num_computed_tokens = \
@@ -344,7 +343,7 @@ def schedule(self) -> SchedulerOutput:
344
343
f"Invalid request status: { request .status } " )
345
344
346
345
if self .lora_config and request .lora_request :
347
- requested_loras .add (request .lora_request .lora_int_id )
346
+ scheduled_loras .add (request .lora_request .lora_int_id )
348
347
req_to_new_block_ids [request .request_id ] = [
349
348
b .block_id for b in computed_blocks + new_blocks
350
349
]
@@ -363,8 +362,8 @@ def schedule(self) -> SchedulerOutput:
363
362
encoder_budget = new_encoder_budget
364
363
365
364
# Put back any skipped requests at the head of the waiting queue
366
- if waiting_for_fsm :
367
- self .waiting .extendleft (waiting_for_fsm )
365
+ if skipped_waiting_requests :
366
+ self .waiting .extendleft (skipped_waiting_requests )
368
367
369
368
# Check if the scheduling constraints are satisfied.
370
369
total_num_scheduled_tokens = sum (num_scheduled_tokens .values ())
0 commit comments