Skip to content

Commit a5cfbab

Browse files
varun-sundar-rabindranathVarun Sundar Rabindranath
andauthored
[Core] LoRA: V1 Scheduler optimization (#15422)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
1 parent ac3cd6e commit a5cfbab

File tree

1 file changed

+28
-29
lines changed

1 file changed

+28
-29
lines changed

vllm/v1/core/sched/scheduler.py

Lines changed: 28 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -239,16 +239,16 @@ def schedule(self) -> SchedulerOutput:
239239
encoder_budget = new_encoder_budget
240240

241241
# Record the LoRAs in scheduled_running_reqs
242-
requested_loras: set[int] = set()
242+
scheduled_loras: set[int] = set()
243243
if self.lora_config:
244-
requested_loras = set(
244+
scheduled_loras = set(
245245
req.lora_request.lora_int_id for req in scheduled_running_reqs
246246
if req.lora_request and req.lora_request.lora_int_id > 0)
247-
assert len(requested_loras) <= self.lora_config.max_loras
247+
assert len(scheduled_loras) <= self.lora_config.max_loras
248248

249249
# Use a temporary deque to collect requests that need to be skipped
250250
# and put back at the head of the waiting queue later
251-
waiting_for_fsm: deque[Request] = deque()
251+
skipped_waiting_requests: deque[Request] = deque()
252252

253253
# Next, schedule the WAITING requests.
254254
if not preempted_reqs:
@@ -258,31 +258,30 @@ def schedule(self) -> SchedulerOutput:
258258

259259
request = self.waiting[0]
260260

261-
if request.status == RequestStatus.WAITING_FOR_FSM:
261+
# Waiting request skipping logic
262+
is_skipped = False
263+
# Skip request if the structured output request is still waiting
264+
# for FSM.
265+
if (not is_skipped
266+
and request.status == RequestStatus.WAITING_FOR_FSM):
262267
structured_output_req = request.structured_output_request
263-
if structured_output_req and structured_output_req.grammar:
268+
is_skipped = (not structured_output_req
269+
or not structured_output_req.grammar)
270+
if not is_skipped:
264271
request.status = RequestStatus.WAITING
265-
else:
266-
waiting_structured_output_req = self.waiting.popleft()
267-
waiting_for_fsm.appendleft(
268-
waiting_structured_output_req)
269-
continue
270-
271-
# Check that adding the request still respects the max_loras
272-
# constraint.
273-
if self.lora_config and request.lora_request:
272+
273+
# Skip request if max_loras can't be honored.
274+
if (not is_skipped and self.lora_config
275+
and request.lora_request):
274276
req_lora_id = request.lora_request.lora_int_id
275-
if len(requested_loras) == self.lora_config.max_loras and (
276-
req_lora_id not in requested_loras):
277-
# Cannot schedule.
278-
# TODO (varun): This means all the other requests in
279-
# the WAITING queue will be blocked by this request,
280-
# even if,
281-
# 1. these other requests do not use LoRA, or,
282-
# 2. these other requests use the already requested
283-
# LoRAs.
284-
# This is too conservative and could be optimized.
285-
break
277+
is_skipped = (len(scheduled_loras)
278+
== self.lora_config.max_loras
279+
and (req_lora_id not in scheduled_loras))
280+
281+
if is_skipped:
282+
skipped_waiting_requests.appendleft(request)
283+
self.waiting.popleft()
284+
continue
286285

287286
# Get already-cached tokens.
288287
computed_blocks, num_computed_tokens = \
@@ -344,7 +343,7 @@ def schedule(self) -> SchedulerOutput:
344343
f"Invalid request status: {request.status}")
345344

346345
if self.lora_config and request.lora_request:
347-
requested_loras.add(request.lora_request.lora_int_id)
346+
scheduled_loras.add(request.lora_request.lora_int_id)
348347
req_to_new_block_ids[request.request_id] = [
349348
b.block_id for b in computed_blocks + new_blocks
350349
]
@@ -363,8 +362,8 @@ def schedule(self) -> SchedulerOutput:
363362
encoder_budget = new_encoder_budget
364363

365364
# Put back any skipped requests at the head of the waiting queue
366-
if waiting_for_fsm:
367-
self.waiting.extendleft(waiting_for_fsm)
365+
if skipped_waiting_requests:
366+
self.waiting.extendleft(skipped_waiting_requests)
368367

369368
# Check if the scheduling constraints are satisfied.
370369
total_num_scheduled_tokens = sum(num_scheduled_tokens.values())

0 commit comments

Comments
 (0)