Skip to content

Commit 19362e9

Browse files
committed
Merge branch 'serve-v1' into lkchen-ray_data_llm
Signed-off-by: Linkun Chen <github@lkchen.net>
2 parents f743e6b + 52d9254 commit 19362e9

File tree

1 file changed

+56
-9
lines changed

1 file changed

+56
-9
lines changed

python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py

Lines changed: 56 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -305,12 +305,43 @@ async def start(self):
305305
logger.info("Started vLLM engine.")
306306

307307
async def _start_engine(self) -> "EngineClient":
308+
from vllm import envs
309+
310+
# Since vLLM 0.8.0, the logic to determine v0/v1 engine is as follows:
311+
# 1. If VLLM_USE_V1 is not set, then it tries to use v1 engine. However,
312+
# if any feature specified in the engine config is not supported, then
313+
# it falls back to v0. Note that launching vLLM on a non-main thread
314+
# is an experimental feature, so vLLM will fall back to v0 in this case.
315+
# 2. If VLLM_USE_V1 is set to 1, then it will use v1 engine even with
316+
# experimental features (such as launching vLLM on a non-main thread).
317+
# 3. If VLLM_USE_V1 is set to 0, force using v0 engine.
318+
if not envs.VLLM_USE_V1:
319+
return await self._start_engine_v0()
320+
return await self._start_engine_v1()
321+
322+
async def _start_engine_v1(self) -> "EngineClient":
323+
"""Start the vLLM v1 engine. Note that we only use _get_async_engine_args
324+
to get the engine args and don't use _get_vllm_engine_config, because
325+
we integrate vLLM v1 using the highest-level async engine API.
326+
TODO: Refactor vLLM v0 integration to use the same async engine API
327+
to simplify the code.
328+
"""
329+
from vllm import AsyncLLMEngine
330+
331+
await self.initialize_node(self.llm_config)
332+
engine_args = _get_async_engine_args(self.llm_config)
333+
334+
return AsyncLLMEngine.from_engine_args(
335+
engine_args=engine_args,
336+
)
337+
338+
async def _start_engine_v0(self) -> "EngineClient":
308339
from vllm.engine.multiprocessing.client import MQLLMEngineClient
309340

310341
args: InitializeNodeOutput = await self.initialize_node(self.llm_config)
311342
engine_args, engine_config = _get_vllm_engine_config(self.llm_config)
312343

313-
if MQLLMEngineClient.is_unsupported_config(engine_args):
344+
if MQLLMEngineClient.is_unsupported_config(engine_config):
314345
# If the engine is not supported, we fall back to the legacy async engine.
315346
#
316347
# Note (genesu): as of 2025-02-11, this code path is only triggered when
@@ -502,20 +533,36 @@ async def _generate(
502533
)
503534

504535
if request_output is not None:
505-
time_in_queue_histogram.observe(request_output.metrics.time_in_queue)
506536
total_request_time = time.perf_counter() - start
507-
generation_time = (
508-
total_request_time - request_output.metrics.time_in_queue
509-
)
537+
if request_output.metrics is None:
538+
# vLLM V1 metrics are not included in the request output yet.
539+
queue_time = "N/A"
540+
generation_time_str = "N/A"
541+
tokens_s = "N/A"
542+
generated_tokens_s = "N/A"
543+
else:
544+
time_in_queue_histogram.observe(
545+
request_output.metrics.time_in_queue
546+
)
547+
queue_time = f"{request_output.metrics.time_in_queue}s"
548+
generation_time = (
549+
total_request_time - request_output.metrics.time_in_queue
550+
)
551+
generation_time_str = f"{generation_time}s"
552+
tokens_s = (
553+
num_input_tokens + all_tokens_collected
554+
) / generation_time
555+
generated_tokens_s = all_tokens_collected / generation_time
556+
510557
logger.info(
511558
f"Request {vllm_generation_request.request_id} finished ({finish_reason}). "
512559
f"Total time: {total_request_time}s, "
513-
f"Queue time: {request_output.metrics.time_in_queue}s, "
514-
f"Generation+async time: {generation_time}s, "
560+
f"Queue time: {queue_time}, "
561+
f"Generation+async time: {generation_time_str}, "
515562
f"Input tokens: {num_input_tokens}, "
516563
f"Generated tokens: {all_tokens_collected}, "
517-
f"tokens/s: {(num_input_tokens + all_tokens_collected) / generation_time}, "
518-
f"generated tokens/s: {all_tokens_collected / generation_time}."
564+
f"tokens/s: {tokens_s}, "
565+
f"generated tokens/s: {generated_tokens_s}."
519566
)
520567
else:
521568
logger.warning(

0 commit comments

Comments
 (0)