Skip to content

Commit ebc4b15

Browse files
committed
merge #51490
Signed-off-by: Linkun Chen <github@lkchen.net>
1 parent f743e6b commit ebc4b15

File tree

1 file changed

+61
-9
lines changed

1 file changed

+61
-9
lines changed

python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py

+61-9
Original file line numberDiff line numberDiff line change
@@ -305,12 +305,48 @@ async def start(self):
305305
logger.info("Started vLLM engine.")
306306

307307
async def _start_engine(self) -> "EngineClient":
308+
from vllm import envs
309+
310+
# Since vLLM 0.8.0, the logic to determine v0/v1 engine is as follows:
311+
# 1. If VLLM_USE_V1 is not set, then it tries to use v1 engine. However,
312+
# if any feature specified in the engine config is not supported, then
313+
# it falls back to v0. Note that launching vLLM on a non-main thread
314+
# is an experimental feature, so vLLM will fall back to v0 in this case.
315+
# 2. If VLLM_USE_V1 is set to 1, then it will use v1 engine even with
316+
# experimental features (such as launching vLLM on a non-main thread).
317+
# 3. If VLLM_USE_V1 is set to 0, force using v0 engine.
318+
if not envs.is_set("VLLM_USE_V1"):
319+
raise AssertionError(
320+
"Starting ray 2.45, VLLM_USE_V1 environmetn variable must be set "
321+
"to prevent undetermined behavior"
322+
)
323+
if not envs.VLLM_USE_V1:
324+
return await self._start_engine_v0()
325+
return await self._start_engine_v1()
326+
327+
async def _start_engine_v1(self) -> "EngineClient":
328+
"""Start the vLLM v1 engine. Note that we only use _get_async_engine_args
329+
to get the engine args and don't use _get_vllm_engine_config, because
330+
we integrate vLLM v1 using the highest-level async engine API.
331+
TODO: Refactor vLLM v0 integration to use the same async engine API
332+
to simplify the code.
333+
"""
334+
from vllm import AsyncLLMEngine
335+
336+
await self.initialize_node(self.llm_config)
337+
engine_args = _get_async_engine_args(self.llm_config)
338+
339+
return AsyncLLMEngine.from_engine_args(
340+
engine_args=engine_args,
341+
)
342+
343+
async def _start_engine_v0(self) -> "EngineClient":
308344
from vllm.engine.multiprocessing.client import MQLLMEngineClient
309345

310346
args: InitializeNodeOutput = await self.initialize_node(self.llm_config)
311347
engine_args, engine_config = _get_vllm_engine_config(self.llm_config)
312348

313-
if MQLLMEngineClient.is_unsupported_config(engine_args):
349+
if MQLLMEngineClient.is_unsupported_config(engine_config):
314350
# If the engine is not supported, we fall back to the legacy async engine.
315351
#
316352
# Note (genesu): as of 2025-02-11, this code path is only triggered when
@@ -502,20 +538,36 @@ async def _generate(
502538
)
503539

504540
if request_output is not None:
505-
time_in_queue_histogram.observe(request_output.metrics.time_in_queue)
506541
total_request_time = time.perf_counter() - start
507-
generation_time = (
508-
total_request_time - request_output.metrics.time_in_queue
509-
)
542+
if request_output.metrics is None:
543+
# vLLM V1 metrics are not included in the request output yet.
544+
queue_time = "N/A"
545+
generation_time_str = "N/A"
546+
tokens_s = "N/A"
547+
generated_tokens_s = "N/A"
548+
else:
549+
time_in_queue_histogram.observe(
550+
request_output.metrics.time_in_queue
551+
)
552+
queue_time = f"{request_output.metrics.time_in_queue}s"
553+
generation_time = (
554+
total_request_time - request_output.metrics.time_in_queue
555+
)
556+
generation_time_str = f"{generation_time}s"
557+
tokens_s = (
558+
num_input_tokens + all_tokens_collected
559+
) / generation_time
560+
generated_tokens_s = all_tokens_collected / generation_time
561+
510562
logger.info(
511563
f"Request {vllm_generation_request.request_id} finished ({finish_reason}). "
512564
f"Total time: {total_request_time}s, "
513-
f"Queue time: {request_output.metrics.time_in_queue}s, "
514-
f"Generation+async time: {generation_time}s, "
565+
f"Queue time: {queue_time}, "
566+
f"Generation+async time: {generation_time_str}, "
515567
f"Input tokens: {num_input_tokens}, "
516568
f"Generated tokens: {all_tokens_collected}, "
517-
f"tokens/s: {(num_input_tokens + all_tokens_collected) / generation_time}, "
518-
f"generated tokens/s: {all_tokens_collected / generation_time}."
569+
f"tokens/s: {tokens_s}, "
570+
f"generated tokens/s: {generated_tokens_s}."
519571
)
520572
else:
521573
logger.warning(

0 commit comments

Comments
 (0)