Skip to content

Commit 1fa4607

Browse files
committed
done
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
1 parent dbfa2d1 commit 1fa4607

File tree

1 file changed

+55
-8
lines changed

1 file changed

+55
-8
lines changed

python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,37 @@ async def start(self):
301301
logger.info("Started vLLM engine.")
302302

303303
async def _start_engine(self) -> "EngineClient":
304+
from vllm import envs
305+
306+
# Since vLLM 0.8.0, the logic to determine v0/v1 engine is as follows:
307+
# 1. If VLLM_USE_V1 is not set, then it tries to use v1 engine. However,
308+
# if any feature specified in the engine config is not supported, then
309+
# it falls back to v0. Note that launching vLLM on a non-main thread
310+
# is an experimental feature, so vLLM will fall back to v0 in this case.
311+
# 2. If VLLM_USE_V1 is set to 1, then it will use v1 engine even with
312+
# experimental features (such as launching vLLM on a non-main thread).
313+
# 3. If VLLM_USE_V1 is set to 0, force using v0 engine.
314+
if not envs.VLLM_USE_V1:
315+
return await self._start_engine_v0()
316+
return await self._start_engine_v1()
317+
318+
async def _start_engine_v1(self) -> "EngineClient":
319+
"""Start the vLLM v1 engine. Note that we only use _get_async_engine_args
320+
to get the engine args and don't use _get_vllm_engine_config, because
321+
we integrate vLLM v1 using the highest-level async engine API.
322+
TODO: Refactor vLLM v0 integration to use the same async engine API
323+
to simplify the code.
324+
"""
325+
from vllm import AsyncLLMEngine
326+
327+
await self.initialize_node(self.llm_config)
328+
engine_args = _get_async_engine_args(self.llm_config)
329+
330+
return AsyncLLMEngine.from_engine_args(
331+
engine_args=engine_args,
332+
)
333+
334+
async def _start_engine_v0(self) -> "EngineClient":
304335
from vllm.engine.multiprocessing.client import MQLLMEngineClient
305336

306337
args: InitializeNodeOutput = await self.initialize_node(self.llm_config)
@@ -498,20 +529,36 @@ async def _generate(
498529
)
499530

500531
if request_output is not None:
501-
time_in_queue_histogram.observe(request_output.metrics.time_in_queue)
502532
total_request_time = time.perf_counter() - start
503-
generation_time = (
504-
total_request_time - request_output.metrics.time_in_queue
505-
)
533+
if request_output.metrics is None:
534+
# vLLM V1 metrics are not included in the request output yet.
535+
queue_time = "N/A"
536+
generation_time_str = "N/A"
537+
tokens_s = "N/A"
538+
generated_tokens_s = "N/A"
539+
else:
540+
time_in_queue_histogram.observe(
541+
request_output.metrics.time_in_queue
542+
)
543+
queue_time = f"{request_output.metrics.time_in_queue}s"
544+
generation_time = (
545+
total_request_time - request_output.metrics.time_in_queue
546+
)
547+
generation_time_str = f"{generation_time}s"
548+
tokens_s = (
549+
num_input_tokens + all_tokens_collected
550+
) / generation_time
551+
generated_tokens_s = all_tokens_collected / generation_time
552+
506553
logger.info(
507554
f"Request {vllm_generation_request.request_id} finished ({finish_reason}). "
508555
f"Total time: {total_request_time}s, "
509-
f"Queue time: {request_output.metrics.time_in_queue}s, "
510-
f"Generation+async time: {generation_time}s, "
556+
f"Queue time: {queue_time}, "
557+
f"Generation+async time: {generation_time_str}, "
511558
f"Input tokens: {num_input_tokens}, "
512559
f"Generated tokens: {all_tokens_collected}, "
513-
f"tokens/s: {(num_input_tokens + all_tokens_collected) / generation_time}, "
514-
f"generated tokens/s: {all_tokens_collected / generation_time}."
560+
f"tokens/s: {tokens_s}, "
561+
f"generated tokens/s: {generated_tokens_s}."
515562
)
516563
else:
517564
logger.warning(

0 commit comments

Comments
 (0)