@@ -305,12 +305,48 @@ async def start(self):
305
305
logger .info ("Started vLLM engine." )
306
306
307
307
async def _start_engine (self ) -> "EngineClient" :
308
+ from vllm import envs
309
+
310
+ # Since vLLM 0.8.0, the logic to determine v0/v1 engine is as follows:
311
+ # 1. If VLLM_USE_V1 is not set, then it tries to use v1 engine. However,
312
+ # if any feature specified in the engine config is not supported, then
313
+ # it falls back to v0. Note that launching vLLM on a non-main thread
314
+ # is an experimental feature, so vLLM will fall back to v0 in this case.
315
+ # 2. If VLLM_USE_V1 is set to 1, then it will use v1 engine even with
316
+ # experimental features (such as launching vLLM on a non-main thread).
317
+ # 3. If VLLM_USE_V1 is set to 0, force using v0 engine.
318
+ if not envs .is_set ("VLLM_USE_V1" ):
319
+ raise AssertionError (
320
+ "Starting ray 2.45, VLLM_USE_V1 environmetn variable must be set "
321
+ "to prevent undetermined behavior"
322
+ )
323
+ if not envs .VLLM_USE_V1 :
324
+ return await self ._start_engine_v0 ()
325
+ return await self ._start_engine_v1 ()
326
+
327
+ async def _start_engine_v1 (self ) -> "EngineClient" :
328
+ """Start the vLLM v1 engine. Note that we only use _get_async_engine_args
329
+ to get the engine args and don't use _get_vllm_engine_config, because
330
+ we integrate vLLM v1 using the highest-level async engine API.
331
+ TODO: Refactor vLLM v0 integration to use the same async engine API
332
+ to simplify the code.
333
+ """
334
+ from vllm import AsyncLLMEngine
335
+
336
+ await self .initialize_node (self .llm_config )
337
+ engine_args = _get_async_engine_args (self .llm_config )
338
+
339
+ return AsyncLLMEngine .from_engine_args (
340
+ engine_args = engine_args ,
341
+ )
342
+
343
+ async def _start_engine_v0 (self ) -> "EngineClient" :
308
344
from vllm .engine .multiprocessing .client import MQLLMEngineClient
309
345
310
346
args : InitializeNodeOutput = await self .initialize_node (self .llm_config )
311
347
engine_args , engine_config = _get_vllm_engine_config (self .llm_config )
312
348
313
- if MQLLMEngineClient .is_unsupported_config (engine_args ):
349
+ if MQLLMEngineClient .is_unsupported_config (engine_config ):
314
350
# If the engine is not supported, we fall back to the legacy async engine.
315
351
#
316
352
# Note (genesu): as of 2025-02-11, this code path is only triggered when
@@ -502,20 +538,36 @@ async def _generate(
502
538
)
503
539
504
540
if request_output is not None :
505
- time_in_queue_histogram .observe (request_output .metrics .time_in_queue )
506
541
total_request_time = time .perf_counter () - start
507
- generation_time = (
508
- total_request_time - request_output .metrics .time_in_queue
509
- )
542
+ if request_output .metrics is None :
543
+ # vLLM V1 metrics are not included in the request output yet.
544
+ queue_time = "N/A"
545
+ generation_time_str = "N/A"
546
+ tokens_s = "N/A"
547
+ generated_tokens_s = "N/A"
548
+ else :
549
+ time_in_queue_histogram .observe (
550
+ request_output .metrics .time_in_queue
551
+ )
552
+ queue_time = f"{ request_output .metrics .time_in_queue } s"
553
+ generation_time = (
554
+ total_request_time - request_output .metrics .time_in_queue
555
+ )
556
+ generation_time_str = f"{ generation_time } s"
557
+ tokens_s = (
558
+ num_input_tokens + all_tokens_collected
559
+ ) / generation_time
560
+ generated_tokens_s = all_tokens_collected / generation_time
561
+
510
562
logger .info (
511
563
f"Request { vllm_generation_request .request_id } finished ({ finish_reason } ). "
512
564
f"Total time: { total_request_time } s, "
513
- f"Queue time: { request_output . metrics . time_in_queue } s , "
514
- f"Generation+async time: { generation_time } s , "
565
+ f"Queue time: { queue_time } , "
566
+ f"Generation+async time: { generation_time_str } , "
515
567
f"Input tokens: { num_input_tokens } , "
516
568
f"Generated tokens: { all_tokens_collected } , "
517
- f"tokens/s: { ( num_input_tokens + all_tokens_collected ) / generation_time } , "
518
- f"generated tokens/s: { all_tokens_collected / generation_time } ."
569
+ f"tokens/s: { tokens_s } , "
570
+ f"generated tokens/s: { generated_tokens_s } ."
519
571
)
520
572
else :
521
573
logger .warning (
0 commit comments