@@ -305,12 +305,43 @@ async def start(self):
305
305
logger .info ("Started vLLM engine." )
306
306
307
307
async def _start_engine (self ) -> "EngineClient" :
308
+ from vllm import envs
309
+
310
+ # Since vLLM 0.8.0, the logic to determine v0/v1 engine is as follows:
311
+ # 1. If VLLM_USE_V1 is not set, then it tries to use v1 engine. However,
312
+ # if any feature specified in the engine config is not supported, then
313
+ # it falls back to v0. Note that launching vLLM on a non-main thread
314
+ # is an experimental feature, so vLLM will fall back to v0 in this case.
315
+ # 2. If VLLM_USE_V1 is set to 1, then it will use v1 engine even with
316
+ # experimental features (such as launching vLLM on a non-main thread).
317
+ # 3. If VLLM_USE_V1 is set to 0, force using v0 engine.
318
+ if not envs .VLLM_USE_V1 :
319
+ return await self ._start_engine_v0 ()
320
+ return await self ._start_engine_v1 ()
321
+
322
+ async def _start_engine_v1 (self ) -> "EngineClient" :
323
+ """Start the vLLM v1 engine. Note that we only use _get_async_engine_args
324
+ to get the engine args and don't use _get_vllm_engine_config, because
325
+ we integrate vLLM v1 using the highest-level async engine API.
326
+ TODO: Refactor vLLM v0 integration to use the same async engine API
327
+ to simplify the code.
328
+ """
329
+ from vllm import AsyncLLMEngine
330
+
331
+ await self .initialize_node (self .llm_config )
332
+ engine_args = _get_async_engine_args (self .llm_config )
333
+
334
+ return AsyncLLMEngine .from_engine_args (
335
+ engine_args = engine_args ,
336
+ )
337
+
338
+ async def _start_engine_v0 (self ) -> "EngineClient" :
308
339
from vllm .engine .multiprocessing .client import MQLLMEngineClient
309
340
310
341
args : InitializeNodeOutput = await self .initialize_node (self .llm_config )
311
342
engine_args , engine_config = _get_vllm_engine_config (self .llm_config )
312
343
313
- if MQLLMEngineClient .is_unsupported_config (engine_args ):
344
+ if MQLLMEngineClient .is_unsupported_config (engine_config ):
314
345
# If the engine is not supported, we fall back to the legacy async engine.
315
346
#
316
347
# Note (genesu): as of 2025-02-11, this code path is only triggered when
@@ -502,20 +533,36 @@ async def _generate(
502
533
)
503
534
504
535
if request_output is not None :
505
- time_in_queue_histogram .observe (request_output .metrics .time_in_queue )
506
536
total_request_time = time .perf_counter () - start
507
- generation_time = (
508
- total_request_time - request_output .metrics .time_in_queue
509
- )
537
+ if request_output .metrics is None :
538
+ # vLLM V1 metrics are not included in the request output yet.
539
+ queue_time = "N/A"
540
+ generation_time_str = "N/A"
541
+ tokens_s = "N/A"
542
+ generated_tokens_s = "N/A"
543
+ else :
544
+ time_in_queue_histogram .observe (
545
+ request_output .metrics .time_in_queue
546
+ )
547
+ queue_time = f"{ request_output .metrics .time_in_queue } s"
548
+ generation_time = (
549
+ total_request_time - request_output .metrics .time_in_queue
550
+ )
551
+ generation_time_str = f"{ generation_time } s"
552
+ tokens_s = (
553
+ num_input_tokens + all_tokens_collected
554
+ ) / generation_time
555
+ generated_tokens_s = all_tokens_collected / generation_time
556
+
510
557
logger .info (
511
558
f"Request { vllm_generation_request .request_id } finished ({ finish_reason } ). "
512
559
f"Total time: { total_request_time } s, "
513
- f"Queue time: { request_output . metrics . time_in_queue } s , "
514
- f"Generation+async time: { generation_time } s , "
560
+ f"Queue time: { queue_time } , "
561
+ f"Generation+async time: { generation_time_str } , "
515
562
f"Input tokens: { num_input_tokens } , "
516
563
f"Generated tokens: { all_tokens_collected } , "
517
- f"tokens/s: { ( num_input_tokens + all_tokens_collected ) / generation_time } , "
518
- f"generated tokens/s: { all_tokens_collected / generation_time } ."
564
+ f"tokens/s: { tokens_s } , "
565
+ f"generated tokens/s: { generated_tokens_s } ."
519
566
)
520
567
else :
521
568
logger .warning (
0 commit comments