@@ -301,6 +301,37 @@ async def start(self):
301
301
logger .info ("Started vLLM engine." )
302
302
303
303
async def _start_engine (self ) -> "EngineClient" :
304
+ from vllm import envs
305
+
306
+ # Since vLLM 0.8.0, the logic to determine v0/v1 engine is as follows:
307
+ # 1. If VLLM_USE_V1 is not set, then it tries to use v1 engine. However,
308
+ # if any feature specified in the engine config is not supported, then
309
+ # it falls back to v0. Note that launching vLLM on a non-main thread
310
+ # is an experimental feature, so vLLM will fall back to v0 in this case.
311
+ # 2. If VLLM_USE_V1 is set to 1, then it will use v1 engine even with
312
+ # experimental features (such as launching vLLM on a non-main thread).
313
+ # 3. If VLLM_USE_V1 is set to 0, force using v0 engine.
314
+ if not envs .VLLM_USE_V1 :
315
+ return await self ._start_engine_v0 ()
316
+ return await self ._start_engine_v1 ()
317
+
318
+ async def _start_engine_v1 (self ) -> "EngineClient" :
319
+ """Start the vLLM v1 engine. Note that we only use _get_async_engine_args
320
+ to get the engine args and don't use _get_vllm_engine_config, because
321
+ we integrate vLLM v1 using the highest-level async engine API.
322
+ TODO: Refactor vLLM v0 integration to use the same async engine API
323
+ to simplify the code.
324
+ """
325
+ from vllm import AsyncLLMEngine
326
+
327
+ await self .initialize_node (self .llm_config )
328
+ engine_args = _get_async_engine_args (self .llm_config )
329
+
330
+ return AsyncLLMEngine .from_engine_args (
331
+ engine_args = engine_args ,
332
+ )
333
+
334
+ async def _start_engine_v0 (self ) -> "EngineClient" :
304
335
from vllm .engine .multiprocessing .client import MQLLMEngineClient
305
336
306
337
args : InitializeNodeOutput = await self .initialize_node (self .llm_config )
@@ -498,20 +529,36 @@ async def _generate(
498
529
)
499
530
500
531
if request_output is not None :
501
- time_in_queue_histogram .observe (request_output .metrics .time_in_queue )
502
532
total_request_time = time .perf_counter () - start
503
- generation_time = (
504
- total_request_time - request_output .metrics .time_in_queue
505
- )
533
+ if request_output .metrics is None :
534
+ # vLLM V1 metrics are not included in the request output yet.
535
+ queue_time = "N/A"
536
+ generation_time_str = "N/A"
537
+ tokens_s = "N/A"
538
+ generated_tokens_s = "N/A"
539
+ else :
540
+ time_in_queue_histogram .observe (
541
+ request_output .metrics .time_in_queue
542
+ )
543
+ queue_time = f"{ request_output .metrics .time_in_queue } s"
544
+ generation_time = (
545
+ total_request_time - request_output .metrics .time_in_queue
546
+ )
547
+ generation_time_str = f"{ generation_time } s"
548
+ tokens_s = (
549
+ num_input_tokens + all_tokens_collected
550
+ ) / generation_time
551
+ generated_tokens_s = all_tokens_collected / generation_time
552
+
506
553
logger .info (
507
554
f"Request { vllm_generation_request .request_id } finished ({ finish_reason } ). "
508
555
f"Total time: { total_request_time } s, "
509
- f"Queue time: { request_output . metrics . time_in_queue } s , "
510
- f"Generation+async time: { generation_time } s , "
556
+ f"Queue time: { queue_time } , "
557
+ f"Generation+async time: { generation_time_str } , "
511
558
f"Input tokens: { num_input_tokens } , "
512
559
f"Generated tokens: { all_tokens_collected } , "
513
- f"tokens/s: { ( num_input_tokens + all_tokens_collected ) / generation_time } , "
514
- f"generated tokens/s: { all_tokens_collected / generation_time } ."
560
+ f"tokens/s: { tokens_s } , "
561
+ f"generated tokens/s: { generated_tokens_s } ."
515
562
)
516
563
else :
517
564
logger .warning (
0 commit comments