Skip to content

Commit 4b97d68

Browse files
committed
bump vLLM to 0.8.2
Signed-off-by: Linkun Chen <github@lkchen.net>
1 parent 75941e7 commit 4b97d68

26 files changed

+1476
-738
lines changed

doc/source/serve/llm/serving-llms.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ Requirements
1818

1919
.. code-block:: bash
2020
21-
pip install ray[serve,llm]>=2.43.0 vllm>=0.7.2
21+
pip install ray[serve,llm]>=2.43.0 vllm>=0.8.2
2222
2323
# Suggested dependencies when using vllm 0.7.2:
2424
pip install xgrammar==0.1.11 pynvml==12.0.0

python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -305,12 +305,49 @@ async def start(self):
305305
logger.info("Started vLLM engine.")
306306

307307
async def _start_engine(self) -> "EngineClient":
308+
from vllm import envs
309+
310+
# Since vLLM 0.8.0, the logic to determine v0/v1 engine is as follows:
311+
# 1. If VLLM_USE_V1 is not set, then it tries to use v1 engine. However,
312+
# if any feature specified in the engine config is not supported, then
313+
# it falls back to v0. Note that launching vLLM on a non-main thread
314+
# is an experimental feature, so vLLM will fall back to v0 in this case.
315+
# 2. If VLLM_USE_V1 is set to 1, then it will use v1 engine even with
316+
# experimental features (such as launching vLLM on a non-main thread).
317+
# 3. If VLLM_USE_V1 is set to 0, force using v0 engine.
318+
# In Ray Serve LLM, we forbid case 1 because we have to know exactly which engine is used.
319+
if not envs.is_set("VLLM_USE_V1"):
320+
raise AssertionError(
321+
"Starting from Ray 2.45, VLLM_USE_V1 environment variable must be "
322+
"set to prevent undetermined behavior"
323+
)
324+
if not envs.VLLM_USE_V1:
325+
return await self._start_engine_v0()
326+
return await self._start_engine_v1()
327+
328+
async def _start_engine_v1(self) -> "EngineClient":
329+
"""Start the vLLM v1 engine. Note that we only use _get_async_engine_args
330+
to get the engine args and don't use _get_vllm_engine_config, because
331+
we integrate vLLM v1 using the highest-level async engine API.
332+
TODO: Refactor vLLM v0 integration to use the same async engine API
333+
to simplify the code.
334+
"""
335+
from vllm import AsyncLLMEngine
336+
337+
await self.initialize_node(self.llm_config)
338+
engine_args = _get_async_engine_args(self.llm_config)
339+
340+
return AsyncLLMEngine.from_engine_args(
341+
engine_args=engine_args,
342+
)
343+
344+
async def _start_engine_v0(self) -> "EngineClient":
308345
from vllm.engine.multiprocessing.client import MQLLMEngineClient
309346

310347
args: InitializeNodeOutput = await self.initialize_node(self.llm_config)
311348
engine_args, engine_config = _get_vllm_engine_config(self.llm_config)
312349

313-
if MQLLMEngineClient.is_unsupported_config(engine_args):
350+
if MQLLMEngineClient.is_unsupported_config(engine_config):
314351
# If the engine is not supported, we fall back to the legacy async engine.
315352
#
316353
# Note (genesu): as of 2025-02-11, this code path is only triggered when
@@ -502,20 +539,36 @@ async def _generate(
502539
)
503540

504541
if request_output is not None:
505-
time_in_queue_histogram.observe(request_output.metrics.time_in_queue)
506542
total_request_time = time.perf_counter() - start
507-
generation_time = (
508-
total_request_time - request_output.metrics.time_in_queue
509-
)
543+
if request_output.metrics is None:
544+
# vLLM V1 metrics are not included in the request output yet.
545+
queue_time = "N/A"
546+
generation_time_str = "N/A"
547+
tokens_s = "N/A"
548+
generated_tokens_s = "N/A"
549+
else:
550+
time_in_queue_histogram.observe(
551+
request_output.metrics.time_in_queue
552+
)
553+
queue_time = f"{request_output.metrics.time_in_queue}s"
554+
generation_time = (
555+
total_request_time - request_output.metrics.time_in_queue
556+
)
557+
generation_time_str = f"{generation_time}s"
558+
tokens_s = (
559+
num_input_tokens + all_tokens_collected
560+
) / generation_time
561+
generated_tokens_s = all_tokens_collected / generation_time
562+
510563
logger.info(
511564
f"Request {vllm_generation_request.request_id} finished ({finish_reason}). "
512565
f"Total time: {total_request_time}s, "
513-
f"Queue time: {request_output.metrics.time_in_queue}s, "
514-
f"Generation+async time: {generation_time}s, "
566+
f"Queue time: {queue_time}, "
567+
f"Generation+async time: {generation_time_str}, "
515568
f"Input tokens: {num_input_tokens}, "
516569
f"Generated tokens: {all_tokens_collected}, "
517-
f"tokens/s: {(num_input_tokens + all_tokens_collected) / generation_time}, "
518-
f"generated tokens/s: {all_tokens_collected / generation_time}."
570+
f"tokens/s: {tokens_s}, "
571+
f"generated tokens/s: {generated_tokens_s}."
519572
)
520573
else:
521574
logger.warning(

python/requirements/llm/llm-requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Keep this in sync with the definition in setup.py for ray[llm]
2-
vllm>=0.7.2
2+
vllm>=0.8.2
33
# For json mode
44
jsonref>=1.1.0
55
jsonschema

python/requirements/test-requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ boto3==1.26.76
1515
cloudpickle==2.2.0
1616
cryptography==42.0.5
1717
cython==0.29.37
18-
fastapi==0.109.2
18+
fastapi>=0.115.0
1919
feather-format==0.4.1
2020
# Keep compatible with Werkzeug
2121
flask==2.1.3

python/requirements_compiled.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,7 @@ fairscale==0.4.6
485485
# via -r python/requirements/ml/tune-test-requirements.txt
486486
farama-notifications==0.0.4
487487
# via gymnasium
488-
fastapi==0.109.2
488+
fastapi==0.115.0
489489
# via
490490
# -r python/requirements.txt
491491
# -r python/requirements/test-requirements.txt
@@ -2140,7 +2140,7 @@ stack-data==0.6.3
21402140
# via ipython
21412141
stanio==0.3.0
21422142
# via cmdstanpy
2143-
starlette==0.36.3
2143+
starlette==0.37.2
21442144
# via
21452145
# -r python/requirements.txt
21462146
# fastapi

python/requirements_compiled_ray_py311_cpu.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -439,9 +439,9 @@ farama-notifications==0.0.4 \
439439
# via
440440
# -c python/requirements_compiled_ray_test_py311_cpu.txt
441441
# gymnasium
442-
fastapi==0.109.2 \
443-
--hash=sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d \
444-
--hash=sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73
442+
fastapi==0.115.0 \
443+
--hash=sha256:17ea427674467486e997206a5ab25760f6b09e069f099b96f5b55a32fb6f1631 \
444+
--hash=sha256:f93b4ca3529a8ebc6fc3fcf710e5efa8de3df9b41570958abf1d97d843138004
445445
# via
446446
# -c python/requirements_compiled_ray_test_py311_cpu.txt
447447
# -r python/requirements.txt
@@ -1889,9 +1889,9 @@ sniffio==1.3.1 \
18891889
# via
18901890
# -c python/requirements_compiled_ray_test_py311_cpu.txt
18911891
# anyio
1892-
starlette==0.36.3 \
1893-
--hash=sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044 \
1894-
--hash=sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080
1892+
starlette==0.37.2 \
1893+
--hash=sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee \
1894+
--hash=sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823
18951895
# via
18961896
# -c python/requirements_compiled_ray_test_py311_cpu.txt
18971897
# -r python/requirements.txt

python/requirements_compiled_ray_py311_cu121.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -439,9 +439,9 @@ farama-notifications==0.0.4 \
439439
# via
440440
# -c python/requirements_compiled_ray_test_py311_cu121.txt
441441
# gymnasium
442-
fastapi==0.109.2 \
443-
--hash=sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d \
444-
--hash=sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73
442+
fastapi==0.115.0 \
443+
--hash=sha256:17ea427674467486e997206a5ab25760f6b09e069f099b96f5b55a32fb6f1631 \
444+
--hash=sha256:f93b4ca3529a8ebc6fc3fcf710e5efa8de3df9b41570958abf1d97d843138004
445445
# via
446446
# -c python/requirements_compiled_ray_test_py311_cu121.txt
447447
# -r python/requirements.txt
@@ -1889,9 +1889,9 @@ sniffio==1.3.1 \
18891889
# via
18901890
# -c python/requirements_compiled_ray_test_py311_cu121.txt
18911891
# anyio
1892-
starlette==0.36.3 \
1893-
--hash=sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044 \
1894-
--hash=sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080
1892+
starlette==0.37.2 \
1893+
--hash=sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee \
1894+
--hash=sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823
18951895
# via
18961896
# -c python/requirements_compiled_ray_test_py311_cu121.txt
18971897
# -r python/requirements.txt

python/requirements_compiled_ray_py311_cu124.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -439,9 +439,9 @@ farama-notifications==0.0.4 \
439439
# via
440440
# -c python/requirements_compiled_ray_test_py311_cu124.txt
441441
# gymnasium
442-
fastapi==0.109.2 \
443-
--hash=sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d \
444-
--hash=sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73
442+
fastapi==0.115.0 \
443+
--hash=sha256:17ea427674467486e997206a5ab25760f6b09e069f099b96f5b55a32fb6f1631 \
444+
--hash=sha256:f93b4ca3529a8ebc6fc3fcf710e5efa8de3df9b41570958abf1d97d843138004
445445
# via
446446
# -c python/requirements_compiled_ray_test_py311_cu124.txt
447447
# -r python/requirements.txt
@@ -1889,9 +1889,9 @@ sniffio==1.3.1 \
18891889
# via
18901890
# -c python/requirements_compiled_ray_test_py311_cu124.txt
18911891
# anyio
1892-
starlette==0.36.3 \
1893-
--hash=sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044 \
1894-
--hash=sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080
1892+
starlette==0.37.2 \
1893+
--hash=sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee \
1894+
--hash=sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823
18951895
# via
18961896
# -c python/requirements_compiled_ray_test_py311_cu124.txt
18971897
# -r python/requirements.txt

python/requirements_compiled_ray_test_py311_cpu.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -601,9 +601,9 @@ farama-notifications==0.0.4 \
601601
# via
602602
# -c /tmp/ray-deps/requirements_compiled.txt
603603
# gymnasium
604-
fastapi==0.109.2 \
605-
--hash=sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d \
606-
--hash=sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73
604+
fastapi==0.115.0 \
605+
--hash=sha256:17ea427674467486e997206a5ab25760f6b09e069f099b96f5b55a32fb6f1631 \
606+
--hash=sha256:f93b4ca3529a8ebc6fc3fcf710e5efa8de3df9b41570958abf1d97d843138004
607607
# via
608608
# -c /tmp/ray-deps/requirements_compiled.txt
609609
# -r python/requirements.txt
@@ -2866,9 +2866,9 @@ stack-data==0.6.3 \
28662866
# via
28672867
# -c /tmp/ray-deps/requirements_compiled.txt
28682868
# ipython
2869-
starlette==0.36.3 \
2870-
--hash=sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044 \
2871-
--hash=sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080
2869+
starlette==0.37.2 \
2870+
--hash=sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee \
2871+
--hash=sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823
28722872
# via
28732873
# -c /tmp/ray-deps/requirements_compiled.txt
28742874
# -r python/requirements.txt

python/requirements_compiled_ray_test_py311_cu121.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -601,9 +601,9 @@ farama-notifications==0.0.4 \
601601
# via
602602
# -c /tmp/ray-deps/requirements_compiled.txt
603603
# gymnasium
604-
fastapi==0.109.2 \
605-
--hash=sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d \
606-
--hash=sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73
604+
fastapi==0.115.0 \
605+
--hash=sha256:17ea427674467486e997206a5ab25760f6b09e069f099b96f5b55a32fb6f1631 \
606+
--hash=sha256:f93b4ca3529a8ebc6fc3fcf710e5efa8de3df9b41570958abf1d97d843138004
607607
# via
608608
# -c /tmp/ray-deps/requirements_compiled.txt
609609
# -r python/requirements.txt
@@ -2866,9 +2866,9 @@ stack-data==0.6.3 \
28662866
# via
28672867
# -c /tmp/ray-deps/requirements_compiled.txt
28682868
# ipython
2869-
starlette==0.36.3 \
2870-
--hash=sha256:13d429aa93a61dc40bf503e8c801db1f1bca3dc706b10ef2434a36123568f044 \
2871-
--hash=sha256:90a671733cfb35771d8cc605e0b679d23b992f8dcfad48cc60b38cb29aeb7080
2869+
starlette==0.37.2 \
2870+
--hash=sha256:6fe59f29268538e5d0d182f2791a479a0c64638e6935d1c6989e63fb2699c6ee \
2871+
--hash=sha256:9af890290133b79fc3db55474ade20f6220a364a0402e0b556e7cd5e1e093823
28722872
# via
28732873
# -c /tmp/ray-deps/requirements_compiled.txt
28742874
# -r python/requirements.txt

0 commit comments

Comments
 (0)