Skip to content

Commit 12575cf

Browse files
[Bugfix] fix RAY_CGRAPH_get_timeout is not set successfully (#19725)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
1 parent 8b6e1d6 commit 12575cf

File tree

1 file changed

+10
-10
lines changed

1 file changed

+10
-10
lines changed

vllm/executor/ray_distributed_executor.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -557,8 +557,17 @@ def _check_ray_cgraph_installation(self):
557557
def _compiled_ray_dag(self, enable_asyncio: bool):
558558
assert self.parallel_config.use_ray
559559
self._check_ray_cgraph_installation()
560+
# Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds
561+
# (it is 10 seconds by default). This is a Ray environment variable to
562+
# control the timeout of getting result from a compiled graph execution,
563+
# i.e., the distributed execution that includes model forward runs and
564+
# intermediate tensor communications, in the case of vllm.
565+
# Note: we should set this env var before importing
566+
# ray.dag, otherwise it will not take effect.
567+
os.environ.setdefault("RAY_CGRAPH_get_timeout", "300") # noqa: SIM112
560568
from ray.dag import InputNode, MultiOutputNode
561-
569+
logger.info("RAY_CGRAPH_get_timeout is set to %s",
570+
os.environ["RAY_CGRAPH_get_timeout"]) # noqa: SIM112
562571
logger.info("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE = %s",
563572
envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE)
564573
logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
@@ -570,15 +579,6 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
570579
"Invalid value for VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: "
571580
f"{channel_type}. Valid values are: 'auto', 'nccl', or 'shm'.")
572581

573-
# Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds
574-
# (it is 10 seconds by default). This is a Ray environment variable to
575-
# control the timeout of getting result from a compiled graph execution,
576-
# i.e., the distributed execution that includes model forward runs and
577-
# intermediate tensor communications, in the case of vllm.
578-
os.environ.setdefault("RAY_CGRAPH_get_timeout", "300") # noqa: SIM112
579-
logger.info("RAY_CGRAPH_get_timeout is set to %s",
580-
os.environ["RAY_CGRAPH_get_timeout"]) # noqa: SIM112
581-
582582
with InputNode() as input_data:
583583
# Example DAG: PP=2, TP=4
584584
#

0 commit comments

Comments
 (0)