Skip to content

Commit 977180c

Browse files
authored
[Docs] Improve documentation for multi-node service helper script (#20600)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
1 parent c40784c commit 977180c

File tree

1 file changed

+32
-7
lines changed

1 file changed

+32
-7
lines changed

examples/online_serving/multi-node-serving.sh

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,35 @@
11
#!/bin/bash
2+
#
3+
# Helper script to manually start or join a Ray cluster for online serving of vLLM models.
4+
# This script is first executed on the head node, and then on each worker node with the IP address
5+
# of the head node.
6+
#
7+
# Subcommands:
8+
# leader: Launches a Ray head node and blocks until the cluster reaches the expected size (head + workers).
9+
# worker: Starts a worker node that connects to an existing Ray head node.
10+
#
11+
# Example usage:
12+
# On the head node machine, start the Ray head node process and run a vLLM server.
13+
# ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>] && \
14+
# python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2
15+
#
16+
# On each worker node, start the Ray worker node process.
17+
# ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]
18+
#
19+
# About Ray:
20+
# Ray is an open-source distributed execution framework that simplifies
21+
# distributed computing. Learn more:
22+
# https://ray.io/
223

3-
subcommand=$1
4-
shift
524

6-
ray_port=6379
7-
ray_init_timeout=300
8-
declare -a start_params
25+
subcommand=$1 # Either "leader" or "worker".
26+
shift # Remove the subcommand from the argument list.
927

28+
ray_port=6379 # Port used by the Ray head node.
29+
ray_init_timeout=300 # Seconds to wait before timing out.
30+
declare -a start_params # Parameters forwarded to the underlying 'ray start' command.
31+
32+
# Handle the worker subcommand.
1033
case "$subcommand" in
1134
worker)
1235
ray_address=""
@@ -32,6 +55,7 @@ case "$subcommand" in
3255
exit 1
3356
fi
3457

58+
# Retry until the worker node connects to the head node or the timeout expires.
3559
for (( i=0; i < $ray_init_timeout; i+=5 )); do
3660
ray start --address=$ray_address:$ray_port --block "${start_params[@]}"
3761
if [ $? -eq 0 ]; then
@@ -45,6 +69,7 @@ case "$subcommand" in
4569
exit 1
4670
;;
4771

72+
# Handle the leader subcommand.
4873
leader)
4974
ray_cluster_size=""
5075
while [ $# -gt 0 ]; do
@@ -69,10 +94,10 @@ case "$subcommand" in
6994
exit 1
7095
fi
7196

72-
# start the ray daemon
97+
# Start the Ray head node.
7398
ray start --head --port=$ray_port "${start_params[@]}"
7499

75-
# wait until all workers are active
100+
# Poll Ray until every worker node is active.
76101
for (( i=0; i < $ray_init_timeout; i+=5 )); do
77102
active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
78103
if [ $active_nodes -eq $ray_cluster_size ]; then

0 commit comments

Comments
 (0)