1
1
#! /bin/bash
2
+ #
3
+ # Helper script to manually start or join a Ray cluster for online serving of vLLM models.
4
+ # This script is first executed on the head node, and then on each worker node with the IP address
5
+ # of the head node.
6
+ #
7
+ # Subcommands:
8
+ # leader: Launches a Ray head node and blocks until the cluster reaches the expected size (head + workers).
9
+ # worker: Starts a worker node that connects to an existing Ray head node.
10
+ #
11
+ # Example usage:
12
+ # On the head node machine, start the Ray head node process and run a vLLM server.
13
+ # ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>] && \
14
+ # python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2
15
+ #
16
+ # On each worker node, start the Ray worker node process.
17
+ # ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]
18
+ #
19
+ # About Ray:
20
+ # Ray is an open-source distributed execution framework that simplifies
21
+ # distributed computing. Learn more:
22
+ # https://ray.io/
2
23
3
- subcommand=$1
4
- shift
5
24
6
- ray_port=6379
7
- ray_init_timeout=300
8
- declare -a start_params
25
+ subcommand=$1 # Either "leader" or "worker".
26
+ shift # Remove the subcommand from the argument list.
9
27
28
+ ray_port=6379 # Port used by the Ray head node.
29
+ ray_init_timeout=300 # Seconds to wait before timing out.
30
+ declare -a start_params # Parameters forwarded to the underlying 'ray start' command.
31
+
32
+ # Handle the worker subcommand.
10
33
case " $subcommand " in
11
34
worker)
12
35
ray_address=" "
@@ -32,6 +55,7 @@ case "$subcommand" in
32
55
exit 1
33
56
fi
34
57
58
+ # Retry until the worker node connects to the head node or the timeout expires.
35
59
for (( i= 0 ; i < $ray_init_timeout ; i+= 5 )) ; do
36
60
ray start --address=$ray_address :$ray_port --block " ${start_params[@]} "
37
61
if [ $? -eq 0 ]; then
@@ -45,6 +69,7 @@ case "$subcommand" in
45
69
exit 1
46
70
;;
47
71
72
+ # Handle the leader subcommand.
48
73
leader)
49
74
ray_cluster_size=" "
50
75
while [ $# -gt 0 ]; do
@@ -69,10 +94,10 @@ case "$subcommand" in
69
94
exit 1
70
95
fi
71
96
72
- # start the ray daemon
97
+ # Start the Ray head node.
73
98
ray start --head --port=$ray_port " ${start_params[@]} "
74
99
75
- # wait until all workers are active
100
+ # Poll Ray until every worker node is active.
76
101
for (( i= 0 ; i < $ray_init_timeout ; i+= 5 )) ; do
77
102
active_nodes=` python3 -c ' import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))' `
78
103
if [ $active_nodes -eq $ray_cluster_size ]; then
0 commit comments