1
1
# SPDX-License-Identifier: Apache-2.0
2
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
3
"""
4
- a simple demonstration to show how to co-locate
5
- vLLM worker with training actors on the same GPUs,
6
- for RLHF-like applications.
7
- The key points:
8
- - Control the placement of the vLLM workers with Ray, by setting
9
- VLLM_RAY_PER_WORKER_GPUS and VLLM_RAY_BUNDLE_INDICES properly.
10
- - Use cuda-ipc to pass tensors, since NCCL does not work when we have
11
- multiple processes on the same GPU.
4
+ Demonstrates how to co-locate a vLLM inference worker and training
5
+ actors on the same set of GPUs for reinforcement learning from human feedback
6
+ (RLHF) workloads.
7
+
8
+ Ray serves as the distributed execution framework in this example. Ray
9
+ placement groups allocate both training actors and vLLM workers to the
10
+ same GPU bundles, enabling fast, in-GPU communication between the two
11
+ components.
12
+
13
+ The script shows how to do the following:
14
+
15
+ * Configure environment variables (`VLLM_RAY_PER_WORKER_GPUS` and
16
+ `VLLM_RAY_BUNDLE_INDICES`) so that vLLM workers land on the desired
17
+ devices.
18
+ * Exchange tensors between processes by means of CUDA inter-process
19
+ communication (IPC). CUDA IPC sidesteps NCCL limitations that occur
20
+ when multiple processes share a single GPU.
21
+
22
+ Note that this example assumes a single-node cluster with four GPUs, but Ray
23
+ supports multi-node clusters. vLLM expects exclusive use of the GPUs during
24
+ its initialization for memory profiling. Residual GPU activity interferes
25
+ with vLLM memory profiling and causes unexpected behavior.
26
+
27
+ Learn more about Ray placement groups:
28
+ https://docs.ray.io/en/latest/placement-groups.html
12
29
"""
13
30
14
31
import os
22
39
23
40
24
41
class MyLLM (LLM ):
25
- def __init__ (self , * args , bundle_indices : list , ** kwargs ):
26
- # a hack to make the script work.
27
- # stop ray from manipulating CUDA_VISIBLE_DEVICES
28
- # at the top-level
42
+ """Configure the vLLM worker for Ray placement group execution.
43
+
44
+ The constructor sets environment variables that allow multiple vLLM
45
+ workers to share a single physical GPU and that encode the bundle
46
+ indices assigned by the placement group.
47
+
48
+ Args:
49
+ *args: Positional arguments forwarded to `vllm.LLM`.
50
+ bundle_indices (list[int]): Placement-group bundle indices
51
+ assigned to this worker.
52
+ **kwargs: Keyword arguments forwarded to `vllm.LLM`.
53
+ """
54
+
55
+ def __init__ (self , * args , bundle_indices : list [int ], ** kwargs ):
56
+ # Prevent Ray from manipulating the top-level CUDA_VISIBLE_DEVICES variable
57
+ # so that vLLM can its own device placement inside the worker.
29
58
os .environ .pop ("CUDA_VISIBLE_DEVICES" , None )
30
- # every worker will use 0.4 GPU, so that we can schedule
31
- # 2 instances on the same GPUs.
59
+ # Each worker uses 0.4 GPU so that two instances fit on the same GPUs.
32
60
os .environ ["VLLM_RAY_PER_WORKER_GPUS" ] = "0.4"
33
61
os .environ ["VLLM_RAY_BUNDLE_INDICES" ] = "," .join (map (str , bundle_indices ))
34
62
print (f"creating LLM with bundle_indices={ bundle_indices } " )
35
63
super ().__init__ (* args , ** kwargs )
36
64
37
65
38
66
class RayTrainingActor :
67
+ """Training actor that hosts a Facebook OPT-125M model from Hugging Face.
68
+
69
+ The model is loaded onto the first GPU assigned to this actor, and expose
70
+ the CUDA IPC handles so that colocated vLLM workers can map tensors
71
+ directly.
72
+ """
73
+
39
74
def __init__ (self ):
40
- # ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
75
+ # Ray sets CUDA_VISIBLE_DEVICES to the GPUs assigned to this actor.
41
76
from transformers import AutoModelForCausalLM
42
77
43
78
self .model = AutoModelForCausalLM .from_pretrained ("facebook/opt-125m" )
44
79
self .model .to ("cuda:0" )
80
+ # Zero out all the parameters.
45
81
for name , p in self .model .named_parameters ():
46
82
p .data .zero_ ()
47
83
torch .cuda .synchronize ()
48
- # the argument for get_device_uuid is the index
49
- # of the GPU in the visible devices.
84
+ # The argument for ` get_device_uuid` is the index of the GPU in the
85
+ # list of visible devices.
50
86
from vllm .platforms import current_platform
51
87
52
88
self .device_uuid = current_platform .get_device_uuid (0 )
@@ -59,23 +95,23 @@ def get_weight_ipc_handles(self):
59
95
60
96
data = {}
61
97
for name , p in self .model .named_parameters ():
62
- # the training actor might only have a subset of the weights
63
- # and need to all-gather the weights from all the actors.
64
- # for demonstration, here we assume all training actors have
65
- # the full weights.
98
+ # A training actor might hold only a subset of the weights and may
99
+ # need to gather weights from other actors. For demonstration
100
+ # purposes, each training actor owns the full weight set.
66
101
data [name ] = reduce_tensor (p .detach ())
67
102
return {self .device_uuid : data }
68
103
69
104
70
- # ray manages 4 GPUs
105
+ # Ray manages four GPUs.
106
+
71
107
os .environ ["CUDA_VISIBLE_DEVICES" ] = "0,1,2,3"
72
108
ray .init ()
73
109
74
- # we want to co -locate vLLM instance and the training actor
75
- # on the same set of GPUs.
76
- # the placement plan is as follows:
77
- # GPU 0 and 1 : training actor 0, 1 , and vLLM instance 0 (with TP=2)
78
- # GPU 2 and 3: training actor 2, 3, and vLLM instance 1 (with TP=2)
110
+ # Co -locate vLLM instances and training actors on the same set of GPUs:
111
+ # * GPU 0 and 1: training actor 0, training actor 1, and vLLM instance 0
112
+ # (tensor parallelism = 2).
113
+ # * GPU 2 and 3 : training actor 2, training actor 3 , and vLLM instance 1
114
+ # (tensor parallelism = 2).
79
115
80
116
pg = placement_group ([{"GPU" : 1 , "CPU" : 0 }] * 4 )
81
117
ray .get (pg .ready ())
@@ -104,10 +140,8 @@ def get_weight_ipc_handles(self):
104
140
training_actor_device_ids .append (device_id )
105
141
106
142
for i , bundle_indices in enumerate ([[0 , 1 ], [2 , 3 ]]):
107
- # IMPORTANT: when creating vLLM instances, we need to
108
- # make sure there are no GPU activities on the target GPUs,
109
- # otherwise, they will interfere with the vLLM memory profiling,
110
- # and cause unexpected behaviors.
143
+ # Use the following syntax instead of the @ray.remote decorator so that
144
+ # the placement group is customized for each bundle.
111
145
llm = ray .remote (
112
146
num_cpus = 0 ,
113
147
num_gpus = 0 ,
@@ -125,35 +159,34 @@ def get_weight_ipc_handles(self):
125
159
bundle_indices = bundle_indices ,
126
160
)
127
161
inference_engines .append (llm )
128
- # don't call any method on the inference engine here,
129
- # otherwise it will block until the vLLM instance is created .
162
+ # Do not call any method on the inference engine at this point; the call
163
+ # blocks until the vLLM instance finishes initialization .
130
164
131
165
for i , llm in enumerate (inference_engines ):
132
166
inference_engine_device_ids .append (
133
167
ray .get (llm .collective_rpc .remote ("report_device_id" , args = tuple ()))
134
168
)
135
169
print (f"inference engine { i } is on { inference_engine_device_ids [- 1 ]} " )
136
170
137
- # check the placement
138
- # the first two training actors should be
139
- # on the same GPUs as the first inference engine
171
+ # Verify placement: the first two training actors share the same GPUs as
172
+ # the first inference engine.
140
173
assert training_actor_device_ids [:2 ] == inference_engine_device_ids [0 ]
141
- # the last two training actors should be
142
- # on the same GPUs as the second inference engine
174
+ # Verify placement: the last two training actors share the same GPUs as
175
+ # the second inference engine.
143
176
assert training_actor_device_ids [2 :] == inference_engine_device_ids [1 ]
144
177
145
- print ("gather all the IPC handles from the training actors" )
178
+ print ("Gather all the IPC handles from the training actors. " )
146
179
ipc_handles = {}
147
180
for actor in training_actors :
148
181
ipc_handles .update (ray .get (actor .get_weight_ipc_handles .remote ()))
149
182
150
- print ("update the weights of the inference engines" )
183
+ print ("Update the weights of the inference engines. " )
151
184
for llm in inference_engines :
152
185
ray .get (
153
186
llm .collective_rpc .remote (
154
187
"update_weights_from_ipc_handles" , args = (ipc_handles ,)
155
188
)
156
189
)
157
- print ("check if the weights are updated" )
190
+ print ("Check if the weights are updated. " )
158
191
for llm in inference_engines :
159
192
assert ray .get (llm .collective_rpc .remote ("check_weights_changed" , args = tuple ()))
0 commit comments