Skip to content

Commit 853487b

Browse files
crypdickhmellor
andauthored
[Docs] Improve docs for RLHF co-location example (#20599)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
1 parent 9ff2af6 commit 853487b

File tree

1 file changed

+74
-41
lines changed

1 file changed

+74
-41
lines changed

examples/offline_inference/rlhf_colocate.py

Lines changed: 74 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,31 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
"""
4-
a simple demonstration to show how to co-locate
5-
vLLM worker with training actors on the same GPUs,
6-
for RLHF-like applications.
7-
The key points:
8-
- Control the placement of the vLLM workers with Ray, by setting
9-
VLLM_RAY_PER_WORKER_GPUS and VLLM_RAY_BUNDLE_INDICES properly.
10-
- Use cuda-ipc to pass tensors, since NCCL does not work when we have
11-
multiple processes on the same GPU.
4+
Demonstrates how to co-locate a vLLM inference worker and training
5+
actors on the same set of GPUs for reinforcement learning from human feedback
6+
(RLHF) workloads.
7+
8+
Ray serves as the distributed execution framework in this example. Ray
9+
placement groups allocate both training actors and vLLM workers to the
10+
same GPU bundles, enabling fast, in-GPU communication between the two
11+
components.
12+
13+
The script shows how to do the following:
14+
15+
* Configure environment variables (`VLLM_RAY_PER_WORKER_GPUS` and
16+
`VLLM_RAY_BUNDLE_INDICES`) so that vLLM workers land on the desired
17+
devices.
18+
* Exchange tensors between processes by means of CUDA inter-process
19+
communication (IPC). CUDA IPC sidesteps NCCL limitations that occur
20+
when multiple processes share a single GPU.
21+
22+
Note that this example assumes a single-node cluster with four GPUs, but Ray
23+
supports multi-node clusters. vLLM expects exclusive use of the GPUs during
24+
its initialization for memory profiling. Residual GPU activity interferes
25+
with vLLM memory profiling and causes unexpected behavior.
26+
27+
Learn more about Ray placement groups:
28+
https://docs.ray.io/en/latest/placement-groups.html
1229
"""
1330

1431
import os
@@ -22,31 +39,50 @@
2239

2340

2441
class MyLLM(LLM):
25-
def __init__(self, *args, bundle_indices: list, **kwargs):
26-
# a hack to make the script work.
27-
# stop ray from manipulating CUDA_VISIBLE_DEVICES
28-
# at the top-level
42+
"""Configure the vLLM worker for Ray placement group execution.
43+
44+
The constructor sets environment variables that allow multiple vLLM
45+
workers to share a single physical GPU and that encode the bundle
46+
indices assigned by the placement group.
47+
48+
Args:
49+
*args: Positional arguments forwarded to `vllm.LLM`.
50+
bundle_indices (list[int]): Placement-group bundle indices
51+
assigned to this worker.
52+
**kwargs: Keyword arguments forwarded to `vllm.LLM`.
53+
"""
54+
55+
def __init__(self, *args, bundle_indices: list[int], **kwargs):
56+
# Prevent Ray from manipulating the top-level CUDA_VISIBLE_DEVICES variable
57+
# so that vLLM can its own device placement inside the worker.
2958
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
30-
# every worker will use 0.4 GPU, so that we can schedule
31-
# 2 instances on the same GPUs.
59+
# Each worker uses 0.4 GPU so that two instances fit on the same GPUs.
3260
os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
3361
os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(map(str, bundle_indices))
3462
print(f"creating LLM with bundle_indices={bundle_indices}")
3563
super().__init__(*args, **kwargs)
3664

3765

3866
class RayTrainingActor:
67+
"""Training actor that hosts a Facebook OPT-125M model from Hugging Face.
68+
69+
The model is loaded onto the first GPU assigned to this actor, and expose
70+
the CUDA IPC handles so that colocated vLLM workers can map tensors
71+
directly.
72+
"""
73+
3974
def __init__(self):
40-
# ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
75+
# Ray sets CUDA_VISIBLE_DEVICES to the GPUs assigned to this actor.
4176
from transformers import AutoModelForCausalLM
4277

4378
self.model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
4479
self.model.to("cuda:0")
80+
# Zero out all the parameters.
4581
for name, p in self.model.named_parameters():
4682
p.data.zero_()
4783
torch.cuda.synchronize()
48-
# the argument for get_device_uuid is the index
49-
# of the GPU in the visible devices.
84+
# The argument for `get_device_uuid` is the index of the GPU in the
85+
# list of visible devices.
5086
from vllm.platforms import current_platform
5187

5288
self.device_uuid = current_platform.get_device_uuid(0)
@@ -59,23 +95,23 @@ def get_weight_ipc_handles(self):
5995

6096
data = {}
6197
for name, p in self.model.named_parameters():
62-
# the training actor might only have a subset of the weights
63-
# and need to all-gather the weights from all the actors.
64-
# for demonstration, here we assume all training actors have
65-
# the full weights.
98+
# A training actor might hold only a subset of the weights and may
99+
# need to gather weights from other actors. For demonstration
100+
# purposes, each training actor owns the full weight set.
66101
data[name] = reduce_tensor(p.detach())
67102
return {self.device_uuid: data}
68103

69104

70-
# ray manages 4 GPUs
105+
# Ray manages four GPUs.
106+
71107
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
72108
ray.init()
73109

74-
# we want to co-locate vLLM instance and the training actor
75-
# on the same set of GPUs.
76-
# the placement plan is as follows:
77-
# GPU 0 and 1: training actor 0, 1, and vLLM instance 0 (with TP=2)
78-
# GPU 2 and 3: training actor 2, 3, and vLLM instance 1 (with TP=2)
110+
# Co-locate vLLM instances and training actors on the same set of GPUs:
111+
# * GPU 0 and 1: training actor 0, training actor 1, and vLLM instance 0
112+
# (tensor parallelism = 2).
113+
# * GPU 2 and 3: training actor 2, training actor 3, and vLLM instance 1
114+
# (tensor parallelism = 2).
79115

80116
pg = placement_group([{"GPU": 1, "CPU": 0}] * 4)
81117
ray.get(pg.ready())
@@ -104,10 +140,8 @@ def get_weight_ipc_handles(self):
104140
training_actor_device_ids.append(device_id)
105141

106142
for i, bundle_indices in enumerate([[0, 1], [2, 3]]):
107-
# IMPORTANT: when creating vLLM instances, we need to
108-
# make sure there are no GPU activities on the target GPUs,
109-
# otherwise, they will interfere with the vLLM memory profiling,
110-
# and cause unexpected behaviors.
143+
# Use the following syntax instead of the @ray.remote decorator so that
144+
# the placement group is customized for each bundle.
111145
llm = ray.remote(
112146
num_cpus=0,
113147
num_gpus=0,
@@ -125,35 +159,34 @@ def get_weight_ipc_handles(self):
125159
bundle_indices=bundle_indices,
126160
)
127161
inference_engines.append(llm)
128-
# don't call any method on the inference engine here,
129-
# otherwise it will block until the vLLM instance is created.
162+
# Do not call any method on the inference engine at this point; the call
163+
# blocks until the vLLM instance finishes initialization.
130164

131165
for i, llm in enumerate(inference_engines):
132166
inference_engine_device_ids.append(
133167
ray.get(llm.collective_rpc.remote("report_device_id", args=tuple()))
134168
)
135169
print(f"inference engine {i} is on {inference_engine_device_ids[-1]}")
136170

137-
# check the placement
138-
# the first two training actors should be
139-
# on the same GPUs as the first inference engine
171+
# Verify placement: the first two training actors share the same GPUs as
172+
# the first inference engine.
140173
assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
141-
# the last two training actors should be
142-
# on the same GPUs as the second inference engine
174+
# Verify placement: the last two training actors share the same GPUs as
175+
# the second inference engine.
143176
assert training_actor_device_ids[2:] == inference_engine_device_ids[1]
144177

145-
print("gather all the IPC handles from the training actors")
178+
print("Gather all the IPC handles from the training actors.")
146179
ipc_handles = {}
147180
for actor in training_actors:
148181
ipc_handles.update(ray.get(actor.get_weight_ipc_handles.remote()))
149182

150-
print("update the weights of the inference engines")
183+
print("Update the weights of the inference engines.")
151184
for llm in inference_engines:
152185
ray.get(
153186
llm.collective_rpc.remote(
154187
"update_weights_from_ipc_handles", args=(ipc_handles,)
155188
)
156189
)
157-
print("check if the weights are updated")
190+
print("Check if the weights are updated.")
158191
for llm in inference_engines:
159192
assert ray.get(llm.collective_rpc.remote("check_weights_changed", args=tuple()))

0 commit comments

Comments
 (0)