@@ -47,7 +47,7 @@ def ascend_destroy_model_parallel():
47
47
destory_ascend_model_parallel ()
48
48
49
49
50
- def ascend_stateless_init_torch_distributed_process_group (
50
+ def stateless_init_torch_distributed_process_group (
51
51
host : str , port : int , rank : int , world_size : int ,
52
52
backend : str ) -> ProcessGroup :
53
53
"""
@@ -96,10 +96,16 @@ def ascend_stateless_init_torch_distributed_process_group(
96
96
# different systems (e.g. RPC) in case the store is multi-tenant.
97
97
prefix_store = PrefixStore (init_method , store )
98
98
99
+ # TODO(Yizhou): The reason we need to set options while vllm does not
100
+ # seems to be related to the version of PyTorch. In the latest version,
101
+ # there is no need to set options. While in the older version, 2.5.1
102
+ # specifically, we need to set options.
103
+ options = ProcessGroup .Options (backend = backend )
99
104
pg : ProcessGroup = ProcessGroup (
100
105
prefix_store ,
101
106
group_rank ,
102
107
group_size ,
108
+ options ,
103
109
)
104
110
if backend == "gloo" :
105
111
from torch .distributed .distributed_c10d import ProcessGroupGloo
@@ -136,7 +142,10 @@ def ascend_stateless_init_torch_distributed_process_group(
136
142
else :
137
143
raise RuntimeError (f"Unsupported torch distributed backend: { backend } " )
138
144
139
- pg ._set_default_backend (backend_type )
145
+ # TODO(Yizhou): Like we mentioned above, _set_default_backend is not
146
+ # implemented in the 2.5.1 version of PyTorch. But we need to set it
147
+ # after the latest version is released.
148
+ # pg._set_default_backend(backend_type)
140
149
backend_class ._set_sequence_number_for_group ()
141
150
142
151
pg ._register_backend (device , backend_type , backend_class )
@@ -163,20 +172,21 @@ def parallel_config_get_dp_port(self) -> int:
163
172
164
173
165
174
def ascend_stateless_init_dp_group (self ) -> "ProcessGroup" :
166
- from vllm .distributed .utils import \
167
- stateless_init_torch_distributed_process_group
168
-
175
+ # TODO(Yizhou): Currently we have to set the backend to gloo
176
+ # because in vllm.config.ParallelConfig.has_unfinished_dp the
177
+ # device is set to cpu. We need to fix this in the future.
178
+ # We need to compare the performance of gloo and hccl and then
179
+ # decide which one to use.
169
180
dp_group = stateless_init_torch_distributed_process_group (
170
181
self .data_parallel_master_ip ,
171
182
self .get_next_dp_init_port (),
172
183
self .data_parallel_rank ,
173
184
self .data_parallel_size ,
174
- backend = "hccl " )
185
+ backend = "gloo " )
175
186
176
187
return dp_group
177
188
178
189
179
190
vllm .distributed .parallel_state .destroy_model_parallel = ascend_destroy_model_parallel
180
- vllm .distributed .stateless_init_torch_distributed_process_group = ascend_stateless_init_torch_distributed_process_group
181
191
ParallelConfig .get_next_dp_init_port = parallel_config_get_dp_port
182
192
ParallelConfig .stateless_init_dp_group = ascend_stateless_init_dp_group
0 commit comments