Add dp initialize patch with hccl backend (#626)

ganyi1996ppo · web-flow · commit e74331a1ede3 · 2025-04-23T15:47:51.000+08:00
### What this PR does / why we need it?  Add dp stateless process group initialization path with hccl backend as vllm-ascend patch. ### Does this PR introduce _any_ user-facing change?  ### How was this patch tested?  --------- Signed-off-by: ganyi <pleaplusone.gy@gmail.com>
diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py
@@ -87,7 +87,15 @@
 #    Future Plan:
 #       Its a workaround in vllm-ascend to enable multi-node dp inference, maybe removed if vllm have better plan
 #       on multi-node dp inference implementation
-#
+#   4. `ParallelConfig.stateless_init_dp_group`
+#    Why:
+#       vLLM use gloo backend by default to initialize stateless dp process gourp, but we want to use hccl here to
+#       get better performance
+#    How：
+#       adopt nccl backend to init process group
+#    Related PR (if no, explain why): no related PR, we want add this ability into vllm
+#    Future Plan:
+#       Remove those patch when vllm merged them
 # * Worker Patch:
 # ===============
 # ** File: worker/patch_0_8_4/patch_metrics.py **
diff --git a/vllm_ascend/patch/platform/patch_common/patch_distributed.py b/vllm_ascend/patch/platform/patch_common/patch_distributed.py
@@ -152,6 +152,21 @@ def parallel_config_get_dp_port(self) -> int:
     return port
 
 
+def ascend_stateless_init_dp_group(self) -> "ProcessGroup":
+    from vllm.distributed.utils import \
+        stateless_init_torch_distributed_process_group
+
+    dp_group = stateless_init_torch_distributed_process_group(
+        self.data_parallel_master_ip,
+        self.get_next_dp_init_port(),
+        self.data_parallel_rank,
+        self.data_parallel_size,
+        backend="hccl")
+
+    return dp_group
+
+
 vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_parallel
 vllm.distributed.stateless_init_torch_distributed_process_group = ascend_stateless_init_torch_distributed_process_group
 ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port
+ParallelConfig.stateless_init_dp_group = ascend_stateless_init_dp_group