One shot all reduce & symm mem sync

joydddd · joydddd · commit 2c79dd9c53af · 2025-07-08T13:23:29.000-07:00
stack-info: PR: #245, branch: joydddd/stack/12
diff --git a/helion/language/signal_wait.py b/helion/language/signal_wait.py
@@ -83,7 +83,7 @@ def _(
             f"Invalid memory semantic '{sem}'. Must be one of {valid_sems}."
         )
 
-    if op == "atomic_cas" and not update:
+    if op == "atomic_cas" and update is None:
         raise ValueError(
             f"{op} without an update value. Do you want to use 'ld' instead? "
         )
diff --git a/test/test_distributed.py b/test/test_distributed.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+from torch.testing._internal.common_distributed import MultiProcessTestCase
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import instantiate_parametrized_tests
+from torch.testing._internal.common_utils import run_tests
+
+import helion
+import helion.language as hl
+
+from helion._testing import code_and_output
+
+
+@helion.jit
+def symm_mem_sync_kernel(
+    remote_signal_pad_pointer: torch.Tensor,  # shape[world_size]
+    local_signal_pad: torch.Tensor,
+    rank: int,
+) -> None:
+    N, world_size = local_signal_pad.size()
+    for n in hl.grid(N):
+        for tile in hl.tile(world_size, block_size=world_size):
+            peer_bars = remote_signal_pad_pointer[tile] + n * world_size + rank
+            hl.signal(peer_bars, [tile], signal=1, scope="sys", skip_sync=True)
+            hl.wait(local_signal_pad, [n, tile], signal=1, update=0, scope="sys", op="atomic_cas")
+
+
+@instantiate_parametrized_tests
+class SymmMemBarrier(MultiProcessTestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
+
+    @property
+    def world_size(self) -> int:
+        # world_size > 2 is needed to verify accumulation order
+        return 4
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(f"cuda:{self.rank}")
+
+    def _init_process(self):
+        torch.cuda.set_device(self.device)
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend="nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        torch.manual_seed(42 + self.rank)
+
+    @skip_if_lt_x_gpu(4)
+    def test_symm_mem_barrier(self):
+        self._init_process()
+        t = symm_mem.empty(4096, device=self.device)
+        symm_mem_hdl = symm_mem.rendezvous(t, group=dist.group.WORLD)
+        local_signal_pad_t = symm_mem_hdl.get_buffer(symm_mem_hdl.rank, (32, symm_mem_hdl.world_size), dtype=torch.int32)
+        signa_pad_pointers_t = torch.as_tensor(symm_mem_hdl.signal_pad_ptrs, dtype=torch.uint64).to(self.device)
+
+        code, result = code_and_output(symm_mem_sync_kernel, (signa_pad_pointers_t, local_signal_pad_t, symm_mem_hdl.rank,))
+
+        signal_pad = symm_mem_hdl.get_signal_pad(symm_mem_hdl.rank)
+        assert signal_pad.eq(0).all().item()
+
+        dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    run_tests()

Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ def _(`
`83`	`83`	`f"Invalid memory semantic '{sem}'. Must be one of {valid_sems}."`
`84`	`84`	`)`
`85`	`85`
`86`		`- if op == "atomic_cas" and not update:`
	`86`	`+ if op == "atomic_cas" and update is None:`
`87`	`87`	`raise ValueError(`
`88`	`88`	`f"{op} without an update value. Do you want to use 'ld' instead? "`
`89`	`89`	`)`