Add hl.signal

joydddd · joydddd · commit 690b9a2123d1 · 2025-07-08T13:21:47.000-07:00
stack-info: PR: #233, branch: joydddd/stack/8
diff --git a/helion/language/__init__.py b/helion/language/__init__.py
@@ -15,6 +15,7 @@
 from .scan_ops import associative_scan as associative_scan
 from .scan_ops import cumprod as cumprod
 from .scan_ops import cumsum as cumsum
+from .signal_wait import signal as signal
 from .signal_wait import wait as wait
 from .tile_ops import tile_begin as tile_begin
 from .tile_ops import tile_block_size as tile_block_size
diff --git a/helion/language/signal_wait.py b/helion/language/signal_wait.py
@@ -6,13 +6,16 @@
 from torch.fx import has_side_effect
 
 from .. import exc
+from .._compiler.indexing_strategy import SubscriptIndexing
 from . import _decorators
 
 if TYPE_CHECKING:
     import ast
 
     from .._compiler.inductor_lowering import CodegenState
 
+__all__ = ["signal", "wait"]
+
 
 @has_side_effect
 @_decorators.api(tiles_as_sizes=True)
@@ -153,3 +156,108 @@ def _(state: CodegenState) -> ast.AST:
         signal=signal_expr,
         update=update_expr,
     )
+
+
+@has_side_effect
+@_decorators.api(tiles_as_sizes=True)
+def signal(
+    signal_pad: torch.Tensor,
+    index: list[object],
+    signal: int = 1,
+    op: str = "atomic_xchg",
+    sem: str = "release",
+    scope: str = "gpu",
+    skip_sync: bool = False,
+) -> torch.Tensor:
+    """Set the signal_pad slice to the signal value.
+    Args:
+        signal_pad: The signal pad to signal
+        index: Indices to index into the signal_pad tensor
+        signal: the value to send
+        op: The memory op for acquring the lock (default: 'atomic_xchg')
+        sem: The memory sematic for acquring the lock (default: 'release')
+        scope: The scope of the lock (default: 'gpu')
+        skip_sync: Skip the syncthreads before sending signal (default: False)
+    """
+    raise exc.NotInsideKernel
+
+
+@_decorators.prepare_args(signal)
+def _(
+    signal_pad: torch.Tensor,
+    index: list[object],
+    signal: int = 1,
+    op: str = "atomic_xchg",
+    sem: str = "release",
+    scope: str = "gpu",
+    skip_sync: bool = False,
+) -> tuple[torch.Tensor, object, int, str, str, str, bool]:
+    from helion.language.tile_proxy import Tile
+
+    valid_ops = {"atomic_add", "atomic_xchg"}
+    valid_sems = {"relaxed", "release", "acq_rel"}
+    valid_scopes = {"sys", "gpu"}
+
+    if op not in valid_ops:
+        raise ValueError(f"Invalid signal op '{op}'. Must be one of {valid_ops}. ")
+
+    if sem not in valid_sems:
+        raise ValueError(
+            f"Invalid memory semantic '{sem}'. Must be one of {valid_sems}."
+        )
+
+    if scope not in valid_scopes:
+        raise ValueError(f"Invalid scope '{scope}'. Must be one of {valid_scopes}.")
+
+    index = Tile._prepare_index(index)
+    index = Tile._tiles_to_sizes(index)
+
+    return (signal_pad, index, signal, op, sem, scope, skip_sync)
+
+
+@_decorators.register_fake(signal)
+def _(
+    signal_pad: torch.Tensor,
+    index: list[object],
+    signal: int = 1,
+    op: str = "atomic_xchg",
+    sem: str = "release",
+    scope: str = "gpu",
+    skip_sync: bool = False,
+) -> torch.Tensor:
+    return signal_pad.new_empty(SubscriptIndexing.compute_shape(signal_pad, index))
+
+
+@_decorators.codegen(signal)
+def _(state: CodegenState) -> ast.AST:
+    import ast
+
+    from .._compiler.ast_extension import expr_from_string
+    from .._compiler.indexing_strategy import SubscriptIndexing
+
+    signal_pad = state.proxy_arg(0)
+    index = state.proxy_arg(1)
+    signal = state.proxy_arg(2)
+    op = state.proxy_arg(3)
+    sem = state.proxy_arg(4)
+    scope = state.proxy_arg(5)
+    skip_sync = state.proxy_arg(6)
+
+    assert isinstance(signal_pad, torch.Tensor)
+    assert isinstance(index, list)
+
+    indices = SubscriptIndexing.create(state, signal_pad, index)
+    signal_pad_name = state.device_function.tensor_arg(signal_pad).name
+
+    signal_expr = ast.Constant(value=signal)
+    assert type(op) is str
+    assert type(sem) is str
+    assert type(scope) is str
+
+    hl_ext_call_triton_send_signal = f"helion.runtime.triton_send_signal(addr={signal_pad_name} + offset, update=signal, sem='{sem}', scope='{scope}', op='{op}', skip_sync={skip_sync})"
+
+    return expr_from_string(
+        hl_ext_call_triton_send_signal,
+        offset=indices.index_expr,
+        signal=signal_expr,
+    )
diff --git a/helion/runtime/__init__.py b/helion/runtime/__init__.py
@@ -8,6 +8,7 @@
 from .config import Config as Config
 from .kernel import Kernel as Kernel
 from .kernel import kernel as kernel
+from .triton_helpers import triton_send_signal as triton_send_signal
 from .triton_helpers import triton_wait_signal as triton_wait_signal
 
 
diff --git a/helion/runtime/triton_helpers.py b/helion/runtime/triton_helpers.py
@@ -3,7 +3,53 @@
 import triton
 import triton.language as tl
 
-__all__ = ["triton_wait_signal"]
+__all__ = ["triton_send_signal", "triton_wait_multiple_signal", "triton_wait_signal"]
+
+
+@triton.jit
+def triton_send_signal(
+    addr: tl.tensor,
+    update: tl.constexpr,
+    sem: tl.constexpr,
+    scope: tl.constexpr,
+    op: tl.constexpr,
+    skip_sync: tl.constexpr,
+) -> None:
+    """
+    Signal global memory barrier(s).
+
+    This function atomically sets global memory barriers to a update value,
+    signaling to other CTAs waiting on the barrier(s).
+
+    Args:
+        addr: Memory address of the barrier(s) to wait on
+        update: Set the barrier to
+        sem: Memory semantics for the atomic operation. Options: "release", "relaxed".
+        scope: Scope of the atomic operation. Options: "gpu", "sys"
+        op: Atomic operation type: "atomic_xchg", "atomic_add"
+        skip_sync: Skip CTA synchronization before setting the barrier. (default: False)
+    """
+    if not skip_sync:
+        tl.inline_asm_elementwise(
+            "bar.sync 0;", "=r", [], dtype=tl.int32, is_pure=False, pack=1
+        )
+
+    tl.static_assert(
+        sem == "release" or sem == "relaxed",
+        "Invalid memory semantic. options: 'release', 'relaxed'. ",
+    )
+    tl.static_assert(
+        scope == "gpu" or scope == "sys", "Invalid scope. options: 'gpu','sys'. "
+    )
+
+    if op == "atomic_xchg":
+        tl.atomic_xchg(addr, update, sem=sem, scope=scope)
+    elif op == "atomic_add":
+        tl.atomic_add(addr, update, sem=sem, scope=scope)
+    else:
+        raise NotImplementedError(
+            f"Unsupported op '{op}' for send signal on gmem barrier. "
+        )
 
 
 @triton.jit
@@ -71,3 +117,17 @@ def triton_wait_signal(
             "bar.sync 0;", "=r", [], dtype=tl.int32, is_pure=False, pack=1
         )
     # tl.debug_barrier() cause significant performance loss. (Perhaps breaks triton prefetching?)
+
+
+@triton.jit
+def triton_wait_multiple_signal(
+    addr: tl.tensor,
+    expect: tl.constexpr,  # wait until lock is set to expect
+    update: tl.constexpr,  # update the lock once it is aquired.
+    sem: tl.constexpr,
+    scope: tl.constexpr,
+    op: tl.constexpr,
+    skip_sync: tl.constexpr,
+) -> None:
+    raise NotImplementedError("Waiting on multiple barriers is not implemented yet. ")
+    # TODO(joydddd): waiting on multiple barriers at the same time whereeach thread waits on a different barrier
diff --git a/test/test_signal_wait.expected b/test/test_signal_wait.expected
@@ -1,6 +1,57 @@
 This file is automatically generated by assertExpectedJournal calls in test_signal_wait.py.
 Update expected outputs by running tests with the EXPECTTEST_ACCEPT=1 environment variable set.
 
+--- assertExpectedJournal(TestWait.test_signal_basic)
+from __future__ import annotations
+
+import torch
+import helion
+import triton
+import triton.language as tl
+
+@triton.jit
+def _gmem_signal_scalar_bar_kernel_kernel(signal_pad, signal_pad_stride_0):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0
+    helion.runtime.triton_send_signal(addr=signal_pad + offset_0 * signal_pad_stride_0, update=1, sem='release', scope='gpu', op='atomic_xchg', skip_sync=False)
+
+def gmem_signal_scalar_bar_kernel(signal_pad: torch.Tensor):
+    n, = signal_pad.shape
+    _gmem_signal_scalar_bar_kernel_kernel[n,](signal_pad, signal_pad.stride(0), num_warps=4, num_stages=3)
+    return signal_pad
+
+def _gmem_signal_scalar_bar_kernel_make_precompiler(signal_pad: torch.Tensor):
+    n, = signal_pad.shape
+    from helion.runtime.precompile_shim import make_precompiler
+    return make_precompiler(_gmem_signal_scalar_bar_kernel_kernel)(signal_pad, signal_pad.stride(0), num_warps=4, num_stages=3)
+
+--- assertExpectedJournal(TestWait.test_signal_multiple)
+from __future__ import annotations
+
+import torch
+import helion
+import triton
+import triton.language as tl
+
+@triton.jit
+def _gmem_signal_tensor_bar_kernel_kernel(signal_pad, signal_pad_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    helion.runtime.triton_send_signal(addr=signal_pad + indices_0 * signal_pad_stride_0, update=1, sem='release', scope='gpu', op='atomic_xchg', skip_sync=False)
+
+def gmem_signal_tensor_bar_kernel(signal_pad: torch.Tensor):
+    n, = signal_pad.shape
+    _BLOCK_SIZE_0 = 4
+    _gmem_signal_tensor_bar_kernel_kernel[triton.cdiv(n, _BLOCK_SIZE_0),](signal_pad, signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return signal_pad
+
+def _gmem_signal_tensor_bar_kernel_make_precompiler(signal_pad: torch.Tensor):
+    n, = signal_pad.shape
+    _BLOCK_SIZE_0 = 4
+    from helion.runtime.precompile_shim import make_precompiler
+    return make_precompiler(_gmem_signal_tensor_bar_kernel_kernel)(signal_pad, signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+
 --- assertExpectedJournal(TestWait.test_wait_2d_tile)
 from __future__ import annotations
 
diff --git a/test/test_signal_wait.py b/test/test_signal_wait.py
@@ -54,6 +54,59 @@ def wait_for_2d_tile_kernel(
         torch.testing.assert_close(result, x)
         self.assertExpectedJournal(code)
 
+    def test_signal_basic(self):
+        @helion.kernel
+        def gmem_signal_scalar_bar_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
+            (n,) = signal_pad.shape
+            for i in hl.grid(n):
+                hl.signal(signal_pad, [i], signal=1)
+            return signal_pad
+
+        signal_pad = torch.zeros(4, device=DEVICE, dtype=torch.int32)
+        code, result = code_and_output(gmem_signal_scalar_bar_kernel, (signal_pad,))
+        torch.testing.assert_close(
+            result, torch.ones(4, device=DEVICE, dtype=torch.int32)
+        )
+        self.assertExpectedJournal(code)
+
+    def test_signal_multiple(self):
+        @helion.kernel
+        def gmem_signal_tensor_bar_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
+            (n,) = signal_pad.shape
+            for tile in hl.tile(n):
+                hl.signal(signal_pad, [tile], signal=1)
+            return signal_pad
+
+        signal_pad = torch.zeros(16, device=DEVICE, dtype=torch.int32)
+        code, result = code_and_output(
+            gmem_signal_tensor_bar_kernel,
+            (signal_pad,),
+            block_size=[4],
+        )
+        torch.testing.assert_close(
+            result, torch.ones(16, device=DEVICE, dtype=torch.int32)
+        )
+        self.assertExpectedJournal(code)
+
+    def test_sent_recieve_cta(self):
+        @helion.kernel
+        def gmem_signal_n_wait_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
+            (n,) = signal_pad.shape
+            for i in hl.grid(n):  # first N ctas sends signal
+                hl.signal(signal_pad, [i], signal=1)
+            for i in hl.grid(n):  # last N ctas waits for signal
+                hl.wait(signal_pad, [i], signal=1)
+            return signal_pad
+
+        signal_pad = torch.zeros(4, device=DEVICE, dtype=torch.int32)
+
+        code, result = code_and_output(gmem_signal_n_wait_kernel, (signal_pad,))
+        torch.testing.assert_close(
+            result, torch.ones(4, device=DEVICE, dtype=torch.int32)
+        )
+        self.assertIn("helion.runtime.triton_send_signal", code)
+        self.assertIn("helion.runtime.triton_wait_signal", code)
+
 
 if __name__ == "__main__":
     unittest.main()