Add examples/segment_reduction.py (#300)

jansel · web-flow · commit 25b3ab9db1fd · 2025-07-13T08:53:38.000-07:00
diff --git a/examples/segment_reduction.py b/examples/segment_reduction.py
@@ -0,0 +1,151 @@
+# Code based on https://github.com/pytorch-labs/helion/issues/237
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+
+import helion
+from helion._testing import DEVICE
+from helion._testing import run_example
+import helion.language as hl
+
+
+def combine_fn_helion(
+    left_values: torch.Tensor,
+    left_indices: torch.Tensor,
+    right_values: torch.Tensor,
+    right_indices: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    combined_values = torch.where(
+        left_indices == right_indices, left_values + right_values, right_values
+    )
+    return combined_values, right_indices
+
+
+@helion.kernel()
+def segmented_reduction_helion(
+    indices: torch.Tensor, input_data: torch.Tensor, num_nodes: int
+) -> torch.Tensor:
+    num_elements, num_features = input_data.shape
+    output = torch.zeros(
+        (num_nodes, num_features), dtype=input_data.dtype, device=input_data.device
+    )
+    for tile_e, tile_f in hl.tile([num_elements, num_features]):
+        vals = input_data[tile_e, tile_f]
+        idxs = indices[tile_e]
+        idxs_next = hl.load(
+            indices, [tile_e.index + 1], extra_mask=tile_e.index < num_elements - 1
+        )
+        tuple_in = (vals, idxs.float().unsqueeze(1).expand_as(vals))
+        out_vals, _ = hl.associative_scan(combine_fn_helion, tuple_in, dim=0)
+        mask = (idxs != idxs_next) | (
+            tile_e.index % tile_e.block_size == tile_e.block_size - 1
+        )
+        segment_vals = torch.where(mask.unsqueeze(1), out_vals, 0.0)
+        hl.atomic_add(output, [idxs, tile_f], segment_vals)
+    return output
+
+
+@triton.jit
+def combine_fn_triton(left_values, left_indices, right_values, right_indices):
+    same_segment = left_indices == right_indices
+    combined_values = tl.where(same_segment, left_values + right_values, right_values)
+    combined_indices = right_indices
+    return combined_values, combined_indices
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_SIZE": bs},
+        )
+        for bs in [8, 16, 32, 64, 128]
+    ],
+    key=["C"],
+    restore_value=["out_ptr"],
+)
+@triton.jit
+def _segmented_reduction_triton(
+    index,  # the input index tensor
+    in_ptr,  # the input tensor
+    out_ptr,  # the output value tensor
+    E: tl.constexpr,  # Number of elements in the input tensor (1d)
+    C: tl.constexpr,  # Number of features in the input tensor (2d)
+    BLOCK_SIZE: tl.constexpr,  # Block size for the scan
+):
+    # Triton version adapted from
+    # https://github.com/fishmingyu/GeoT/blob/main/geot/triton/seg_reduction.py
+    pid = tl.program_id(axis=0)
+    offset_pid = pid // C
+    feature_id = pid % C
+    offsets = offset_pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < E
+
+    # Load input data
+    vals = tl.load(in_ptr + offsets * C + feature_id, mask=mask)
+    idxs = tl.load(index + offsets, mask=mask)
+    idxs_next = tl.load(index + offsets + 1, offsets < E - 1)
+
+    # Perform an inclusive scan using tl.associative_scan
+    result_values, _ = tl.associative_scan(
+        (
+            vals,
+            idxs,
+        ),
+        axis=0,
+        combine_fn=combine_fn_triton,
+    )
+    # if offset % BLOCK_SIZE == -1, it means the last element of the segment
+    segment_start = (idxs != idxs_next) | (offsets % BLOCK_SIZE == BLOCK_SIZE - 1)
+    tl.atomic_add(out_ptr + idxs * C + feature_id, result_values, mask & segment_start)
+
+
+def segmented_reduction_triton(indices, input_data, num_nodes):
+    E, C = input_data.shape
+    output = torch.zeros(
+        (num_nodes, C), dtype=input_data.dtype, device=input_data.device
+    )
+
+    def grid(META):
+        return (triton.cdiv(E, META["BLOCK_SIZE"]) * C,)
+
+    _segmented_reduction_triton[grid](indices, input_data, output, E, C)
+    return output
+
+
+def segmented_reduction_pytorch(indices, input_data, num_nodes):
+    # Run PyTorch reference (scatter_add equivalent)
+    num_features = input_data.size(1)
+    pytorch_output = torch.zeros(
+        num_nodes, num_features, device=input_data.device, dtype=input_data.dtype
+    )
+    pytorch_output.scatter_add_(
+        0, indices.unsqueeze(1).expand(-1, num_features), input_data
+    )
+    return pytorch_output
+
+
+def main():
+    num_nodes = 100
+    num_edges = 2000
+    num_features = 128
+
+    dtype = torch.float32
+
+    # Create sorted indices for segmented reduction
+    indices = torch.randint(0, num_nodes, (num_edges,), device=DEVICE).sort()[0]
+    input_data = torch.randn(num_edges, num_features, device=DEVICE, dtype=dtype)
+
+    run_example(
+        segmented_reduction_helion,
+        {
+            "triton": segmented_reduction_triton,
+            "pytorch": segmented_reduction_pytorch,
+        },
+        (indices, input_data, num_nodes),
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -1373,6 +1373,84 @@ def _rms_norm_make_precompiler(x: torch.Tensor, weight: torch.Tensor, eps: float
     from helion.runtime.precompile_shim import make_precompiler
     return make_precompiler(_rms_norm_kernel)(x, weight, out, eps, _BLOCK_SIZE_0, _RDIM_SIZE_1, num_warps=4, num_stages=3)
 
+--- assertExpectedJournal(TestExamples.test_segment_reduction)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_compat import libdevice
+
+import helion._testing.segment_reduction as _source_module
+
+@triton.jit
+def helper_function_0(param_0, param_1, param_2, param_3):
+    v_0 = param_1 == param_3
+    v_1 = param_0 + param_2
+    v_2 = tl.where(v_0, v_1, param_2)
+    return (v_2, param_3)
+
+@triton.jit
+def _segmented_reduction_helion_kernel(input_data, indices, output, indices_stride_0, input_data_stride_0, input_data_stride_1, output_stride_0, output_stride_1, num_elements, num_features, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+    num_blocks_0 = tl.cdiv(num_elements, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < num_elements
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < num_features
+    vals = tl.load(input_data + (indices_0[:, None] * input_data_stride_0 + indices_1[None, :] * input_data_stride_1), mask_0[:, None] & mask_1[None, :], other=0)
+    idxs = tl.load(indices + indices_0 * indices_stride_0, mask_0, other=0)
+    v_0 = tl.full([], 1, tl.int32)
+    v_1 = indices_0 + v_0
+    sub = -1 + num_elements
+    v_2 = sub.to(tl.int32)
+    v_3 = indices_0 < v_2
+    idxs_next = tl.load(indices + v_1 * indices_stride_0, mask_0 & v_3, other=0)
+    v_4 = idxs.to(tl.float32)
+    unsqueeze = v_4[:, None]
+    expand = tl.broadcast_to(unsqueeze, [_BLOCK_SIZE_0, _BLOCK_SIZE_1])
+    out_vals = tl.associative_scan((vals, expand), 0, helper_function_0)[0]
+    v_5 = idxs != idxs_next
+    _BLOCK_SIZE_0_ = _BLOCK_SIZE_0
+    v_6 = _BLOCK_SIZE_0_.to(tl.int32)
+    v_7 = indices_0 % v_6
+    v_8 = tl.full([], 0, tl.int32)
+    v_9 = v_7 != v_8
+    v_10 = libdevice.signbit(v_7) != 0 if v_7.dtype is tl.float32 else v_7 < 0
+    v_11 = libdevice.signbit(v_6) != 0 if v_6.dtype is tl.float32 else v_6 < 0
+    v_12 = v_10 != v_11
+    v_13 = v_9 & v_12
+    v_14 = v_7 + v_6
+    v_15 = tl.where(v_13, v_14, v_7)
+    sub_1 = -1 + _BLOCK_SIZE_0
+    v_16 = sub_1.to(tl.int32)
+    v_17 = v_15 == v_16
+    v_18 = v_5 | v_17
+    unsqueeze_1 = v_18[:, None]
+    v_19 = 0.0
+    v_20 = v_19[None, None]
+    v_21 = tl.where(unsqueeze_1, out_vals, v_20)
+    tl.atomic_add(output + (idxs[:, None] * output_stride_0 + indices_1[None, :] * output_stride_1), v_21, mask=mask_0[:, None] & mask_1[None, :], sem='relaxed')
+
+def segmented_reduction_helion(indices: torch.Tensor, input_data: torch.Tensor, num_nodes: int):
+    num_elements, num_features = input_data.shape
+    output = torch.zeros((num_nodes, num_features), dtype=input_data.dtype, device=input_data.device)
+    _BLOCK_SIZE_0 = 32
+    _BLOCK_SIZE_1 = 32
+    _segmented_reduction_helion_kernel[triton.cdiv(num_elements, _BLOCK_SIZE_0) * triton.cdiv(num_features, _BLOCK_SIZE_1),](input_data, indices, output, indices.stride(0), input_data.stride(0), input_data.stride(1), output.stride(0), output.stride(1), num_elements, num_features, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    return output
+
+def _segmented_reduction_helion_make_precompiler(indices: torch.Tensor, input_data: torch.Tensor, num_nodes: int):
+    num_elements, num_features = input_data.shape
+    output = torch.zeros((num_nodes, num_features), dtype=input_data.dtype, device=input_data.device)
+    _BLOCK_SIZE_0 = 32
+    _BLOCK_SIZE_1 = 32
+    from helion.runtime.precompile_shim import make_precompiler
+    return make_precompiler(_segmented_reduction_helion_kernel)(input_data, indices, output, indices.stride(0), input_data.stride(0), input_data.stride(1), output.stride(0), output.stride(1), num_elements, num_features, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+
 --- assertExpectedJournal(TestExamples.test_softmax)
 from __future__ import annotations
 
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -498,6 +498,31 @@ def test_jagged_mean(self):
             )
         )
 
+    def test_segment_reduction(self):
+        num_nodes = 100
+        num_edges = 1000
+        num_features = 32
+        dtype = torch.float32
+
+        # Create sorted indices for segmented reduction
+        indices = torch.randint(0, num_nodes, (num_edges,), device=DEVICE).sort()[0]
+        input_data = torch.randn(num_edges, num_features, device=DEVICE, dtype=dtype)
+
+        args = (indices, input_data, num_nodes)
+
+        # Import and use the reference implementation
+        mod = import_path(EXAMPLES_DIR / "segment_reduction.py")
+        expected = mod.segmented_reduction_pytorch(*args)
+
+        self.assertExpectedJournal(
+            check_example(
+                "segment_reduction",
+                args,
+                expected,
+                fn_name="segmented_reduction_helion",
+            )
+        )
+
 
 if __name__ == "__main__":
     unittest.main()