Codegen if tl.sum(one_elem_tensor): instead of if one_elem_tensor (#158)

yf225 · web-flow · commit 46b617d3d186 · 2025-06-14T15:57:24.000+02:00
diff --git a/helion/_compiler/device_ir.py b/helion/_compiler/device_ir.py
@@ -35,6 +35,7 @@
 from .ast_extension import LoopType
 from .ast_extension import NodeVisitor
 from .ast_extension import create
+from .ast_extension import expr_from_string
 from .ast_read_writes import ReadWrites
 from .compile_environment import CompileEnvironment
 from .host_function import HostFunction
@@ -232,6 +233,14 @@ def name(self) -> str:
 
     def codegen(self, state: CodegenState) -> list[object]:
         test = state.ast_arg(0)
+
+        test_proxy = state.proxy_arg(0)
+        if isinstance(test_proxy, torch.Tensor) and test_proxy.numel() == 1:
+            # Triton does not support `if one_elem_tensor:` but supports `if scalar:`,
+            # so we need to use tl.sum to extract the scalar.
+            test_code = ast.unparse(test)
+            test = expr_from_string(f"tl.sum({test_code})")
+
         args = state.ast_args[2]
         assert isinstance(args, list)
         assert all(isinstance(x, ast.AST) for x in args)
diff --git a/test/test_control_flow.py b/test/test_control_flow.py
@@ -86,6 +86,70 @@ def _fn_make_precompiler(x, v):
     return make_precompiler(_fn_kernel)(x, out, x.size(0), x.size(1), out.stride(0), out.stride(1), x.stride(0), x.stride(1), v, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)""",
         )
 
+    def test_if_arg_one_element_tensor(self):
+        @helion.kernel
+        def fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.zeros_like(x)
+
+            for idx in hl.grid(x.shape[0]):
+                # Since `y[idx]` is a one-element tensor, comparing it against 0 will also create a one-element tensor.
+                if y[idx] != 0:
+                    output[idx] = x[idx] * 2
+                if (
+                    y[idx] == 0
+                ):  # TODO(yf225): `else:` raises MLIR error in Triton, so we use a second if.
+                    output[idx] = x[idx]
+
+            return output
+
+        x = torch.tensor([1.0, 2.0, 3.0, 4.0], device=DEVICE)
+        y = torch.tensor([0, 1, 0, 1], device=DEVICE, dtype=torch.int32)
+        expected = torch.tensor([1.0, 4.0, 3.0, 8.0], device=DEVICE)
+        code, result = code_and_output(
+            fn,
+            (x, y),
+        )
+        torch.testing.assert_close(result, expected)
+        self.assertExpectedInline(
+            code,
+            """\
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def _fn_kernel(x, y, output, output_stride_0, x_stride_0, y_stride_0):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0
+    indices_0 = offset_0 + tl.zeros([1], tl.int32)
+    load = tl.load(y + indices_0 * y_stride_0, None)
+    v_0 = tl.full([], 0, tl.int32)
+    v_1 = load != v_0
+    if tl.sum(v_1):
+        load_1 = tl.load(x + indices_0 * x_stride_0, None)
+        v_2 = 2.0
+        v_3 = load_1 * v_2
+        tl.store(output + indices_0 * output_stride_0, v_3, None)
+    load_2 = tl.load(y + indices_0 * y_stride_0, None)
+    v_4 = tl.full([], 0, tl.int32)
+    v_5 = load_2 == v_4
+    if tl.sum(v_5):
+        load_3 = tl.load(x + indices_0 * x_stride_0, None)
+        tl.store(output + indices_0 * output_stride_0, load_3, None)
+
+def fn(x: torch.Tensor, y: torch.Tensor):
+    output = torch.zeros_like(x)
+    _fn_kernel[x.size(0),](x, y, output, output.stride(0), x.stride(0), y.stride(0), num_warps=4, num_stages=3)
+    return output
+
+def _fn_make_precompiler(x: torch.Tensor, y: torch.Tensor):
+    output = torch.zeros_like(x)
+    from helion.runtime.precompile_shim import make_precompiler
+    return make_precompiler(_fn_kernel)(x, y, output, output.stride(0), x.stride(0), y.stride(0), num_warps=4, num_stages=3)""",
+        )
+
     def test_constant_true(self):
         @helion.kernel(
             config={
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -6,7 +6,6 @@
 from expecttest import TestCase
 from packaging import version
 import torch
-from torch._environment import is_fbcode
 
 from helion._testing import DEVICE
 from helion._testing import code_and_output
@@ -1627,11 +1626,6 @@ def _jagged_dense_add_2d_make_precompiler(x_data: torch.Tensor, x_offsets: torch
     return make_precompiler(_jagged_dense_add_2d_kernel)(x_offsets, x_data, y, out, out.size(0), out.size(1), x_offsets.size(0), y.size(0), y.size(1), out.stride(0), out.stride(1), x_data.stride(0), x_offsets.stride(0), y.stride(0), y.stride(1), _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=8, num_stages=4)""",
         )
 
-    @unittest.skipIf(
-        "RTX 30" in torch.cuda.get_device_name(0),
-        "Triton internal error on RTX 30XX series",
-    )
-    @unittest.skipIf(is_fbcode(), "Triton internal error on fbcode Triton pin")
     def test_moe_matmul_ogs(self):
         mod = import_path(examples_dir / "moe_matmul_ogs.py")
 
@@ -1670,7 +1664,7 @@ def _moe_matmul_ogs_kernel(expert_token_offsets, expert_token_counts, sorted_to_
     num_tokens = tl.load(expert_token_counts + indices_0 * expert_token_counts_stride_0, None)
     v_0 = tl.full([], 0, tl.int32)
     v_1 = num_tokens != v_0
-    if v_1:
+    if tl.sum(v_1):
         num_tokens_copy = num_tokens
         start_copy = start
         for offset_1 in range(0, max_T_per_expert.to(tl.int32), _BLOCK_SIZE_1):