Fix bug with errors on unreachable if branch (#138)

jansel · web-flow · commit 4c0ad7254e99 · 2025-06-07T11:06:29.000-07:00
diff --git a/helion/_compiler/type_propagation.py b/helion/_compiler/type_propagation.py
@@ -1950,16 +1950,17 @@ def visit_AugAssign(self, node: ast.AugAssign) -> TypeInfo:
 
     def visit_If(self, node: ast.If) -> TypeInfo:
         test = self.visit(node.test)
-        body = self._body(node.body)
-        orelse = self._body(node.orelse)
         try:
             truth_val = test.truth_value()
-            if truth_val:
-                self.scope.merge(body)
-            else:
-                self.scope.merge(orelse)
+            has_truth_val = True
         except NotImplementedError:
-            self.scope.merge_if_else(body, orelse)
+            truth_val = None
+            has_truth_val = False
+        if has_truth_val:
+            # For constant conditions, only type propagate one branch
+            self.scope.merge(self._body(node.body if truth_val else node.orelse))
+        else:
+            self.scope.merge_if_else(self._body(node.body), self._body(node.orelse))
         return NoType(origin=self.origin())
 
     def _body(self, stmts: list[ast.stmt]) -> LocalScope:
diff --git a/test/test_control_flow.py b/test/test_control_flow.py
@@ -200,6 +200,141 @@ def _fn_make_precompiler(x):
     return make_precompiler(_fn_kernel)(x, out, out.size(0), out.size(1), x.size(0), x.size(1), out.stride(0), out.stride(1), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)""",
         )
 
+    def test_error_in_non_taken_branch(self):
+        def mul_relu_block_back_spec(x, y, dz):
+            z = torch.relu(x * y[:, None])
+            grad_x, grad_y = torch.autograd.grad(z, [x, y], dz, retain_graph=True)
+            return grad_x, grad_y
+
+        @helion.kernel(config=helion.Config(block_sizes=[32, 32]))
+        def mul_relu_block_backward_kernel(
+            x: torch.Tensor,
+            y: torch.Tensor,
+            dz: torch.Tensor,
+            use_atomics: hl.constexpr = False,
+        ):
+            # Get tensor sizes
+            m, n = x.shape
+            # Create output tensor for gradients
+            dx = torch.empty_like(x)
+
+            if use_atomics:
+                dy = torch.zeros_like(y)
+            else:
+                dy = torch.empty_like(x)
+
+            # Use Helion to tile the computation
+            for tile_i, tile_j in hl.tile([m, n]):
+                # Get input tiles
+                x_tile = x[tile_i, tile_j]
+                y_tile = y[tile_i]
+                dz_tile = dz[tile_i, tile_j]
+
+                # For ReLU, gradient is 1 where input > 0, 0 otherwise
+                relu_mask = (x_tile * y_tile[:, None]) > 0
+                # Chain rule: dx = dz * relu_grad * y
+                relu_grad = torch.where(relu_mask, 1, 0)
+                dx[tile_i, tile_j] = dz_tile * relu_grad * y_tile[:, None]
+
+                # Chain rule: dy = dz * relu_grad * x -> backwards of broadcast(sum)
+                if use_atomics:
+                    local_dy_grad = torch.sum(dz_tile * relu_grad * x_tile, dim=1)
+                    hl.atomic_add(dy, [tile_i], local_dy_grad)
+                else:
+                    local_dy_grad = dz_tile * relu_grad * x_tile
+                    dy[tile_i, tile_j] = local_dy_grad
+
+            if use_atomics:
+                return dx, dy
+            return dx, dy.sum(axis=-1)
+
+        x = torch.randn(512, 1024, device="cuda", requires_grad=True)
+        y = torch.randn(512, device="cuda", requires_grad=True)
+        dz = torch.randn(512, 1024, device="cuda")
+        expected = mul_relu_block_back_spec(x, y, dz)
+        torch.testing.assert_close(
+            mul_relu_block_backward_kernel(x, y, dz, False),
+            expected,
+        )
+        code, output = code_and_output(
+            mul_relu_block_backward_kernel,
+            (x, y, dz, True),
+        )
+        self.assertExpectedInline(
+            code,
+            """\
+from __future__ import annotations
+
+import torch
+import helion.language as hl
+import triton
+import triton.language as tl
+
+@triton.jit
+def _mul_relu_block_backward_kernel_kernel(x, y, dz, dx, dy, dx_stride_0, dx_stride_1, dy_stride_0, dz_stride_0, dz_stride_1, x_stride_0, x_stride_1, y_stride_0, m, n, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+    num_blocks_0 = tl.cdiv(m, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < m
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < n
+    x_tile = tl.load(x + (indices_0[:, None] * x_stride_0 + indices_1[None, :] * x_stride_1), mask_0[:, None] & mask_1[None, :], other=0)
+    y_tile = tl.load(y + indices_0 * y_stride_0, mask_0, other=0)
+    dz_tile = tl.load(dz + (indices_0[:, None] * dz_stride_0 + indices_1[None, :] * dz_stride_1), mask_0[:, None] & mask_1[None, :], other=0)
+    subscript = y_tile[:, None]
+    v_0 = x_tile * subscript
+    v_1 = 0.0
+    v_2 = v_0 > v_1
+    v_3 = tl.full([], 0, tl.int64)
+    v_4 = tl.full([], 1, tl.int64)
+    v_5 = v_4[None, None]
+    v_6 = v_3[None, None]
+    v_7 = tl.where(v_2, v_5, v_6)
+    v_8 = v_7.to(tl.float32)
+    v_9 = dz_tile * v_8
+    subscript_1 = y_tile[:, None]
+    v_10 = v_9 * subscript_1
+    tl.store(dx + (indices_0[:, None] * dx_stride_0 + indices_1[None, :] * dx_stride_1), v_10, mask_0[:, None] & mask_1[None, :])
+    v_11 = v_7.to(tl.float32)
+    v_12 = dz_tile * v_11
+    v_13 = v_12 * x_tile
+    local_dy_grad = tl.sum(v_13, 1)
+    tl.atomic_add(dy + indices_0 * dy_stride_0, local_dy_grad, mask=mask_0, sem='relaxed')
+
+def mul_relu_block_backward_kernel(x: torch.Tensor, y: torch.Tensor, dz: torch.Tensor, use_atomics: hl.constexpr=False):
+    m, n = x.shape
+    dx = torch.empty_like(x)
+    if True:
+        dy = torch.zeros_like(y)
+    else:
+        dy = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 32
+    _BLOCK_SIZE_1 = 32
+    _mul_relu_block_backward_kernel_kernel[triton.cdiv(m, _BLOCK_SIZE_0) * triton.cdiv(n, _BLOCK_SIZE_1),](x, y, dz, dx, dy, dx.stride(0), dx.stride(1), dy.stride(0), dz.stride(0), dz.stride(1), x.stride(0), x.stride(1), y.stride(0), m, n, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    if True:
+        return (dx, dy)
+    return (dx, dy.sum(axis=-1))
+
+def _mul_relu_block_backward_kernel_make_precompiler(x: torch.Tensor, y: torch.Tensor, dz: torch.Tensor, use_atomics: hl.constexpr=False):
+    m, n = x.shape
+    dx = torch.empty_like(x)
+    if True:
+        dy = torch.zeros_like(y)
+    else:
+        dy = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 32
+    _BLOCK_SIZE_1 = 32
+    from helion.runtime.precompile_shim import make_precompiler
+    return make_precompiler(_mul_relu_block_backward_kernel_kernel)(x, y, dz, dx, dy, dx.stride(0), dx.stride(1), dy.stride(0), dz.stride(0), dz.stride(1), x.stride(0), x.stride(1), y.stride(0), m, n, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)""",
+        )
+        torch.testing.assert_close(
+            output,
+            expected,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()