pytorch-labs · joydddd · Jul 10, 2025 · jansel · Jul 11, 2025 · joydddd
diff --git a/examples/template_via_closure.py b/examples/template_via_closure.py
@@ -60,7 +60,7 @@ def check(n: int, k: int, m: int) -> None:
 
     def epilogue(acc: torch.Tensor, tile: list[torch.Tensor]) -> torch.Tensor:
         # The epilogue can use the captured bias tensor that is implicitly lifted to a kernel arg
-        return torch.relu(acc + bias[tile])
+        return torch.relu(acc + bias[0, tile[1]])
 
     def kernel_wrapper(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         return matmul_with_epilogue(x, y, epilogue)

diff --git a/helion/_compiler/indexing_strategy.py b/helion/_compiler/indexing_strategy.py
@@ -378,9 +378,8 @@ def create(
         assert len(index_values) == fake_value.ndim
         index_expr = []
         for i, idx in enumerate(index_values):
-            if fake_value.size(i) != 1:
-                stride = state.device_function.tensor_stride(fake_value, i).name
-                index_expr.append(f"{idx} * {stride}")
+            stride = state.device_function.tensor_stride(fake_value, i).name
+            index_expr.append(f"{idx} * {stride}")
         if not index_expr:
             shape_str = tile_strategy.shape_str(output_size)
             index_expr.append(f"tl.zeros({shape_str}, {dtype})")

diff --git a/helion/_compiler/tile_strategy.py b/helion/_compiler/tile_strategy.py
@@ -670,11 +670,19 @@ def codegen_device_loop(self, state: CodegenState) -> DeviceLoopState:
                 type_comment=None,
             )
             assert for_node.body is body
-            extra_body = [
-                statement_from_string(
-                    f"{index_var} = {offset_var} + tl.arange(0, ({block_size_var})).to({dtype})"
-                ),
-            ]
+            extra_body = []
+            if block_size == 1:
+                extra_body.append(
+                    statement_from_string(
+                        f"{index_var} = {offset_var} + tl.zeros([1], {dtype})"
+                    ),
+                )
+            else:
+                extra_body.append(
+                    statement_from_string(
+                        f"{index_var} = {offset_var} + tl.arange(0, ({block_size_var})).to({dtype})"
+                    ),
+                )
             mask_statement = self._setup_mask(  # pyright: ignore[reportAttributeAccessIssue]
                 state, block_idx, block_size, index_var, end
             )

diff --git a/test/test_associative_scan.expected b/test/test_associative_scan.expected
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -1384,13 +1384,14 @@ def _matmul_with_epilogue_kernel(x, y, epilogue_closure_0, out, _BLOCK_SIZE_0: t
         load = tl.load(x + (indices_0[:, None] * 1024 + indices_2[None, :] * 1), None)
         load_1 = tl.load(y + (indices_2[:, None] * 1024 + indices_1[None, :] * 1), None)
         acc = tl.dot(load, load_1, acc=acc_copy_0, input_precision='tf32')
-    load_2 = tl.load(epilogue_closure_0 + indices_1[None, :] * 1, None)
-    v_0 = load_2.to(tl.float32)
-    v_1 = acc + v_0
-    v_2 = tl.full([], 0, tl.int32)
-    v_3 = triton_helpers.maximum(v_2, v_1)
-    v_4 = v_3.to(tl.float16)
-    tl.store(out + (indices_0[:, None] * 1024 + indices_1[None, :] * 1), v_4, None)
+    load_2 = tl.load(epilogue_closure_0 + (0 * 1024 + indices_1 * 1), None)
+    v_0 = load_2[None, :]
+    v_1 = v_0.to(tl.float32)
+    v_2 = acc + v_1
+    v_3 = tl.full([], 0, tl.int32)
+    v_4 = triton_helpers.maximum(v_3, v_2)
+    v_5 = v_4.to(tl.float16)
+    tl.store(out + (indices_0[:, None] * 1024 + indices_1[None, :] * 1), v_5, None)
 
 def matmul_with_epilogue(x: Tensor, y: Tensor, epilogue: Callable[[Tensor, list[Tensor]], Tensor]):
     m, k = x.size()
@@ -1528,4 +1529,3 @@ def _matmul_with_epilogue_make_precompiler(x: Tensor, y: Tensor, epilogue: Calla
     _BLOCK_SIZE_2 = 16
     from helion.runtime.precompile_shim import make_precompiler
     return make_precompiler(_matmul_with_epilogue_kernel)(x, y, out, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=2, num_stages=4)
-
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -109,7 +109,7 @@ def test_template_via_closure0(self):
         args = (
             torch.randn([1024, 1024], device=DEVICE, dtype=torch.float16),
             torch.randn([1024, 1024], device=DEVICE, dtype=torch.float16),
-            lambda acc, tile: torch.relu(acc + bias[tile]),
+            lambda acc, tile: torch.relu(acc + bias[0, tile[1]]),
         )
         self.assertExpectedJournal(
             check_example(

diff --git a/test/test_loops.expected b/test/test_loops.expected
@@ -66,7 +66,7 @@ def _device_loop_3d_kernel(x, out, out_stride_0, out_stride_1, out_stride_2, out
             indices_1 = offset_1 + tl.arange(0, _BLOCK_SIZE_1).to(tl.int32)
             mask_1 = indices_1 < b
             for offset_3 in tl.range(0, d.to(tl.int32), step=1):
-                indices_3 = offset_3 + tl.arange(0, 1).to(tl.int32)
+                indices_3 = offset_3 + tl.zeros([1], tl.int32)
                 load = tl.load(x + (indices_0[:, None, None, None] * x_stride_0 + indices_1[None, :, None, None] * x_stride_1 + indices_2[None, None, :, None] * x_stride_2 + indices_3[None, None, None, :] * x_stride_3), mask_0[:, None, None, None] & mask_1[None, :, None, None] & mask_2[None, None, :, None], other=0)
                 v_0 = tl_math.sin(load)
                 tl.store(out + (indices_0[:, None, None, None] * out_stride_0 + indices_1[None, :, None, None] * out_stride_1 + indices_2[None, None, :, None] * out_stride_2 + indices_3[None, None, None, :] * out_stride_3), v_0, mask_0[:, None, None, None] & mask_1[None, :, None, None] & mask_2[None, None, :, None])
@@ -197,7 +197,7 @@ def _chebyshev_kernel_kernel(x, w, out, out_stride_0, out_stride_1, w_stride_0,
     v_3 = 2.0
     v_4 = in_x * v_3
     for offset_2 in tl.range(2, 5, step=1):
-        indices_2 = offset_2 + tl.arange(0, 1).to(tl.int32)
+        indices_2 = offset_2 + tl.zeros([1], tl.int32)
         v_4_copy = v_4
         in_x_0_copy = in_x_0
         T0_copy = T0
@@ -245,13 +245,13 @@ import triton
 import triton.language as tl
 
 @triton.jit
-def _fn_kernel(x, end, out, x_size_0, out_stride_0, x_stride_0, x_stride_1, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_0: tl.constexpr):
+def _fn_kernel(x, end, out, x_size_0, end_stride_0, out_stride_0, x_stride_0, x_stride_1, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_0: tl.constexpr):
     pid_0 = tl.program_id(0)
     offset_1 = pid_0 * _BLOCK_SIZE_1
     indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
     mask_1 = indices_1 < x_size_0
     acc = tl.full([_BLOCK_SIZE_1, _BLOCK_SIZE_0], 0.0, tl.float32)
-    load = tl.load(end + tl.zeros([], tl.int32), None)
+    load = tl.load(end + 0 * end_stride_0, None)
     for offset_0 in tl.range(0, load.to(tl.int32), step=_BLOCK_SIZE_0):
         indices_0 = offset_0 + tl.arange(0, _BLOCK_SIZE_0).to(tl.int32)
         mask_0 = indices_0 < load
@@ -267,7 +267,7 @@ def fn(x: torch.Tensor, end: torch.Tensor):
     bs = 32
     _BLOCK_SIZE_1 = 32
     _BLOCK_SIZE_0 = 32
-    _fn_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_1),](x, end, out, x.size(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_1, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _fn_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_1),](x, end, out, x.size(0), end.stride(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_1, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
     return out
 
 def _fn_make_precompiler(x: torch.Tensor, end: torch.Tensor):
@@ -276,7 +276,7 @@ def _fn_make_precompiler(x: torch.Tensor, end: torch.Tensor):
     _BLOCK_SIZE_1 = 32
     _BLOCK_SIZE_0 = 32
     from helion.runtime.precompile_shim import make_precompiler
-    return make_precompiler(_fn_kernel)(x, end, out, x.size(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_1, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return make_precompiler(_fn_kernel)(x, end, out, x.size(0), end.stride(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_1, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
 
 --- assertExpectedJournal(TestLoops.test_data_dependent_bounds2)
 from __future__ import annotations
@@ -286,13 +286,13 @@ import triton
 import triton.language as tl
 
 @triton.jit
-def _fn_kernel(x, end, out, out_size_0, x_size_0, out_stride_0, x_stride_0, x_stride_1, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+def _fn_kernel(x, end, out, out_size_0, x_size_0, end_stride_0, out_stride_0, x_stride_0, x_stride_1, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
     pid_0 = tl.program_id(0)
     offset_0 = pid_0 * _BLOCK_SIZE_0
     indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
     mask_0 = indices_0 < x_size_0
     acc = tl.full([_BLOCK_SIZE_0], 0.0, tl.float32)
-    load = tl.load(end + tl.zeros([], tl.int32), None)
+    load = tl.load(end + 0 * end_stride_0, None)
     for offset_1 in tl.range(0, load.to(tl.int32), step=_BLOCK_SIZE_1):
         indices_1 = offset_1 + tl.arange(0, _BLOCK_SIZE_1).to(tl.int32)
         mask_1 = indices_1 < load
@@ -307,15 +307,15 @@ def fn(x: torch.Tensor, end: torch.Tensor):
     out = x.new_empty([x.size(0)])
     _BLOCK_SIZE_0 = 32
     _BLOCK_SIZE_1 = 32
-    _fn_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_0),](x, end, out, out.size(0), x.size(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    _fn_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_0),](x, end, out, out.size(0), x.size(0), end.stride(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
     return out
 
 def _fn_make_precompiler(x: torch.Tensor, end: torch.Tensor):
     out = x.new_empty([x.size(0)])
     _BLOCK_SIZE_0 = 32
     _BLOCK_SIZE_1 = 32
     from helion.runtime.precompile_shim import make_precompiler
-    return make_precompiler(_fn_kernel)(x, end, out, out.size(0), x.size(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    return make_precompiler(_fn_kernel)(x, end, out, out.size(0), x.size(0), end.stride(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
 
 --- assertExpectedJournal(TestLoops.test_data_dependent_bounds3)
 from __future__ import annotations
@@ -325,14 +325,14 @@ import triton
 import triton.language as tl
 
 @triton.jit
-def _fn_kernel(x, end0, end1, out, x_size_0, out_stride_0, x_stride_0, x_stride_1, x_stride_2, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+def _fn_kernel(x, end0, end1, out, x_size_0, end0_stride_0, end1_stride_0, out_stride_0, x_stride_0, x_stride_1, x_stride_2, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
     pid_0 = tl.program_id(0)
     offset_0 = pid_0 * _BLOCK_SIZE_0
     indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
     mask_0 = indices_0 < x_size_0
     acc = tl.full([_BLOCK_SIZE_0], 0.0, tl.float64)
-    load = tl.load(end0 + tl.zeros([], tl.int32), None)
-    load_1 = tl.load(end1 + tl.zeros([], tl.int32), None)
+    load = tl.load(end0 + 0 * end0_stride_0, None)
+    load_1 = tl.load(end1 + 0 * end1_stride_0, None)
     for offset_1 in tl.range(0, load.to(tl.int32), step=_BLOCK_SIZE_1):
         indices_1 = offset_1 + tl.arange(0, _BLOCK_SIZE_1).to(tl.int32)
         mask_1 = indices_1 < load
@@ -352,7 +352,7 @@ def fn(x: torch.Tensor, end0: torch.Tensor, end1: torch.Tensor):
     _BLOCK_SIZE_0 = 32
     _BLOCK_SIZE_2 = 32
     _BLOCK_SIZE_1 = 32
-    _fn_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_0),](x, end0, end1, out, x.size(0), out.stride(0), x.stride(0), x.stride(1), x.stride(2), _BLOCK_SIZE_0, _BLOCK_SIZE_2, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    _fn_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_0),](x, end0, end1, out, x.size(0), end0.stride(0), end1.stride(0), out.stride(0), x.stride(0), x.stride(1), x.stride(2), _BLOCK_SIZE_0, _BLOCK_SIZE_2, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
     return out
 
 def _fn_make_precompiler(x: torch.Tensor, end0: torch.Tensor, end1: torch.Tensor):
@@ -361,7 +361,7 @@ def _fn_make_precompiler(x: torch.Tensor, end0: torch.Tensor, end1: torch.Tensor
     _BLOCK_SIZE_2 = 32
     _BLOCK_SIZE_1 = 32
     from helion.runtime.precompile_shim import make_precompiler
-    return make_precompiler(_fn_kernel)(x, end0, end1, out, x.size(0), out.stride(0), x.stride(0), x.stride(1), x.stride(2), _BLOCK_SIZE_0, _BLOCK_SIZE_2, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    return make_precompiler(_fn_kernel)(x, end0, end1, out, x.size(0), end0.stride(0), end1.stride(0), out.stride(0), x.stride(0), x.stride(1), x.stride(2), _BLOCK_SIZE_0, _BLOCK_SIZE_2, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
 
 --- assertExpectedJournal(TestLoops.test_data_dependent_bounds4)
 from __future__ import annotations
@@ -371,14 +371,14 @@ import triton
 import triton.language as tl
 
 @triton.jit
-def _fn_kernel(x, begin, end, out, x_size_0, out_stride_0, x_stride_0, x_stride_1, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_0: tl.constexpr):
+def _fn_kernel(x, begin, end, out, x_size_0, begin_stride_0, end_stride_0, out_stride_0, x_stride_0, x_stride_1, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_0: tl.constexpr):
     pid_0 = tl.program_id(0)
     offset_1 = pid_0 * _BLOCK_SIZE_1
     indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
     mask_1 = indices_1 < x_size_0
     acc = tl.full([_BLOCK_SIZE_1, _BLOCK_SIZE_0], 0.0, tl.float32)
-    load = tl.load(begin + tl.zeros([], tl.int32), None)
-    load_1 = tl.load(end + tl.zeros([], tl.int32), None)
+    load = tl.load(begin + 0 * begin_stride_0, None)
+    load_1 = tl.load(end + 0 * end_stride_0, None)
     for offset_0 in tl.range(load.to(tl.int32), load_1.to(tl.int32), step=_BLOCK_SIZE_0):
         indices_0 = offset_0 + tl.arange(0, _BLOCK_SIZE_0).to(tl.int32)
         mask_0 = indices_0 < load_1
@@ -394,7 +394,7 @@ def fn(x: torch.Tensor, begin: torch.Tensor, end: torch.Tensor):
     bs = 32
     _BLOCK_SIZE_1 = 32
     _BLOCK_SIZE_0 = 32
-    _fn_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_1),](x, begin, end, out, x.size(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_1, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    _fn_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_1),](x, begin, end, out, x.size(0), begin.stride(0), end.stride(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_1, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
     return out
 
 def _fn_make_precompiler(x: torch.Tensor, begin: torch.Tensor, end: torch.Tensor):
@@ -403,7 +403,7 @@ def _fn_make_precompiler(x: torch.Tensor, begin: torch.Tensor, end: torch.Tensor
     _BLOCK_SIZE_1 = 32
     _BLOCK_SIZE_0 = 32
     from helion.runtime.precompile_shim import make_precompiler
-    return make_precompiler(_fn_kernel)(x, begin, end, out, x.size(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_1, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return make_precompiler(_fn_kernel)(x, begin, end, out, x.size(0), begin.stride(0), end.stride(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_1, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
 
 --- assertExpectedJournal(TestLoops.test_data_dependent_bounds5)
 from __future__ import annotations
@@ -413,14 +413,14 @@ import triton
 import triton.language as tl
 
 @triton.jit
-def _fn_kernel(x, begin, end, out, x_size_0, out_stride_0, x_stride_0, x_stride_1, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+def _fn_kernel(x, begin, end, out, x_size_0, begin_stride_0, end_stride_0, out_stride_0, x_stride_0, x_stride_1, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
     pid_0 = tl.program_id(0)
     offset_0 = pid_0 * _BLOCK_SIZE_0
     indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
     mask_0 = indices_0 < x_size_0
     acc = tl.full([_BLOCK_SIZE_0], 0.0, tl.float32)
-    load = tl.load(begin + tl.zeros([], tl.int32), None)
-    load_1 = tl.load(end + tl.zeros([], tl.int32), None)
+    load = tl.load(begin + 0 * begin_stride_0, None)
+    load_1 = tl.load(end + 0 * end_stride_0, None)
     for offset_1 in tl.range(load.to(tl.int32), load_1.to(tl.int32), step=_BLOCK_SIZE_1):
         indices_1 = offset_1 + tl.arange(0, _BLOCK_SIZE_1).to(tl.int32)
         mask_1 = indices_1 < load_1
@@ -435,15 +435,15 @@ def fn(x: torch.Tensor, begin: torch.Tensor, end: torch.Tensor):
     out = x.new_empty([x.size(0)])
     _BLOCK_SIZE_0 = 32
     _BLOCK_SIZE_1 = 32
-    _fn_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_0),](x, begin, end, out, x.size(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    _fn_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_0),](x, begin, end, out, x.size(0), begin.stride(0), end.stride(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
     return out
 
 def _fn_make_precompiler(x: torch.Tensor, begin: torch.Tensor, end: torch.Tensor):
     out = x.new_empty([x.size(0)])
     _BLOCK_SIZE_0 = 32
     _BLOCK_SIZE_1 = 32
     from helion.runtime.precompile_shim import make_precompiler
-    return make_precompiler(_fn_kernel)(x, begin, end, out, x.size(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    return make_precompiler(_fn_kernel)(x, begin, end, out, x.size(0), begin.stride(0), end.stride(0), out.stride(0), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
 
 --- assertExpectedJournal(TestLoops.test_l2_grouping_with_register_block_size)
 from __future__ import annotations