Support hl.tile_{begin,end,block_size} (#150)

jansel · web-flow · commit 8efc252b3916 · 2025-06-11T15:24:32.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -85,3 +85,4 @@ venv
 .watchman
 .watchmanconfig
 *.zip
+CLAUDE.md
diff --git a/helion/_compiler/compile_environment.py b/helion/_compiler/compile_environment.py
@@ -169,6 +169,14 @@ def create_block_var(self, debug_name: str, hint: int = 64) -> torch.SymInt:
         self.debug_shape_renames[sym._sympy_()] = sympy.Symbol(debug_name, integer=True)
         return sym
 
+    def create_unbacked_symint(self, hint: int = 8192) -> torch.SymInt:
+        with self.shape_env.ignore_fresh_unbacked_symbols():
+            sym = self.shape_env.create_unbacked_symint()
+            # TODO(jansel): this is a hack to get us past some == 1 checks
+            #               we should probably have a better way to handle this
+            self.shape_env.var_to_val[sym._sympy_()] = sympy.sympify(hint)
+            return sym
+
     def to_fake(self, obj: object, origin: Origin) -> object:
         if isinstance(obj, torch.Tensor):
             return self._to_fake_tensor(obj, origin.to_source())
@@ -177,12 +185,7 @@ def to_fake(self, obj: object, origin: Origin) -> object:
                 with self.shape_env.ignore_fresh_unbacked_symbols():
                     return self.shape_env.create_unbacked_symbool()
             if isinstance(obj, int):
-                with self.shape_env.ignore_fresh_unbacked_symbols():
-                    sym = self.shape_env.create_unbacked_symint()
-                    # TODO(jansel): this is a hack to get us past some == 1 checks
-                    #               we should probably have a better way to handle this
-                    self.shape_env.var_to_val[sym._sympy_()] = sympy.sympify(8192)
-                    return sym
+                return self.create_unbacked_symint()
             if isinstance(obj, float):
                 with self.shape_env.ignore_fresh_unbacked_symbols():
                     return self.shape_env.create_unbacked_symfloat()
diff --git a/helion/_compiler/device_ir.py b/helion/_compiler/device_ir.py
@@ -49,6 +49,8 @@
 from .type_propagation import CallableType
 from .type_propagation import GridIndexType
 from .type_propagation import IterType
+from .type_propagation import LiteralType
+from .type_propagation import NumericType
 from .type_propagation import SequenceType
 from .type_propagation import TensorType
 from .type_propagation import TileIndexType
@@ -739,7 +741,9 @@ def visit_Assign(self, node: ast.Assign) -> None:
         rhs_type = node.value._type_info
         assert isinstance(target, ExtendedAST)
         lhs_type = target._type_info
-        if not isinstance(lhs_type, TensorType) or not isinstance(rhs_type, TensorType):
+        if not isinstance(lhs_type, TensorType) or not isinstance(
+            rhs_type, (TensorType, NumericType, LiteralType)
+        ):
             raise exc.NonTensorSubscriptAssign(lhs_type, rhs_type)
         assert isinstance(target.value, ExtendedAST)
         target_origin = target.value._type_info.origin
diff --git a/helion/_compiler/generate_ast.py b/helion/_compiler/generate_ast.py
@@ -63,15 +63,15 @@ def add_statement(self, stmt: ast.AST | str | None) -> None:
             stmt = statement_from_string(stmt)
         self.statements_stack[-1].append(stmt)
 
-    def tmpvar(self, dce: bool = False) -> str:
-        return self.device_function.unique_name("v", dce=dce)
+    def tmpvar(self, *, dce: bool = False, prefix: str = "v") -> str:
+        return self.device_function.unique_name(prefix, dce=dce)
 
-    def lift(self, expr: ast.AST, dce: bool = False) -> ast.Name:
+    def lift(self, expr: ast.AST, *, dce: bool = False, prefix: str = "v") -> ast.Name:
         if isinstance(expr, ast.Name):
             return expr
         assert isinstance(expr, ExtendedAST), expr
         with expr:
-            varname = self.tmpvar(dce=dce)
+            varname = self.tmpvar(dce=dce, prefix=prefix)
             self.add_statement(statement_from_string(f"{varname} = expr", expr=expr))
             return create(ast.Name, id=varname, ctx=ast.Load())
 
diff --git a/helion/_compiler/indexing_strategy.py b/helion/_compiler/indexing_strategy.py
@@ -306,7 +306,7 @@ def create(
                 ast_index = state.ast_args[1]
                 assert isinstance(ast_index, (list, tuple))
                 assert len(ast_index) == len(index)
-                index_var = state.codegen.lift(ast_index[n]).id
+                index_var = state.codegen.lift(ast_index[n], prefix="index").id
                 index_values.append(f"({index_var}){expand}")
                 if (
                     block_idx := TileStrategy.get_block_index(output_size[output_idx])
@@ -321,7 +321,7 @@ def create(
                 ast_index = state.ast_args[1]
                 assert isinstance(ast_index, (list, tuple))
                 assert len(ast_index) == 1
-                index_var = state.codegen.lift(ast_index[0]).id
+                index_var = state.codegen.lift(ast_index[0], prefix="index").id
                 index_values.append(index_var)
                 output_idx += k.ndim
                 for n, s in enumerate(output_size):
diff --git a/helion/_compiler/reduction_strategy.py b/helion/_compiler/reduction_strategy.py
@@ -176,7 +176,13 @@ def codegen_preamble(self, state: CodegenState) -> None:
             state.add_statement(
                 f"{mask_var} = {index_var} < {self.fn.sympy_expr(numel)}"
             )
-        state.codegen.set_active_loops(PersistentReductionState(self))
+        # Extract end_var_name from the numel expression
+        env = CompileEnvironment.current()
+        numel = env.block_sizes[self.block_index].numel
+        end_var_name = {self.block_index: self.fn.sympy_expr(numel)}
+        state.codegen.set_active_loops(
+            PersistentReductionState(self, end_var_name=end_var_name)
+        )
 
     def codegen_reduction(
         self,
@@ -254,11 +260,14 @@ def codegen_device_loop(self, state: CodegenState) -> DeviceLoopState:
             orelse=[],
             type_comment=None,
         )
+        # Extract end_var_name from the actual numel expression used in the range()
+        end_var_name = {block_index: device_function.sympy_expr(numel)}
         return DeviceLoopState(
             self,
             for_node=for_node,
             inner_statements=body,
             end_bounds={block_index: numel},
+            end_var_name=end_var_name,
         )
 
     def codegen_reduction(
diff --git a/helion/_compiler/tile_dispatch.py b/helion/_compiler/tile_dispatch.py
@@ -13,7 +13,6 @@
 from helion._compiler.reduction_strategy import PersistentReductionStrategy
 from helion._compiler.reduction_strategy import ReductionStrategy
 from helion._compiler.tile_strategy import CompactedShape
-from helion._compiler.tile_strategy import DeviceGridState
 from helion._compiler.tile_strategy import DeviceLoopState
 from helion._compiler.tile_strategy import FlattenedTileStrategy
 from helion._compiler.tile_strategy import NDGridTileStrategy
@@ -111,11 +110,11 @@ def _add_reduction_strategies(self, fn: DeviceFunction, config: Config) -> None:
 
     def codegen_grid(self, state: CodegenState, block_ids: list[int]) -> None:
         strategy = self.block_id_to_strategy[tuple(block_ids)]
-        strategy.codegen_grid(state)
+        grid_state = strategy.codegen_grid(state)
         for other_strategy in self.strategies:
             if other_strategy is not strategy:
                 other_strategy.codegen_preamble(state)
-        state.codegen.set_active_loops(DeviceGridState(strategy))
+        state.codegen.set_active_loops(grid_state)
 
     def codegen_device_loop(
         self, state: CodegenState, block_ids: list[int]
diff --git a/helion/_compiler/tile_index_proxy.py b/helion/_compiler/tile_index_proxy.py
@@ -91,6 +91,33 @@ def index(self) -> torch.Tensor:
 
         return tile_index(self)
 
+    @property
+    def begin(self) -> int:
+        """
+        Alias for hl.tile_begin, which retrieves the start offset of a tile.
+        """
+        from ..language.tiles import tile_begin
+
+        return tile_begin(self)
+
+    @property
+    def end(self) -> int:
+        """
+        Alias for hl.tile_end, which retrieves the end offset of a tile.
+        """
+        from ..language.tiles import tile_end
+
+        return tile_end(self)
+
+    @property
+    def block_size(self) -> int:
+        """
+        Alias for hl.tile_block_size, which retrieves the block_size of a tile.
+        """
+        from ..language.tiles import tile_block_size
+
+        return tile_block_size(self)
+
 
 class CheckForIndexCalls:
     """
diff --git a/helion/_compiler/tile_strategy.py b/helion/_compiler/tile_strategy.py
@@ -42,6 +42,7 @@
 @dataclasses.dataclass
 class DeviceLoopOrGridState:
     strategy: TileStrategy
+    end_var_name: dict[int, str]
 
     @property
     def block_ids(self) -> list[int]:
@@ -106,7 +107,7 @@ def block_size_var(self, block_idx: int) -> str | None:
     def user_size(self, block_index: int) -> sympy.Expr:
         raise NotImplementedError
 
-    def codegen_grid(self, state: CodegenState) -> None:
+    def codegen_grid(self, state: CodegenState) -> DeviceGridState:
         raise NotImplementedError
 
     def codegen_device_loop(self, state: CodegenState) -> DeviceLoopState:
@@ -255,7 +256,7 @@ def _codegen_common(
             )
         return block_size_var, offsets_var, total_numel, statements
 
-    def codegen_grid(self, state: CodegenState) -> None:
+    def codegen_grid(self, state: CodegenState) -> DeviceGridState:
         block_size_var, offsets_var, total_numel, statements = self._codegen_common(
             state
         )
@@ -273,6 +274,12 @@ def codegen_grid(self) -> ast.AST:
 
         state.device_function.set_pid(TmpPid())
 
+        end_var_name = {}
+        for block_id in self.block_ids:
+            end_bound = CompileEnvironment.current().block_sizes[block_id].numel
+            end_var_name[block_id] = state.device_function.sympy_expr(end_bound)
+        return DeviceGridState(self, end_var_name=end_var_name)
+
     def codegen_device_loop(self, state: CodegenState) -> DeviceLoopState:
         block_size_var, offsets_var, total_numel, statements = self._codegen_common(
             state
@@ -301,6 +308,7 @@ def codegen_device_loop(self, state: CodegenState) -> DeviceLoopState:
             for_node=for_node,
             inner_statements=body,
             end_bounds=self.get_end_bounds(state),
+            end_var_name={},
         )
 
     @classmethod
@@ -361,7 +369,7 @@ def __init__(
                     f"_BLOCK_SIZE_{block_idx}"
                 )
 
-    def codegen_grid(self, state: CodegenState) -> None:
+    def codegen_grid(self, state: CodegenState) -> DeviceGridState:
         block_ids = self.block_ids
         env = CompileEnvironment.current()
         device_function = state.device_function
@@ -417,6 +425,13 @@ def codegen_grid(self, state: CodegenState) -> None:
         else:
             state.device_function.set_pid(pids)
 
+        # Extract end_var_name from end bound expressions
+        end_var_name = {}
+        for block_id in self.block_ids:
+            end_bound = CompileEnvironment.current().block_sizes[block_id].numel
+            end_var_name[block_id] = state.device_function.sympy_expr(end_bound)
+        return DeviceGridState(self, end_var_name=end_var_name)
+
     def select_pid_strategy(self) -> ProgramIDs:
         if 1 < len(self.block_ids) <= 3 and self.fn.config.use_yz_grid:
             return GridProgramIDs()
@@ -447,6 +462,7 @@ def codegen_device_loop(self, state: CodegenState) -> DeviceLoopState:
         _, begins, ends, _ = state.ast_args
         assert isinstance(begins, list)
         assert isinstance(ends, list)
+        end_var_name = {}
         for block_idx, block_size, begin, end in self._reorder(
             [*zip(block_ids, block_sizes, begins, ends, strict=True)]
         ):
@@ -463,6 +479,9 @@ def codegen_device_loop(self, state: CodegenState) -> DeviceLoopState:
                     )
             else:
                 block_size_var = "1"
+            end_var_name[block_idx] = state.codegen.lift(
+                self._to_ast(end, to_dtype=dtype), dce=True, prefix="end"
+            ).id
             for_node = create(
                 ast.For,
                 target=create(ast.Name, id=offset_var, ctx=ast.Store()),
@@ -494,6 +513,7 @@ def codegen_device_loop(self, state: CodegenState) -> DeviceLoopState:
             for_node=for_node,
             inner_statements=innermost_body,
             end_bounds=self.get_end_bounds(state),
+            end_var_name=end_var_name,
         )
 
     def compact_shape(self, shapes: list[CompactedShape]) -> list[CompactedShape]:
diff --git a/helion/_compiler/type_propagation.py b/helion/_compiler/type_propagation.py
@@ -503,6 +503,9 @@ def propagate_setitem(
                         rhs_rank,
                         f"LHS shape: {tuple(lhs_shape)}, RHS shape: {tuple(value.fake_value.shape)}",
                     )
+            elif isinstance(value, (NumericType, LiteralType)):
+                # Allow scalar assignment to tensor (broadcasts to tensor shape)
+                pass
             elif isinstance(value, UnknownType):
                 raise exc.TypePropagationError(value)
             else:
diff --git a/helion/language/__init__.py b/helion/language/__init__.py
@@ -13,5 +13,8 @@
 from .memory_ops import atomic_add as atomic_add
 from .memory_ops import load as load
 from .memory_ops import store as store
+from .tiles import tile_begin as tile_begin
+from .tiles import tile_block_size as tile_block_size
+from .tiles import tile_end as tile_end
 from .tiles import tile_index as tile_index
 from .view_ops import subscript as subscript
diff --git a/helion/language/_tracing_ops.py b/helion/language/_tracing_ops.py
@@ -44,7 +44,9 @@ def _(state: CodegenState) -> ast.AST:
             # this should be unused
             return expr_from_string("block_size_var_optimized_away")
     return state.codegen.lift(
-        expr_from_string(state.device_function.sympy_expr(val._sympy_())), dce=True
+        expr_from_string(state.device_function.sympy_expr(val._sympy_())),
+        dce=True,
+        prefix="symnode",
     )
 
 
diff --git a/helion/language/loops.py b/helion/language/loops.py
@@ -222,20 +222,26 @@ def _(
                     size
                 )
 
-    _add_config_choices([x.block_id for x in results], is_tile=True)
+    _add_config_choices(
+        [x.block_id for x in results],
+        is_tile=True,
+        has_begin=not all((isinstance(x, int) and x == 0) for x in proxy_begin),
+    )
     if unpack:
         (result,) = results
     else:
         result = SequenceType(origin, results)
     return IterType(origin, result)
 
 
-def _add_config_choices(block_ids: list[int], *, is_tile: bool = False) -> None:
+def _add_config_choices(
+    block_ids: list[int], *, is_tile: bool = False, has_begin: bool = False
+) -> None:
     config_spec = CompileEnvironment.current().config_spec
     if len(block_ids) > 1:
         # Add loop reordering choice
         config_spec.loop_orders.append(LoopOrderSpec(block_ids))
-        if is_tile:
+        if is_tile and not has_begin:
             config_spec.flatten_loops.append(FlattenLoopSpec(block_ids))
 
     if all(x._loop_type != LoopType.GRID for x in ExtendedAST.current()):  # is_grid
diff --git a/helion/language/memory_ops.py b/helion/language/memory_ops.py
@@ -21,7 +21,7 @@
 def store(
     tensor: torch.Tensor,
     index: list[object],
-    value: torch.Tensor,
+    value: torch.Tensor | torch.SymInt | float,
     extra_mask: torch.Tensor | None = None,
 ) -> None:
     """Store a value from to tensor using a list of indices.
@@ -41,12 +41,12 @@ def store(
 def _(
     tensor: torch.Tensor,
     index: list[object],
-    value: torch.Tensor,
+    value: torch.Tensor | torch.SymInt | float,
     extra_mask: torch.Tensor | None = None,
-) -> tuple[torch.Tensor, list[object], torch.Tensor]:
+) -> tuple[torch.Tensor, list[object], torch.Tensor | torch.SymInt | int | float]:
     from helion._compiler.tile_index_proxy import TileIndexProxy
 
-    if value.dtype != tensor.dtype:
+    if hasattr(value, "dtype") and value.dtype != tensor.dtype:
         value = value.to(tensor.dtype)
     index = TileIndexProxy.tiles_to_sizes(index)
     return (tensor, index, value, extra_mask)
@@ -56,7 +56,7 @@ def _(
 def _(
     tensor: torch.Tensor,
     index: list[object],
-    value: torch.Tensor,
+    value: torch.Tensor | torch.SymInt | float,
     extra_mask: torch.Tensor | None = None,
 ) -> None:
     return None
diff --git a/helion/language/tiles.py b/helion/language/tiles.py
diff --git a/test/test_indexing.py b/test/test_indexing.py
diff --git a/test/test_matmul.py b/test/test_matmul.py