pytorch-labs
diff --git a/‎examples/matmul_layernorm.py
Lines changed: 79 additions & 0 deletions b/‎examples/matmul_layernorm.py
Lines changed: 79 additions & 0 deletions
diff --git a/‎helion/_compiler/compile_environment.py
Lines changed: 15 additions & 0 deletions b/‎helion/_compiler/compile_environment.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎helion/_compiler/device_ir.py
Lines changed: 28 additions & 0 deletions b/‎helion/_compiler/device_ir.py
Lines changed: 28 additions & 0 deletions
diff --git a/‎helion/_compiler/inductor_lowering.py
Lines changed: 118 additions & 22 deletions b/‎helion/_compiler/inductor_lowering.py
Lines changed: 118 additions & 22 deletions
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+import torch
+
+import helion
+import helion.language as hl
+
+
+# static_shapes=True gives a performance boost for matmuls
+@helion.kernel(static_shapes=True)
+def matmul_layernorm(
+    x: torch.Tensor, y: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
+) -> torch.Tensor:
+    m, k = x.size()
+    k2 = y.size(0)
+    n = hl.register_reduction_dim(y.size(1))
+    assert k == k2, f"size mismatch {k} != {k2}"
+    assert weight.size(0) == n, f"weight size mismatch {weight.size(0)} != {n}"
+    assert bias.size(0) == n, f"bias size mismatch {bias.size(0)} != {n}"
+    out = torch.empty(
+        [m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device
+    )
+    for tile_m in hl.tile(m):
+        acc = hl.zeros([tile_m, n], dtype=torch.float32)
+        for tile_k in hl.tile(k):
+            mm = torch.matmul(x[tile_m, tile_k], y[tile_k, :])
+            acc = acc + mm
+        eps = 1e-5
+        var, mean = torch.var_mean(acc, dim=-1, keepdim=True, correction=0)
+        normalized = (acc - mean) * torch.rsqrt(var + eps)
+        acc = normalized * (weight[:].to(torch.float32)) + (bias[:].to(torch.float32))
+        out[tile_m, :] = acc
+    return out
+
+
+def matmul_layernorm_pytorch(
+    x: torch.Tensor, y: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
+) -> torch.Tensor:
+    import torch.nn.functional as F
+
+    matmul_out = torch.matmul(x, y)
+
+    ln_out = F.layer_norm(
+        matmul_out.to(torch.float32),
+        normalized_shape=(matmul_out.shape[-1],),
+        weight=weight.to(torch.float32),
+        bias=bias.to(torch.float32),
+    )
+
+    return ln_out.to(torch.promote_types(x.dtype, y.dtype))
+
+
+def check(m: int, k: int, n: int) -> None:
+    from triton.testing import do_bench
+
+    x = torch.randn([m, k], device="cuda", dtype=torch.float16)
+    y = torch.randn([k, n], device="cuda", dtype=torch.float16)
+    weight = torch.randn([n], device="cuda", dtype=torch.float16)
+    bias = torch.randn([n], device="cuda", dtype=torch.float16)
+    result = matmul_layernorm(x, y, weight, bias)
+    expected = matmul_layernorm_pytorch(x, y, weight, bias)
+    torch.testing.assert_close(result, expected, rtol=1e-2, atol=1e-1)
+    sec = do_bench(lambda: matmul_layernorm(x, y, weight, bias))
+    baseline_sec = do_bench(lambda: matmul_layernorm_pytorch(x, y, weight, bias))
+    print(
+        f"Helion time: {sec:.4f}s, torch time: {baseline_sec:.4f}, speedup: {baseline_sec / sec:.2f}x"
+    )
+
+
+def main() -> None:
+    # TODO(yf225): n=64 or 128 throws error, need to investigate
+    # check(32, 64, 64)
+    # check(32, 64, 128)
+    check(32, 64, 200)
+    check(128, 256, 400)
+
+
+if __name__ == "__main__":
+    main()
@@ -121,9 +121,24 @@ def allocate_block_size(
         return idx
 
     def allocate_reduction_dimension(self, size: torch.SymInt | int) -> BlockSizeInfo:
+        # Check if this size is already a registered block size
+        if isinstance(size, torch.SymInt):
+            from .host_function import HostFunction
+
+            expr = size._sympy_()
+            origin_info = HostFunction.current().expr_to_origin.get(expr)
+            if origin_info and isinstance(origin_info.origin, BlockSizeOrigin):
+                block_idx = origin_info.origin.block_id
+                # Return the existing block size if it's a reduction dimension
+                if self.block_sizes[block_idx].reduction:
+                    return self.block_sizes[block_idx]
+
+        # Check for existing reduction dimensions with the same size
         for rdim in self.block_sizes:
             if rdim.reduction and rdim.size == size:
                 return rdim
+
+        # Allocate a new reduction dimension
         rdim_idx = self.allocate_block_size(
             size,
             reduction=True,
 
@@ -313,6 +313,21 @@ def build_rolled_reductions(self) -> None:
         for rdim in rdims:
             graph_to_info = {}
             allow_loop = False
+
+            # First, check if any graph contains matmul with rdim
+            # If so, we can't roll any graphs in this reduction dimension
+            can_roll_graphs = True
+            for graph_info in self.graphs:
+                roller = ReductionRoller(self, rdim, {})
+                if roller.has_matmul_with_rdim(graph_info.graph):
+                    can_roll_graphs = False
+                    break
+
+            if not can_roll_graphs:
+                first = False
+                continue
+
+            # Process graphs normally
             for graph_id, graph_info in enumerate([*self.graphs]):
                 assert graph_id == graph_info.graph_id
                 roller = ReductionRoller(self, rdim, graph_to_info)
@@ -705,6 +720,19 @@ def visit_Assign(self, node: ast.Assign) -> None:
             # TODO(jansel): should assert that name is only used on device
             self._assign(target, self.visit(node.value))
             return None
+        if isinstance(target, ast.Tuple):
+            # Handle tuple unpacking
+            value = self.visit(node.value)
+            if not isinstance(value, tuple):
+                raise exc.InvalidAssignment
+            if len(target.elts) != len(value):
+                raise exc.InvalidAssignment
+            for t, v in zip(target.elts, value, strict=True):
+                if isinstance(t, ast.Name):
+                    self._assign(t, v)
+                else:
+                    raise exc.InvalidAssignment
+            return None
         if not isinstance(target, ast.Subscript):
             raise exc.InvalidAssignment
         assert isinstance(node.value, ExtendedAST)
 
@@ -6,6 +6,7 @@
 import functools
 from operator import getitem
 from typing import TYPE_CHECKING
+from typing import Callable
 from typing import ContextManager
 from typing import NamedTuple
 
@@ -148,21 +149,33 @@ def convert_arg(arg: Node) -> TensorBox:
             # pyre-ignore[6]
             *map_arg((node.args, node.kwargs), convert_arg),
         )
-        result.realize()
-    if not isinstance(result, TensorBox) or not isinstance(result.data, StorageBox):
-        raise InductorLoweringError(
-            f"Lowering {node.target} returned type(result), expected TensorBox(StorageBox(...)): {result}"
-        )
-    if not isinstance(buffer := result.data.data, ComputedBuffer):
-        raise InductorLoweringError(
-            f"Lowering {node.target} returned buffer type {type(buffer)}, expected ComputedBuffer: {buffer}"
-        )
+        if not isinstance(result, tuple):
+            result = (result,)
+        buffer_name_to_output_index = {}
+        for i, r in enumerate(result):
+            r.realize()
+            if not isinstance(r, TensorBox) or not isinstance(r.data, StorageBox):
+                raise InductorLoweringError(
+                    f"Lowering {node.target} returned {type(r)}, expected TensorBox(StorageBox(...)): {r}"
+                )
+            if not isinstance(buffer := r.data.data, ComputedBuffer):
+                raise InductorLoweringError(
+                    f"Lowering {node.target} returned buffer type {type(buffer)}, expected ComputedBuffer: {buffer}"
+                )
+            buffer_name_to_output_index[buffer.get_name()] = i
 
     new_buffers = graph_lowering.buffers[prior_buffers:]
-    assert new_buffers[-1] is buffer
+    assert buffer in new_buffers  # pyre-ignore[61]
     nodes = []
     extra_input_names = []
     new_node: torch.fx.Node
+
+    # Explicitly track the mapping from node to Inductor buffer name.
+    # First, map the original input nodes to their names.
+    node_to_buf_name_mapping: dict[torch.fx.Node, str] = dict(
+        zip(node._input_nodes, input_names, strict=True)
+    )
+
     for i, buffer in enumerate(new_buffers):
         if not isinstance(buffer, ComputedBuffer) or not isinstance(
             buffer.data, (Pointwise, Reduction)
@@ -176,29 +189,49 @@ def convert_arg(arg: Node) -> TensorBox:
                 new_node.kwargs = {**new_node.kwargs, "_extra_args": [*nodes]}
         else:
             new_node = create_extra_node(node, buffer, [*node._input_nodes, *nodes])
+
+        # Store output index if this buffer corresponds to an output
+        if buffer.get_name() in buffer_name_to_output_index:
+            new_node.meta["output_index"] = buffer_name_to_output_index[
+                buffer.get_name()
+            ]
+
         lowering_cls = (
             PointwiseLowering
             if isinstance(buffer.data, Pointwise)
             else ReductionLowering
         )
         buffer.freeze_layout()
+
+        current_input_nodes = new_node._input_nodes
+        current_input_names = []
+        for inp_node in current_input_nodes:
+            current_input_names.append(node_to_buf_name_mapping[inp_node])
+
         used_input_names = strip_unused_inputs(
             new_node,
             buffer.get_read_names(),
-            dict(
-                zip(
-                    node.all_input_nodes,
-                    [*input_names, *extra_input_names],
-                    strict=True,
-                )
-            ),
+            dict(zip(current_input_nodes, current_input_names, strict=True)),
         )
         new_node.meta["lowering"] = lowering = lowering_cls(buffer, used_input_names)
+        new_node.meta["orig_node"] = node
         if isinstance(lowering, ReductionLowering):
             lowering.add_input_mask(new_node)
         nodes.append(new_node)
         extra_input_names.append(buffer.get_name())
 
+        # Add this node to our mapping for future nodes to reference
+        node_to_buf_name_mapping[new_node] = buffer.get_name()
+
+    # After all nodes are created, build the output_nodes mapping for multi-output operations
+    if len(result) > 1 and nodes:
+        last_node = nodes[-1]  # The last node is the main node
+        output_nodes = {}
+        for n in nodes:
+            if "output_index" in n.meta:
+                output_nodes[n.meta["output_index"]] = n.name
+        last_node.meta["output_nodes"] = output_nodes
+
 
 def strip_unused_inputs(
     node: torch.fx.Node,
@@ -447,14 +480,23 @@ def codegen(self, ctx: GraphInterpreter, node: torch.fx.Node) -> object:
             strategy = BlockReductionStrategy(state, self.block_index)
 
         inputs = self.input_fake_tensors(node)
-        if len(inputs) != 1:
-            # TODO(jansel): combine multiple inputs into a single fake value
-            raise NotImplementedError("reductions with >1 input")
+
+        repr_input = None
+        if len(inputs) == 1:
+            repr_input = inputs[0]
+        else:
+            if node.meta["orig_node"].target == torch.ops.aten.var_mean.correction:
+                assert len(inputs) == 2
+                # `inputs[0]` is the original input tensor to var_mean
+                repr_input = inputs[0]
+            else:
+                # TODO(jansel): combine multiple inputs into a single fake value
+                raise NotImplementedError("reductions with >1 input")
 
         # TODO(jansel): find a better way to get dim
         (dim,) = [
             i
-            for i, v in enumerate(inputs[0].shape)
+            for i, v in enumerate(repr_input.shape)
             if TileStrategy.get_block_index(v) == self.block_index
         ]
 
@@ -463,7 +505,7 @@ def codegen(self, ctx: GraphInterpreter, node: torch.fx.Node) -> object:
             output_name,
             reduction.reduction_type,
             dim,
-            inputs[0],
+            repr_input,
             node.meta["val"],
         )
 
@@ -806,6 +848,14 @@ def index_expr(self, expr: sympy.Expr, dtype: torch.dtype) -> str:
         name = self.cg.lift(
             expr_from_string(self.cg.device_function.user_sympy_expr(expr))
         ).id
+
+        # If the lifted symbol refers to a `tl.constexpr` kernel
+        # argument (for example a tile/block size constant such as
+        # `_BLOCK_SIZE_1`) the resulting Triton value is not a tensor
+        # and therefore does not expose a `.to` method.
+        if name in self.cg.device_function._constexpr_args:
+            return name
+
         return f"{name}.to({triton_type(dtype)})"
 
 
@@ -821,11 +871,57 @@ def __init__(self, graph: torch.fx.Graph, cg: GenerateAST) -> None:
         super().__init__(_LazyGraphModule({}, graph), garbage_collect_values=False)
         self.cg = cg
 
+    def _collect_multi_outputs(
+        self, node: Node, last_node_result: object
+    ) -> tuple[object, ...]:
+        """
+        Collect outputs for multi-output operations using metadata.
+        """
+        # Check if this operation has multiple outputs using the new metadata
+        assert "output_nodes" in node.meta
+        output_nodes = node.meta["output_nodes"]
+        outputs = [None] * len(output_nodes)
+        all_nodes = {n.name: n for n in self.module.graph.nodes}  # pyre-ignore[16]
+
+        for idx, node_name in output_nodes.items():
+            if node_name == node.name:
+                # This is the last node
+                outputs[idx] = last_node_result  # pyre-ignore[6]
+            else:
+                # This is an extra node - get its result from env
+                if node_name in all_nodes:
+                    extra_node = all_nodes[node_name]
+                    if extra_node in self.env:
+                        outputs[idx] = self.env[extra_node]
+
+        # Ensure all outputs are found and are ast.Name nodes
+        final_outputs = []
+        for i, result in enumerate(outputs):
+            assert result is not None
+            if not isinstance(result, ast.Name):
+                var_name = self.cg.device_function.new_var(f"{node.name}_output{i}")
+                self.cg.add_statement(
+                    statement_from_string(f"{var_name} = result", result=result)
+                )
+                result = create(ast.Name, id=var_name, ctx=ast.Load())
+            final_outputs.append(result)
+
+        return tuple(final_outputs)
+
     def run_node(self, n: Node) -> object:
         if n.op == "call_function":
             with self._set_current_node(n), n.meta["location"]:
                 lowering: Lowering = n.meta["lowering"]
                 result = lowering.codegen(self, n)
+                n.meta["codegen"] = result
+
+                # Generic handling for operations with multiple outputs
+                if n.kwargs.get("_extra_args"):
+                    # Check if this node has getitem users, indicating multiple outputs
+                    getitem_users = [user for user in n.users if user.target == getitem]
+                    if len(getitem_users) > 0:
+                        return self._collect_multi_outputs(n, result)
+
                 if result is None:
                     return None
                 if not isinstance(result, ast.AST):