pytorch-labs
diff --git a/‎helion/_compiler/ast_read_writes.py
Lines changed: 127 additions & 0 deletions b/‎helion/_compiler/ast_read_writes.py
Lines changed: 127 additions & 0 deletions
diff --git a/‎helion/_compiler/device_function.py
Lines changed: 3 additions & 9 deletions b/‎helion/_compiler/device_function.py
Lines changed: 3 additions & 9 deletions
diff --git a/‎helion/_compiler/generate_ast.py
Lines changed: 21 additions & 0 deletions b/‎helion/_compiler/generate_ast.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎test/test_broadcasting.expected
Lines changed: 0 additions & 2 deletions b/‎test/test_broadcasting.expected
Lines changed: 0 additions & 2 deletions
diff --git a/‎test/test_control_flow.expected
Lines changed: 0 additions & 4 deletions b/‎test/test_control_flow.expected
Lines changed: 0 additions & 4 deletions
diff --git a/‎test/test_examples.expected
Lines changed: 0 additions & 16 deletions b/‎test/test_examples.expected
Lines changed: 0 additions & 16 deletions
@@ -132,3 +132,130 @@ def ast_delete_assignments(body: list[ast.AST], to_remove: set[str]) -> list[ast
         if new_node is not None:
             new_body.append(new_node)
     return new_body
+
+
+class _NotPureException(Exception):
+    pass
+
+
+class _PureExpressionVisitor(ast.NodeVisitor):
+    """
+    AST visitor that determines if an expression is guaranteed to be pure.
+    """
+
+    def generic_visit(self, node: ast.AST) -> None:
+        # Anything without a specific visitor is not pure
+        raise _NotPureException
+
+    def visit_Constant(self, node: ast.Constant) -> None:
+        pass
+
+    def visit_Num(self, node: ast.Num) -> None:
+        pass
+
+    def visit_Str(self, node: ast.Str) -> None:
+        pass
+
+    def visit_Bytes(self, node: ast.Bytes) -> None:
+        pass
+
+    def visit_NameConstant(self, node: ast.NameConstant) -> None:
+        pass
+
+    def visit_Ellipsis(self, node: ast.Ellipsis) -> None:
+        pass
+
+    def visit_Name(self, node: ast.Name) -> None:
+        pass
+
+    def visit_Tuple(self, node: ast.Tuple) -> None:
+        for elt in node.elts:
+            self.visit(elt)
+
+    def visit_List(self, node: ast.List) -> None:
+        for elt in node.elts:
+            self.visit(elt)
+
+    def visit_Set(self, node: ast.Set) -> None:
+        for elt in node.elts:
+            self.visit(elt)
+
+    def visit_Dict(self, node: ast.Dict) -> None:
+        for key in node.keys:
+            if key is not None:  # Handle dict unpacking
+                self.visit(key)
+        for value in node.values:
+            self.visit(value)
+
+    def visit_BinOp(self, node: ast.BinOp) -> None:
+        self.visit(node.left)
+        self.visit(node.right)
+
+    def visit_UnaryOp(self, node: ast.UnaryOp) -> None:
+        self.visit(node.operand)
+
+    def visit_Starred(self, node: ast.Starred) -> None:
+        self.visit(node.value)
+
+
+def definitely_does_not_have_side_effects(expr: ast.expr) -> bool:
+    try:
+        _PureExpressionVisitor().visit(expr)
+        return True
+    except _NotPureException:
+        return False
+
+
+class _DeletePureExpressions(ast.NodeTransformer):
+    def visit_Expr(self, node: ast.Expr) -> ast.Expr | None:
+        if definitely_does_not_have_side_effects(node.value):
+            return None
+        return node
+
+
+def dead_assignment_elimination(
+    body: list[ast.AST],
+    dce_vars: list[str],
+    num_iterations=8,
+    input_rw: ReadWrites | None = None,
+) -> None:
+    """
+    Eliminates dead assignments from body
+    """
+
+    # num_iterations and input_rw are not compatible with each other
+    assert num_iterations == 1 or input_rw is None
+    for _ in range(num_iterations):
+        rw = input_rw if input_rw is not None else ReadWrites.from_list(body)
+        to_remove = set()
+        for name in dce_vars:
+            if name in rw.writes and name not in rw.reads:
+                to_remove.add(name)
+        if not to_remove:
+            break
+        body[:] = ast_delete_assignments(body, to_remove)
+
+
+def is_string_expr(node: ast.AST) -> bool:
+    return (
+        isinstance(node, ast.Expr)
+        and isinstance(node.value, ast.Constant)
+        and isinstance(node.value.value, str)
+    )
+
+
+def dead_expression_elimination(body: list[ast.AST]) -> None:
+    """
+    Eliminates dead expressions from body
+    """
+    new_body = []
+    for node in body:
+        if is_string_expr(node):
+            # triple quoted comments and strings are indistinguishable
+            # do not eliminate them
+            new_body.append(node)
+            continue
+        new_node = _DeletePureExpressions().visit(node)
+        if new_node is not None:
+            new_body.append(new_node)
+    body[:] = new_body
@@ -24,8 +24,8 @@
 from .ast_extension import expr_from_string
 from .ast_extension import statement_from_string
 from .ast_read_writes import ReadWrites
-from .ast_read_writes import ast_delete_assignments
 from .ast_read_writes import ast_rename
+from .ast_read_writes import dead_assignment_elimination
 from .compile_environment import CompileEnvironment
 from .host_function import HostFunction
 from .host_function import NoCurrentFunction
@@ -463,14 +463,8 @@ def dead_code_elimination(self) -> None:
 
         for _ in range(8):
             rw = ReadWrites.from_list([*self.preamble, *self.body])
-            to_remove = set()
-            for name in self.dce_vars:
-                if name in rw.writes and name not in rw.reads:
-                    to_remove.add(name)
-            if not to_remove:
-                break
-            self.body[:] = ast_delete_assignments(self.body, to_remove)
-            self.preamble[:] = ast_delete_assignments(self.preamble, to_remove)
+            dead_assignment_elimination(self.body, self.dce_vars, 1, rw)
+            dead_assignment_elimination(self.preamble, self.dce_vars, 1, rw)
 
         # drop any unused args
         args_to_remove = {
 
@@ -6,6 +6,8 @@
 from typing import TYPE_CHECKING
 from typing import NamedTuple
 
+from torch.utils._ordered_set import OrderedSet
+
 from .. import exc
 from ..language._decorators import is_api_func
 from ..runtime.precompile_shim import make_precompiler
@@ -15,6 +17,9 @@
 from .ast_extension import create
 from .ast_extension import expr_from_string
 from .ast_extension import statement_from_string
+from .ast_read_writes import dead_assignment_elimination
+from .ast_read_writes import dead_expression_elimination
+from .ast_read_writes import definitely_does_not_have_side_effects
 from .compile_environment import CompileEnvironment
 from .device_function import DeviceFunction
 from .helper_function import CodegenInterface
@@ -322,6 +327,21 @@ def visit_Call(self, node: ast.Call) -> ast.AST:
             )
         return self.generic_visit(node)
 
+    def host_dead_code_elimination(self) -> None:
+        dce_vars: OrderedSet[str] = OrderedSet()
+        for stmt in self.host_statements:
+            if (
+                isinstance(stmt, ast.Assign)
+                and definitely_does_not_have_side_effects(stmt.value)
+                and all(isinstance(name, ast.Name) for name in stmt.targets)
+            ):
+                for name in stmt.targets:
+                    assert isinstance(name, ast.Name)
+                    dce_vars.add(name.id)
+
+        dead_assignment_elimination(self.host_statements, list(dce_vars))
+        dead_expression_elimination(self.host_statements)
+
 
 class TensorReference(NamedTuple):
     node: ast.AST
@@ -413,6 +433,7 @@ def generate_ast(func: HostFunction, config: Config) -> ast.AST:
             for stmt in func.body:
                 codegen.add_statement(codegen.visit(stmt))
             kernel_def = codegen.device_function.codegen_function_def()
+            codegen.host_dead_code_elimination()
             host_def = func.codegen_function_def(codegen.host_statements)
             precompile_def = codegen_precompile_def(
                 host_def, codegen.device_function.name
 
@@ -242,7 +242,6 @@ def fn(a, idx1):
     out0 = torch.empty_like(a)
     out1 = torch.empty_like(a)
     out2 = torch.empty_like(a)
-    idx0 = 11
     _BLOCK_SIZE_0 = 16
     _BLOCK_SIZE_1 = 16
     _fn_kernel[triton.cdiv(a.size(0), _BLOCK_SIZE_0) * triton.cdiv(a.size(1), _BLOCK_SIZE_1),](a, out0, out1, out2, a.size(0), a.size(1), a.stride(0), a.stride(1), out0.stride(0), out0.stride(1), out1.stride(0), out1.stride(1), out2.stride(0), out2.stride(1), idx1, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
@@ -252,7 +251,6 @@ def _fn_make_precompiler(a, idx1):
     out0 = torch.empty_like(a)
     out1 = torch.empty_like(a)
     out2 = torch.empty_like(a)
-    idx0 = 11
     _BLOCK_SIZE_0 = 16
     _BLOCK_SIZE_1 = 16
     from helion.runtime.precompile_shim import make_precompiler
 
@@ -22,15 +22,13 @@ def _fn_kernel(x, out, out_size_0, out_size_1, x_size_0, x_size_1, out_stride_0,
 
 def fn(x):
     out = torch.empty_like(x)
-    v = 15
     _BLOCK_SIZE_0 = 32
     _BLOCK_SIZE_1 = 32
     _fn_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_0) * triton.cdiv(x.size(1), _BLOCK_SIZE_1),](x, out, out.size(0), out.size(1), x.size(0), x.size(1), out.stride(0), out.stride(1), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
     return out
 
 def _fn_make_precompiler(x):
     out = torch.empty_like(x)
-    v = 15
     _BLOCK_SIZE_0 = 32
     _BLOCK_SIZE_1 = 32
     from helion.runtime.precompile_shim import make_precompiler
@@ -55,14 +53,12 @@ def _fn_kernel(x, out, x_size_0, x_size_1, out_stride_0, out_stride_1, x_stride_
 
 def fn(x):
     out = torch.empty_like(x)
-    v = 4
     _BLOCK_SIZE_0_1 = 128
     _fn_kernel[triton.cdiv(x.size(0) * x.size(1), _BLOCK_SIZE_0_1), 1, 1](x, out, x.size(0), x.size(1), out.stride(0), out.stride(1), x.stride(0), x.stride(1), _BLOCK_SIZE_0_1, num_warps=4, num_stages=3)
     return out
 
 def _fn_make_precompiler(x):
     out = torch.empty_like(x)
-    v = 4
     _BLOCK_SIZE_0_1 = 128
     from helion.runtime.precompile_shim import make_precompiler
     return make_precompiler(_fn_kernel)(x, out, x.size(0), x.size(1), out.stride(0), out.stride(1), x.stride(0), x.stride(1), _BLOCK_SIZE_0_1, num_warps=4, num_stages=3)
 
@@ -104,9 +104,7 @@ def attention(q_in: torch.Tensor, k_in: torch.Tensor, v_in: torch.Tensor):
     k_view = k_in.reshape([-1, n_dim, head_dim]).transpose(1, 2)
     out = torch.empty_like(q_view)
     sm_scale = 1.0 / math.sqrt(head_dim)
-    qk_scale = sm_scale * 1.44269504
     _BLOCK_SIZE_1 = 128
-    _RDIM_SIZE_2 = 64
     _BLOCK_SIZE_3 = 64
     _attention_kernel[64 * triton.cdiv(1024, _BLOCK_SIZE_1),](q_view, k_view, v_view, out, _BLOCK_SIZE_1, _BLOCK_SIZE_3, num_warps=4, num_stages=3)
     return out.view(q_in.size())
@@ -122,9 +120,7 @@ def _attention_make_precompiler(q_in: torch.Tensor, k_in: torch.Tensor, v_in: to
     k_view = k_in.reshape([-1, n_dim, head_dim]).transpose(1, 2)
     out = torch.empty_like(q_view)
     sm_scale = 1.0 / math.sqrt(head_dim)
-    qk_scale = sm_scale * 1.44269504
     _BLOCK_SIZE_1 = 128
-    _RDIM_SIZE_2 = 64
     _BLOCK_SIZE_3 = 64
     from helion.runtime.precompile_shim import make_precompiler
     return make_precompiler(_attention_kernel)(q_view, k_view, v_view, out, _BLOCK_SIZE_1, _BLOCK_SIZE_3, num_warps=4, num_stages=3)
@@ -208,7 +204,6 @@ def attention(q_in: torch.Tensor, k_in: torch.Tensor, v_in: torch.Tensor):
     k_view = k_in.reshape([-1, n_dim, head_dim]).transpose(1, 2)
     out = torch.empty_like(q_view)
     sm_scale = 1.0 / math.sqrt(head_dim)
-    qk_scale = sm_scale * 1.44269504
     _BLOCK_SIZE_1 = 128
     _RDIM_SIZE_2 = 64
     _BLOCK_SIZE_3 = 16
@@ -226,7 +221,6 @@ def _attention_make_precompiler(q_in: torch.Tensor, k_in: torch.Tensor, v_in: to
     k_view = k_in.reshape([-1, n_dim, head_dim]).transpose(1, 2)
     out = torch.empty_like(q_view)
     sm_scale = 1.0 / math.sqrt(head_dim)
-    qk_scale = sm_scale * 1.44269504
     _BLOCK_SIZE_1 = 128
     _RDIM_SIZE_2 = 64
     _BLOCK_SIZE_3 = 16
@@ -303,7 +297,6 @@ def attention(q_in: torch.Tensor, k_in: torch.Tensor, v_in: torch.Tensor):
     k_view = k_in.reshape([-1, n_dim, head_dim]).transpose(1, 2)
     out = torch.empty_like(q_view)
     sm_scale = 1.0 / math.sqrt(head_dim)
-    qk_scale = sm_scale * 1.44269504
     _BLOCK_SIZE_1 = 64
     _RDIM_SIZE_2 = 64
     _BLOCK_SIZE_3 = 64
@@ -321,7 +314,6 @@ def _attention_make_precompiler(q_in: torch.Tensor, k_in: torch.Tensor, v_in: to
     k_view = k_in.reshape([-1, n_dim, head_dim]).transpose(1, 2)
     out = torch.empty_like(q_view)
     sm_scale = 1.0 / math.sqrt(head_dim)
-    qk_scale = sm_scale * 1.44269504
     _BLOCK_SIZE_1 = 64
     _RDIM_SIZE_2 = 64
     _BLOCK_SIZE_3 = 64
@@ -1570,17 +1562,13 @@ def _softmax_two_pass_kernel(x, out, out_stride_0, out_stride_1, x_stride_0, x_s
 def softmax_two_pass(x: torch.Tensor):
     m, n = x.size()
     out = torch.empty_like(x)
-    block_size_m = 1
-    block_size_n = 128
     _BLOCK_SIZE_1 = 128
     _softmax_two_pass_kernel[m,](x, out, out.stride(0), out.stride(1), x.stride(0), x.stride(1), n, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
     return out
 
 def _softmax_two_pass_make_precompiler(x: torch.Tensor):
     m, n = x.size()
     out = torch.empty_like(x)
-    block_size_m = 1
-    block_size_n = 128
     _BLOCK_SIZE_1 = 128
     from helion.runtime.precompile_shim import make_precompiler
     return make_precompiler(_softmax_two_pass_kernel)(x, out, out.stride(0), out.stride(1), x.stride(0), x.stride(1), n, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
@@ -1640,8 +1628,6 @@ def _softmax_two_pass_kernel(x, out, out_size_0, out_size_1, x_size_0, x_size_1,
 def softmax_two_pass(x: torch.Tensor):
     m, n = x.size()
     out = torch.empty_like(x)
-    block_size_m = 8
-    block_size_n = 64
     _BLOCK_SIZE_0 = 8
     _BLOCK_SIZE_1 = 64
     _softmax_two_pass_kernel[triton.cdiv(m, _BLOCK_SIZE_0),](x, out, out.size(0), out.size(1), x.size(0), x.size(1), out.stride(0), out.stride(1), x.stride(0), x.stride(1), m, n, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
@@ -1650,8 +1636,6 @@ def softmax_two_pass(x: torch.Tensor):
 def _softmax_two_pass_make_precompiler(x: torch.Tensor):
     m, n = x.size()
     out = torch.empty_like(x)
-    block_size_m = 8
-    block_size_n = 64
     _BLOCK_SIZE_0 = 8
     _BLOCK_SIZE_1 = 64
     from helion.runtime.precompile_shim import make_precompiler