pytorch-labs
diff --git a/‎helion/_compiler/device_function.py
Lines changed: 10 additions & 1 deletion b/‎helion/_compiler/device_function.py
Lines changed: 10 additions & 1 deletion
diff --git a/‎helion/_compiler/generate_ast.py
Lines changed: 1 addition & 67 deletions b/‎helion/_compiler/generate_ast.py
Lines changed: 1 addition & 67 deletions
diff --git a/‎helion/_compiler/host_function.py
Lines changed: 24 additions & 1 deletion b/‎helion/_compiler/host_function.py
Lines changed: 24 additions & 1 deletion
diff --git a/‎helion/_compiler/output_header.py
Lines changed: 3 additions & 1 deletion b/‎helion/_compiler/output_header.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎helion/autotuner/base_search.py
Lines changed: 32 additions & 3 deletions b/‎helion/autotuner/base_search.py
Lines changed: 32 additions & 3 deletions
diff --git a/‎helion/runtime/__init__.py
Lines changed: 13 additions & 0 deletions b/‎helion/runtime/__init__.py
Lines changed: 13 additions & 0 deletions
diff --git a/‎helion/runtime/kernel.py
Lines changed: 0 additions & 1 deletion b/‎helion/runtime/kernel.py
Lines changed: 0 additions & 1 deletion
@@ -181,6 +181,15 @@ def __init__(self, name: str, config: Config, codegen: GenerateAST) -> None:
         self.pid: ProgramIDs | None = None
         self.namespace: _Namespace = _Namespace()
         self.namespace._used_names.update(reserved_names())
+        self.namespace._used_names.update(
+            # used by triton run() method
+            [
+                "grid",
+                "warmup",
+                "num_warps",
+                "num_stages",
+            ]
+        )
         self._variable_renames: dict[str, list[str]] = {}
         self.dce_vars: list[str] = []
         self.block_size_var_cache: dict[tuple[int, ...], str] = {}
@@ -448,7 +457,7 @@ def codegen_function_call(self) -> ast.AST:
         assert pid is not None
         # TODO(jansel): we should run CSE this statement
         call_statement = statement_from_string(
-            f"{self.name}[__call_grid_expr]({', '.join(args)})",
+            f"_launcher({self.name}, __call_grid_expr, {', '.join(args)})",
             __call_grid_expr=pid.codegen_grid(),
         )
         assert isinstance(call_statement, ExtendedAST)
 
@@ -10,7 +10,6 @@
 
 from .. import exc
 from ..language._decorators import is_api_func
-from ..runtime.precompile_shim import make_precompiler
 from .ast_extension import ExtendedAST
 from .ast_extension import LoopType
 from .ast_extension import NodeVisitor
@@ -367,68 +366,6 @@ def has_mask(self) -> bool:
         )
 
 
-def codegen_precompile_def(
-    host_def: ast.FunctionDef, device_function_name: str
-) -> ast.FunctionDef:
-    """
-    Generate a precompile function definition for the given host function.
-    The precompile function is the same as the normal function, but the call to the
-    kernel is replaced with a call to make_precompiler.
-
-    Args:
-        host_def: The host function definition to that is used to call the kernel.
-        device_function_name: The name of the device function to be called.
-
-    Returns:
-        A transformed function definition with the kernel call replaced.
-    """
-
-    def transform(node: ExtendedAST) -> ExtendedAST:
-        nonlocal found_calls
-        assert not node._is_kernel_call
-        fields = node.fields()
-        for key, value in [*fields.items()]:
-            if isinstance(value, list):
-                new_list = []
-                for item in value:
-                    assert isinstance(item, ExtendedAST)
-                    if item._is_kernel_call:
-                        with item:
-                            found_calls += 1
-                            new_list.append(
-                                statement_from_string(
-                                    f"from {make_precompiler.__module__} import make_precompiler"
-                                )
-                            )
-                            assert isinstance(item, ast.Expr)
-                            value = item.value
-                            assert isinstance(value, ExtendedAST)
-                            new_list.append(
-                                create(
-                                    ast.Return,
-                                    value=value.copy(
-                                        func=expr_from_string(
-                                            f"make_precompiler({device_function_name})"
-                                        )
-                                    ),
-                                )
-                            )
-                            break
-                    new_list.append(transform(item))
-                fields[key] = new_list
-            elif isinstance(value, ExtendedAST):
-                fields[key] = transform(value)
-        return node.new(fields)
-
-    found_calls = 0
-    assert isinstance(host_def, ExtendedAST)
-    new_fn = transform(host_def)
-    assert isinstance(new_fn, ast.FunctionDef)
-    new_fn.name = f"_{host_def.name}_make_precompiler"
-    assert found_calls == 1
-    return new_fn
-
-
 def generate_ast(func: HostFunction, config: Config) -> ast.AST:
     with func:
         codegen = GenerateAST(func, config)
@@ -438,16 +375,13 @@ def generate_ast(func: HostFunction, config: Config) -> ast.AST:
             kernel_def = codegen.device_function.codegen_function_def()
             codegen.host_dead_code_elimination()
             host_def = func.codegen_function_def(codegen.host_statements)
-            precompile_def = codegen_precompile_def(
-                host_def, codegen.device_function.name
-            )
+
             result = ast.Module(
                 [
                     *func.codegen_imports(),
                     *codegen.device_function.codegen_helper_functions(),
                     *kernel_def,
                     host_def,
-                    precompile_def,
                 ],
                 [],
             )
 
@@ -16,6 +16,7 @@
 
 from .. import exc
 from . import ast_extension
+from .ast_extension import expr_from_string
 from .ast_extension import statement_from_string
 from .compile_environment import CompileEnvironment
 from .output_header import SOURCE_MODULE
@@ -212,10 +213,32 @@ def debug_str(self) -> str:
         return "\n\n".join(result)
 
     def codegen_function_def(self, statements: list[ast.AST]) -> ast.FunctionDef:
+        # Create a new arguments structure with _launcher kwarg-only parameter
+        new_args = ast_extension.create(
+            ast.arguments,
+            posonlyargs=self.args.posonlyargs,
+            args=self.args.args,
+            vararg=self.args.vararg,
+            kwonlyargs=[
+                *self.args.kwonlyargs,
+                ast_extension.create(
+                    ast.arg,
+                    arg="_launcher",
+                    annotation=None,
+                ),
+            ],
+            kw_defaults=[
+                *self.args.kw_defaults,
+                expr_from_string("_default_launcher"),
+            ],
+            kwarg=self.args.kwarg,
+            defaults=self.args.defaults,
+        )
+
         return ast_extension.create(
             ast.FunctionDef,
             name=self.name,
-            args=self.args,
+            args=new_args,
             body=statements,
             decorator_list=[],
             type_comment=None,
 
@@ -21,12 +21,14 @@
     "triton_helpers": "from torch._inductor.runtime import triton_helpers",
     "tl_math": "from torch._inductor.runtime.triton_helpers import math as tl_math",
     "libdevice": "from torch._inductor.runtime.triton_compat import libdevice",
+    "_default_launcher": "from helion.runtime import default_launcher as _default_launcher",
 }
 
 disallowed_names: dict[str, None] = dict.fromkeys(
     [
         SOURCE_MODULE,
-        "make_precompiler",
+        "_launcher",
+        "_default_launcher",
         "_NUM_SM",
     ]
 )
 
@@ -22,13 +22,16 @@
 
 from .. import exc
 from ..runtime.precompile_shim import already_compiled
+from ..runtime.precompile_shim import make_precompiler
 from .config_generation import ConfigGeneration
 from .config_generation import FlatConfig
 from .logger import LambdaLogger
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
+    import triton
+
     from ..runtime.config import Config
     from ..runtime.kernel import BoundKernel
     from ..runtime.kernel import CompiledConfig
@@ -144,9 +147,24 @@ def start_precompile_and_check_for_hangs(
             return PrecompileFuture.skip(self, config, True)
         ctx = mp.get_context("fork")
 
-        precompiler = fn.make_precompiler(*self.args)  # pyright: ignore[reportFunctionMemberAccess]
-        if precompiler is already_compiled:
-            return PrecompileFuture.skip(self, config, True)
+        def extract_launcher(
+            triton_kernel: triton.JITFunction,
+            grid: tuple[int, ...],
+            *args: object,
+            **kwargs: object,
+        ):
+            """Custom launcher that extracts arguments instead of executing."""
+            raise _ExtractedLaunchArgs(triton_kernel, grid, args, kwargs)
+
+        try:
+            # Call main function with extraction launcher to extract arguments
+            fn(*self.args, _launcher=extract_launcher)
+            # Should not reach here
+            raise RuntimeError("Expected _ExtractedLaunchArgs exception")
+        except _ExtractedLaunchArgs as e:
+            precompiler = make_precompiler(e.kernel)(*e.args, **e.kwargs)
+            if precompiler is already_compiled:
+                return PrecompileFuture.skip(self, config, True)
         process: mp.Process = ctx.Process(target=precompiler)  # pyright: ignore[reportAssignmentType]
         process.start()
         return PrecompileFuture(
@@ -501,3 +519,14 @@ def _mark_complete(self) -> bool:
 
         self.ok = False
         return False
+
+
+class _ExtractedLaunchArgs(Exception):
+    """Exception that carries kernel launch arguments for precompiler extraction."""
+
+    def __init__(self, triton_kernel, grid, args, kwargs):
+        super().__init__()
+        self.kernel = triton_kernel
+        self.grid = grid
+        self.args = args
+        self.kwargs = kwargs
@@ -42,3 +42,16 @@ def get_num_sm(device: torch.device) -> int:
     """
     assert device.type == "cuda", "TODO: implement for other devices"
     return torch.cuda.get_device_properties(device.index).multi_processor_count
+
+
+def default_launcher(
+    triton_kernel: triton.JITFunction,
+    grid: tuple[int, ...],
+    *args: object,
+    num_warps: int,
+    num_stages: int,
+):
+    """Default launcher function that executes the kernel immediately."""
+    return triton_kernel.run(
+        *args, grid=grid, warmup=False, num_warps=num_warps, num_stages=num_stages
+    )
@@ -387,7 +387,6 @@ def compile_config(
                 print(triton_code, file=sys.stderr)
         module = PyCodeCache.load(triton_code)
         rv = getattr(module, self.kernel.name)
-        rv.make_precompiler = getattr(module, f"_{self.kernel.name}_make_precompiler")
         self._compile_cache[config] = rv
         return rv
Original file line number	Diff line number	Diff line change
`@@ -21,12 +21,14 @@`
`21`	`21`	`"triton_helpers": "from torch._inductor.runtime import triton_helpers",`
`22`	`22`	`"tl_math": "from torch._inductor.runtime.triton_helpers import math as tl_math",`
`23`	`23`	`"libdevice": "from torch._inductor.runtime.triton_compat import libdevice",`
	`24`	`+ "_default_launcher": "from helion.runtime import default_launcher as _default_launcher",`
`24`	`25`	`}`
`25`	`26`
`26`	`27`	`disallowed_names: dict[str, None] = dict.fromkeys(`
`27`	`28`	`[`
`28`	`29`	`SOURCE_MODULE,`
`29`		`- "make_precompiler",`
	`30`	`+ "_launcher",`
	`31`	`+ "_default_launcher",`
`30`	`32`	`"_NUM_SM",`
`31`	`33`	`]`
`32`	`34`	`)`