Add hl.register_tunable (#154)

jansel · web-flow · commit b9e93c068a21 · 2025-06-12T11:42:35.000-07:00
diff --git a/examples/matmul_split_k.py b/examples/matmul_split_k.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+import torch
+
+import helion
+from helion.autotuner import PowerOfTwoFragment
+import helion.language as hl
+
+
+# static_shapes=True gives a performance boost for matmuls
+@helion.kernel(static_shapes=True)
+def matmul_split_k(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    m, k = x.size()
+    k2, n = y.size()
+    assert k == k2, f"size mismatch {k} != {k2}"
+    out = torch.zeros(
+        [m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device
+    )
+    split_k = hl.register_tunable("split_k", PowerOfTwoFragment(1, 256))
+    k_block = helion.next_power_of_2(helion.cdiv(k, split_k))
+    for tile_m, tile_n, outer_k in hl.tile([m, n, k], block_size=[None, None, k_block]):
+        acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+        for inner_k in hl.tile(outer_k.begin, outer_k.end):
+            acc = torch.addmm(acc, x[tile_m, inner_k], y[inner_k, tile_n])
+        hl.atomic_add(out, [tile_m, tile_n], acc)
+    return out
+
+
+def check(m: int, k: int, n: int) -> None:
+    from triton.testing import do_bench
+
+    x = torch.randn([m, k], device="cuda", dtype=torch.float16)
+    y = torch.randn([k, n], device="cuda", dtype=torch.float16)
+    result = matmul_split_k(x, y)
+    torch.testing.assert_close(result, x @ y, rtol=1e-2, atol=1)
+    sec = do_bench(lambda: matmul_split_k(x, y))
+    baseline_sec = do_bench(lambda: torch.matmul(x, y))
+    print(
+        f"Helion time: {sec:.4f}ms, torch time: {baseline_sec:.4f}, speedup: {baseline_sec / sec:.2f}x"
+    )
+
+
+def main() -> None:
+    check(64, 32768, 64)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/helion/_compiler/compile_environment.py b/helion/_compiler/compile_environment.py
@@ -189,7 +189,13 @@ def to_fake(self, obj: object, origin: Origin) -> object:
                     return self.shape_env.create_unbacked_symfloat()
         if isinstance(
             obj,
-            (torch.dtype, torch.device, types.BuiltinFunctionType, types.ModuleType),
+            (
+                torch.dtype,
+                torch.device,
+                types.BuiltinFunctionType,
+                types.ModuleType,
+                type,
+            ),
         ):
             return obj
         if isinstance(obj, types.FunctionType):
diff --git a/helion/_compiler/type_propagation.py b/helion/_compiler/type_propagation.py
@@ -12,6 +12,7 @@
 from typing import NoReturn
 from typing import Protocol
 from typing import TypeVar
+from typing import cast
 from unittest.mock import patch
 
 import sympy
@@ -21,6 +22,7 @@
 from torch.utils._pytree import tree_map_only
 
 from .. import exc
+from ..autotuner.config_fragment import ConfigSpecFragment
 from ..autotuner.config_spec import BlockSizeSpec
 from ..language._decorators import get_device_func_replacement
 from ..language._decorators import is_api_func
@@ -249,6 +251,8 @@ def from_example(cls, value: object, origin: Origin) -> TypeInfo:
                     )
                 ),
             )
+        if isinstance(value, ConfigSpecFragment):
+            return ConfigFragmentType(origin, value)
         if dataclasses.is_dataclass(value):
             keys = value.__dataclass_fields__.keys()  # pyre-ignore[16]
             return ClassType(
@@ -695,6 +699,16 @@ def as_literal(self) -> object:
         return self.value
 
 
+class ConfigFragmentType(LiteralType):
+    """TypeInfo for config fragments are treated as constant literals during compilation."""
+
+    value: ConfigSpecFragment
+
+    def __init__(self, origin: Origin, fragment: ConfigSpecFragment) -> None:
+        assert isinstance(fragment, ConfigSpecFragment)
+        super().__init__(origin, fragment)
+
+
 class CallableType(LiteralType):
     value: Callable[..., object]
 
@@ -745,6 +759,19 @@ def to_proxy(arg: TypeInfo) -> object:
         env: CompileEnvironment = CompileEnvironment.current()
         proxy_args = [x.tree_map(to_proxy) for x in args]
         proxy_kwargs = {k: v.tree_map(to_proxy) for k, v in kwargs.items()}
+
+        # special handling for symint arguments
+        if any(
+            (isinstance(x, torch.SymInt) and not isinstance(x._sympy_(), sympy.Integer))
+            for x in proxy_args
+        ):
+            if self.value in self._new_symint_on_host_fns() and origin.is_host():
+                return SymIntType.new_unbacked(origin)
+            if isinstance(self.value, type) and issubclass(
+                self.value, ConfigFragmentType
+            ):
+                raise exc.ConfigSpecFragmentWithSymInt(args)
+
         try:
             with patch.object(torch.SymInt, "__index__", _raise_shape_specializing):
                 output_type = TypeInfo.from_example(
@@ -782,6 +809,15 @@ def to_proxy(arg: TypeInfo) -> object:
             # TODO(jansel): point to other tracing modes
             raise exc.TorchOpTracingError(e) from e
 
+    @staticmethod
+    @functools.cache
+    def _new_symint_on_host_fns() -> dict[object, None]:
+        """Funtions that should return a new unbacked symint when called on host with a symint argument."""
+        from triton import cdiv
+        from triton import next_power_of_2
+
+        return cast("dict[object, None]", dict.fromkeys([cdiv, next_power_of_2]))
+
 
 def _raise_shape_specializing(*args: object) -> None:
     raise exc.ShapeSpecializingCall
@@ -890,12 +926,10 @@ class SymIntType(NumericType):
 
     @classmethod
     def new_unbacked(cls, origin: Origin) -> Self:
-        shape_env = CompileEnvironment.current().shape_env
-        with shape_env.ignore_fresh_unbacked_symbols():
-            return cls(
-                origin,
-                shape_env.create_unbacked_symint(),
-            )
+        return cls(
+            origin,
+            CompileEnvironment.current().create_unbacked_symint(),
+        )
 
     @property
     def python_type(self) -> type[int]:
@@ -953,7 +987,13 @@ def _get_hint(numel: int | torch.SymInt | AutoSize | None) -> int:
     if numel is None or isinstance(numel, AutoSize):
         # For data-dependent sizes, use arbitrary hint of 8192
         return 8192
-    return CompileEnvironment.current().size_hint(numel)
+
+    hint = CompileEnvironment.current().size_hint(numel)
+    # If the hint is invalid (like 0), use a reasonable default
+    # This can happen when other hints cancel out in expressions
+    if hint <= 1:
+        return 8192
+    return hint
 
 
 class TileIndexType(TypeInfo):
diff --git a/helion/autotuner/__init__.py b/helion/autotuner/__init__.py
@@ -1,5 +1,9 @@
 from __future__ import annotations
 
+from .config_fragment import BooleanFragment as BooleanFragment
+from .config_fragment import EnumFragment as EnumFragment
+from .config_fragment import IntegerFragment as IntegerFragment
+from .config_fragment import PowerOfTwoFragment as PowerOfTwoFragment
 from .config_spec import ConfigSpec as ConfigSpec
 from .differential_evolution import (
     DifferentialEvolutionSearch as DifferentialEvolutionSearch,
diff --git a/helion/autotuner/config_fragment.py b/helion/autotuner/config_fragment.py
@@ -63,6 +63,13 @@ class BaseIntegerFragment(ConfigSpecFragment):
     high: int  # maximum value (inclusive)
     default_val: int
 
+    def __init__(self, low: int, high: int, default_val: int | None = None) -> None:
+        self.low = low
+        self.high = high
+        if default_val is None:
+            default_val = low
+        self.default_val = default_val
+
     def default(self) -> int:
         return self.clamp(self.default_val)
 
diff --git a/helion/autotuner/config_spec.py b/helion/autotuner/config_spec.py
@@ -61,6 +61,9 @@ class ConfigSpec:
     reduction_loops: BlockIdSequence[ReductionLoopSpec] = dataclasses.field(
         default_factory=BlockIdSequence
     )
+    user_defined_tunables: dict[str, ConfigSpecFragment] = dataclasses.field(
+        default_factory=dict
+    )
     allow_use_yz_grid: bool | None = None
 
     def _remove_duplicates(self) -> None:
@@ -110,7 +113,10 @@ def normalize(self, config: helion.Config | dict[str, object]) -> None:
             config.setdefault("use_yz_grid", False)
 
         config.setdefault("indexing", "pointer")
-        if invalid_keys := ({*config} - VALID_KEYS):
+
+        # Allow tunable parameter keys in addition to VALID_KEYS
+        allowed_keys = VALID_KEYS | {*self.user_defined_tunables.keys()}
+        if invalid_keys := ({*config} - allowed_keys):
             raise InvalidConfig(f"Invalid config keys {sorted(invalid_keys)!r}")
 
     def default_config(self) -> helion.Config:
@@ -134,6 +140,10 @@ def flat_config(self, fn: Callable[[ConfigSpecFragment], object]) -> helion.Conf
                 )
             ),
         }
+        # Add tunable parameters
+        for key, fragment in self.user_defined_tunables.items():
+            config[key] = fn(fragment)
+
         if self.allow_use_yz_grid:
             use_yz_grid = fn(BooleanFragment())
             # pyre-ignore[16]
@@ -191,6 +201,7 @@ def __init__(
         self.max_size: int = (
             next_power_of_2(size_hint) if max_size is None else max_size
         )
+        assert self.min_size <= self.max_size
 
     def __repr__(self) -> str:
         fields = []
@@ -207,6 +218,8 @@ def __repr__(self) -> str:
 
     def update_min(self, value: int) -> None:
         self.min_size = assert_integer_power_of_two(max(value, self.min_size))
+        if self.max_size < self.min_size:
+            self.max_size = self.min_size
 
     def update_max(self, value: int) -> None:
         self.max_size = assert_integer_power_of_two(min(value, self.max_size))
diff --git a/helion/exc.py b/helion/exc.py
@@ -140,6 +140,24 @@ class FailedToUnpackTupleAssign(BaseError):
     message = "Failed to unpack values in tuple assignment.  Expected a sequence of size {0}, got type: {1!s}."
 
 
+class RegisterTunableArgTypes(BaseError):
+    message = "expected string literal and ConfigSpecFragment literal, got {0} and {1}"
+
+
+class TunableTypeNotSupported(BaseError):
+    message = "hl.register_tunable() only supports integer, float, and boolean types, got {0!s}."
+
+
+class TunableNameConflict(BaseError):
+    message = (
+        "Tunable parameter with name {0!s} already exists. Please use a different name."
+    )
+
+
+class ConfigSpecFragmentWithSymInt(BaseError):
+    message = "ConfigSpecFragment with SymInt arg is not supported. hl.constexpr or hl.specialize may be used to specialize the SymInt value."
+
+
 class FailedToUnpackTile(BaseError):
     message = (
         "Failed to unpack a tile into a tuple assignment. "
diff --git a/helion/language/__init__.py b/helion/language/__init__.py
@@ -17,4 +17,5 @@
 from .tiles import tile_block_size as tile_block_size
 from .tiles import tile_end as tile_end
 from .tiles import tile_index as tile_index
+from .tunable_ops import register_tunable as register_tunable
 from .view_ops import subscript as subscript
diff --git a/helion/language/tunable_ops.py b/helion/language/tunable_ops.py
@@ -0,0 +1,82 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from torch._inductor.codegen.simd import constant_repr
+
+from .. import exc
+from .._compiler.ast_extension import expr_from_string
+from ..autotuner.config_fragment import ConfigSpecFragment
+from ..autotuner.config_spec import VALID_KEYS
+from ..exc import NotInsideKernel
+from . import _decorators
+
+if TYPE_CHECKING:
+    import ast
+
+    from .._compiler.inductor_lowering import CodegenState
+    from .._compiler.type_propagation import TypeInfo
+    from .._compiler.variable_origin import Origin
+
+__all__ = ["register_tunable"]
+
+
+@_decorators.api(is_device_only=False)
+def register_tunable(name: str, fragment: ConfigSpecFragment) -> int:
+    """
+    Register a tunable parameter for autotuning.
+
+    This function allows you to define parameters that can be automatically tuned
+    during the autotuning process. The fragment defines the search space and default value.
+
+    :param name: The key for the tunable parameter in the Config().
+    :param fragment: A ConfigSpecFragment that defines the search space (e.g., PowerOfTwoFragment)
+    :return: The value assigned to this tunable parameter in the current configuration.
+    """
+    raise NotInsideKernel
+
+
+@_decorators.type_propagation(register_tunable)
+def _register_tunable_type(
+    name: TypeInfo, fragment: TypeInfo, *, origin: Origin
+) -> TypeInfo:
+    # During type propagation, register the tunable parameter and return unbacked symint
+    from .._compiler.compile_environment import CompileEnvironment
+    from .._compiler.type_propagation import NumericType
+
+    env = CompileEnvironment.current()
+
+    try:
+        fragment_val = fragment.as_literal()
+        name_val = name.as_literal()
+    except NotImplementedError:
+        fragment_val = None
+        name_val = None
+    if not (isinstance(name_val, str) and isinstance(fragment_val, ConfigSpecFragment)):
+        raise exc.RegisterTunableArgTypes(name, fragment)
+    del name, fragment
+
+    if name_val in VALID_KEYS or f"{name_val}s" in VALID_KEYS:
+        raise exc.TunableNameConflict(name_val)
+    if (
+        name_val in env.config_spec.user_defined_tunables
+        and env.config_spec.user_defined_tunables[name_val] != fragment_val
+    ):
+        raise exc.TunableNameConflict(name_val)
+
+    # register the value for tuning
+    env.config_spec.user_defined_tunables[name_val] = fragment_val
+
+    python_type = type(fragment_val.default())
+    if not issubclass(python_type, (int, float, bool)):
+        raise exc.TunableTypeNotSupported(python_type)
+    return NumericType.subtype(python_type).new_unbacked(origin)
+
+
+@_decorators.codegen(register_tunable)
+def _register_tunable_codegen(state: CodegenState) -> ast.AST:
+    name = state.proxy_arg(0)
+    assert isinstance(name, str)
+    config_value = state.config[name]
+    assert isinstance(config_value, (int, float, bool))
+    return expr_from_string(constant_repr(config_value))
diff --git a/test/test_examples.py b/test/test_examples.py
diff --git a/test/test_register_tunable.py b/test/test_register_tunable.py