[Benchmark] Add sum to TritonBench integration (#257)

yf225 · web-flow · commit 9f7158a47459 · 2025-07-09T17:10:29.000-07:00
- Add sum_tritonbench wrapper function that handles 1D input
- Add sum to KERNEL_MAPPINGS in benchmark/run.py
- Include kernel reset logic to ensure clean state before benchmarking
diff --git a/benchmark/run.py b/benchmark/run.py
@@ -27,6 +27,7 @@
     "vector_exp": ("examples.exp", "exp_tritonbench"),
     # TODO(yf225): reduction dim size = 8192 currently throws error. After it's fixed we can remove "num_inputs" extra arg.
     "rms_norm": ("examples.rms_norm", "rms_norm_tritonbench", {"num_inputs": 3}),
+    "sum": ("examples.sum", "sum_tritonbench"),
 }
 
 
@@ -236,6 +237,15 @@ def helion_method(  # pyre-ignore[3]
         ) -> Callable[..., Any]:
             """Helion implementation."""
 
+            # Reset all Helion kernels before creating the benchmark function
+            # so that each input size can go through its own autotuning.
+            from helion.runtime.kernel import Kernel
+
+            for attr_name in dir(module):
+                attr = getattr(module, attr_name)
+                if isinstance(attr, Kernel):
+                    attr.reset()
+
             def _inner() -> Callable[..., Any]:  # pyre-ignore[3]
                 return kernel_func(*args)
 
diff --git a/examples/sum.py b/examples/sum.py
@@ -19,6 +19,16 @@ def sum_kernel(x: torch.Tensor) -> torch.Tensor:
     return out
 
 
+def sum_tritonbench(x: torch.Tensor) -> torch.Tensor:
+    """Wrapper for tritonbench that handles 1D input."""
+    if x.ndim == 1:
+        # For 1D tensors, reshape to 2D for sum_kernel
+        x_2d = x.unsqueeze(0)
+        result = sum_kernel(x_2d)
+        return result.squeeze()
+    return sum_kernel(x)
+
+
 def check(m: int, n: int) -> None:
     x = torch.randn([m, n], device="cuda", dtype=torch.float32)
     kernels = {"helion": sum_kernel}