[WIP] Improve autotune infra to catch more error cases

yf225 · yf225 · commit a11c0a68fc0e · 2025-07-22T10:26:34.000-07:00
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -359,13 +359,37 @@ def main() -> None:
         type=str,
         help="Name(s) of the Helion kernel module(s) to run. Can be a single kernel or comma-separated list (e.g., vector_add or vector_add,rms_norm). If not specified, runs all kernels.",
     )
+    parser.add_argument(
+        "--split",
+        type=str,
+        help="Run only a subset of kernels. Format: M/N where M is the part number (1-indexed) and N is the total number of parts. For example, --split 1/3 runs the first third of kernels.",
+    )
 
     # Parse known args to get the kernel name, pass rest to tritonbench
     args, tritonbench_args = parser.parse_known_args()
 
     # Check and setup tritonbench if needed
     check_and_setup_tritonbench()
 
+    # Parse split argument if provided
+    part_num = None
+    total_parts = None
+    if args.split:
+        try:
+            part_num, total_parts = map(int, args.split.split("/"))
+            if part_num < 1 or part_num > total_parts:
+                print(
+                    f"Error: Part number {part_num} must be between 1 and {total_parts}",
+                    file=sys.stderr,
+                )
+                sys.exit(1)
+        except ValueError:
+            print(
+                f"Error: Invalid split format '{args.split}'. Expected format: M/N (e.g., 1/3)",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+
     if args.kernel:
         # Parse comma-separated kernel names
         kernel_names = [k.strip() for k in args.kernel.split(",")]
@@ -383,6 +407,31 @@ def main() -> None:
             )
             sys.exit(1)
 
+        # Apply split filtering if specified
+        if args.split:
+            # Calculate which kernels belong to this part
+            kernels_per_part = len(kernel_names) // total_parts
+            remainder = len(kernel_names) % total_parts
+
+            # Calculate start and end indices for this part
+            if part_num <= remainder:
+                # Parts 1 to remainder get one extra kernel
+                start_idx = (part_num - 1) * (kernels_per_part + 1)
+                end_idx = start_idx + kernels_per_part + 1
+            else:
+                # Remaining parts get the base number of kernels
+                start_idx = (
+                    remainder * (kernels_per_part + 1)
+                    + (part_num - remainder - 1) * kernels_per_part
+                )
+                end_idx = start_idx + kernels_per_part
+
+            kernel_names = kernel_names[start_idx:end_idx]
+            print(
+                f"Running part {part_num}/{total_parts}: kernels {start_idx + 1} to {end_idx} of total",
+                file=sys.stderr,
+            )
+
         # Run specified kernels
         if len(kernel_names) == 1:
             run_kernel(kernel_names[0], tritonbench_args)
@@ -398,8 +447,35 @@ def main() -> None:
                 run_kernel(kernel_name, tritonbench_args.copy())
     else:
         # Run all kernels
-        print(f"Running all {len(KERNEL_MAPPINGS)} kernels...\n", file=sys.stderr)
-        for kernel_name in KERNEL_MAPPINGS:
+        all_kernels = list(KERNEL_MAPPINGS.keys())
+
+        # Apply split filtering if specified
+        if args.split:
+            # Calculate which kernels belong to this part
+            kernels_per_part = len(all_kernels) // total_parts
+            remainder = len(all_kernels) % total_parts
+
+            # Calculate start and end indices for this part
+            if part_num <= remainder:
+                # Parts 1 to remainder get one extra kernel
+                start_idx = (part_num - 1) * (kernels_per_part + 1)
+                end_idx = start_idx + kernels_per_part + 1
+            else:
+                # Remaining parts get the base number of kernels
+                start_idx = (
+                    remainder * (kernels_per_part + 1)
+                    + (part_num - remainder - 1) * kernels_per_part
+                )
+                end_idx = start_idx + kernels_per_part
+
+            all_kernels = all_kernels[start_idx:end_idx]
+            print(
+                f"Running part {part_num}/{total_parts}: kernels {start_idx + 1} to {end_idx} of {len(KERNEL_MAPPINGS)} total",
+                file=sys.stderr,
+            )
+
+        print(f"Running {len(all_kernels)} kernels...\n", file=sys.stderr)
+        for kernel_name in all_kernels:
             print(f"\n{'=' * 60}", file=sys.stderr)
             print(f"Kernel: {kernel_name}", file=sys.stderr)
             print(f"{'=' * 60}\n", file=sys.stderr)
diff --git a/helion/_compiler/tile_dispatch.py b/helion/_compiler/tile_dispatch.py
@@ -94,7 +94,7 @@ def _add_reduction_strategies(self, fn: DeviceFunction, config: Config) -> None:
             reduction_loop = env.config_spec.reduction_loops.config_get(
                 config.reduction_loops, block_id, None
             )
-            if reduction_loop is None:
+            if reduction_loop is None or reduction_loop <= 1:
                 strategy: TileStrategy = PersistentReductionStrategy(fn, block_id)
             else:
                 strategy = LoopedReductionStrategy(fn, block_id, reduction_loop)
diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -21,6 +21,7 @@
 
 from torch._inductor.runtime.triton_compat import OutOfResources
 from torch._inductor.runtime.triton_compat import PTXASError
+from triton.compiler.errors import CompilationError
 import torch.multiprocessing as mp
 from triton.testing import do_bench
 
@@ -43,7 +44,12 @@
     from . import ConfigSpec
 
 _expected_errors_regexp: re.Pattern[str] = re.compile(
-    r"|".join(map(re.escape, ["[CUDA]: invalid argument"]))
+    r"|".join(
+        map(
+            re.escape,
+            ["[CUDA]: invalid argument", "exceeds triton maximum tensor numel"],
+        )
+    )
 )
 
 
@@ -88,10 +94,13 @@ def benchmark(self, config: Config) -> float:
         Returns:
             The performance of the configuration in seconds.
         """
-        fn = self.kernel.compile_config(config, allow_print=False)
-        if self.start_precompile_and_check_for_hangs(config, fn)():
-            return self.benchmark_function(config, fn)
-        return inf
+        try:
+            fn = self.kernel.compile_config(config, allow_print=False)
+            if self.start_precompile_and_check_for_hangs(config, fn)():
+                return self.benchmark_function(config, fn)
+            return inf
+        except Exception as e:
+            return inf
 
     def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
         """
@@ -125,8 +134,10 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
             self.log.debug("Benchmarking failed: OutOfResources")
         except PTXASError:
             self.log.warning(f"PTXASError compiling config: {config}")
+        except CompilationError:
+            self.log.debug("Benchmarking failed: Triton CompilationError")
         except Exception as e:
-            if not _expected_errors_regexp.search(str(e)):
+            if not _expected_errors_regexp.search(str(e)) and not "exceeds triton maximum tensor numel" in str(e):
                 raise exc.TritonError(f"{type(e).__qualname__}: {e}", config) from e
             self.log.debug(f"Benchmarking failed: {type(e).__name__}: {e}")
         return inf
@@ -149,6 +160,8 @@ def start_precompile_and_check_for_hangs(
         """
         if not self.settings.autotune_precompile:
             return PrecompileFuture.skip(self, config, True)
+        if fn is None:
+            return PrecompileFuture.skip(self, config, False)
         ctx = mp.get_context("fork")
 
         def extract_launcher(
@@ -188,7 +201,13 @@ def parallel_benchmark(self, configs: list[Config]) -> list[tuple[Config, float]
         Returns:
             A list of tuples containing configurations and their performance.
         """
-        fns = [self.kernel.compile_config(c, allow_print=False) for c in configs]
+        fns = []
+        for c in configs:
+            try:
+                compile_result = self.kernel.compile_config(c, allow_print=False)
+                fns.append(compile_result)
+            except Exception as e:
+                fns.append(None)
         if self.settings.autotune_precompile:
             is_workings = PrecompileFuture.wait_for_all(
                 [
diff --git a/helion/autotuner/config_spec.py b/helion/autotuner/config_spec.py
@@ -411,8 +411,8 @@ def _flat_config(
         default = min(high, 4096)
         value = fn(BlockSizeFragment(low, high, default))
         assert isinstance(value, int)
-        if value >= self.size_hint:
-            return None  # max size becomes persistent reduction
+        if value >= self.size_hint or value < low:
+            return None  # max size or invalid value becomes persistent reduction
         return value
 
     def _normalize(self, name: str, value: object) -> int | None:

Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,7 @@ def _add_reduction_strategies(self, fn: DeviceFunction, config: Config) -> None:`
`94`	`94`	`reduction_loop = env.config_spec.reduction_loops.config_get(`
`95`	`95`	`config.reduction_loops, block_id, None`
`96`	`96`	`)`
`97`		`- if reduction_loop is None:`
	`97`	`+ if reduction_loop is None or reduction_loop <= 1:`
`98`	`98`	`strategy: TileStrategy = PersistentReductionStrategy(fn, block_id)`
`99`	`99`	`else:`
`100`	`100`	`strategy = LoopedReductionStrategy(fn, block_id, reduction_loop)`