[Benchmarks] Dry run without phony sycl or ur args (#18845)

PatKamin · pbalcer · web-flow · commit 3310db36964e · 2025-06-10T12:33:10.000+01:00
Create metadata during dry runs for all benchmarks even when --sycl,
--ur, and --umf options are not set

Co-authored-by: Piotr Balcer &lt;piotr.balcer@intel.com&gt;
diff --git a/devops/scripts/benchmarks/benches/base.py b/devops/scripts/benchmarks/benches/base.py
@@ -56,6 +56,11 @@ def explicit_group(self) -> str:
         Can be modified."""
         return ""
 
+    def enabled(self) -> bool:
+        """Returns whether this benchmark is enabled.
+        By default, it returns True, but can be overridden to disable a benchmark."""
+        return True
+
     @abstractmethod
     def setup(self):
         pass
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
@@ -115,37 +115,11 @@ def additional_metadata(self) -> dict[str, BenchmarkMetadata]:
             ),
         }
 
-    def enabled_runtimes(self, supported_runtimes=None, extra_runtimes=None):
-        # all runtimes in the RUNTIMES enum
-        runtimes = supported_runtimes or list(RUNTIMES)
-
-        # filter out SYCL_PREVIEW which is not supported by default in all benchmarks
-        runtimes = [r for r in runtimes if r != RUNTIMES.SYCL_PREVIEW]
-
-        if extra_runtimes is not None:
-            runtimes.extend(extra_runtimes)
-
-        # Filter out UR if not available
-        if options.ur is None:
-            runtimes = [r for r in runtimes if r != RUNTIMES.UR]
-
-        # Filter out L0 if cuda backend
-        if options.ur_adapter == "cuda":
-            runtimes = [r for r in runtimes if r != RUNTIMES.LEVEL_ZERO]
-
-        return runtimes
-
     def benchmarks(self) -> list[Benchmark]:
-        if options.sycl is None:
-            return []
-
-        if options.ur_adapter == "hip":
-            return []
-
         benches = []
 
-        # Add SubmitKernel benchmarks using loops
-        for runtime in self.enabled_runtimes(extra_runtimes=[RUNTIMES.SYCL_PREVIEW]):
+        for runtime in list(RUNTIMES):
+            # Add SubmitKernel benchmarks using loops
             for in_order_queue in [0, 1]:
                 for measure_completion in [0, 1]:
                     for use_events in [0, 1]:
@@ -161,21 +135,18 @@ def benchmarks(self) -> list[Benchmark]:
                                 )
                             )
 
-        # Add SinKernelGraph benchmarks
-        for runtime in self.enabled_runtimes():
+            # Add SinKernelGraph benchmarks
             for with_graphs in [0, 1]:
                 for num_kernels in [5, 100]:
                     benches.append(
                         GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
                     )
 
-        # Add ULLS benchmarks
-        for runtime in self.enabled_runtimes([RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]):
+            # Add ULLS benchmarks
             benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
             benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
 
-        # Add GraphApiSubmitGraph benchmarks
-        for runtime in self.enabled_runtimes():
+            # Add GraphApiSubmitGraph benchmarks
             for in_order_queue in [0, 1]:
                 for num_kernels in [4, 10, 32]:
                     for measure_completion_time in [0, 1]:
@@ -201,24 +172,24 @@ def benchmarks(self) -> list[Benchmark]:
         ]
 
         # Add UR-specific benchmarks
-        if options.ur is not None:
-            benches += [
-                MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 1, 1, 1, 1, 0),
-                MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 0, 1, 1, 1, 0),
-                MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 1, 0),
-                MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 0, 0),
-                MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 0),
-                MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 1),
-                UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256, "Both"),
-                UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256 * 1024, "Both"),
-                UsmBatchMemoryAllocation(self, RUNTIMES.UR, "Device", 128, 256, "Both"),
-                UsmBatchMemoryAllocation(
-                    self, RUNTIMES.UR, "Device", 128, 16 * 1024, "Both"
-                ),
-                UsmBatchMemoryAllocation(
-                    self, RUNTIMES.UR, "Device", 128, 128 * 1024, "Both"
-                ),
-            ]
+        benches += [
+            MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 1, 1, 1, 1, 0),
+            MemcpyExecute(self, RUNTIMES.UR, 400, 1, 102400, 10, 0, 1, 1, 1, 0),
+            MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 1, 0),
+            MemcpyExecute(self, RUNTIMES.UR, 100, 4, 102400, 10, 1, 1, 0, 0, 0),
+            MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 0),
+            MemcpyExecute(self, RUNTIMES.UR, 4096, 4, 1024, 10, 0, 1, 0, 1, 1),
+            UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256, "Both"),
+            UsmMemoryAllocation(self, RUNTIMES.UR, "Device", 256 * 1024, "Both"),
+            UsmBatchMemoryAllocation(self, RUNTIMES.UR, "Device", 128, 256, "Both"),
+            UsmBatchMemoryAllocation(
+                self, RUNTIMES.UR, "Device", 128, 16 * 1024, "Both"
+            ),
+            UsmBatchMemoryAllocation(
+                self, RUNTIMES.UR, "Device", 128, 128 * 1024, "Both"
+            ),
+        ]
+
         benches += [
             MemcpyExecute(
                 self, RUNTIMES.SYCL_PREVIEW, 4096, 1, 1024, 40, 1, 1, 0, 1, 0
@@ -246,11 +217,44 @@ def parse_unit_type(compute_unit):
 
 
 class ComputeBenchmark(Benchmark):
-    def __init__(self, bench, name, test):
+    def __init__(self, bench, name, test, runtime: RUNTIMES = None):
         super().__init__(bench.directory, bench)
         self.bench = bench
         self.bench_name = name
         self.test = test
+        self.runtime = runtime
+
+    def supported_runtimes(self) -> list[RUNTIMES]:
+        """Base runtimes supported by this benchmark, can be overridden."""
+        # By default, support all runtimes except SYCL_PREVIEW
+        return [r for r in RUNTIMES if r != RUNTIMES.SYCL_PREVIEW]
+
+    def enabled_runtimes(self) -> list[RUNTIMES]:
+        """Runtimes available given the current configuration."""
+        # Start with all supported runtimes and apply configuration filters
+        runtimes = self.supported_runtimes()
+
+        # Remove UR if not available
+        if options.ur is None:
+            runtimes = [r for r in runtimes if r != RUNTIMES.UR]
+
+        # Remove Level Zero if using CUDA backend
+        if options.ur_adapter == "cuda":
+            runtimes = [r for r in runtimes if r != RUNTIMES.LEVEL_ZERO]
+
+        return runtimes
+
+    def enabled(self) -> bool:
+        # SYCL is required for all benchmarks
+        if options.sycl is None:
+            return False
+
+        # HIP adapter is not supported
+        if options.ur_adapter == "hip":
+            return False
+
+        # Check if the specific runtime is enabled (or no specific runtime required)
+        return self.runtime is None or self.runtime in self.enabled_runtimes()
 
     def bin_args(self) -> list[str]:
         return []
@@ -338,15 +342,17 @@ def __init__(
         KernelExecTime=1,
     ):
         self.ioq = ioq
-        self.runtime = runtime
         self.MeasureCompletion = MeasureCompletion
         self.UseEvents = UseEvents
         self.KernelExecTime = KernelExecTime
         self.NumKernels = 10
         super().__init__(
-            bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel"
+            bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel", runtime
         )
 
+    def supported_runtimes(self) -> list[RUNTIMES]:
+        return super().supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
+
     def get_tags(self):
         return ["submit", "latency", runtime_to_tag_name(self.runtime), "micro"]
 
@@ -619,7 +625,6 @@ def __init__(
         useCopyOffload,
         useBarrier,
     ):
-        self.runtime = runtime
         self.numOpsPerThread = numOpsPerThread
         self.numThreads = numThreads
         self.allocSize = allocSize
@@ -630,7 +635,7 @@ def __init__(
         self.useCopyOffload = useCopyOffload
         self.useBarrier = useBarrier
         super().__init__(
-            bench, f"multithread_benchmark_{self.runtime.value}", "MemcpyExecute"
+            bench, f"multithread_benchmark_{runtime.value}", "MemcpyExecute", runtime
         )
 
     def extra_env_vars(self) -> dict:
@@ -706,9 +711,8 @@ class GraphApiSinKernelGraph(ComputeBenchmark):
     def __init__(self, bench, runtime: RUNTIMES, withGraphs, numKernels):
         self.withGraphs = withGraphs
         self.numKernels = numKernels
-        self.runtime = runtime
         super().__init__(
-            bench, f"graph_api_benchmark_{runtime.value}", "SinKernelGraph"
+            bench, f"graph_api_benchmark_{runtime.value}", "SinKernelGraph", runtime
         )
 
     def explicit_group(self):
@@ -759,9 +763,10 @@ def __init__(
     ):
         self.inOrderQueue = inOrderQueue
         self.numKernels = numKernels
-        self.runtime = runtime
         self.measureCompletionTime = measureCompletionTime
-        super().__init__(bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph")
+        super().__init__(
+            bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph", runtime
+        )
 
     def explicit_group(self):
         return f"SubmitGraph, numKernels: {self.numKernels}"
@@ -804,8 +809,12 @@ class UllsEmptyKernel(ComputeBenchmark):
     def __init__(self, bench, runtime: RUNTIMES, wgc, wgs):
         self.wgc = wgc
         self.wgs = wgs
-        self.runtime = runtime
-        super().__init__(bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel")
+        super().__init__(
+            bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel", runtime
+        )
+
+    def supported_runtimes(self) -> list[RUNTIMES]:
+        return [RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]
 
     def explicit_group(self):
         return f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}"
@@ -849,9 +858,13 @@ def __init__(
         self.barrier = barrier
         self.hostVisible = hostVisible
         self.ctrBasedEvents = ctrBasedEvents
-        self.runtime = runtime
         self.ioq = ioq
-        super().__init__(bench, f"ulls_benchmark_{runtime.value}", "KernelSwitch")
+        super().__init__(
+            bench, f"ulls_benchmark_{runtime.value}", "KernelSwitch", runtime
+        )
+
+    def supported_runtimes(self):
+        return [RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]
 
     def explicit_group(self):
         return f"KernelSwitch, count: {self.count}, kernelTime: {self.kernelTime}"
@@ -884,12 +897,14 @@ class UsmMemoryAllocation(ComputeBenchmark):
     def __init__(
         self, bench, runtime: RUNTIMES, usm_memory_placement, size, measure_mode
     ):
-        self.runtime = runtime
         self.usm_memory_placement = usm_memory_placement
         self.size = size
         self.measure_mode = measure_mode
         super().__init__(
-            bench, f"api_overhead_benchmark_{runtime.value}", "UsmMemoryAllocation"
+            bench,
+            f"api_overhead_benchmark_{runtime.value}",
+            "UsmMemoryAllocation",
+            runtime,
         )
 
     def get_tags(self):
@@ -941,13 +956,15 @@ def __init__(
         size,
         measure_mode,
     ):
-        self.runtime = runtime
         self.usm_memory_placement = usm_memory_placement
         self.allocation_count = allocation_count
         self.size = size
         self.measure_mode = measure_mode
         super().__init__(
-            bench, f"api_overhead_benchmark_{runtime.value}", "UsmBatchMemoryAllocation"
+            bench,
+            f"api_overhead_benchmark_{runtime.value}",
+            "UsmBatchMemoryAllocation",
+            runtime,
         )
 
     def get_tags(self):
diff --git a/devops/scripts/benchmarks/benches/llamacpp.py b/devops/scripts/benchmarks/benches/llamacpp.py
@@ -17,9 +17,6 @@
 
 class LlamaCppBench(Suite):
     def __init__(self, directory):
-        if options.sycl is None:
-            return
-
         self.directory = directory
 
     def name(self) -> str:
@@ -80,12 +77,6 @@ def setup(self):
         )
 
     def benchmarks(self) -> list[Benchmark]:
-        if options.sycl is None:
-            return []
-
-        if options.ur_adapter == "cuda" or options.ur_adapter == "hip":
-            return []
-
         return [LlamaBench(self)]
 
 
@@ -94,6 +85,13 @@ def __init__(self, bench):
         super().__init__(bench.directory, bench)
         self.bench = bench
 
+    def enabled(self):
+        if options.sycl is None:
+            return False
+        if options.ur_adapter == "cuda" or options.ur_adapter == "hip":
+            return False
+        return True
+
     def setup(self):
         self.benchmark_bin = os.path.join(self.bench.build_path, "bin", "llama-bench")
 
diff --git a/devops/scripts/benchmarks/benches/syclbench.py b/devops/scripts/benchmarks/benches/syclbench.py
@@ -14,9 +14,6 @@
 
 class SyclBench(Suite):
     def __init__(self, directory):
-        if options.sycl is None:
-            return
-
         self.directory = directory
         return
 
@@ -67,9 +64,6 @@ def setup(self):
         self.built = True
 
     def benchmarks(self) -> list[Benchmark]:
-        if options.sycl is None:
-            return []
-
         return [
             # Blocked_transform(self), # run time < 1ms
             DagTaskI(self),
@@ -117,6 +111,9 @@ def __init__(self, bench, name, test):
         self.bench_name = name
         self.test = test
 
+    def enabled(self) -> bool:
+        return options.sycl is not None
+
     def bin_args(self) -> list[str]:
         return []
 
diff --git a/devops/scripts/benchmarks/benches/umf.py b/devops/scripts/benchmarks/benches/umf.py
@@ -33,9 +33,6 @@ def setup(self):
         self.built = True
 
     def benchmarks(self) -> list[Benchmark]:
-        if not isUMFAvailable():
-            return []
-
         benches = [
             GBench(self),
             GBenchUmfProxy(self),
@@ -52,8 +49,6 @@ def __init__(self, bench):
 
         self.bench = bench
         self.bench_name = "umf-benchmark"
-        self.oneapi = get_oneapi()
-        self.umf_lib = options.umf + "lib"
 
         self.fragmentation_prefix = "FRAGMENTATION_"
 
@@ -80,6 +75,9 @@ def __init__(self, bench):
     def name(self):
         return self.bench_name
 
+    def enabled(self):
+        return isUMFAvailable()
+
     # --benchmark_format describes stdout output
     # --benchmark_out=<file> and --benchmark_out_format=<format>
     # describe output to a file
@@ -98,6 +96,8 @@ def setup(self):
             print("UMF prefix path not provided")
             return
 
+        self.oneapi = get_oneapi()
+        self.umf_lib = options.umf + "lib"
         self.benchmark_bin = os.path.join(options.umf, "benchmark", self.bench_name)
 
     def is_memory_statistics_included(self, data_row):
diff --git a/devops/scripts/benchmarks/benches/velocity.py b/devops/scripts/benchmarks/benches/velocity.py
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py