From 4d32fe545dc66ac03697d74da717cddcb109dd1e Mon Sep 17 00:00:00 2001
From: "Mateusz P. Nowak" <mateusz.p.nowak@intel.com>
Date: Wed, 25 Jun 2025 15:48:37 +0000
Subject: [PATCH 1/6] Add Unitrace support

Signed-off-by: Mateusz P. Nowak <mateusz.p.nowak@intel.com>
---
 devops/scripts/benchmarks/benches/base.py     |   34 +-
 devops/scripts/benchmarks/benches/benchdnn.py |    6 +-
 devops/scripts/benchmarks/benches/compute.py  |    4 +-
 devops/scripts/benchmarks/benches/gromacs.py  |    3 +-
 devops/scripts/benchmarks/benches/llamacpp.py |    7 +-
 .../scripts/benchmarks/benches/syclbench.py   |    4 +-
 devops/scripts/benchmarks/benches/umf.py      |    8 +-
 devops/scripts/benchmarks/benches/velocity.py |    9 +-
 devops/scripts/benchmarks/html/data.js        | 5106 ++++++++++++++++-
 devops/scripts/benchmarks/main.py             |   99 +-
 devops/scripts/benchmarks/options.py          |    4 +
 devops/scripts/benchmarks/utils/utils.py      |    4 +-
 12 files changed, 5255 insertions(+), 33 deletions(-)

diff --git a/devops/scripts/benchmarks/benches/base.py b/devops/scripts/benchmarks/benches/base.py
index efbf7d77e003d..37422ae8a8320 100644
--- a/devops/scripts/benchmarks/benches/base.py
+++ b/devops/scripts/benchmarks/benches/base.py
@@ -70,7 +70,7 @@ def teardown(self):
         pass
 
     @abstractmethod
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars, with_unitrace: bool = False) -> list[Result]:
         pass
 
     @staticmethod
@@ -86,7 +86,14 @@ def get_adapter_full_path():
         ), f"could not find adapter file {adapter_path} (and in similar lib paths)"
 
     def run_bench(
-        self, command, env_vars, ld_library=[], add_sycl=True, use_stdout=True
+        self,
+        command,
+        env_vars,
+        ld_library=[],
+        add_sycl=True,
+        use_stdout=True,
+        with_unitrace: bool = False,
+        extra_unitrace_opt=[],
     ):
         env_vars = env_vars.copy()
         if options.ur is not None:
@@ -99,6 +106,29 @@ def run_bench(
         ld_libraries = options.extra_ld_libraries.copy()
         ld_libraries.extend(ld_library)
 
+        if with_unitrace:
+            unitrace_bin = os.path.join(options.workdir, "unitrace-build", "unitrace")
+            if not os.path.exists(unitrace_bin):
+                raise FileNotFoundError(f"Unitrace binary not found: {unitrace_bin}. ")
+            if not os.path.exists(options.unitrace_res_dir):
+                os.makedirs(options.unitrace_res_dir)
+            os.makedirs(f"{options.unitrace_res_dir}/{self.name()}", exist_ok=True)
+            command = (
+                [
+                    str(unitrace_bin),
+                    "--call-logging",
+                    "--host-timing",
+                    "--chrome-sycl-logging",
+                    "--chrome-call-logging",
+                    "--chrome-kernel-logging",
+                    "--output",
+                    f"{options.unitrace_res_dir}/{self.name()}/{self.name()}.log",
+                ]
+                + extra_unitrace_opt
+                + command
+            )
+            if options.verbose:
+                print(f"Unitrace cmd: {' '.join(command)}")
         result = run(
             command=command,
             env_vars=env_vars,
diff --git a/devops/scripts/benchmarks/benches/benchdnn.py b/devops/scripts/benchmarks/benches/benchdnn.py
index 4698d7ed965de..a25a5614e6f46 100644
--- a/devops/scripts/benchmarks/benches/benchdnn.py
+++ b/devops/scripts/benchmarks/benches/benchdnn.py
@@ -73,6 +73,8 @@ def setup(self):
             f"-B {self.build_dir}",
             f"-DCMAKE_PREFIX_PATH={options.sycl}",
             "-DCMAKE_BUILD_TYPE=Release",
+            "-DCMAKE_CXX_COMPILER=clang++",
+            "-DCMAKE_C_COMPILER=clang",
             "-DDNNL_BUILD_TESTS=ON",
             "-DDNNL_BUILD_EXAMPLES=OFF",
             "-DDNNL_CPU_RUNTIME=NONE",  # Disable SYCL support
@@ -126,7 +128,7 @@ def setup(self):
         if not self.bench_bin.exists():
             raise FileNotFoundError(f"Benchmark binary not found: {self.bench_bin}")
 
-    def run(self, env_vars):
+    def run(self, env_vars, with_unitrace: bool = False) -> list[Result]:
         command = [
             str(self.bench_bin),
             *self.bench_args.split(),
@@ -145,6 +147,8 @@ def run(self, env_vars):
             add_sycl=True,
             ld_library=ld_library,
             use_stdout=True,
+            with_unitrace=with_unitrace,
+            extra_unitrace_opt=["--chrome-dnn-logging"],
         )
         result_value = self._extract_time(output)
 
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index 25631c288f951..caeee3b5c694f 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -280,7 +280,7 @@ def explicit_group(self):
     def description(self) -> str:
         return ""
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars, with_unitrace: bool = False) -> list[Result]:
         command = [
             f"{self.benchmark_bin}",
             f"--test={self.test}",
@@ -291,7 +291,7 @@ def run(self, env_vars) -> list[Result]:
         command += self.bin_args()
         env_vars.update(self.extra_env_vars())
 
-        result = self.run_bench(command, env_vars)
+        result = self.run_bench(command, env_vars, with_unitrace=with_unitrace)
         parsed_results = self.parse_output(result)
         ret = []
         for label, median, stddev, unit in parsed_results:
diff --git a/devops/scripts/benchmarks/benches/gromacs.py b/devops/scripts/benchmarks/benches/gromacs.py
index 4d95c538df09c..828acae3ecb20 100644
--- a/devops/scripts/benchmarks/benches/gromacs.py
+++ b/devops/scripts/benchmarks/benches/gromacs.py
@@ -161,7 +161,7 @@ def setup(self):
             ld_library=self.suite.oneapi.ld_libraries(),
         )
 
-    def run(self, env_vars):
+    def run(self, env_vars, with_unitrace: bool = False):
         model_dir = self.grappa_dir / self.model
 
         env_vars.update({"SYCL_CACHE_PERSISTENT": "1"})
@@ -200,6 +200,7 @@ def run(self, env_vars):
             add_sycl=True,
             use_stdout=False,
             ld_library=self.suite.oneapi.ld_libraries(),
+            with_unitrace=with_unitrace,
         )
 
         if not self._validate_correctness(options.benchmark_cwd + "/md.log"):
diff --git a/devops/scripts/benchmarks/benches/llamacpp.py b/devops/scripts/benchmarks/benches/llamacpp.py
index d8eba11916a31..5641108104298 100644
--- a/devops/scripts/benchmarks/benches/llamacpp.py
+++ b/devops/scripts/benchmarks/benches/llamacpp.py
@@ -115,7 +115,7 @@ def get_tags(self):
     def lower_is_better(self):
         return False
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars, with_unitrace: bool = False) -> list[Result]:
         command = [
             f"{self.benchmark_bin}",
             "--output",
@@ -141,7 +141,10 @@ def run(self, env_vars) -> list[Result]:
         ]
 
         result = self.run_bench(
-            command, env_vars, ld_library=self.bench.oneapi.ld_libraries()
+            command,
+            env_vars,
+            ld_library=self.bench.oneapi.ld_libraries(),
+            with_unitrace=with_unitrace,
         )
         parsed = self.parse_output(result)
         results = []
diff --git a/devops/scripts/benchmarks/benches/syclbench.py b/devops/scripts/benchmarks/benches/syclbench.py
index be36c4cd7ba9a..4dd8c3a72c90e 100644
--- a/devops/scripts/benchmarks/benches/syclbench.py
+++ b/devops/scripts/benchmarks/benches/syclbench.py
@@ -137,7 +137,7 @@ def setup(self):
             self.directory, "sycl-bench-build", self.bench_name
         )
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars, with_unitrace: bool = False) -> list[Result]:
         self.outputfile = os.path.join(self.bench.directory, self.test + ".csv")
 
         command = [
@@ -151,7 +151,7 @@ def run(self, env_vars) -> list[Result]:
         env_vars.update(self.extra_env_vars())
 
         # no output to stdout, all in outputfile
-        self.run_bench(command, env_vars)
+        self.run_bench(command, env_vars, with_unitrace=with_unitrace)
 
         with open(self.outputfile, "r") as f:
             reader = csv.reader(f)
diff --git a/devops/scripts/benchmarks/benches/umf.py b/devops/scripts/benchmarks/benches/umf.py
index 6af745e7de3cb..8a2d335665e50 100644
--- a/devops/scripts/benchmarks/benches/umf.py
+++ b/devops/scripts/benchmarks/benches/umf.py
@@ -138,7 +138,7 @@ def get_names_of_benchmarks_to_be_run(self, command, env_vars):
 
         return all_names
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars, with_unitrace: bool = False) -> list[Result]:
         command = [f"{self.benchmark_bin}"]
 
         all_names = self.get_names_of_benchmarks_to_be_run(command, env_vars)
@@ -152,7 +152,11 @@ def run(self, env_vars) -> list[Result]:
             specific_benchmark = command + ["--benchmark_filter=^" + name + "$"]
 
             result = self.run_bench(
-                specific_benchmark, env_vars, add_sycl=False, ld_library=[self.umf_lib]
+                specific_benchmark,
+                env_vars,
+                add_sycl=False,
+                ld_library=[self.umf_lib],
+                with_unitrace=with_unitrace,
             )
 
             parsed = self.parse_output(result)
diff --git a/devops/scripts/benchmarks/benches/velocity.py b/devops/scripts/benchmarks/benches/velocity.py
index 17e0d22331df2..74951f31df2ac 100644
--- a/devops/scripts/benchmarks/benches/velocity.py
+++ b/devops/scripts/benchmarks/benches/velocity.py
@@ -130,7 +130,7 @@ def description(self) -> str:
     def get_tags(self):
         return ["SYCL", "application"]
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars, with_unitrace: bool = False) -> list[Result]:
         env_vars.update(self.extra_env_vars())
 
         command = [
@@ -138,7 +138,12 @@ def run(self, env_vars) -> list[Result]:
         ]
         command += self.bin_args()
 
-        result = self.run_bench(command, env_vars, ld_library=self.ld_libraries())
+        result = self.run_bench(
+            command,
+            env_vars,
+            ld_library=self.ld_libraries(),
+            with_unitrace=with_unitrace,
+        )
 
         return [
             Result(
diff --git a/devops/scripts/benchmarks/html/data.js b/devops/scripts/benchmarks/html/data.js
index 2f1862fe621b7..f4ca859a904fd 100644
--- a/devops/scripts/benchmarks/html/data.js
+++ b/devops/scripts/benchmarks/html/data.js
@@ -1,11 +1,5101 @@
-// This file serves as a placeholder for loading data locally: If
-// `remoteDataUrl` (etc.) is not defined in config.js, the dashboard will
-// attempt to load data from variables defined here instead.
-//
-// These variables are empty by default, and are populated by main.py if
-// `--output-html local` is specified.
+benchmarkRuns = [
+  {
+    "results": [
+      {
+        "label": "onednn-sum-f16-1-eager",
+        "value": 0.00944,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--sdt=f16:f16:f16",
+          "--stag=abx:abx:abx",
+          "--scales=1.25:3:0.5",
+          "16x2x6x4x3"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16 --scales=1.25:3:0.5 16x2x6x4x3,0,2.52173,0.00944,0,0.0128609,0\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.00944 avg(ms):0.0128609\ntotal: 0.27s; create_pd: 0.00s (0%); create_prim: 0.00s (1%); fill: 0.01s (3%); execute: 0.00s (0%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.0,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-f16-1-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-sum-f16-2-eager",
+        "value": 0.60928,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--reset",
+          "--ddt=f16",
+          "--sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16",
+          "--stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b",
+          "--dtag=abx,aBx16b,ABx16a16b,ABcd16b16a,BAcd16a16b,BAcd16b16a,aBCd16b16c,aBCd16c16b,aCBd16b16c,aCBd16c16b",
+          "--scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2",
+          "16x32x48x5"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=abx --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,48.9631,0.06448,0,0.0676806,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBx16b --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,30.5063,0.05808,0,0.0612839,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABx16a16b --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,47.4368,0.05888,0,0.0620269,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABcd16b16a --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,46.4478,0.06368,0,0.0671496,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16a16b --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,46.657,0.05984,0,0.0630586,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16b16a --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,45.9631,0.06448,0,0.0679256,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16b16c --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,41.9988,0.06208,0,0.0652478,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16c16b --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,47.5825,0.05808,0,0.061508,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16b16c --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,51.822,0.06288,0,0.0659863,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16c16b --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,50.4551,0.0568,0,0.0609149,0\ntests:10 passed:10 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.60928 avg(ms):0.642782\ntotal: 2.47s; create_pd: 0.01s (0%); create_prim: 0.45s (18%); fill: 0.08s (3%); execute: 0.02s (1%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.0021996363335788104,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-f16-2-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-sum-f32-1-eager",
+        "value": 0.0088,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--sdt=bf16:bf16:bf16",
+          "--stag=abx:abx:abx",
+          "--scales=0.5:2:0.5",
+          "16x2x6x4x3"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16 --scales=0.5:2:0.5 16x2x6x4x3,0,2.42236,0.0088,0,0.0129955,0\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.0088 avg(ms):0.0129955\ntotal: 0.28s; create_pd: 0.00s (0%); create_prim: 0.00s (1%); fill: 0.01s (3%); execute: 0.00s (0%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.00017486502731471965,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-f32-1-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-sum-f32-2-eager",
+        "value": 0.6441600000000001,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--reset",
+          "--inplace=true,false",
+          "--ddt=bf16",
+          "--sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16",
+          "--stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b",
+          "--dtag=abx,aBx16b,ABx16a16b,ABcd16b16a,BAcd16a16b,BAcd16b16a,aBCd16b16c,aBCd16c16b,aCBd16b16c,aCBd16c16b",
+          "--scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15",
+          "16x32x48x5"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=abx --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,47.5034,0.0648,0,0.0682694,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=abx --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,0.0119629,0.06512,0,0.068156,0\n2:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBx16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBx16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBx16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,29.7659,0.0568,0,0.060283,0\n4:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABx16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABx16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABx16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,46.333,0.0568,0,0.0603173,0\n6:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,44.2969,0.05808,0,0.0609991,0\n8:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,44.946,0.05248,0,0.055674,0\n10:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,44.5508,0.05904,0,0.0620182,0\n12:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,41.147,0.06112,0,0.0643661,0\n14:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,46.8096,0.05728,0,0.0602824,0\n16:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,50.3113,0.05664,0,0.0600053,0\n18:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,49.8357,0.056,0,0.0593853,0\ntests:20 passed:11 skipped:9 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.64416 avg(ms):0.679756\ntotal: 2.68s; create_pd: 0.01s (0%); create_prim: 0.44s (16%); fill: 0.08s (3%); execute: 0.02s (1%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.004735567547823622,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-f32-2-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-sum-padding-1-eager",
+        "value": 0.3904,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--ddt=f32",
+          "--sdt=f32:f32",
+          "--stag=aBcd16b",
+          "--dtag=aBcd16b",
+          "1x8x64x64",
+          "1x8x640x1024",
+          "1x24x640x1024"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x8x64x64,0,1.58545,0.00192,0,0.00269551,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x8x640x1024,0,0.890869,0.08528,0,0.0932233,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x24x640x1024,0,1.29517,0.3032,0,0.32437,0\ntests:3 passed:3 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.3904 avg(ms):0.420289\ntotal: 0.91s; create_pd: 0.00s (0%); create_prim: 0.00s (0%); fill: 0.15s (16%); execute: 0.02s (2%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.001602664448140469,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-padding-1-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-sum-padding-1-graph",
+        "value": 0.39216,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=graph",
+          "--ddt=f32",
+          "--sdt=f32:f32",
+          "--stag=aBcd16b",
+          "--dtag=aBcd16b",
+          "1x8x64x64",
+          "1x8x640x1024",
+          "1x24x640x1024"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --execution-mode=graph --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x8x64x64,0,1.43994,0.00192,0,0.00268973,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --execution-mode=graph --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x8x640x1024,0,0.874268,0.08656,0,0.094599,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --execution-mode=graph --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x24x640x1024,0,1.27124,0.30368,0,0.325998,0\ntests:3 passed:3 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.39216 avg(ms):0.423287\ntotal: 0.89s; create_pd: 0.00s (0%); create_prim: 0.00s (0%); fill: 0.13s (15%); execute: 0.02s (2%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.0009097985124923661,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-padding-1-graph",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-sum-padding-2-eager",
+        "value": 0.00336,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--sdt=bf16:bf16",
+          "--ddt=bf16",
+          "--stag=AB48a16b:AB48a16b",
+          "--dtag=AB48a16b",
+          "512x1024"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16 --ddt=bf16 --stag=AB48a16b:AB48a16b --dtag=AB48a16b --scales=1 512x1024,0,1.21216,0.00336,0,0.00399908,0\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.00336 avg(ms):0.00399908\ntotal: 0.33s; create_pd: 0.00s (0%); create_prim: 0.00s (0%); fill: 0.02s (6%); execute: 0.00s (0%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 8.262364471909155e-05,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-padding-2-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-sum-padding-2-graph",
+        "value": 0.00352,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=graph",
+          "--sdt=bf16:bf16",
+          "--ddt=bf16",
+          "--stag=AB48a16b:AB48a16b",
+          "--dtag=AB48a16b",
+          "512x1024"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --execution-mode=graph --sdt=bf16:bf16 --ddt=bf16 --stag=AB48a16b:AB48a16b --dtag=AB48a16b --scales=1 512x1024,0,1.24072,0.00352,0,0.00398547,0\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.00352 avg(ms):0.00398547\ntotal: 0.33s; create_pd: 0.00s (0%); create_prim: 0.00s (0%); fill: 0.02s (7%); execute: 0.00s (0%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.0,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-padding-2-graph",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-graph-sdpa-plain-f16-eager",
+        "value": 0.33968,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--graph",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--reset",
+          "--dt=f16",
+          "--case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%prb%,%-time%,%0time%\nperf,gpu,--mode=P --max-ms-per-prb=100 --graph --engine=gpu --dt=f16 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json,0.33968,0.342391\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.33968 avg(ms):0.342391\ntotal: 0.54s; create_pd: 0.00s (0%); create_prim: 0.07s (13%); fill: 0.00s (0%); execute: 0.00s (0%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.00855442631178792,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-graph-sdpa-plain-f16-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-graph-sdpa-plain-f32-eager",
+        "value": 0.38512,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--graph",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--reset",
+          "--dt=f32",
+          "--case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%prb%,%-time%,%0time%\nperf,gpu,--mode=P --max-ms-per-prb=100 --graph --engine=gpu --dt=f32 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json,0.38512,0.388208\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.38512 avg(ms):0.388208\ntotal: 0.60s; create_pd: 0.00s (0%); create_prim: 0.07s (11%); fill: 0.00s (0%); execute: 0.00s (0%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.0066990148529466635,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-graph-sdpa-plain-f32-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-graph-sdpa-plain-f32-graph",
+        "value": 0.37952,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--graph",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=graph",
+          "--reset",
+          "--dt=f32",
+          "--case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%prb%,%-time%,%0time%\nperf,gpu,--mode=P --max-ms-per-prb=100 --graph --engine=gpu --execution-mode=graph --dt=f32 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json,0.37952,0.382662\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.37952 avg(ms):0.382662\ntotal: 0.58s; create_pd: 0.00s (0%); create_prim: 0.07s (11%); fill: 0.00s (0%); execute: 0.00s (0%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.011297102873450952,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-graph-sdpa-plain-f32-graph",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      }
+    ],
+    "name": "This PR",
+    "hostname": "gkdse-pre-dnp-02",
+    "git_hash": "1eb1026ad0ef",
+    "github_repo": "mateuszpn/llvm",
+    "date": "2025-06-27T09:56:15.698275+00:00",
+    "compute_runtime": "unknown"
+  }
+];
 
-benchmarkRuns = [];
+benchmarkMetadata = {
+  "SubmitKernel": {
+    "type": "group",
+    "description": "Measures CPU time overhead of submitting kernels through different APIs.",
+    "notes": "Each layer builds on top of the previous layer, adding functionality and overhead.\nThe first layer is the Level Zero API, the second is the Unified Runtime API, and the third is the SYCL API.\nThe UR v2 adapter noticeably reduces UR layer overhead, also improving SYCL performance.\nWork is ongoing to reduce the overhead of the SYCL API\n",
+    "unstable": null,
+    "tags": [
+      "submit",
+      "micro",
+      "SYCL",
+      "UR",
+      "L0"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": null,
+    "explicit_group": null
+  },
+  "SinKernelGraph": {
+    "type": "group",
+    "description": null,
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "submit",
+      "memory",
+      "proxy",
+      "SYCL",
+      "UR",
+      "L0",
+      "graph"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": null,
+    "explicit_group": null
+  },
+  "SubmitGraph": {
+    "type": "group",
+    "description": null,
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "micro",
+      "SYCL",
+      "UR",
+      "L0",
+      "graph"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": null,
+    "explicit_group": null
+  },
+  "FinalizeGraph": {
+    "type": "group",
+    "description": null,
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "finalize",
+      "micro",
+      "SYCL",
+      "graph"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": null,
+    "explicit_group": null
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order, NumKernels 10",
+    "explicit_group": "SubmitKernel in order"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20, CPU count"
+  },
+  "graph_api_benchmark_syclpreview SinKernelGraph graphs:0, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 5 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SinKernelGraph, graphs 0, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_syclpreview SinKernelGraph graphs:0, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 100 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SinKernelGraph, graphs 0, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "graph_api_benchmark_syclpreview SinKernelGraph graphs:1, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 5 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SinKernelGraph, graphs 1, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_syclpreview SinKernelGraph graphs:1, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 100 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SinKernelGraph, graphs 1, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "ulls_benchmark_syclpreview EmptyKernel wgc:1000, wgs:256": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW EmptyKernel, wgc 1000, wgs 256",
+    "explicit_group": "EmptyKernel, wgc: 1000, wgs: 256"
+  },
+  "ulls_benchmark_syclpreview KernelSwitch count 8 kernelTime 200": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW KernelSwitch, count 8, kernelTime 200",
+    "explicit_group": "KernelSwitch, count: 8, kernelTime: 200"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:4 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 4, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:4 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 4, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:10 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 10, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:10 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 10, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:32 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 32, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:32 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 32, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:4 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 4, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:4 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 4, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:10 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 10, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:10 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 10, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:32 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 32, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:32 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 32, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order, NumKernels 10",
+    "explicit_group": "SubmitKernel in order"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20, CPU count"
+  },
+  "graph_api_benchmark_sycl SinKernelGraph graphs:0, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 5 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SinKernelGraph, graphs 0, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_sycl SinKernelGraph graphs:0, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 100 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SinKernelGraph, graphs 0, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "graph_api_benchmark_sycl SinKernelGraph graphs:1, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 5 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SinKernelGraph, graphs 1, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_sycl SinKernelGraph graphs:1, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 100 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SinKernelGraph, graphs 1, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "ulls_benchmark_sycl EmptyKernel wgc:1000, wgs:256": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL EmptyKernel, wgc 1000, wgs 256",
+    "explicit_group": "EmptyKernel, wgc: 1000, wgs: 256"
+  },
+  "ulls_benchmark_sycl KernelSwitch count 8 kernelTime 200": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL KernelSwitch, count 8, kernelTime 200",
+    "explicit_group": "KernelSwitch, count: 8, kernelTime: 200"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:4 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 4, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:4 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 4, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:10 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 10, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:10 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 10, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:32 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 32, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:32 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 32, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:4 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 4, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:4 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 4, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:10 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 10, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:10 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 10, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:32 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 32, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:32 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 32, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order, NumKernels 10",
+    "explicit_group": "SubmitKernel in order"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20, CPU count"
+  },
+  "graph_api_benchmark_l0 SinKernelGraph graphs:0, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 5 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "L0",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SinKernelGraph, graphs 0, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_l0 SinKernelGraph graphs:0, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 100 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "L0",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SinKernelGraph, graphs 0, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "graph_api_benchmark_l0 SinKernelGraph graphs:1, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 5 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "L0",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SinKernelGraph, graphs 1, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_l0 SinKernelGraph graphs:1, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 100 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "L0",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SinKernelGraph, graphs 1, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "ulls_benchmark_l0 EmptyKernel wgc:1000, wgs:256": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "L0",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 EmptyKernel, wgc 1000, wgs 256",
+    "explicit_group": "EmptyKernel, wgc: 1000, wgs: 256"
+  },
+  "ulls_benchmark_l0 KernelSwitch count 8 kernelTime 200": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "L0",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 KernelSwitch, count 8, kernelTime 200",
+    "explicit_group": "KernelSwitch, count: 8, kernelTime: 200"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 4, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 4, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 10, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 10, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 32, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 32, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 4, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 4, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 10, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 10, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 32, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 32, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order, NumKernels 10",
+    "explicit_group": "SubmitKernel in order"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20, CPU count"
+  },
+  "graph_api_benchmark_ur SinKernelGraph graphs:0, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 5 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "UR",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SinKernelGraph, graphs 0, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_ur SinKernelGraph graphs:0, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 100 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "UR",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SinKernelGraph, graphs 0, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "graph_api_benchmark_ur SinKernelGraph graphs:1, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 5 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "UR",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SinKernelGraph, graphs 1, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_ur SinKernelGraph graphs:1, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 100 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "UR",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SinKernelGraph, graphs 1, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "ulls_benchmark_ur EmptyKernel wgc:1000, wgs:256": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "UR",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR EmptyKernel, wgc 1000, wgs 256",
+    "explicit_group": "EmptyKernel, wgc: 1000, wgs: 256"
+  },
+  "ulls_benchmark_ur KernelSwitch count 8 kernelTime 200": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "UR",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR KernelSwitch, count 8, kernelTime 200",
+    "explicit_group": "KernelSwitch, count: 8, kernelTime: 200"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 4, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 4, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 10, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 10, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 32, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 32, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 4, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 4, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 10, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 10, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 32, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 32, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "memory_benchmark_sycl QueueInOrderMemcpy from Device to Device, size 1024": {
+    "type": "benchmark",
+    "description": "Measures SYCL in-order queue memory copy performance for copy and command submission from Device to Device with 1024 bytes, executed 100 times per iteration.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL QueueInOrderMemcpy from Device to Device, size 1024",
+    "explicit_group": ""
+  },
+  "memory_benchmark_sycl QueueInOrderMemcpy from Host to Device, size 1024": {
+    "type": "benchmark",
+    "description": "Measures SYCL in-order queue memory copy performance for copy and command submission from Host to Device with 1024 bytes, executed 100 times per iteration.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL QueueInOrderMemcpy from Host to Device, size 1024",
+    "explicit_group": ""
+  },
+  "memory_benchmark_sycl QueueMemcpy from Device to Device, size 1024": {
+    "type": "benchmark",
+    "description": "Measures general SYCL queue memory copy performance from Device to Device with 1024 bytes per operation.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL QueueMemcpy from Device to Device, size 1024",
+    "explicit_group": ""
+  },
+  "memory_benchmark_sycl StreamMemory, placement Device, type Triad, size 10240": {
+    "type": "benchmark",
+    "description": "Measures Device memory bandwidth using Triad pattern with 10240 bytes. Higher values (GB/s) indicate better performance.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "throughput",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL StreamMemory, placement Device, type Triad, size 10240",
+    "explicit_group": ""
+  },
+  "api_overhead_benchmark_sycl ExecImmediateCopyQueue out of order from Device to Device, size 1024": {
+    "type": "benchmark",
+    "description": "Measures SYCL out-of-order queue overhead for copy-only from Device to Device memory with 1024 bytes. Tests immediate execution overheads.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL ExecImmediateCopyQueue out of order from Device to Device, size 1024",
+    "explicit_group": ""
+  },
+  "api_overhead_benchmark_sycl ExecImmediateCopyQueue in order from Device to Host, size 1024": {
+    "type": "benchmark",
+    "description": "Measures SYCL in-order queue overhead for copy-only from Device to Host memory with 1024 bytes. Tests immediate execution overheads.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL ExecImmediateCopyQueue in order from Device to Host, size 1024",
+    "explicit_group": ""
+  },
+  "miscellaneous_benchmark_sycl VectorSum": {
+    "type": "benchmark",
+    "description": "Measures performance of vector addition across 3D grid (512x256x256 elements) using SYCL.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "math",
+      "throughput",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL VectorSum",
+    "explicit_group": ""
+  },
+  "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:0 graphStructure:Gromacs": {
+    "type": "benchmark",
+    "description": "Measures the time taken to finalize a SYCL graph, using a graph structure based on the usage of graphs in Gromacs. It measures finalizing the same modifiable graph repeatedly over multiple iterations.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "finalize",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL FinalizeGraph, rebuildGraphEveryIter 0, graphStructure Gromacs",
+    "explicit_group": "FinalizeGraph, GraphStructure: Gromacs"
+  },
+  "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:1 graphStructure:Gromacs": {
+    "type": "benchmark",
+    "description": "Measures the time taken to finalize a SYCL graph, using a graph structure based on the usage of graphs in Gromacs. It measures finalizing a unique modifiable graph per iteration.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "finalize",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL FinalizeGraph, rebuildGraphEveryIter 1, graphStructure Gromacs",
+    "explicit_group": "FinalizeGraph, GraphStructure: Gromacs"
+  },
+  "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:0 graphStructure:Llama": {
+    "type": "benchmark",
+    "description": "Measures the time taken to finalize a SYCL graph, using a graph structure based on the usage of graphs in Llama. It measures finalizing the same modifiable graph repeatedly over multiple iterations.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "finalize",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL FinalizeGraph, rebuildGraphEveryIter 0, graphStructure Llama",
+    "explicit_group": "FinalizeGraph, GraphStructure: Llama"
+  },
+  "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:1 graphStructure:Llama": {
+    "type": "benchmark",
+    "description": "Measures the time taken to finalize a SYCL graph, using a graph structure based on the usage of graphs in Llama. It measures finalizing a unique modifiable graph per iteration.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "finalize",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL FinalizeGraph, rebuildGraphEveryIter 1, graphStructure Llama",
+    "explicit_group": "FinalizeGraph, GraphStructure: Llama"
+  },
+  "multithread_benchmark_ur MemcpyExecute opsPerThread:400, numThreads:1, allocSize:102400 srcUSM:1 dstUSM:1": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 1 threads each performing 400 operations on 102400 bytes from device to device memory with events with driver copy offload without barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 400, numThreads 1, allocSize 102400, srcUSM 1, dstUSM 1",
+    "explicit_group": "MemcpyExecute, opsPerThread: 400, numThreads: 1, allocSize: 102400"
+  },
+  "multithread_benchmark_ur MemcpyExecute opsPerThread:400, numThreads:1, allocSize:102400 srcUSM:0 dstUSM:1": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 1 threads each performing 400 operations on 102400 bytes from host to device memory with events with driver copy offload without barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 400, numThreads 1, allocSize 102400, srcUSM 0, dstUSM 1",
+    "explicit_group": "MemcpyExecute, opsPerThread: 400, numThreads: 1, allocSize: 102400"
+  },
+  "multithread_benchmark_ur MemcpyExecute opsPerThread:100, numThreads:4, allocSize:102400 srcUSM:1 dstUSM:1 without events": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 4 threads each performing 100 operations on 102400 bytes from device to device memory without events with driver copy offload without barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 100, numThreads 4, allocSize 102400, srcUSM 1, dstUSM 1, without events",
+    "explicit_group": "MemcpyExecute, opsPerThread: 100, numThreads: 4, allocSize: 102400"
+  },
+  "multithread_benchmark_ur MemcpyExecute opsPerThread:100, numThreads:4, allocSize:102400 srcUSM:1 dstUSM:1 without events without copy offload": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 4 threads each performing 100 operations on 102400 bytes from device to device memory without events without driver copy offload without barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 100, numThreads 4, allocSize 102400, srcUSM 1, dstUSM 1, without events without copy offload",
+    "explicit_group": "MemcpyExecute, opsPerThread: 100, numThreads: 4, allocSize: 102400"
+  },
+  "multithread_benchmark_ur MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:0 dstUSM:1 without events": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 4 threads each performing 4096 operations on 1024 bytes from host to device memory without events with driver copy offload without barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 4, allocSize 1024, srcUSM 0, dstUSM 1, without events",
+    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 4, allocSize: 1024"
+  },
+  "multithread_benchmark_ur MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:0 dstUSM:1 without events with barrier": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 4 threads each performing 4096 operations on 1024 bytes from host to device memory without events with driver copy offload with barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 4, allocSize 1024, srcUSM 0, dstUSM 1, without events",
+    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 4, allocSize: 1024"
+  },
+  "api_overhead_benchmark_ur UsmMemoryAllocation usmMemoryPlacement:Device size:256 measureMode:Both": {
+    "type": "benchmark",
+    "description": "Measures memory allocation overhead by allocating 256 bytes of usm Device memory and free'ing it immediately. Both memory allocation and memory free are timed. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "UR",
+      "micro",
+      "latency",
+      "memory"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR UsmMemoryAllocation, usmMemoryPlacement Device, size 256, measureMode Both",
+    "explicit_group": "UsmMemoryAllocation"
+  },
+  "api_overhead_benchmark_ur UsmMemoryAllocation usmMemoryPlacement:Device size:262144 measureMode:Both": {
+    "type": "benchmark",
+    "description": "Measures memory allocation overhead by allocating 262144 bytes of usm Device memory and free'ing it immediately. Both memory allocation and memory free are timed. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "UR",
+      "micro",
+      "latency",
+      "memory"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR UsmMemoryAllocation, usmMemoryPlacement Device, size 262144, measureMode Both",
+    "explicit_group": "UsmMemoryAllocation"
+  },
+  "api_overhead_benchmark_ur UsmBatchMemoryAllocation usmMemoryPlacement:Device allocationCount:128 size:256 measureMode:Both": {
+    "type": "benchmark",
+    "description": "Measures memory allocation overhead by allocating 256 bytes of usm Device memory 128 times, then free'ing it all at once. Both memory allocation and memory free are timed. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "UR",
+      "micro",
+      "latency",
+      "memory"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR UsmBatchMemoryAllocation, usmMemoryPlacement Device, allocationCount 128, size 256, measureMode Both",
+    "explicit_group": "UsmBatchMemoryAllocation"
+  },
+  "api_overhead_benchmark_ur UsmBatchMemoryAllocation usmMemoryPlacement:Device allocationCount:128 size:16384 measureMode:Both": {
+    "type": "benchmark",
+    "description": "Measures memory allocation overhead by allocating 16384 bytes of usm Device memory 128 times, then free'ing it all at once. Both memory allocation and memory free are timed. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "UR",
+      "micro",
+      "latency",
+      "memory"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR UsmBatchMemoryAllocation, usmMemoryPlacement Device, allocationCount 128, size 16384, measureMode Both",
+    "explicit_group": "UsmBatchMemoryAllocation"
+  },
+  "api_overhead_benchmark_ur UsmBatchMemoryAllocation usmMemoryPlacement:Device allocationCount:128 size:131072 measureMode:Both": {
+    "type": "benchmark",
+    "description": "Measures memory allocation overhead by allocating 131072 bytes of usm Device memory 128 times, then free'ing it all at once. Both memory allocation and memory free are timed. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "UR",
+      "micro",
+      "latency",
+      "memory"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR UsmBatchMemoryAllocation, usmMemoryPlacement Device, allocationCount 128, size 131072, measureMode Both",
+    "explicit_group": "UsmBatchMemoryAllocation"
+  },
+  "multithread_benchmark_syclpreview MemcpyExecute opsPerThread:4096, numThreads:1, allocSize:1024 srcUSM:1 dstUSM:1 without events": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 1 threads each performing 4096 operations on 1024 bytes from device to device memory without events with driver copy offload without barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 1, allocSize 1024, srcUSM 1, dstUSM 1, without events",
+    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 1, allocSize: 1024"
+  },
+  "multithread_benchmark_syclpreview MemcpyExecute opsPerThread:4096, numThreads:1, allocSize:1024 srcUSM:1 dstUSM:1 without events with barrier": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 1 threads each performing 4096 operations on 1024 bytes from device to device memory without events with driver copy offload with barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 1, allocSize 1024, srcUSM 1, dstUSM 1, without events",
+    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 1, allocSize: 1024"
+  },
+  "multithread_benchmark_syclpreview MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:1 dstUSM:1 without events": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 4 threads each performing 4096 operations on 1024 bytes from device to device memory without events with driver copy offload without barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 4, allocSize 1024, srcUSM 1, dstUSM 1, without events",
+    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 4, allocSize: 1024"
+  },
+  "multithread_benchmark_syclpreview MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:1 dstUSM:1 without events with barrier": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 4 threads each performing 4096 operations on 1024 bytes from device to device memory without events with driver copy offload with barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 4, allocSize 1024, srcUSM 1, dstUSM 1, without events",
+    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 4, allocSize: 1024"
+  },
+  "Velocity-Bench Hashtable": {
+    "type": "benchmark",
+    "description": "Measures hash table search performance using an efficient lock-free algorithm with linear probing. Reports throughput in millions of keys processed per second. Higher values indicate better performance.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "throughput"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench Hashtable",
+    "explicit_group": ""
+  },
+  "Velocity-Bench Bitcracker": {
+    "type": "benchmark",
+    "description": "Password-cracking application for BitLocker-encrypted memory units. Uses dictionary attack to find user or recovery passwords. Measures total time required to process 60000 passwords.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "throughput"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench Bitcracker",
+    "explicit_group": ""
+  },
+  "Velocity-Bench CudaSift": {
+    "type": "benchmark",
+    "description": "Implementation of the SIFT (Scale Invariant Feature Transform) algorithm for detecting, describing, and matching local features in images. Measures average processing time in milliseconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "image"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench CudaSift",
+    "explicit_group": ""
+  },
+  "Velocity-Bench Easywave": {
+    "type": "benchmark",
+    "description": "A tsunami wave simulator used for researching tsunami generation and wave propagation. Measures the elapsed time in milliseconds to simulate a specified tsunami event based on real-world data.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "simulation"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench Easywave",
+    "explicit_group": ""
+  },
+  "Velocity-Bench QuickSilver": {
+    "type": "benchmark",
+    "description": "Solves a simplified dynamic Monte Carlo particle-transport problem used in HPC. Replicates memory access patterns, communication patterns, and branching of Mercury workloads. Reports a figure of merit in MMS/CTT where higher values indicate better performance.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "simulation",
+      "throughput"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench QuickSilver",
+    "explicit_group": ""
+  },
+  "Velocity-Bench Sobel Filter": {
+    "type": "benchmark",
+    "description": "Popular RGB-to-grayscale image conversion technique that applies a gaussian filter to reduce edge artifacts. Processes a large 32K x 32K image and measures the time required to apply the filter.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "image",
+      "throughput"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench Sobel Filter",
+    "explicit_group": ""
+  },
+  "Velocity-Bench dl-cifar": {
+    "type": "benchmark",
+    "description": "Deep learning image classification workload based on the CIFAR-10 dataset of 60,000 32x32 color images in 10 classes. Uses neural networks to classify input images and measures total calculation time.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "inference",
+      "image"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench dl-cifar",
+    "explicit_group": ""
+  },
+  "Velocity-Bench dl-mnist": {
+    "type": "benchmark",
+    "description": "Digit recognition based on the MNIST database, one of the oldest and most popular databases of handwritten digits. Uses neural networks to identify digits and measures total calculation time.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "inference",
+      "image"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench dl-mnist",
+    "explicit_group": ""
+  },
+  "Velocity-Bench svm": {
+    "type": "benchmark",
+    "description": "Implementation of Support Vector Machine, a popular classical machine learning technique. Uses supervised learning models with associated algorithms to analyze data for classification and regression analysis. Measures total elapsed time.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "inference"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench svm",
+    "explicit_group": ""
+  },
+  "SYCL-Bench IndependentDAGTaskThroughput_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench IndependentDAGTaskThroughput_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench DAGTaskThroughput_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench DAGTaskThroughput_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench HostDeviceBandwidth_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench HostDeviceBandwidth_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench LocalMem_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro",
+      "memory"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench LocalMem_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench ScalarProduct_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench ScalarProduct_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench Pattern_SegmentedReduction_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench Pattern_SegmentedReduction_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench USM_Allocation_latency_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench USM_Allocation_latency_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench VectorAddition_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench VectorAddition_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench 2mm": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench 2mm",
+    "explicit_group": ""
+  },
+  "SYCL-Bench 3mm": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench 3mm",
+    "explicit_group": ""
+  },
+  "SYCL-Bench Atax": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench Atax",
+    "explicit_group": ""
+  },
+  "SYCL-Bench Bicg": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench Bicg",
+    "explicit_group": ""
+  },
+  "SYCL-Bench Kmeans": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench Kmeans",
+    "explicit_group": ""
+  },
+  "SYCL-Bench LinearRegressionCoeff": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench LinearRegressionCoeff",
+    "explicit_group": ""
+  },
+  "SYCL-Bench MolecularDynamics": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench MolecularDynamics",
+    "explicit_group": ""
+  },
+  "SYCL-Bench sf_16": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench sf_16",
+    "explicit_group": ""
+  },
+  "llama.cpp DeepSeek-R1-Distill-Qwen-1.5B-Q4_0.gguf": {
+    "type": "benchmark",
+    "description": "Performance testing tool for llama.cpp that measures LLM inference speed in tokens per second. Runs both prompt processing (initial context processing) and text generation benchmarks with different batch sizes. Higher values indicate better performance. Uses the DeepSeek-R1-Distill-Qwen-1.5B-Q4_0.gguf quantized model and leverages SYCL with oneDNN for acceleration.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "inference",
+      "throughput"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "llama.cpp DeepSeek-R1-Distill-Qwen-1.5B-Q4_0.gguf",
+    "explicit_group": ""
+  },
+  "umf-benchmark": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "umf-benchmark",
+    "explicit_group": ""
+  },
+  "gromacs-0006-pme-graphs": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "gromacs-0006-pme-graphs",
+    "explicit_group": ""
+  },
+  "gromacs-0006-pme-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "gromacs-0006-pme-eager",
+    "explicit_group": ""
+  },
+  "gromacs-0006-rf-graphs": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "gromacs-0006-rf-graphs",
+    "explicit_group": ""
+  },
+  "gromacs-0006-rf-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "gromacs-0006-rf-eager",
+    "explicit_group": ""
+  },
+  "onednn-sum-f16-1-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-f16-1-eager",
+    "explicit_group": "sum-f16-1"
+  },
+  "onednn-sum-f16-2-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-f16-2-eager",
+    "explicit_group": "sum-f16-2"
+  },
+  "onednn-sum-f32-1-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-f32-1-eager",
+    "explicit_group": "sum-f32-1"
+  },
+  "onednn-sum-f32-2-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-f32-2-eager",
+    "explicit_group": "sum-f32-2"
+  },
+  "onednn-sum-padding-1-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-padding-1-eager",
+    "explicit_group": "sum-padding-1"
+  },
+  "onednn-sum-padding-1-graph": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-padding-1-graph",
+    "explicit_group": "sum-padding-1"
+  },
+  "onednn-sum-padding-2-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-padding-2-eager",
+    "explicit_group": "sum-padding-2"
+  },
+  "onednn-sum-padding-2-graph": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-padding-2-graph",
+    "explicit_group": "sum-padding-2"
+  },
+  "onednn-graph-sdpa-plain-f16-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-graph-sdpa-plain-f16-eager",
+    "explicit_group": "graph-sdpa-plain-f16"
+  },
+  "onednn-graph-sdpa-plain-f16-graph": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-graph-sdpa-plain-f16-graph",
+    "explicit_group": "graph-sdpa-plain-f16"
+  },
+  "onednn-graph-sdpa-plain-f32-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-graph-sdpa-plain-f32-eager",
+    "explicit_group": "graph-sdpa-plain-f32"
+  },
+  "onednn-graph-sdpa-plain-f32-graph": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-graph-sdpa-plain-f32-graph",
+    "explicit_group": "graph-sdpa-plain-f32"
+  },
+  "Foo Group": {
+    "type": "group",
+    "description": "This is a test benchmark for Foo Group.",
+    "notes": "This is a test note for Foo Group.\nLook, multiple lines!",
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": null,
+    "explicit_group": null
+  },
+  "Bar Group": {
+    "type": "group",
+    "description": "This is a test benchmark for Bar Group.",
+    "notes": null,
+    "unstable": "This is an unstable note for Bar Group.",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": null,
+    "explicit_group": null
+  },
+  "Memory Bandwidth 1": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Memory Bandwidth 1.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Memory Bandwidth 1",
+    "explicit_group": ""
+  },
+  "Memory Bandwidth 2": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Memory Bandwidth 2.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Memory Bandwidth 2",
+    "explicit_group": ""
+  },
+  "Memory Bandwidth 3": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Memory Bandwidth 3.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Memory Bandwidth 3",
+    "explicit_group": ""
+  },
+  "Memory Bandwidth 4": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Memory Bandwidth 4.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Memory Bandwidth 4",
+    "explicit_group": ""
+  },
+  "Memory Bandwidth 5": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Memory Bandwidth 5.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Memory Bandwidth 5",
+    "explicit_group": ""
+  },
+  "Memory Bandwidth 6": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Memory Bandwidth 6.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Memory Bandwidth 6",
+    "explicit_group": ""
+  },
+  "Latency 1": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Latency 1.",
+    "notes": "A Latency test note!",
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Latency 1",
+    "explicit_group": ""
+  },
+  "Latency 2": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Latency 2.",
+    "notes": "A Latency test note!",
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Latency 2",
+    "explicit_group": ""
+  },
+  "Latency 3": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Latency 3.",
+    "notes": "A Latency test note!",
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Latency 3",
+    "explicit_group": ""
+  },
+  "Latency 4": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Latency 4.",
+    "notes": "A Latency test note!",
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Latency 4",
+    "explicit_group": ""
+  },
+  "Latency 5": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Latency 5.",
+    "notes": "A Latency test note!",
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Latency 5",
+    "explicit_group": ""
+  },
+  "Latency 6": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Latency 6.",
+    "notes": "A Latency test note!",
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Latency 6",
+    "explicit_group": ""
+  },
+  "Throughput 1": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Throughput 1.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Throughput 1",
+    "explicit_group": ""
+  },
+  "Throughput 2": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Throughput 2.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Throughput 2",
+    "explicit_group": ""
+  },
+  "Throughput 3": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Throughput 3.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Throughput 3",
+    "explicit_group": ""
+  },
+  "Throughput 4": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Throughput 4.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Throughput 4",
+    "explicit_group": ""
+  },
+  "Throughput 5": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Throughput 5.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Throughput 5",
+    "explicit_group": ""
+  },
+  "Throughput 6": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Throughput 6.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Throughput 6",
+    "explicit_group": ""
+  },
+  "FLOPS 1": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for FLOPS 1.",
+    "notes": null,
+    "unstable": "Unstable FLOPS test!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "FLOPS 1",
+    "explicit_group": ""
+  },
+  "FLOPS 2": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for FLOPS 2.",
+    "notes": null,
+    "unstable": "Unstable FLOPS test!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "FLOPS 2",
+    "explicit_group": ""
+  },
+  "FLOPS 3": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for FLOPS 3.",
+    "notes": null,
+    "unstable": "Unstable FLOPS test!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "FLOPS 3",
+    "explicit_group": ""
+  },
+  "FLOPS 4": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for FLOPS 4.",
+    "notes": null,
+    "unstable": "Unstable FLOPS test!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "FLOPS 4",
+    "explicit_group": ""
+  },
+  "FLOPS 5": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for FLOPS 5.",
+    "notes": null,
+    "unstable": "Unstable FLOPS test!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "FLOPS 5",
+    "explicit_group": ""
+  },
+  "FLOPS 6": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for FLOPS 6.",
+    "notes": null,
+    "unstable": "Unstable FLOPS test!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "FLOPS 6",
+    "explicit_group": ""
+  },
+  "Cache Miss Rate 1": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Cache Miss Rate 1.",
+    "notes": "Test Note",
+    "unstable": "And another note!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Cache Miss Rate 1",
+    "explicit_group": ""
+  },
+  "Cache Miss Rate 2": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Cache Miss Rate 2.",
+    "notes": "Test Note",
+    "unstable": "And another note!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Cache Miss Rate 2",
+    "explicit_group": ""
+  },
+  "Cache Miss Rate 3": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Cache Miss Rate 3.",
+    "notes": "Test Note",
+    "unstable": "And another note!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Cache Miss Rate 3",
+    "explicit_group": ""
+  },
+  "Cache Miss Rate 4": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Cache Miss Rate 4.",
+    "notes": "Test Note",
+    "unstable": "And another note!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Cache Miss Rate 4",
+    "explicit_group": ""
+  },
+  "Cache Miss Rate 5": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Cache Miss Rate 5.",
+    "notes": "Test Note",
+    "unstable": "And another note!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Cache Miss Rate 5",
+    "explicit_group": ""
+  },
+  "Cache Miss Rate 6": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Cache Miss Rate 6.",
+    "notes": "Test Note",
+    "unstable": "And another note!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Cache Miss Rate 6",
+    "explicit_group": ""
+  }
+};
 
-defaultCompareNames = [];
+benchmarkTags = {
+  "SYCL": {
+    "name": "SYCL",
+    "description": "Benchmark uses SYCL runtime"
+  },
+  "UR": {
+    "name": "UR",
+    "description": "Benchmark uses Unified Runtime API"
+  },
+  "L0": {
+    "name": "L0",
+    "description": "Benchmark uses Level Zero API directly"
+  },
+  "UMF": {
+    "name": "UMF",
+    "description": "Benchmark uses Unified Memory Framework directly"
+  },
+  "micro": {
+    "name": "micro",
+    "description": "Microbenchmark focusing on a specific functionality"
+  },
+  "application": {
+    "name": "application",
+    "description": "Real application-based performance test"
+  },
+  "proxy": {
+    "name": "proxy",
+    "description": "Benchmark that simulates real application use-cases"
+  },
+  "submit": {
+    "name": "submit",
+    "description": "Tests kernel submission performance"
+  },
+  "math": {
+    "name": "math",
+    "description": "Tests math computation performance"
+  },
+  "memory": {
+    "name": "memory",
+    "description": "Tests memory transfer or bandwidth performance"
+  },
+  "allocation": {
+    "name": "allocation",
+    "description": "Tests memory allocation performance"
+  },
+  "graph": {
+    "name": "graph",
+    "description": "Tests graph-based execution performance"
+  },
+  "latency": {
+    "name": "latency",
+    "description": "Measures operation latency"
+  },
+  "throughput": {
+    "name": "throughput",
+    "description": "Measures operation throughput"
+  },
+  "inference": {
+    "name": "inference",
+    "description": "Tests ML/AI inference performance"
+  },
+  "image": {
+    "name": "image",
+    "description": "Image processing benchmark"
+  },
+  "simulation": {
+    "name": "simulation",
+    "description": "Physics or scientific simulation benchmark"
+  }
+};
 
+defaultCompareNames = [
+  "This PR"
+];
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
index 95fc7a6b28736..87dbea74c0ca8 100755
--- a/devops/scripts/benchmarks/main.py
+++ b/devops/scripts/benchmarks/main.py
@@ -17,11 +17,12 @@
 from output_markdown import generate_markdown
 from output_html import generate_html
 from history import BenchmarkHistory
-from utils.utils import prepare_workdir
+from utils.utils import prepare_workdir, git_clone, run
 from utils.compute_runtime import *
 from utils.validate import Validate
 from utils.detect_versions import DetectVersion
 from presets import enabled_suites, presets
+from utils.oneapi import get_oneapi
 
 import argparse
 import re
@@ -32,16 +33,58 @@
 INTERNAL_WORKDIR_VERSION = "2.0"
 
 
+def download_and_build_unitrace(workdir):
+    repo_dir = git_clone(
+        workdir,
+        "pti-gpu-repo",
+        "https://github.com/intel/pti-gpu.git",
+        "master",
+    )
+    build_dir = os.path.join(workdir, "unitrace-build")
+    unitrace_src = os.path.join(repo_dir, "tools", "unitrace")
+    os.makedirs(build_dir, exist_ok=True)
+
+    unitrace_exe = os.path.join(build_dir, "unitrace")
+    if not os.path.isfile(unitrace_exe):
+        run(
+            [
+                "cmake",
+                f"-S {unitrace_src}",
+                f"-B {build_dir}",
+                "-DCMAKE_BUILD_TYPE=Release",
+                "-DCMAKE_CXX_COMPILER=clang++",
+                "-DCMAKE_C_COMPILER=clang",
+                "-DBUILD_WITH_L0=1",
+                "-DBUILD_WITH_OPENCL=0",
+                "-DBUILD_WITH_ITT=1",
+                "-DBUILD_WITH_XPTI=1",
+                "-DBUILD_WITH_MPI=0",
+            ],
+            ld_library=get_oneapi().ld_libraries() + [f"{options.sycl}/lib"],
+            add_sycl=True,
+        )
+        run(
+            ["cmake", "--build", build_dir, "-j"],
+            ld_library=get_oneapi().ld_libraries() + [f"{options.sycl}/lib"],
+            add_sycl=True,
+        )
+    print("Unitrace built successfully.")
+
+
 def run_iterations(
     benchmark: Benchmark,
     env_vars,
     iters: int,
     results: dict[str, list[Result]],
     failures: dict[str, str],
+    with_unitrace: bool = False,
 ):
     for iter in range(iters):
-        print(f"running {benchmark.name()}, iteration {iter}... ", flush=True)
-        bench_results = benchmark.run(env_vars)
+        if with_unitrace:
+            print(f"running {benchmark.name()} with Unitrace", flush=True)
+        else:
+            print(f"running {benchmark.name()}, iteration {iter}... ", flush=True)
+        bench_results = benchmark.run(env_vars, with_unitrace=with_unitrace)
         if bench_results is None:
             failures[benchmark.name()] = "benchmark produced no results!"
             break
@@ -158,6 +201,14 @@ def collect_metadata(suites):
 def main(directory, additional_env_vars, save_name, compare_names, filter):
     prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
 
+    if options.unitrace or options.unitrace_inclusive:
+        print("Downloading and building Unitrace...")
+        download_and_build_unitrace(options.workdir)
+        if options.results_directory_override == None:
+            options.unitrace_res_dir = os.path.join(directory, "results")
+        else:
+            options.unitrace_res_dir = options.results_directory_override
+
     if options.build_compute_runtime:
         print(f"Setting up Compute Runtime {options.compute_runtime_tag}")
         cr = get_compute_runtime()
@@ -234,19 +285,30 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
             merged_env_vars = {**additional_env_vars}
             intermediate_results: dict[str, list[Result]] = {}
             processed: list[Result] = []
-            for _ in range(options.iterations_stddev):
+            if not options.unitrace:
+                for _ in range(options.iterations_stddev):
+                    run_iterations(
+                        benchmark,
+                        merged_env_vars,
+                        options.iterations,
+                        intermediate_results,
+                        failures,
+                    )
+                    valid, processed = process_results(
+                        intermediate_results, benchmark.stddev_threshold()
+                    )
+                    if valid:
+                        break
+            if options.unitrace_inclusive or options.unitrace:
+                # run the benchmark with unitrace
                 run_iterations(
                     benchmark,
                     merged_env_vars,
-                    options.iterations,
+                    1,
                     intermediate_results,
                     failures,
+                    with_unitrace=True,
                 )
-                valid, processed = process_results(
-                    intermediate_results, benchmark.stddev_threshold()
-                )
-                if valid:
-                    break
             results += processed
         except Exception as e:
             if options.exit_on_failure:
@@ -500,6 +562,17 @@ def validate_and_parse_env_args(env_args):
         help="HIP device architecture",
         default=None,
     )
+    parser.add_argument(
+        "--unitrace",
+        action="store_true",
+        help="Unitrace tracing for sigle iteration of benchmarks",
+    )
+
+    parser.add_argument(
+        "--unitrace-inclusive",
+        action="store_true",
+        help="Regular run of benchmarks iterations and unitrace tracing in single additional run",
+    )
 
     # Options intended for CI:
     parser.add_argument(
@@ -589,6 +662,8 @@ def validate_and_parse_env_args(env_args):
     options.results_directory_override = args.results_dir
     options.build_jobs = args.build_jobs
     options.hip_arch = args.hip_arch
+    options.unitrace = args.unitrace
+    options.unitrace_inclusive = args.unitrace_inclusive
 
     if args.build_igc and args.compute_runtime is None:
         parser.error("--build-igc requires --compute-runtime to be set")
@@ -599,6 +674,10 @@ def validate_and_parse_env_args(env_args):
         if not os.path.isdir(args.output_dir):
             parser.error("Specified --output-dir is not a valid path")
         options.output_directory = os.path.abspath(args.output_dir)
+    if args.unitrace_inclusive and args.unitrace:
+        parser.error(
+            "--unitrace-inclusive and --unitrace are mutually exclusive, please specify only one of them"
+        )
 
     # Options intended for CI:
     options.timestamp_override = args.timestamp_override
diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py
index a7b65e752d450..73170e527a6d2 100644
--- a/devops/scripts/benchmarks/options.py
+++ b/devops/scripts/benchmarks/options.py
@@ -41,6 +41,7 @@ class DetectVersionsOptions:
     # Max amount of api calls permitted on each run of the benchmark scripts
     max_api_calls = 4
 
+
 @dataclass
 class Options:
     workdir: str = None
@@ -70,6 +71,9 @@ class Options:
     current_run_name: str = "This PR"
     preset: str = "Full"
     build_jobs: int = multiprocessing.cpu_count()
+    unitrace: bool = False
+    unitrace_inclusive: bool = False
+    unitrace_res_dir: str = None
 
     # Options intended for CI:
     regression_threshold: float = 0.05
diff --git a/devops/scripts/benchmarks/utils/utils.py b/devops/scripts/benchmarks/utils/utils.py
index ef2a1222a7183..3780494e00584 100644
--- a/devops/scripts/benchmarks/utils/utils.py
+++ b/devops/scripts/benchmarks/utils/utils.py
@@ -56,7 +56,9 @@ def run(
             command_str = " ".join(command)
             env_str = " ".join(f"{key}={value}" for key, value in env_vars.items())
             full_command_str = f"{env_str} {command_str}".strip()
-            print(f"Running: {full_command_str}")
+            print(
+                f"Running: {full_command_str}\nLD_LIBRARY_PATH: {env.get('LD_LIBRARY_PATH', '')}"
+            )
 
         result = subprocess.run(
             command,

From d653883081704633112979933c123dd116a7a3b8 Mon Sep 17 00:00:00 2001
From: "Mateusz P. Nowak" <mateusz.p.nowak@intel.com>
Date: Wed, 25 Jun 2025 15:48:37 +0000
Subject: [PATCH 2/6] Add Unitrace support

Signed-off-by: Mateusz P. Nowak <mateusz.p.nowak@intel.com>
---
 devops/scripts/benchmarks/benches/base.py     |  105 +-
 devops/scripts/benchmarks/benches/benchdnn.py |    7 +-
 devops/scripts/benchmarks/benches/compute.py  |    6 +-
 devops/scripts/benchmarks/benches/gromacs.py  |    3 +-
 devops/scripts/benchmarks/benches/llamacpp.py |    7 +-
 .../scripts/benchmarks/benches/syclbench.py   |    4 +-
 devops/scripts/benchmarks/benches/test.py     |    2 +-
 devops/scripts/benchmarks/benches/umf.py      |    8 +-
 devops/scripts/benchmarks/benches/velocity.py |    9 +-
 devops/scripts/benchmarks/history.py          |   14 +-
 devops/scripts/benchmarks/html/data.js        | 5106 ++++++++++++++++-
 devops/scripts/benchmarks/main.py             |  111 +-
 devops/scripts/benchmarks/options.py          |    4 +
 13 files changed, 5345 insertions(+), 41 deletions(-)

diff --git a/devops/scripts/benchmarks/benches/base.py b/devops/scripts/benchmarks/benches/base.py
index efbf7d77e003d..215dc57f67f28 100644
--- a/devops/scripts/benchmarks/benches/base.py
+++ b/devops/scripts/benchmarks/benches/base.py
@@ -12,6 +12,7 @@
 from options import options
 from utils.utils import download, run
 from abc import ABC, abstractmethod
+import glob
 
 benchmark_tags = [
     BenchmarkTag("SYCL", "Benchmark uses SYCL runtime"),
@@ -70,7 +71,9 @@ def teardown(self):
         pass
 
     @abstractmethod
-    def run(self, env_vars) -> list[Result]:
+    def run(
+        self, env_vars, unitrace_timestamp: str = None
+    ) -> list[Result]:
         pass
 
     @staticmethod
@@ -86,7 +89,14 @@ def get_adapter_full_path():
         ), f"could not find adapter file {adapter_path} (and in similar lib paths)"
 
     def run_bench(
-        self, command, env_vars, ld_library=[], add_sycl=True, use_stdout=True
+        self,
+        command,
+        env_vars,
+        ld_library=[],
+        add_sycl=True,
+        use_stdout=True,
+        unitrace_timestamp: str = None,
+        extra_unitrace_opt=[],
     ):
         env_vars = env_vars.copy()
         if options.ur is not None:
@@ -99,6 +109,33 @@ def run_bench(
         ld_libraries = options.extra_ld_libraries.copy()
         ld_libraries.extend(ld_library)
 
+        if unitrace_timestamp is not None:
+            unitrace_bin = os.path.join(options.workdir, "unitrace-build", "unitrace")
+            if not os.path.exists(unitrace_bin):
+                raise FileNotFoundError(f"Unitrace binary not found: {unitrace_bin}. ")
+            if not os.path.exists(options.unitrace_res_dir):
+                os.makedirs(options.unitrace_res_dir)
+            bench_dir = f"{options.unitrace_res_dir}/{self.name()}"
+            os.makedirs(bench_dir, exist_ok=True)
+
+            unitrace_output = f"{bench_dir}/{self.name()}_{unitrace_timestamp}"
+            command = (
+                [
+                    str(unitrace_bin),
+                    "--call-logging",
+                    "--host-timing",
+                    "--chrome-sycl-logging",
+                    "--chrome-call-logging",
+                    "--chrome-kernel-logging",
+                    "--output",
+                    unitrace_output,
+                ]
+                + extra_unitrace_opt
+                + command
+            )
+            if options.verbose:
+                print(f"Unitrace cmd: {' '.join(command)}")
+
         result = run(
             command=command,
             env_vars=env_vars,
@@ -107,6 +144,9 @@ def run_bench(
             ld_library=ld_libraries,
         )
 
+        if unitrace_timestamp is not None:
+            handle_unitrace_output(bench_dir, unitrace_output, unitrace_timestamp)
+
         if use_stdout:
             return result.stdout.decode()
         else:
@@ -193,3 +233,64 @@ def setup(self):
 
     def additional_metadata(self) -> dict[str, BenchmarkMetadata]:
         return {}
+
+
+def handle_unitrace_output(bench_dir, unitrace_output, timestamp):
+    FILECNT = 20  # Set your desired max file count
+
+    # 1. Handle unitrace_output.{pid} logs: rename to unitrace_output (remove pid)
+    for f in os.listdir(bench_dir):
+        if f.startswith(os.path.basename(unitrace_output) + "."):
+            parts = f.rsplit(".", 1)
+            if (
+                len(parts) == 2
+                and parts[1].isdigit()
+                and os.path.isfile(os.path.join(bench_dir, f))
+            ):
+                src = os.path.join(bench_dir, f)
+                dst = os.path.join(bench_dir, os.path.basename(unitrace_output))
+                shutil.move(src, dst)
+                break
+
+    # 2. Handle {name}.{pid}.json files: move and rename to {self.name()}.{timestamp}.json
+    pid_json_files = []
+    for f in os.listdir(options.benchmark_cwd):
+        parts = f.split(".")
+        l = len(parts)
+        if len(parts) >= 3 and parts[l - 1] == "json" and parts[l - 2].isdigit():
+            pid_json_files.append(f)
+
+    if len(pid_json_files) == 1:
+        # Extract benchmark name from bench_dir path
+        bench_name = os.path.basename(bench_dir)
+        dst = f"{bench_dir}/{bench_name}_{timestamp}.json"
+        shutil.move(os.path.join(options.benchmark_cwd, pid_json_files[0]), dst)
+    elif len(pid_json_files) > 1:
+        print(
+            f"Warning: Found {len(pid_json_files)} files matching the pattern. Expected 1."
+        )
+
+    # Count files in the dir and remove oldest if more than FILECNT
+    def extract_timestamp_from_name(filename):
+        # Example: onednn-sum-padding-2-graph_20250701_114551
+        base = os.path.basename(filename)
+        parts = base.rsplit("_", 1)
+        if len(parts) == 2:
+            ts = parts[1]
+            # Remove extension if present (for .json files)
+            ts = ts.split(".", 1)[0]
+            return ts
+        return ""
+
+    files = glob.glob(f"{bench_dir}/*")
+    files_with_ts = []
+    for f in files:
+        ts = extract_timestamp_from_name(f)
+        files_with_ts.append((f, ts))
+    # Sort by timestamp string (lexicographically, which works for YYYYMMDD_HHMMSS)
+    files_with_ts.sort(key=lambda x: x[1])
+    sorted_files = [f for f, ts in files_with_ts if ts]
+
+    if len(sorted_files) > FILECNT:
+        for f in sorted_files[: len(sorted_files) - FILECNT]:
+            os.remove(f)
diff --git a/devops/scripts/benchmarks/benches/benchdnn.py b/devops/scripts/benchmarks/benches/benchdnn.py
index 4698d7ed965de..cd6ccd790e4e4 100644
--- a/devops/scripts/benchmarks/benches/benchdnn.py
+++ b/devops/scripts/benchmarks/benches/benchdnn.py
@@ -73,6 +73,8 @@ def setup(self):
             f"-B {self.build_dir}",
             f"-DCMAKE_PREFIX_PATH={options.sycl}",
             "-DCMAKE_BUILD_TYPE=Release",
+            "-DCMAKE_CXX_COMPILER=clang++",
+            "-DCMAKE_C_COMPILER=clang",
             "-DDNNL_BUILD_TESTS=ON",
             "-DDNNL_BUILD_EXAMPLES=OFF",
             "-DDNNL_CPU_RUNTIME=NONE",  # Disable SYCL support
@@ -87,6 +89,7 @@ def setup(self):
             f"cmake --build {self.build_dir} --target benchdnn -j {options.build_jobs}",
             add_sycl=True,
             ld_library=[str(self.build_dir) + "/src"] + self.oneapi.ld_libraries(),
+            timeout=60 * 30,
         )
 
     def teardown(self):
@@ -126,7 +129,7 @@ def setup(self):
         if not self.bench_bin.exists():
             raise FileNotFoundError(f"Benchmark binary not found: {self.bench_bin}")
 
-    def run(self, env_vars):
+    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         command = [
             str(self.bench_bin),
             *self.bench_args.split(),
@@ -145,6 +148,8 @@ def run(self, env_vars):
             add_sycl=True,
             ld_library=ld_library,
             use_stdout=True,
+            unitrace_timestamp=unitrace_timestamp,
+            extra_unitrace_opt=["--chrome-dnn-logging"],
         )
         result_value = self._extract_time(output)
 
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index 25631c288f951..26931e553a2d7 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -280,7 +280,7 @@ def explicit_group(self):
     def description(self) -> str:
         return ""
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         command = [
             f"{self.benchmark_bin}",
             f"--test={self.test}",
@@ -291,7 +291,9 @@ def run(self, env_vars) -> list[Result]:
         command += self.bin_args()
         env_vars.update(self.extra_env_vars())
 
-        result = self.run_bench(command, env_vars)
+        result = self.run_bench(
+            command, env_vars, unitrace_timestamp=unitrace_timestamp
+        )
         parsed_results = self.parse_output(result)
         ret = []
         for label, median, stddev, unit in parsed_results:
diff --git a/devops/scripts/benchmarks/benches/gromacs.py b/devops/scripts/benchmarks/benches/gromacs.py
index 4d95c538df09c..eb75710fa52f0 100644
--- a/devops/scripts/benchmarks/benches/gromacs.py
+++ b/devops/scripts/benchmarks/benches/gromacs.py
@@ -161,7 +161,7 @@ def setup(self):
             ld_library=self.suite.oneapi.ld_libraries(),
         )
 
-    def run(self, env_vars):
+    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         model_dir = self.grappa_dir / self.model
 
         env_vars.update({"SYCL_CACHE_PERSISTENT": "1"})
@@ -200,6 +200,7 @@ def run(self, env_vars):
             add_sycl=True,
             use_stdout=False,
             ld_library=self.suite.oneapi.ld_libraries(),
+            unitrace_timestamp=unitrace_timestamp,
         )
 
         if not self._validate_correctness(options.benchmark_cwd + "/md.log"):
diff --git a/devops/scripts/benchmarks/benches/llamacpp.py b/devops/scripts/benchmarks/benches/llamacpp.py
index d8eba11916a31..06aea3d755364 100644
--- a/devops/scripts/benchmarks/benches/llamacpp.py
+++ b/devops/scripts/benchmarks/benches/llamacpp.py
@@ -115,7 +115,7 @@ def get_tags(self):
     def lower_is_better(self):
         return False
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         command = [
             f"{self.benchmark_bin}",
             "--output",
@@ -141,7 +141,10 @@ def run(self, env_vars) -> list[Result]:
         ]
 
         result = self.run_bench(
-            command, env_vars, ld_library=self.bench.oneapi.ld_libraries()
+            command,
+            env_vars,
+            ld_library=self.bench.oneapi.ld_libraries(),
+            unitrace_timestamp=unitrace_timestamp,
         )
         parsed = self.parse_output(result)
         results = []
diff --git a/devops/scripts/benchmarks/benches/syclbench.py b/devops/scripts/benchmarks/benches/syclbench.py
index be36c4cd7ba9a..98a7ee3054934 100644
--- a/devops/scripts/benchmarks/benches/syclbench.py
+++ b/devops/scripts/benchmarks/benches/syclbench.py
@@ -137,7 +137,7 @@ def setup(self):
             self.directory, "sycl-bench-build", self.bench_name
         )
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         self.outputfile = os.path.join(self.bench.directory, self.test + ".csv")
 
         command = [
@@ -151,7 +151,7 @@ def run(self, env_vars) -> list[Result]:
         env_vars.update(self.extra_env_vars())
 
         # no output to stdout, all in outputfile
-        self.run_bench(command, env_vars)
+        self.run_bench(command, env_vars, unitrace_timestamp=unitrace_timestamp)
 
         with open(self.outputfile, "r") as f:
             reader = csv.reader(f)
diff --git a/devops/scripts/benchmarks/benches/test.py b/devops/scripts/benchmarks/benches/test.py
index 94fd76041a26f..61ed6ebc29294 100644
--- a/devops/scripts/benchmarks/benches/test.py
+++ b/devops/scripts/benchmarks/benches/test.py
@@ -88,7 +88,7 @@ def notes(self) -> str:
     def unstable(self) -> str:
         return self.unstable_text
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         random_value = self.value + random.uniform(-1 * (self.diff), self.diff)
         return [
             Result(
diff --git a/devops/scripts/benchmarks/benches/umf.py b/devops/scripts/benchmarks/benches/umf.py
index 6af745e7de3cb..0491f36944ba1 100644
--- a/devops/scripts/benchmarks/benches/umf.py
+++ b/devops/scripts/benchmarks/benches/umf.py
@@ -138,7 +138,7 @@ def get_names_of_benchmarks_to_be_run(self, command, env_vars):
 
         return all_names
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         command = [f"{self.benchmark_bin}"]
 
         all_names = self.get_names_of_benchmarks_to_be_run(command, env_vars)
@@ -152,7 +152,11 @@ def run(self, env_vars) -> list[Result]:
             specific_benchmark = command + ["--benchmark_filter=^" + name + "$"]
 
             result = self.run_bench(
-                specific_benchmark, env_vars, add_sycl=False, ld_library=[self.umf_lib]
+                specific_benchmark,
+                env_vars,
+                add_sycl=False,
+                ld_library=[self.umf_lib],
+                unitrace_timestamp=unitrace_timestamp,
             )
 
             parsed = self.parse_output(result)
diff --git a/devops/scripts/benchmarks/benches/velocity.py b/devops/scripts/benchmarks/benches/velocity.py
index 17e0d22331df2..971229a1f0e51 100644
--- a/devops/scripts/benchmarks/benches/velocity.py
+++ b/devops/scripts/benchmarks/benches/velocity.py
@@ -130,7 +130,7 @@ def description(self) -> str:
     def get_tags(self):
         return ["SYCL", "application"]
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         env_vars.update(self.extra_env_vars())
 
         command = [
@@ -138,7 +138,12 @@ def run(self, env_vars) -> list[Result]:
         ]
         command += self.bin_args()
 
-        result = self.run_bench(command, env_vars, ld_library=self.ld_libraries())
+        result = self.run_bench(
+            command,
+            env_vars,
+            ld_library=self.ld_libraries(),
+            unitrace_timestamp=unitrace_timestamp,
+        )
 
         return [
             Result(
diff --git a/devops/scripts/benchmarks/history.py b/devops/scripts/benchmarks/history.py
index 0295514d3cc24..8905bd5a35bb5 100644
--- a/devops/scripts/benchmarks/history.py
+++ b/devops/scripts/benchmarks/history.py
@@ -149,7 +149,7 @@ def git_info_from_path(path: Path) -> (str, str):
             compute_runtime=compute_runtime,
         )
 
-    def save(self, save_name, results: list[Result], to_file=True):
+    def save(self, save_name, timestamp, results: list[Result], to_file=True):
         benchmark_data = self.create_run(save_name, results)
         self.runs.append(benchmark_data)
 
@@ -160,12 +160,12 @@ def save(self, save_name, results: list[Result], to_file=True):
         results_dir = Path(os.path.join(self.dir, "results"))
         os.makedirs(results_dir, exist_ok=True)
 
-        # Use formatted timestamp for the filename
-        timestamp = (
-            datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
-            if options.timestamp_override is None
-            else options.timestamp_override
-        )
+        # # Use formatted timestamp for the filename
+        # timestamp = (
+        #     datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
+        #     if options.timestamp_override is None
+        #     else options.timestamp_override
+        # )
         file_path = Path(os.path.join(results_dir, f"{save_name}_{timestamp}.json"))
         with file_path.open("w") as file:
             json.dump(serialized, file, indent=4)
diff --git a/devops/scripts/benchmarks/html/data.js b/devops/scripts/benchmarks/html/data.js
index 2f1862fe621b7..f4ca859a904fd 100644
--- a/devops/scripts/benchmarks/html/data.js
+++ b/devops/scripts/benchmarks/html/data.js
@@ -1,11 +1,5101 @@
-// This file serves as a placeholder for loading data locally: If
-// `remoteDataUrl` (etc.) is not defined in config.js, the dashboard will
-// attempt to load data from variables defined here instead.
-//
-// These variables are empty by default, and are populated by main.py if
-// `--output-html local` is specified.
+benchmarkRuns = [
+  {
+    "results": [
+      {
+        "label": "onednn-sum-f16-1-eager",
+        "value": 0.00944,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--sdt=f16:f16:f16",
+          "--stag=abx:abx:abx",
+          "--scales=1.25:3:0.5",
+          "16x2x6x4x3"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16 --scales=1.25:3:0.5 16x2x6x4x3,0,2.52173,0.00944,0,0.0128609,0\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.00944 avg(ms):0.0128609\ntotal: 0.27s; create_pd: 0.00s (0%); create_prim: 0.00s (1%); fill: 0.01s (3%); execute: 0.00s (0%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.0,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-f16-1-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-sum-f16-2-eager",
+        "value": 0.60928,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--reset",
+          "--ddt=f16",
+          "--sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16",
+          "--stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b",
+          "--dtag=abx,aBx16b,ABx16a16b,ABcd16b16a,BAcd16a16b,BAcd16b16a,aBCd16b16c,aBCd16c16b,aCBd16b16c,aCBd16c16b",
+          "--scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2",
+          "16x32x48x5"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=abx --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,48.9631,0.06448,0,0.0676806,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBx16b --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,30.5063,0.05808,0,0.0612839,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABx16a16b --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,47.4368,0.05888,0,0.0620269,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABcd16b16a --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,46.4478,0.06368,0,0.0671496,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16a16b --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,46.657,0.05984,0,0.0630586,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16b16a --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,45.9631,0.06448,0,0.0679256,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16b16c --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,41.9988,0.06208,0,0.0652478,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16c16b --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,47.5825,0.05808,0,0.061508,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16b16c --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,51.822,0.06288,0,0.0659863,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16c16b --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,50.4551,0.0568,0,0.0609149,0\ntests:10 passed:10 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.60928 avg(ms):0.642782\ntotal: 2.47s; create_pd: 0.01s (0%); create_prim: 0.45s (18%); fill: 0.08s (3%); execute: 0.02s (1%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.0021996363335788104,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-f16-2-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-sum-f32-1-eager",
+        "value": 0.0088,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--sdt=bf16:bf16:bf16",
+          "--stag=abx:abx:abx",
+          "--scales=0.5:2:0.5",
+          "16x2x6x4x3"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16 --scales=0.5:2:0.5 16x2x6x4x3,0,2.42236,0.0088,0,0.0129955,0\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.0088 avg(ms):0.0129955\ntotal: 0.28s; create_pd: 0.00s (0%); create_prim: 0.00s (1%); fill: 0.01s (3%); execute: 0.00s (0%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.00017486502731471965,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-f32-1-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-sum-f32-2-eager",
+        "value": 0.6441600000000001,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--reset",
+          "--inplace=true,false",
+          "--ddt=bf16",
+          "--sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16",
+          "--stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b",
+          "--dtag=abx,aBx16b,ABx16a16b,ABcd16b16a,BAcd16a16b,BAcd16b16a,aBCd16b16c,aBCd16c16b,aCBd16b16c,aCBd16c16b",
+          "--scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15",
+          "16x32x48x5"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=abx --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,47.5034,0.0648,0,0.0682694,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=abx --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,0.0119629,0.06512,0,0.068156,0\n2:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBx16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBx16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBx16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,29.7659,0.0568,0,0.060283,0\n4:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABx16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABx16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABx16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,46.333,0.0568,0,0.0603173,0\n6:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,44.2969,0.05808,0,0.0609991,0\n8:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,44.946,0.05248,0,0.055674,0\n10:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,44.5508,0.05904,0,0.0620182,0\n12:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,41.147,0.06112,0,0.0643661,0\n14:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,46.8096,0.05728,0,0.0602824,0\n16:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,50.3113,0.05664,0,0.0600053,0\n18:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,49.8357,0.056,0,0.0593853,0\ntests:20 passed:11 skipped:9 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.64416 avg(ms):0.679756\ntotal: 2.68s; create_pd: 0.01s (0%); create_prim: 0.44s (16%); fill: 0.08s (3%); execute: 0.02s (1%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.004735567547823622,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-f32-2-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-sum-padding-1-eager",
+        "value": 0.3904,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--ddt=f32",
+          "--sdt=f32:f32",
+          "--stag=aBcd16b",
+          "--dtag=aBcd16b",
+          "1x8x64x64",
+          "1x8x640x1024",
+          "1x24x640x1024"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x8x64x64,0,1.58545,0.00192,0,0.00269551,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x8x640x1024,0,0.890869,0.08528,0,0.0932233,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x24x640x1024,0,1.29517,0.3032,0,0.32437,0\ntests:3 passed:3 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.3904 avg(ms):0.420289\ntotal: 0.91s; create_pd: 0.00s (0%); create_prim: 0.00s (0%); fill: 0.15s (16%); execute: 0.02s (2%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.001602664448140469,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-padding-1-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-sum-padding-1-graph",
+        "value": 0.39216,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=graph",
+          "--ddt=f32",
+          "--sdt=f32:f32",
+          "--stag=aBcd16b",
+          "--dtag=aBcd16b",
+          "1x8x64x64",
+          "1x8x640x1024",
+          "1x24x640x1024"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --execution-mode=graph --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x8x64x64,0,1.43994,0.00192,0,0.00268973,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --execution-mode=graph --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x8x640x1024,0,0.874268,0.08656,0,0.094599,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --execution-mode=graph --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x24x640x1024,0,1.27124,0.30368,0,0.325998,0\ntests:3 passed:3 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.39216 avg(ms):0.423287\ntotal: 0.89s; create_pd: 0.00s (0%); create_prim: 0.00s (0%); fill: 0.13s (15%); execute: 0.02s (2%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.0009097985124923661,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-padding-1-graph",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-sum-padding-2-eager",
+        "value": 0.00336,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--sdt=bf16:bf16",
+          "--ddt=bf16",
+          "--stag=AB48a16b:AB48a16b",
+          "--dtag=AB48a16b",
+          "512x1024"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16 --ddt=bf16 --stag=AB48a16b:AB48a16b --dtag=AB48a16b --scales=1 512x1024,0,1.21216,0.00336,0,0.00399908,0\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.00336 avg(ms):0.00399908\ntotal: 0.33s; create_pd: 0.00s (0%); create_prim: 0.00s (0%); fill: 0.02s (6%); execute: 0.00s (0%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 8.262364471909155e-05,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-padding-2-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-sum-padding-2-graph",
+        "value": 0.00352,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--sum",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=graph",
+          "--sdt=bf16:bf16",
+          "--ddt=bf16",
+          "--stag=AB48a16b:AB48a16b",
+          "--dtag=AB48a16b",
+          "512x1024"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --execution-mode=graph --sdt=bf16:bf16 --ddt=bf16 --stag=AB48a16b:AB48a16b --dtag=AB48a16b --scales=1 512x1024,0,1.24072,0.00352,0,0.00398547,0\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.00352 avg(ms):0.00398547\ntotal: 0.33s; create_pd: 0.00s (0%); create_prim: 0.00s (0%); fill: 0.02s (7%); execute: 0.00s (0%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.0,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-sum-padding-2-graph",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-graph-sdpa-plain-f16-eager",
+        "value": 0.33968,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--graph",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--reset",
+          "--dt=f16",
+          "--case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%prb%,%-time%,%0time%\nperf,gpu,--mode=P --max-ms-per-prb=100 --graph --engine=gpu --dt=f16 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json,0.33968,0.342391\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.33968 avg(ms):0.342391\ntotal: 0.54s; create_pd: 0.00s (0%); create_prim: 0.07s (13%); fill: 0.00s (0%); execute: 0.00s (0%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.00855442631178792,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-graph-sdpa-plain-f16-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-graph-sdpa-plain-f32-eager",
+        "value": 0.38512,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--graph",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=direct",
+          "--reset",
+          "--dt=f32",
+          "--case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%prb%,%-time%,%0time%\nperf,gpu,--mode=P --max-ms-per-prb=100 --graph --engine=gpu --dt=f32 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json,0.38512,0.388208\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.38512 avg(ms):0.388208\ntotal: 0.60s; create_pd: 0.00s (0%); create_prim: 0.07s (11%); fill: 0.00s (0%); execute: 0.00s (0%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.0066990148529466635,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-graph-sdpa-plain-f32-eager",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      },
+      {
+        "label": "onednn-graph-sdpa-plain-f32-graph",
+        "value": 0.37952,
+        "command": [
+          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
+          "--graph",
+          "--mode=P",
+          "--engine=gpu",
+          "--max-ms-per-prb=100",
+          "--execution-mode=graph",
+          "--reset",
+          "--dt=f32",
+          "--case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json"
+        ],
+        "env": {
+          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
+        },
+        "stdout": "Output template: perf,%engine%,%prb%,%-time%,%0time%\nperf,gpu,--mode=P --max-ms-per-prb=100 --graph --engine=gpu --execution-mode=graph --dt=f32 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json,0.37952,0.382662\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.37952 avg(ms):0.382662\ntotal: 0.58s; create_pd: 0.00s (0%); create_prim: 0.07s (11%); fill: 0.00s (0%); execute: 0.00s (0%);\n",
+        "passed": true,
+        "unit": "ms",
+        "stddev": 0.011297102873450952,
+        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
+        "git_hash": "v3.8",
+        "name": "onednn-graph-sdpa-plain-f32-graph",
+        "lower_is_better": true,
+        "suite": "BenchDNN"
+      }
+    ],
+    "name": "This PR",
+    "hostname": "gkdse-pre-dnp-02",
+    "git_hash": "1eb1026ad0ef",
+    "github_repo": "mateuszpn/llvm",
+    "date": "2025-06-27T09:56:15.698275+00:00",
+    "compute_runtime": "unknown"
+  }
+];
 
-benchmarkRuns = [];
+benchmarkMetadata = {
+  "SubmitKernel": {
+    "type": "group",
+    "description": "Measures CPU time overhead of submitting kernels through different APIs.",
+    "notes": "Each layer builds on top of the previous layer, adding functionality and overhead.\nThe first layer is the Level Zero API, the second is the Unified Runtime API, and the third is the SYCL API.\nThe UR v2 adapter noticeably reduces UR layer overhead, also improving SYCL performance.\nWork is ongoing to reduce the overhead of the SYCL API\n",
+    "unstable": null,
+    "tags": [
+      "submit",
+      "micro",
+      "SYCL",
+      "UR",
+      "L0"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": null,
+    "explicit_group": null
+  },
+  "SinKernelGraph": {
+    "type": "group",
+    "description": null,
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "submit",
+      "memory",
+      "proxy",
+      "SYCL",
+      "UR",
+      "L0",
+      "graph"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": null,
+    "explicit_group": null
+  },
+  "SubmitGraph": {
+    "type": "group",
+    "description": null,
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "micro",
+      "SYCL",
+      "UR",
+      "L0",
+      "graph"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": null,
+    "explicit_group": null
+  },
+  "FinalizeGraph": {
+    "type": "group",
+    "description": null,
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "finalize",
+      "micro",
+      "SYCL",
+      "graph"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": null,
+    "explicit_group": null
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order, NumKernels 10",
+    "explicit_group": "SubmitKernel in order"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20, CPU count"
+  },
+  "graph_api_benchmark_syclpreview SinKernelGraph graphs:0, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 5 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SinKernelGraph, graphs 0, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_syclpreview SinKernelGraph graphs:0, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 100 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SinKernelGraph, graphs 0, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "graph_api_benchmark_syclpreview SinKernelGraph graphs:1, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 5 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SinKernelGraph, graphs 1, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_syclpreview SinKernelGraph graphs:1, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 100 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SinKernelGraph, graphs 1, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "ulls_benchmark_syclpreview EmptyKernel wgc:1000, wgs:256": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW EmptyKernel, wgc 1000, wgs 256",
+    "explicit_group": "EmptyKernel, wgc: 1000, wgs: 256"
+  },
+  "ulls_benchmark_syclpreview KernelSwitch count 8 kernelTime 200": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW KernelSwitch, count 8, kernelTime 200",
+    "explicit_group": "KernelSwitch, count: 8, kernelTime: 200"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:4 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 4, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:4 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 4, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:10 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 10, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:10 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 10, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:32 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 32, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:32 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 32, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:4 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 4, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:4 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 4, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:10 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 10, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:10 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 10, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:32 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 32, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_syclpreview SubmitGraph numKernels:32 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCLPREVIEW performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 32, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order, NumKernels 10",
+    "explicit_group": "SubmitKernel in order"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "SYCL SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20, CPU count"
+  },
+  "graph_api_benchmark_sycl SinKernelGraph graphs:0, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 5 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SinKernelGraph, graphs 0, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_sycl SinKernelGraph graphs:0, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 100 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SinKernelGraph, graphs 0, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "graph_api_benchmark_sycl SinKernelGraph graphs:1, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 5 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SinKernelGraph, graphs 1, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_sycl SinKernelGraph graphs:1, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 100 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "SYCL",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SinKernelGraph, graphs 1, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "ulls_benchmark_sycl EmptyKernel wgc:1000, wgs:256": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL EmptyKernel, wgc 1000, wgs 256",
+    "explicit_group": "EmptyKernel, wgc: 1000, wgs: 256"
+  },
+  "ulls_benchmark_sycl KernelSwitch count 8 kernelTime 200": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL KernelSwitch, count 8, kernelTime 200",
+    "explicit_group": "KernelSwitch, count: 8, kernelTime: 200"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:4 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 4, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:4 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 4, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:10 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 10, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:10 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 10, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:32 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 32, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:32 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 32, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:4 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 4, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:4 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 4, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:10 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 10, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:10 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 10, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:32 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 32, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_sycl SubmitGraph numKernels:32 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures SYCL performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL SubmitGraph, numKernels 32, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order, NumKernels 10",
+    "explicit_group": "SubmitKernel in order"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "L0",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "L0 SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20, CPU count"
+  },
+  "graph_api_benchmark_l0 SinKernelGraph graphs:0, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 5 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "L0",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SinKernelGraph, graphs 0, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_l0 SinKernelGraph graphs:0, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 100 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "L0",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SinKernelGraph, graphs 0, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "graph_api_benchmark_l0 SinKernelGraph graphs:1, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 5 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "L0",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SinKernelGraph, graphs 1, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_l0 SinKernelGraph graphs:1, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 100 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "L0",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SinKernelGraph, graphs 1, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "ulls_benchmark_l0 EmptyKernel wgc:1000, wgs:256": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "L0",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 EmptyKernel, wgc 1000, wgs 256",
+    "explicit_group": "EmptyKernel, wgc: 1000, wgs: 256"
+  },
+  "ulls_benchmark_l0 KernelSwitch count 8 kernelTime 200": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "L0",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 KernelSwitch, count 8, kernelTime 200",
+    "explicit_group": "KernelSwitch, count: 8, kernelTime: 200"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 4, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 4, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 10, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 10, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 32, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 32, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 4, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 4, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 10, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 10, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 32, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures L0 performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "L0",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "L0 SubmitGraph, numKernels 32, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order, NumKernels 10",
+    "explicit_group": "SubmitKernel in order"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order using events KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion using events, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion using events, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events, CPU count"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion KernelExecTime=20": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20"
+  },
+  "api_overhead_benchmark_ur SubmitKernel in order with measure completion KernelExecTime=20 CPU count": {
+    "type": "benchmark",
+    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "submit",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": 0.0,
+    "range_max": null,
+    "display_name": "UR SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
+    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20, CPU count"
+  },
+  "graph_api_benchmark_ur SinKernelGraph graphs:0, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 5 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "UR",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SinKernelGraph, graphs 0, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_ur SinKernelGraph graphs:0, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 100 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "UR",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SinKernelGraph, graphs 0, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "graph_api_benchmark_ur SinKernelGraph graphs:1, numKernels:5": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 5 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "UR",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SinKernelGraph, graphs 1, numKernels 5",
+    "explicit_group": "SinKernelGraph, numKernels: 5"
+  },
+  "graph_api_benchmark_ur SinKernelGraph graphs:1, numKernels:100": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 100 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
+    "tags": [
+      "graph",
+      "UR",
+      "proxy",
+      "submit",
+      "memory",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SinKernelGraph, graphs 1, numKernels 100",
+    "explicit_group": "SinKernelGraph, numKernels: 100"
+  },
+  "ulls_benchmark_ur EmptyKernel wgc:1000, wgs:256": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "UR",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR EmptyKernel, wgc 1000, wgs 256",
+    "explicit_group": "EmptyKernel, wgc: 1000, wgs: 256"
+  },
+  "ulls_benchmark_ur KernelSwitch count 8 kernelTime 200": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "UR",
+      "micro",
+      "latency",
+      "submit"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR KernelSwitch, count 8, kernelTime 200",
+    "explicit_group": "KernelSwitch, count: 8, kernelTime: 200"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 4, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 4, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 10, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 10, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 0 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 32, ioq 0, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 0 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 32, ioq 0, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 4, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 4, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 4"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 10, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 10, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 10"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 1 measureCompletion 0": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 32, ioq 1, measureCompletion 0",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 1 measureCompletion 1": {
+    "type": "benchmark",
+    "description": "Measures UR performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "UR",
+      "micro",
+      "submit",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR SubmitGraph, numKernels 32, ioq 1, measureCompletion 1",
+    "explicit_group": "SubmitGraph, numKernels: 32"
+  },
+  "memory_benchmark_sycl QueueInOrderMemcpy from Device to Device, size 1024": {
+    "type": "benchmark",
+    "description": "Measures SYCL in-order queue memory copy performance for copy and command submission from Device to Device with 1024 bytes, executed 100 times per iteration.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL QueueInOrderMemcpy from Device to Device, size 1024",
+    "explicit_group": ""
+  },
+  "memory_benchmark_sycl QueueInOrderMemcpy from Host to Device, size 1024": {
+    "type": "benchmark",
+    "description": "Measures SYCL in-order queue memory copy performance for copy and command submission from Host to Device with 1024 bytes, executed 100 times per iteration.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL QueueInOrderMemcpy from Host to Device, size 1024",
+    "explicit_group": ""
+  },
+  "memory_benchmark_sycl QueueMemcpy from Device to Device, size 1024": {
+    "type": "benchmark",
+    "description": "Measures general SYCL queue memory copy performance from Device to Device with 1024 bytes per operation.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL QueueMemcpy from Device to Device, size 1024",
+    "explicit_group": ""
+  },
+  "memory_benchmark_sycl StreamMemory, placement Device, type Triad, size 10240": {
+    "type": "benchmark",
+    "description": "Measures Device memory bandwidth using Triad pattern with 10240 bytes. Higher values (GB/s) indicate better performance.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "throughput",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL StreamMemory, placement Device, type Triad, size 10240",
+    "explicit_group": ""
+  },
+  "api_overhead_benchmark_sycl ExecImmediateCopyQueue out of order from Device to Device, size 1024": {
+    "type": "benchmark",
+    "description": "Measures SYCL out-of-order queue overhead for copy-only from Device to Device memory with 1024 bytes. Tests immediate execution overheads.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL ExecImmediateCopyQueue out of order from Device to Device, size 1024",
+    "explicit_group": ""
+  },
+  "api_overhead_benchmark_sycl ExecImmediateCopyQueue in order from Device to Host, size 1024": {
+    "type": "benchmark",
+    "description": "Measures SYCL in-order queue overhead for copy-only from Device to Host memory with 1024 bytes. Tests immediate execution overheads.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "submit",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL ExecImmediateCopyQueue in order from Device to Host, size 1024",
+    "explicit_group": ""
+  },
+  "miscellaneous_benchmark_sycl VectorSum": {
+    "type": "benchmark",
+    "description": "Measures performance of vector addition across 3D grid (512x256x256 elements) using SYCL.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "math",
+      "throughput",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL VectorSum",
+    "explicit_group": ""
+  },
+  "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:0 graphStructure:Gromacs": {
+    "type": "benchmark",
+    "description": "Measures the time taken to finalize a SYCL graph, using a graph structure based on the usage of graphs in Gromacs. It measures finalizing the same modifiable graph repeatedly over multiple iterations.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "finalize",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL FinalizeGraph, rebuildGraphEveryIter 0, graphStructure Gromacs",
+    "explicit_group": "FinalizeGraph, GraphStructure: Gromacs"
+  },
+  "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:1 graphStructure:Gromacs": {
+    "type": "benchmark",
+    "description": "Measures the time taken to finalize a SYCL graph, using a graph structure based on the usage of graphs in Gromacs. It measures finalizing a unique modifiable graph per iteration.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "finalize",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL FinalizeGraph, rebuildGraphEveryIter 1, graphStructure Gromacs",
+    "explicit_group": "FinalizeGraph, GraphStructure: Gromacs"
+  },
+  "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:0 graphStructure:Llama": {
+    "type": "benchmark",
+    "description": "Measures the time taken to finalize a SYCL graph, using a graph structure based on the usage of graphs in Llama. It measures finalizing the same modifiable graph repeatedly over multiple iterations.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "finalize",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL FinalizeGraph, rebuildGraphEveryIter 0, graphStructure Llama",
+    "explicit_group": "FinalizeGraph, GraphStructure: Llama"
+  },
+  "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:1 graphStructure:Llama": {
+    "type": "benchmark",
+    "description": "Measures the time taken to finalize a SYCL graph, using a graph structure based on the usage of graphs in Llama. It measures finalizing a unique modifiable graph per iteration.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "graph",
+      "SYCL",
+      "micro",
+      "finalize",
+      "latency"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL FinalizeGraph, rebuildGraphEveryIter 1, graphStructure Llama",
+    "explicit_group": "FinalizeGraph, GraphStructure: Llama"
+  },
+  "multithread_benchmark_ur MemcpyExecute opsPerThread:400, numThreads:1, allocSize:102400 srcUSM:1 dstUSM:1": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 1 threads each performing 400 operations on 102400 bytes from device to device memory with events with driver copy offload without barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 400, numThreads 1, allocSize 102400, srcUSM 1, dstUSM 1",
+    "explicit_group": "MemcpyExecute, opsPerThread: 400, numThreads: 1, allocSize: 102400"
+  },
+  "multithread_benchmark_ur MemcpyExecute opsPerThread:400, numThreads:1, allocSize:102400 srcUSM:0 dstUSM:1": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 1 threads each performing 400 operations on 102400 bytes from host to device memory with events with driver copy offload without barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 400, numThreads 1, allocSize 102400, srcUSM 0, dstUSM 1",
+    "explicit_group": "MemcpyExecute, opsPerThread: 400, numThreads: 1, allocSize: 102400"
+  },
+  "multithread_benchmark_ur MemcpyExecute opsPerThread:100, numThreads:4, allocSize:102400 srcUSM:1 dstUSM:1 without events": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 4 threads each performing 100 operations on 102400 bytes from device to device memory without events with driver copy offload without barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 100, numThreads 4, allocSize 102400, srcUSM 1, dstUSM 1, without events",
+    "explicit_group": "MemcpyExecute, opsPerThread: 100, numThreads: 4, allocSize: 102400"
+  },
+  "multithread_benchmark_ur MemcpyExecute opsPerThread:100, numThreads:4, allocSize:102400 srcUSM:1 dstUSM:1 without events without copy offload": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 4 threads each performing 100 operations on 102400 bytes from device to device memory without events without driver copy offload without barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 100, numThreads 4, allocSize 102400, srcUSM 1, dstUSM 1, without events without copy offload",
+    "explicit_group": "MemcpyExecute, opsPerThread: 100, numThreads: 4, allocSize: 102400"
+  },
+  "multithread_benchmark_ur MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:0 dstUSM:1 without events": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 4 threads each performing 4096 operations on 1024 bytes from host to device memory without events with driver copy offload without barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 4, allocSize 1024, srcUSM 0, dstUSM 1, without events",
+    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 4, allocSize: 1024"
+  },
+  "multithread_benchmark_ur MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:0 dstUSM:1 without events with barrier": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 4 threads each performing 4096 operations on 1024 bytes from host to device memory without events with driver copy offload with barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "UR",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 4, allocSize 1024, srcUSM 0, dstUSM 1, without events",
+    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 4, allocSize: 1024"
+  },
+  "api_overhead_benchmark_ur UsmMemoryAllocation usmMemoryPlacement:Device size:256 measureMode:Both": {
+    "type": "benchmark",
+    "description": "Measures memory allocation overhead by allocating 256 bytes of usm Device memory and free'ing it immediately. Both memory allocation and memory free are timed. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "UR",
+      "micro",
+      "latency",
+      "memory"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR UsmMemoryAllocation, usmMemoryPlacement Device, size 256, measureMode Both",
+    "explicit_group": "UsmMemoryAllocation"
+  },
+  "api_overhead_benchmark_ur UsmMemoryAllocation usmMemoryPlacement:Device size:262144 measureMode:Both": {
+    "type": "benchmark",
+    "description": "Measures memory allocation overhead by allocating 262144 bytes of usm Device memory and free'ing it immediately. Both memory allocation and memory free are timed. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "UR",
+      "micro",
+      "latency",
+      "memory"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR UsmMemoryAllocation, usmMemoryPlacement Device, size 262144, measureMode Both",
+    "explicit_group": "UsmMemoryAllocation"
+  },
+  "api_overhead_benchmark_ur UsmBatchMemoryAllocation usmMemoryPlacement:Device allocationCount:128 size:256 measureMode:Both": {
+    "type": "benchmark",
+    "description": "Measures memory allocation overhead by allocating 256 bytes of usm Device memory 128 times, then free'ing it all at once. Both memory allocation and memory free are timed. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "UR",
+      "micro",
+      "latency",
+      "memory"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR UsmBatchMemoryAllocation, usmMemoryPlacement Device, allocationCount 128, size 256, measureMode Both",
+    "explicit_group": "UsmBatchMemoryAllocation"
+  },
+  "api_overhead_benchmark_ur UsmBatchMemoryAllocation usmMemoryPlacement:Device allocationCount:128 size:16384 measureMode:Both": {
+    "type": "benchmark",
+    "description": "Measures memory allocation overhead by allocating 16384 bytes of usm Device memory 128 times, then free'ing it all at once. Both memory allocation and memory free are timed. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "UR",
+      "micro",
+      "latency",
+      "memory"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR UsmBatchMemoryAllocation, usmMemoryPlacement Device, allocationCount 128, size 16384, measureMode Both",
+    "explicit_group": "UsmBatchMemoryAllocation"
+  },
+  "api_overhead_benchmark_ur UsmBatchMemoryAllocation usmMemoryPlacement:Device allocationCount:128 size:131072 measureMode:Both": {
+    "type": "benchmark",
+    "description": "Measures memory allocation overhead by allocating 131072 bytes of usm Device memory 128 times, then free'ing it all at once. Both memory allocation and memory free are timed. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "UR",
+      "micro",
+      "latency",
+      "memory"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR UsmBatchMemoryAllocation, usmMemoryPlacement Device, allocationCount 128, size 131072, measureMode Both",
+    "explicit_group": "UsmBatchMemoryAllocation"
+  },
+  "multithread_benchmark_syclpreview MemcpyExecute opsPerThread:4096, numThreads:1, allocSize:1024 srcUSM:1 dstUSM:1 without events": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 1 threads each performing 4096 operations on 1024 bytes from device to device memory without events with driver copy offload without barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 1, allocSize 1024, srcUSM 1, dstUSM 1, without events",
+    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 1, allocSize: 1024"
+  },
+  "multithread_benchmark_syclpreview MemcpyExecute opsPerThread:4096, numThreads:1, allocSize:1024 srcUSM:1 dstUSM:1 without events with barrier": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 1 threads each performing 4096 operations on 1024 bytes from device to device memory without events with driver copy offload with barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 1, allocSize 1024, srcUSM 1, dstUSM 1, without events",
+    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 1, allocSize: 1024"
+  },
+  "multithread_benchmark_syclpreview MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:1 dstUSM:1 without events": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 4 threads each performing 4096 operations on 1024 bytes from device to device memory without events with driver copy offload without barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 4, allocSize 1024, srcUSM 1, dstUSM 1, without events",
+    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 4, allocSize: 1024"
+  },
+  "multithread_benchmark_syclpreview MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:1 dstUSM:1 without events with barrier": {
+    "type": "benchmark",
+    "description": "Measures multithreaded memory copy performance with 4 threads each performing 4096 operations on 1024 bytes from device to device memory without events with driver copy offload with barrier. ",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "memory",
+      "latency",
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 4, allocSize 1024, srcUSM 1, dstUSM 1, without events",
+    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 4, allocSize: 1024"
+  },
+  "Velocity-Bench Hashtable": {
+    "type": "benchmark",
+    "description": "Measures hash table search performance using an efficient lock-free algorithm with linear probing. Reports throughput in millions of keys processed per second. Higher values indicate better performance.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "throughput"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench Hashtable",
+    "explicit_group": ""
+  },
+  "Velocity-Bench Bitcracker": {
+    "type": "benchmark",
+    "description": "Password-cracking application for BitLocker-encrypted memory units. Uses dictionary attack to find user or recovery passwords. Measures total time required to process 60000 passwords.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "throughput"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench Bitcracker",
+    "explicit_group": ""
+  },
+  "Velocity-Bench CudaSift": {
+    "type": "benchmark",
+    "description": "Implementation of the SIFT (Scale Invariant Feature Transform) algorithm for detecting, describing, and matching local features in images. Measures average processing time in milliseconds.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "image"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench CudaSift",
+    "explicit_group": ""
+  },
+  "Velocity-Bench Easywave": {
+    "type": "benchmark",
+    "description": "A tsunami wave simulator used for researching tsunami generation and wave propagation. Measures the elapsed time in milliseconds to simulate a specified tsunami event based on real-world data.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "simulation"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench Easywave",
+    "explicit_group": ""
+  },
+  "Velocity-Bench QuickSilver": {
+    "type": "benchmark",
+    "description": "Solves a simplified dynamic Monte Carlo particle-transport problem used in HPC. Replicates memory access patterns, communication patterns, and branching of Mercury workloads. Reports a figure of merit in MMS/CTT where higher values indicate better performance.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "simulation",
+      "throughput"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench QuickSilver",
+    "explicit_group": ""
+  },
+  "Velocity-Bench Sobel Filter": {
+    "type": "benchmark",
+    "description": "Popular RGB-to-grayscale image conversion technique that applies a gaussian filter to reduce edge artifacts. Processes a large 32K x 32K image and measures the time required to apply the filter.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "image",
+      "throughput"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench Sobel Filter",
+    "explicit_group": ""
+  },
+  "Velocity-Bench dl-cifar": {
+    "type": "benchmark",
+    "description": "Deep learning image classification workload based on the CIFAR-10 dataset of 60,000 32x32 color images in 10 classes. Uses neural networks to classify input images and measures total calculation time.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "inference",
+      "image"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench dl-cifar",
+    "explicit_group": ""
+  },
+  "Velocity-Bench dl-mnist": {
+    "type": "benchmark",
+    "description": "Digit recognition based on the MNIST database, one of the oldest and most popular databases of handwritten digits. Uses neural networks to identify digits and measures total calculation time.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "inference",
+      "image"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench dl-mnist",
+    "explicit_group": ""
+  },
+  "Velocity-Bench svm": {
+    "type": "benchmark",
+    "description": "Implementation of Support Vector Machine, a popular classical machine learning technique. Uses supervised learning models with associated algorithms to analyze data for classification and regression analysis. Measures total elapsed time.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "inference"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Velocity-Bench svm",
+    "explicit_group": ""
+  },
+  "SYCL-Bench IndependentDAGTaskThroughput_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench IndependentDAGTaskThroughput_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench DAGTaskThroughput_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench DAGTaskThroughput_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench HostDeviceBandwidth_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench HostDeviceBandwidth_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench LocalMem_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro",
+      "memory"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench LocalMem_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench ScalarProduct_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench ScalarProduct_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench Pattern_SegmentedReduction_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench Pattern_SegmentedReduction_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench USM_Allocation_latency_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench USM_Allocation_latency_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench VectorAddition_multi": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench VectorAddition_multi",
+    "explicit_group": ""
+  },
+  "SYCL-Bench 2mm": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench 2mm",
+    "explicit_group": ""
+  },
+  "SYCL-Bench 3mm": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench 3mm",
+    "explicit_group": ""
+  },
+  "SYCL-Bench Atax": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench Atax",
+    "explicit_group": ""
+  },
+  "SYCL-Bench Bicg": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench Bicg",
+    "explicit_group": ""
+  },
+  "SYCL-Bench Kmeans": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench Kmeans",
+    "explicit_group": ""
+  },
+  "SYCL-Bench LinearRegressionCoeff": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench LinearRegressionCoeff",
+    "explicit_group": ""
+  },
+  "SYCL-Bench MolecularDynamics": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench MolecularDynamics",
+    "explicit_group": ""
+  },
+  "SYCL-Bench sf_16": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "micro"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "SYCL-Bench sf_16",
+    "explicit_group": ""
+  },
+  "llama.cpp DeepSeek-R1-Distill-Qwen-1.5B-Q4_0.gguf": {
+    "type": "benchmark",
+    "description": "Performance testing tool for llama.cpp that measures LLM inference speed in tokens per second. Runs both prompt processing (initial context processing) and text generation benchmarks with different batch sizes. Higher values indicate better performance. Uses the DeepSeek-R1-Distill-Qwen-1.5B-Q4_0.gguf quantized model and leverages SYCL with oneDNN for acceleration.",
+    "notes": null,
+    "unstable": null,
+    "tags": [
+      "SYCL",
+      "application",
+      "inference",
+      "throughput"
+    ],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "llama.cpp DeepSeek-R1-Distill-Qwen-1.5B-Q4_0.gguf",
+    "explicit_group": ""
+  },
+  "umf-benchmark": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "umf-benchmark",
+    "explicit_group": ""
+  },
+  "gromacs-0006-pme-graphs": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "gromacs-0006-pme-graphs",
+    "explicit_group": ""
+  },
+  "gromacs-0006-pme-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "gromacs-0006-pme-eager",
+    "explicit_group": ""
+  },
+  "gromacs-0006-rf-graphs": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "gromacs-0006-rf-graphs",
+    "explicit_group": ""
+  },
+  "gromacs-0006-rf-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "gromacs-0006-rf-eager",
+    "explicit_group": ""
+  },
+  "onednn-sum-f16-1-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-f16-1-eager",
+    "explicit_group": "sum-f16-1"
+  },
+  "onednn-sum-f16-2-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-f16-2-eager",
+    "explicit_group": "sum-f16-2"
+  },
+  "onednn-sum-f32-1-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-f32-1-eager",
+    "explicit_group": "sum-f32-1"
+  },
+  "onednn-sum-f32-2-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-f32-2-eager",
+    "explicit_group": "sum-f32-2"
+  },
+  "onednn-sum-padding-1-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-padding-1-eager",
+    "explicit_group": "sum-padding-1"
+  },
+  "onednn-sum-padding-1-graph": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-padding-1-graph",
+    "explicit_group": "sum-padding-1"
+  },
+  "onednn-sum-padding-2-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-padding-2-eager",
+    "explicit_group": "sum-padding-2"
+  },
+  "onednn-sum-padding-2-graph": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-sum-padding-2-graph",
+    "explicit_group": "sum-padding-2"
+  },
+  "onednn-graph-sdpa-plain-f16-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-graph-sdpa-plain-f16-eager",
+    "explicit_group": "graph-sdpa-plain-f16"
+  },
+  "onednn-graph-sdpa-plain-f16-graph": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-graph-sdpa-plain-f16-graph",
+    "explicit_group": "graph-sdpa-plain-f16"
+  },
+  "onednn-graph-sdpa-plain-f32-eager": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-graph-sdpa-plain-f32-eager",
+    "explicit_group": "graph-sdpa-plain-f32"
+  },
+  "onednn-graph-sdpa-plain-f32-graph": {
+    "type": "benchmark",
+    "description": "",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "onednn-graph-sdpa-plain-f32-graph",
+    "explicit_group": "graph-sdpa-plain-f32"
+  },
+  "Foo Group": {
+    "type": "group",
+    "description": "This is a test benchmark for Foo Group.",
+    "notes": "This is a test note for Foo Group.\nLook, multiple lines!",
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": null,
+    "explicit_group": null
+  },
+  "Bar Group": {
+    "type": "group",
+    "description": "This is a test benchmark for Bar Group.",
+    "notes": null,
+    "unstable": "This is an unstable note for Bar Group.",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": null,
+    "explicit_group": null
+  },
+  "Memory Bandwidth 1": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Memory Bandwidth 1.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Memory Bandwidth 1",
+    "explicit_group": ""
+  },
+  "Memory Bandwidth 2": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Memory Bandwidth 2.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Memory Bandwidth 2",
+    "explicit_group": ""
+  },
+  "Memory Bandwidth 3": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Memory Bandwidth 3.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Memory Bandwidth 3",
+    "explicit_group": ""
+  },
+  "Memory Bandwidth 4": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Memory Bandwidth 4.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Memory Bandwidth 4",
+    "explicit_group": ""
+  },
+  "Memory Bandwidth 5": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Memory Bandwidth 5.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Memory Bandwidth 5",
+    "explicit_group": ""
+  },
+  "Memory Bandwidth 6": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Memory Bandwidth 6.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Memory Bandwidth 6",
+    "explicit_group": ""
+  },
+  "Latency 1": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Latency 1.",
+    "notes": "A Latency test note!",
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Latency 1",
+    "explicit_group": ""
+  },
+  "Latency 2": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Latency 2.",
+    "notes": "A Latency test note!",
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Latency 2",
+    "explicit_group": ""
+  },
+  "Latency 3": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Latency 3.",
+    "notes": "A Latency test note!",
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Latency 3",
+    "explicit_group": ""
+  },
+  "Latency 4": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Latency 4.",
+    "notes": "A Latency test note!",
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Latency 4",
+    "explicit_group": ""
+  },
+  "Latency 5": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Latency 5.",
+    "notes": "A Latency test note!",
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Latency 5",
+    "explicit_group": ""
+  },
+  "Latency 6": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Latency 6.",
+    "notes": "A Latency test note!",
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Latency 6",
+    "explicit_group": ""
+  },
+  "Throughput 1": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Throughput 1.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Throughput 1",
+    "explicit_group": ""
+  },
+  "Throughput 2": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Throughput 2.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Throughput 2",
+    "explicit_group": ""
+  },
+  "Throughput 3": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Throughput 3.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Throughput 3",
+    "explicit_group": ""
+  },
+  "Throughput 4": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Throughput 4.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Throughput 4",
+    "explicit_group": ""
+  },
+  "Throughput 5": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Throughput 5.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Throughput 5",
+    "explicit_group": ""
+  },
+  "Throughput 6": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Throughput 6.",
+    "notes": null,
+    "unstable": null,
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Throughput 6",
+    "explicit_group": ""
+  },
+  "FLOPS 1": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for FLOPS 1.",
+    "notes": null,
+    "unstable": "Unstable FLOPS test!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "FLOPS 1",
+    "explicit_group": ""
+  },
+  "FLOPS 2": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for FLOPS 2.",
+    "notes": null,
+    "unstable": "Unstable FLOPS test!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "FLOPS 2",
+    "explicit_group": ""
+  },
+  "FLOPS 3": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for FLOPS 3.",
+    "notes": null,
+    "unstable": "Unstable FLOPS test!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "FLOPS 3",
+    "explicit_group": ""
+  },
+  "FLOPS 4": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for FLOPS 4.",
+    "notes": null,
+    "unstable": "Unstable FLOPS test!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "FLOPS 4",
+    "explicit_group": ""
+  },
+  "FLOPS 5": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for FLOPS 5.",
+    "notes": null,
+    "unstable": "Unstable FLOPS test!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "FLOPS 5",
+    "explicit_group": ""
+  },
+  "FLOPS 6": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for FLOPS 6.",
+    "notes": null,
+    "unstable": "Unstable FLOPS test!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "FLOPS 6",
+    "explicit_group": ""
+  },
+  "Cache Miss Rate 1": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Cache Miss Rate 1.",
+    "notes": "Test Note",
+    "unstable": "And another note!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Cache Miss Rate 1",
+    "explicit_group": ""
+  },
+  "Cache Miss Rate 2": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Cache Miss Rate 2.",
+    "notes": "Test Note",
+    "unstable": "And another note!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Cache Miss Rate 2",
+    "explicit_group": ""
+  },
+  "Cache Miss Rate 3": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Cache Miss Rate 3.",
+    "notes": "Test Note",
+    "unstable": "And another note!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Cache Miss Rate 3",
+    "explicit_group": ""
+  },
+  "Cache Miss Rate 4": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Cache Miss Rate 4.",
+    "notes": "Test Note",
+    "unstable": "And another note!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Cache Miss Rate 4",
+    "explicit_group": ""
+  },
+  "Cache Miss Rate 5": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Cache Miss Rate 5.",
+    "notes": "Test Note",
+    "unstable": "And another note!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Cache Miss Rate 5",
+    "explicit_group": ""
+  },
+  "Cache Miss Rate 6": {
+    "type": "benchmark",
+    "description": "This is a test benchmark for Cache Miss Rate 6.",
+    "notes": "Test Note",
+    "unstable": "And another note!",
+    "tags": [],
+    "range_min": null,
+    "range_max": null,
+    "display_name": "Cache Miss Rate 6",
+    "explicit_group": ""
+  }
+};
 
-defaultCompareNames = [];
+benchmarkTags = {
+  "SYCL": {
+    "name": "SYCL",
+    "description": "Benchmark uses SYCL runtime"
+  },
+  "UR": {
+    "name": "UR",
+    "description": "Benchmark uses Unified Runtime API"
+  },
+  "L0": {
+    "name": "L0",
+    "description": "Benchmark uses Level Zero API directly"
+  },
+  "UMF": {
+    "name": "UMF",
+    "description": "Benchmark uses Unified Memory Framework directly"
+  },
+  "micro": {
+    "name": "micro",
+    "description": "Microbenchmark focusing on a specific functionality"
+  },
+  "application": {
+    "name": "application",
+    "description": "Real application-based performance test"
+  },
+  "proxy": {
+    "name": "proxy",
+    "description": "Benchmark that simulates real application use-cases"
+  },
+  "submit": {
+    "name": "submit",
+    "description": "Tests kernel submission performance"
+  },
+  "math": {
+    "name": "math",
+    "description": "Tests math computation performance"
+  },
+  "memory": {
+    "name": "memory",
+    "description": "Tests memory transfer or bandwidth performance"
+  },
+  "allocation": {
+    "name": "allocation",
+    "description": "Tests memory allocation performance"
+  },
+  "graph": {
+    "name": "graph",
+    "description": "Tests graph-based execution performance"
+  },
+  "latency": {
+    "name": "latency",
+    "description": "Measures operation latency"
+  },
+  "throughput": {
+    "name": "throughput",
+    "description": "Measures operation throughput"
+  },
+  "inference": {
+    "name": "inference",
+    "description": "Tests ML/AI inference performance"
+  },
+  "image": {
+    "name": "image",
+    "description": "Image processing benchmark"
+  },
+  "simulation": {
+    "name": "simulation",
+    "description": "Physics or scientific simulation benchmark"
+  }
+};
 
+defaultCompareNames = [
+  "This PR"
+];
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
index 95fc7a6b28736..b330936d6c4a2 100755
--- a/devops/scripts/benchmarks/main.py
+++ b/devops/scripts/benchmarks/main.py
@@ -17,11 +17,13 @@
 from output_markdown import generate_markdown
 from output_html import generate_html
 from history import BenchmarkHistory
-from utils.utils import prepare_workdir
+from utils.utils import prepare_workdir, git_clone, run
 from utils.compute_runtime import *
 from utils.validate import Validate
 from utils.detect_versions import DetectVersion
 from presets import enabled_suites, presets
+from utils.oneapi import get_oneapi
+from datetime import datetime, timezone
 
 import argparse
 import re
@@ -32,16 +34,60 @@
 INTERNAL_WORKDIR_VERSION = "2.0"
 
 
+def download_and_build_unitrace(workdir):
+    repo_dir = git_clone(
+        workdir,
+        "pti-gpu-repo",
+        "https://github.com/intel/pti-gpu.git",
+        "master",
+    )
+    build_dir = os.path.join(workdir, "unitrace-build")
+    unitrace_src = os.path.join(repo_dir, "tools", "unitrace")
+    os.makedirs(build_dir, exist_ok=True)
+
+    unitrace_exe = os.path.join(build_dir, "unitrace")
+    if not os.path.isfile(unitrace_exe):
+        run(
+            [
+                "cmake",
+                f"-S {unitrace_src}",
+                f"-B {build_dir}",
+                "-DCMAKE_BUILD_TYPE=Release",
+                "-DCMAKE_CXX_COMPILER=clang++",
+                "-DCMAKE_C_COMPILER=clang",
+                "-DBUILD_WITH_L0=1",
+                "-DBUILD_WITH_OPENCL=0",
+                "-DBUILD_WITH_ITT=1",
+                "-DBUILD_WITH_XPTI=1",
+                "-DBUILD_WITH_MPI=0",
+            ],
+            ld_library=get_oneapi().ld_libraries() + [f"{options.sycl}/lib"],
+            add_sycl=True,
+        )
+        run(
+            ["cmake", "--build", build_dir, "-j"],
+            ld_library=get_oneapi().ld_libraries() + [f"{options.sycl}/lib"],
+            add_sycl=True,
+        )
+    print("Unitrace built successfully.")
+
+
 def run_iterations(
     benchmark: Benchmark,
     env_vars,
     iters: int,
     results: dict[str, list[Result]],
     failures: dict[str, str],
+    unitrace_timestamp: str = None,
 ):
     for iter in range(iters):
-        print(f"running {benchmark.name()}, iteration {iter}... ", flush=True)
-        bench_results = benchmark.run(env_vars)
+        if unitrace_timestamp is not None:
+            print(f"running {benchmark.name()} with Unitrace", flush=True)
+        else:
+            print(f"running {benchmark.name()}, iteration {iter}... ", flush=True)
+        bench_results = benchmark.run(
+            env_vars, unitrace_timestamp=unitrace_timestamp
+        )
         if bench_results is None:
             failures[benchmark.name()] = "benchmark produced no results!"
             break
@@ -158,6 +204,14 @@ def collect_metadata(suites):
 def main(directory, additional_env_vars, save_name, compare_names, filter):
     prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
 
+    if options.unitrace or options.unitrace_inclusive:
+        print("Downloading and building Unitrace...")
+        download_and_build_unitrace(options.workdir)
+        if options.results_directory_override == None:
+            options.unitrace_res_dir = os.path.join(directory, "results")
+        else:
+            options.unitrace_res_dir = options.results_directory_override
+
     if options.build_compute_runtime:
         print(f"Setting up Compute Runtime {options.compute_runtime_tag}")
         cr = get_compute_runtime()
@@ -213,6 +267,12 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
                 print(f"{type(s).__name__} setup complete.")
                 benchmarks += suite_benchmarks
 
+    timestamp = (
+        datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
+        if options.timestamp_override is None
+        else options.timestamp_override
+    )
+
     for benchmark in benchmarks:
         try:
             if options.verbose:
@@ -234,19 +294,31 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
             merged_env_vars = {**additional_env_vars}
             intermediate_results: dict[str, list[Result]] = {}
             processed: list[Result] = []
-            for _ in range(options.iterations_stddev):
+            if not options.unitrace:
+                for _ in range(options.iterations_stddev):
+                    run_iterations(
+                        benchmark,
+                        merged_env_vars,
+                        options.iterations,
+                        intermediate_results,
+                        failures,
+                        unitrace_timestamp=None,
+                    )
+                    valid, processed = process_results(
+                        intermediate_results, benchmark.stddev_threshold()
+                    )
+                    if valid:
+                        break
+            if options.unitrace_inclusive or options.unitrace:
+                # run the benchmark with unitrace
                 run_iterations(
                     benchmark,
                     merged_env_vars,
-                    options.iterations,
+                    1,
                     intermediate_results,
                     failures,
+                    unitrace_timestamp=timestamp,
                 )
-                valid, processed = process_results(
-                    intermediate_results, benchmark.stddev_threshold()
-                )
-                if valid:
-                    break
             results += processed
         except Exception as e:
             if options.exit_on_failure:
@@ -309,7 +381,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
     # we calculate historical averages or get latest results for compare.
     # Otherwise we might be comparing the results to themselves.
     if not options.dry_run:
-        history.save(saved_name, results, save_name is not None)
+        history.save(saved_name, timestamp, results, save_name is not None)
         if saved_name not in compare_names:
             compare_names.append(saved_name)
 
@@ -500,6 +572,17 @@ def validate_and_parse_env_args(env_args):
         help="HIP device architecture",
         default=None,
     )
+    parser.add_argument(
+        "--unitrace",
+        action="store_true",
+        help="Unitrace tracing for sigle iteration of benchmarks",
+    )
+
+    parser.add_argument(
+        "--unitrace-inclusive",
+        action="store_true",
+        help="Regular run of benchmarks iterations and unitrace tracing in single additional run",
+    )
 
     # Options intended for CI:
     parser.add_argument(
@@ -589,6 +672,8 @@ def validate_and_parse_env_args(env_args):
     options.results_directory_override = args.results_dir
     options.build_jobs = args.build_jobs
     options.hip_arch = args.hip_arch
+    options.unitrace = args.unitrace
+    options.unitrace_inclusive = args.unitrace_inclusive
 
     if args.build_igc and args.compute_runtime is None:
         parser.error("--build-igc requires --compute-runtime to be set")
@@ -599,6 +684,10 @@ def validate_and_parse_env_args(env_args):
         if not os.path.isdir(args.output_dir):
             parser.error("Specified --output-dir is not a valid path")
         options.output_directory = os.path.abspath(args.output_dir)
+    if args.unitrace_inclusive and args.unitrace:
+        parser.error(
+            "--unitrace-inclusive and --unitrace are mutually exclusive, please specify only one of them"
+        )
 
     # Options intended for CI:
     options.timestamp_override = args.timestamp_override
diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py
index a7b65e752d450..73170e527a6d2 100644
--- a/devops/scripts/benchmarks/options.py
+++ b/devops/scripts/benchmarks/options.py
@@ -41,6 +41,7 @@ class DetectVersionsOptions:
     # Max amount of api calls permitted on each run of the benchmark scripts
     max_api_calls = 4
 
+
 @dataclass
 class Options:
     workdir: str = None
@@ -70,6 +71,9 @@ class Options:
     current_run_name: str = "This PR"
     preset: str = "Full"
     build_jobs: int = multiprocessing.cpu_count()
+    unitrace: bool = False
+    unitrace_inclusive: bool = False
+    unitrace_res_dir: str = None
 
     # Options intended for CI:
     regression_threshold: float = 0.05

From d8f6b2bfb76fb14e41d6dc7b7e7f6430559465de Mon Sep 17 00:00:00 2001
From: "Mateusz P. Nowak" <mateusz.p.nowak@intel.com>
Date: Mon, 7 Jul 2025 09:12:39 +0000
Subject: [PATCH 3/6] Add list of Unitrace exclusions

Signed-off-by: Mateusz P. Nowak <mateusz.p.nowak@intel.com>
---
 devops/scripts/benchmarks/benches/base.py     | 116 +++-----------
 devops/scripts/benchmarks/benches/benchdnn.py |   9 +-
 .../benchmarks/benches/benchdnn_list.py       |   7 +
 devops/scripts/benchmarks/benches/compute.py  |  68 +++++++++
 devops/scripts/benchmarks/benches/velocity.py |   4 +-
 devops/scripts/benchmarks/main.py             |  24 +--
 devops/scripts/benchmarks/options.py          |   3 +-
 devops/scripts/benchmarks/utils/unitrace.py   | 143 ++++++++++++++++++
 devops/scripts/benchmarks/utils/utils.py      |   4 +-
 9 files changed, 262 insertions(+), 116 deletions(-)
 create mode 100644 devops/scripts/benchmarks/utils/unitrace.py

diff --git a/devops/scripts/benchmarks/benches/base.py b/devops/scripts/benchmarks/benches/base.py
index 215dc57f67f28..80854d7e17248 100644
--- a/devops/scripts/benchmarks/benches/base.py
+++ b/devops/scripts/benchmarks/benches/base.py
@@ -12,7 +12,7 @@
 from options import options
 from utils.utils import download, run
 from abc import ABC, abstractmethod
-import glob
+import utils.unitrace as unitrace
 
 benchmark_tags = [
     BenchmarkTag("SYCL", "Benchmark uses SYCL runtime"),
@@ -71,9 +71,7 @@ def teardown(self):
         pass
 
     @abstractmethod
-    def run(
-        self, env_vars, unitrace_timestamp: str = None
-    ) -> list[Result]:
+    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         pass
 
     @staticmethod
@@ -110,42 +108,27 @@ def run_bench(
         ld_libraries.extend(ld_library)
 
         if unitrace_timestamp is not None:
-            unitrace_bin = os.path.join(options.workdir, "unitrace-build", "unitrace")
-            if not os.path.exists(unitrace_bin):
-                raise FileNotFoundError(f"Unitrace binary not found: {unitrace_bin}. ")
-            if not os.path.exists(options.unitrace_res_dir):
-                os.makedirs(options.unitrace_res_dir)
-            bench_dir = f"{options.unitrace_res_dir}/{self.name()}"
-            os.makedirs(bench_dir, exist_ok=True)
-
-            unitrace_output = f"{bench_dir}/{self.name()}_{unitrace_timestamp}"
-            command = (
-                [
-                    str(unitrace_bin),
-                    "--call-logging",
-                    "--host-timing",
-                    "--chrome-sycl-logging",
-                    "--chrome-call-logging",
-                    "--chrome-kernel-logging",
-                    "--output",
-                    unitrace_output,
-                ]
-                + extra_unitrace_opt
-                + command
+            bench_dir, unitrace_output, command = unitrace.unitrace_prepare(
+                self.name(), unitrace_timestamp, command, extra_unitrace_opt
             )
-            if options.verbose:
-                print(f"Unitrace cmd: {' '.join(command)}")
 
-        result = run(
-            command=command,
-            env_vars=env_vars,
-            add_sycl=add_sycl,
-            cwd=options.benchmark_cwd,
-            ld_library=ld_libraries,
-        )
+        try:
+            result = run(
+                command=command,
+                env_vars=env_vars,
+                add_sycl=add_sycl,
+                cwd=options.benchmark_cwd,
+                ld_library=ld_libraries,
+            )
+        except subprocess.CalledProcessError as e:
+            if unitrace_timestamp is not None:
+                unitrace.unitrace_cleanup(options.benchmark_cwd, unitrace_output)
+            raise
 
         if unitrace_timestamp is not None:
-            handle_unitrace_output(bench_dir, unitrace_output, unitrace_timestamp)
+            unitrace.handle_unitrace_output(
+                bench_dir, unitrace_output, unitrace_timestamp
+            )
 
         if use_stdout:
             return result.stdout.decode()
@@ -233,64 +216,3 @@ def setup(self):
 
     def additional_metadata(self) -> dict[str, BenchmarkMetadata]:
         return {}
-
-
-def handle_unitrace_output(bench_dir, unitrace_output, timestamp):
-    FILECNT = 20  # Set your desired max file count
-
-    # 1. Handle unitrace_output.{pid} logs: rename to unitrace_output (remove pid)
-    for f in os.listdir(bench_dir):
-        if f.startswith(os.path.basename(unitrace_output) + "."):
-            parts = f.rsplit(".", 1)
-            if (
-                len(parts) == 2
-                and parts[1].isdigit()
-                and os.path.isfile(os.path.join(bench_dir, f))
-            ):
-                src = os.path.join(bench_dir, f)
-                dst = os.path.join(bench_dir, os.path.basename(unitrace_output))
-                shutil.move(src, dst)
-                break
-
-    # 2. Handle {name}.{pid}.json files: move and rename to {self.name()}.{timestamp}.json
-    pid_json_files = []
-    for f in os.listdir(options.benchmark_cwd):
-        parts = f.split(".")
-        l = len(parts)
-        if len(parts) >= 3 and parts[l - 1] == "json" and parts[l - 2].isdigit():
-            pid_json_files.append(f)
-
-    if len(pid_json_files) == 1:
-        # Extract benchmark name from bench_dir path
-        bench_name = os.path.basename(bench_dir)
-        dst = f"{bench_dir}/{bench_name}_{timestamp}.json"
-        shutil.move(os.path.join(options.benchmark_cwd, pid_json_files[0]), dst)
-    elif len(pid_json_files) > 1:
-        print(
-            f"Warning: Found {len(pid_json_files)} files matching the pattern. Expected 1."
-        )
-
-    # Count files in the dir and remove oldest if more than FILECNT
-    def extract_timestamp_from_name(filename):
-        # Example: onednn-sum-padding-2-graph_20250701_114551
-        base = os.path.basename(filename)
-        parts = base.rsplit("_", 1)
-        if len(parts) == 2:
-            ts = parts[1]
-            # Remove extension if present (for .json files)
-            ts = ts.split(".", 1)[0]
-            return ts
-        return ""
-
-    files = glob.glob(f"{bench_dir}/*")
-    files_with_ts = []
-    for f in files:
-        ts = extract_timestamp_from_name(f)
-        files_with_ts.append((f, ts))
-    # Sort by timestamp string (lexicographically, which works for YYYYMMDD_HHMMSS)
-    files_with_ts.sort(key=lambda x: x[1])
-    sorted_files = [f for f, ts in files_with_ts if ts]
-
-    if len(sorted_files) > FILECNT:
-        for f in sorted_files[: len(sorted_files) - FILECNT]:
-            os.remove(f)
diff --git a/devops/scripts/benchmarks/benches/benchdnn.py b/devops/scripts/benchmarks/benches/benchdnn.py
index 2fb0596fd3f04..0b43f15fee687 100644
--- a/devops/scripts/benchmarks/benches/benchdnn.py
+++ b/devops/scripts/benchmarks/benches/benchdnn.py
@@ -9,7 +9,7 @@
 from utils.utils import git_clone, run, create_build_path
 from utils.result import Result
 from utils.oneapi import get_oneapi
-from .benchdnn_list import get_bench_dnn_list
+from .benchdnn_list import get_bench_dnn_list, unitrace_exclusion_list
 
 
 class OneDnnBench(Suite):
@@ -144,6 +144,13 @@ def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         env_vars = dict(env_vars) if env_vars else {}
         env_vars["ONEAPI_DEVICE_SELECTOR"] = "level_zero:*"
 
+        if self.name() in unitrace_exclusion_list:
+            if options.verbose:
+                print(
+                    f"[{self.name()}] Skipping benchmark due to unitrace exclusion list."
+                )
+            unitrace_timestamp = None
+
         output = self.run_bench(
             command,
             env_vars,
diff --git a/devops/scripts/benchmarks/benches/benchdnn_list.py b/devops/scripts/benchmarks/benches/benchdnn_list.py
index 542858b33a343..bd84800efc75e 100644
--- a/devops/scripts/benchmarks/benches/benchdnn_list.py
+++ b/devops/scripts/benchmarks/benches/benchdnn_list.py
@@ -12,6 +12,12 @@
 #    if rungraph is True, both direct and graph execution modes will be run for the benchmark
 #    if False, only direct execution mode will be run
 
+unitrace_exclusion_list = [
+    "onednn-graph-sdpa-plain-f32-eager",
+    "onednn-graph-sdpa-plain-f32-graph",
+]
+
+
 # the final choice of benchmarks to run, used in CI and other environments
 benches_final_set = [
     [
@@ -62,6 +68,7 @@
         "graph",
         "sdpa-plain-f16",
         "--reset --dt=f16 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json",
+        False,  # Do not run SYCL graph for this benchmark
     ],
     [
         "graph",
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index 26931e553a2d7..14c3f93833a01 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -224,6 +224,71 @@ def parse_unit_type(compute_unit):
 
 
 class ComputeBenchmark(Benchmark):
+
+    # list of benchmarks to exclude from unitrace due to SIGSEGV, SIGABRT or timeouts
+    unitrace_exclusion_list = [
+        "api_overhead_benchmark_l0 SubmitKernel in order not using events KernelExecTime=20",
+        "api_overhead_benchmark_l0 SubmitKernel in order not using events",
+        "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events KernelExecTime=20",
+        "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events",
+        "api_overhead_benchmark_sycl SubmitKernel in order not using events KernelExecTime=20",
+        "api_overhead_benchmark_sycl SubmitKernel in order not using events",
+        "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events KernelExecTime=20",
+        "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events",
+        "api_overhead_benchmark_syclpreview SubmitKernel in order not using events KernelExecTime=20",
+        "api_overhead_benchmark_syclpreview SubmitKernel in order not using events",
+        "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events KernelExecTime=20",
+        "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events",
+        "api_overhead_benchmark_ur SubmitKernel in order not using events KernelExecTime=20",
+        "api_overhead_benchmark_ur SubmitKernel in order not using events",
+        "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events KernelExecTime=20",
+        "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events",
+        "api_overhead_benchmark_ur SubmitKernel out of order not using events KernelExecTime=20",
+        "api_overhead_benchmark_ur SubmitKernel out of order not using events",
+        "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events KernelExecTime=20",
+        "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events",
+        "graph_api_benchmark_l0 SinKernelGraph graphs:0, numKernels:5",
+        "graph_api_benchmark_l0 SinKernelGraph graphs:0, numKernels:100",
+        "graph_api_benchmark_l0 SinKernelGraph graphs:1, numKernels:5",
+        "graph_api_benchmark_l0 SinKernelGraph graphs:1, numKernels:100",
+        "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 0 measureCompletion 0",
+        "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 0 measureCompletion 1",
+        "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 1 measureCompletion 0",
+        "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 1 measureCompletion 1",
+        "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 0 measureCompletion 0",
+        "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 0 measureCompletion 1",
+        "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 1 measureCompletion 0",
+        "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 1 measureCompletion 1",
+        "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 0 measureCompletion 0",
+        "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 0 measureCompletion 1",
+        "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 1 measureCompletion 0",
+        "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 1 measureCompletion 1",
+        "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:0 graphStructure:Gromacs",
+        "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:0 graphStructure:Llama",
+        "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:1 graphStructure:Gromacs",
+        "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:1 graphStructure:Llama",
+        "graph_api_benchmark_ur SinKernelGraph graphs:0, numKernels:100",
+        "graph_api_benchmark_ur SinKernelGraph graphs:0, numKernels:5",
+        "graph_api_benchmark_ur SinKernelGraph graphs:1, numKernels:100",
+        "graph_api_benchmark_ur SinKernelGraph graphs:1, numKernels:5",
+        "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 1 measureCompletion 0",
+        "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 1 measureCompletion 1",
+        "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 0 measureCompletion 0",
+        "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 0 measureCompletion 1",
+        "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 0 measureCompletion 0",
+        "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 0 measureCompletion 1",
+        "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 1 measureCompletion 0",
+        "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 1 measureCompletion 1",
+        "multithread_benchmark_ur MemcpyExecute opsPerThread:400, numThreads:1, allocSize:102400 srcUSM:1 dstUSM:1",
+        "multithread_benchmark_ur MemcpyExecute opsPerThread:400, numThreads:1, allocSize:102400 srcUSM:0 dstUSM:1",
+        "multithread_benchmark_ur MemcpyExecute opsPerThread:100, numThreads:4, allocSize:102400 srcUSM:1 dstUSM:1 without events",
+        "multithread_benchmark_ur MemcpyExecute opsPerThread:100, numThreads:4, allocSize:102400 srcUSM:1 dstUSM:1 without events without copy offload",
+        "multithread_benchmark_ur MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:0 dstUSM:1 without events",
+        "multithread_benchmark_ur MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:0 dstUSM:1 without events with barrier",
+        "memory_benchmark_sycl StreamMemory, placement Device, type Triad, size 10240",
+        "miscellaneous_benchmark_sycl VectorSum",
+    ]
+
     def __init__(self, bench, name, test, runtime: RUNTIMES = None):
         super().__init__(bench.directory, bench)
         self.bench = bench
@@ -291,6 +356,9 @@ def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         command += self.bin_args()
         env_vars.update(self.extra_env_vars())
 
+        if self.name() in self.unitrace_exclusion_list:
+            unitrace_timestamp = None
+
         result = self.run_bench(
             command, env_vars, unitrace_timestamp=unitrace_timestamp
         )
diff --git a/devops/scripts/benchmarks/benches/velocity.py b/devops/scripts/benchmarks/benches/velocity.py
index 971229a1f0e51..0082f6d0d10e3 100644
--- a/devops/scripts/benchmarks/benches/velocity.py
+++ b/devops/scripts/benchmarks/benches/velocity.py
@@ -288,7 +288,7 @@ class QuickSilver(VelocityBase):
     def __init__(self, vb: VelocityBench):
         super().__init__("QuickSilver", "qs", vb, "MMS/CTT")
 
-    def run(self, env_vars) -> list[Result]:
+    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         # TODO: fix the crash in QuickSilver when UR_L0_USE_IMMEDIATE_COMMANDLISTS=0
         if (
             "UR_L0_USE_IMMEDIATE_COMMANDLISTS" in env_vars
@@ -296,7 +296,7 @@ def run(self, env_vars) -> list[Result]:
         ):
             return None
 
-        return super().run(env_vars)
+        return super().run(env_vars, unitrace_timestamp=unitrace_timestamp)
 
     def name(self):
         return "Velocity-Bench QuickSilver"
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
index 659c922ecfcc3..e50932d0c3803 100755
--- a/devops/scripts/benchmarks/main.py
+++ b/devops/scripts/benchmarks/main.py
@@ -85,9 +85,7 @@ def run_iterations(
             print(f"running {benchmark.name()} with Unitrace", flush=True)
         else:
             print(f"running {benchmark.name()}, iteration {iter}... ", flush=True)
-        bench_results = benchmark.run(
-            env_vars, unitrace_timestamp=unitrace_timestamp
-        )
+        bench_results = benchmark.run(env_vars, unitrace_timestamp=unitrace_timestamp)
         if bench_results is None:
             failures[benchmark.name()] = "benchmark produced no results!"
             break
@@ -201,10 +199,10 @@ def collect_metadata(suites):
     return metadata
 
 
-def main(directory, additional_env_vars, save_name, compare_names, filter):
+def main(directory, additional_env_vars, compare_names, filter):
     prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
 
-    if options.unitrace or options.unitrace_inclusive:
+    if options.unitrace_only or options.unitrace_inclusive:
         print("Downloading and building Unitrace...")
         download_and_build_unitrace(options.workdir)
         if options.results_directory_override == None:
@@ -294,7 +292,8 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
             merged_env_vars = {**additional_env_vars}
             intermediate_results: dict[str, list[Result]] = {}
             processed: list[Result] = []
-            if not options.unitrace:
+            # regular run of the benchmark
+            if not options.unitrace_only:
                 for _ in range(options.iterations_stddev):
                     run_iterations(
                         benchmark,
@@ -309,8 +308,9 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
                     )
                     if valid:
                         break
-            if options.unitrace_inclusive or options.unitrace:
-                # run the benchmark with unitrace
+            # unitrace run of the benchmark
+            if options.unitrace_inclusive or options.unitrace_only:
+                # set the timestamp to enable unitrace run and save results with proper file names
                 run_iterations(
                     benchmark,
                     merged_env_vars,
@@ -375,13 +375,13 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
             f"Markdown with benchmark results has been written to {md_path}/benchmark_results.md"
         )
 
-    saved_name = save_name if save_name is not None else this_name
+    saved_name = options.save_name if options.save_name is not None else this_name
 
     # It's important we don't save the current results into history before
     # we calculate historical averages or get latest results for compare.
     # Otherwise we might be comparing the results to themselves.
     if not options.dry_run:
-        history.save(saved_name, timestamp, results, save_name is not None)
+        history.save(saved_name, timestamp, results, options.save_name is not None)
         if saved_name not in compare_names:
             compare_names.append(saved_name)
 
@@ -673,6 +673,7 @@ def validate_and_parse_env_args(env_args):
     options.ur = args.ur
     options.ur_adapter = args.adapter
     options.exit_on_failure = args.exit_on_failure
+    options.save_name = args.save
     options.compare = Compare(args.compare_type)
     options.compare_max = args.compare_max
     options.output_markdown = args.output_markdown
@@ -688,7 +689,7 @@ def validate_and_parse_env_args(env_args):
     options.results_directory_override = args.results_dir
     options.build_jobs = args.build_jobs
     options.hip_arch = args.hip_arch
-    options.unitrace = args.unitrace
+    options.unitrace_only = args.unitrace
     options.unitrace_inclusive = args.unitrace_inclusive
 
     if args.build_igc and args.compute_runtime is None:
@@ -744,7 +745,6 @@ def validate_and_parse_env_args(env_args):
     main(
         args.benchmark_directory,
         additional_env_vars,
-        args.save,
         args.compare,
         benchmark_filter,
     )
diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py
index dc2852fa63665..f334f36b08c4e 100644
--- a/devops/scripts/benchmarks/options.py
+++ b/devops/scripts/benchmarks/options.py
@@ -53,6 +53,7 @@ class Options:
     timeout: float = 600
     iterations: int = 3
     verbose: bool = False
+    save_name: str = None
     compare: Compare = Compare.LATEST
     compare_max: int = 10  # average/median over how many results
     output_markdown: MarkdownSize = MarkdownSize.SHORT
@@ -69,7 +70,7 @@ class Options:
     current_run_name: str = "This PR"
     preset: str = "Full"
     build_jobs: int = multiprocessing.cpu_count()
-    unitrace: bool = False
+    unitrace_only: bool = False
     unitrace_inclusive: bool = False
     unitrace_res_dir: str = None
 
diff --git a/devops/scripts/benchmarks/utils/unitrace.py b/devops/scripts/benchmarks/utils/unitrace.py
new file mode 100644
index 0000000000000..87458c9e96967
--- /dev/null
+++ b/devops/scripts/benchmarks/utils/unitrace.py
@@ -0,0 +1,143 @@
+# Copyright (C) 2024-2025 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+import os
+import shutil
+import re
+
+from options import options
+import stat as statmod
+
+
+def extract_save_name_and_timestamp(dirname):
+    """
+    Extracts (save_name, timestamp) from a directory name of the form {save_name}_{timestamp},
+    where timestamp is always 15 characters: YYYYMMDD_HHMMSS.
+    save_name may contain underscores.
+    """
+    m = re.match(r"(.+)_(\d{8}_\d{6})$", dirname)
+    if m:
+        return m.group(1), m.group(2)
+    return None, None
+
+
+def prune_unitrace_dirs(base_dir, FILECNT=10):
+    """
+    Keeps only FILECNT newest directories for each save_name group in base_dir.
+    """
+    dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
+    groups = {}
+    for d in dirs:
+        save_name, ts = extract_save_name_and_timestamp(d)
+        if save_name and ts:
+            groups.setdefault(save_name, []).append((d, ts))
+    for save_name, dirlist in groups.items():
+        # Sort by timestamp string (lexicographically, works for YYYYMMDD_HHMMSS)
+        dirlist.sort(key=lambda x: x[1])
+        if len(dirlist) > FILECNT:
+            for d, ts in dirlist[: len(dirlist) - FILECNT]:
+                full_path = os.path.join(base_dir, d)
+                print(f"Removing old unitrace dir: {full_path}")
+                shutil.rmtree(full_path)
+
+
+def unitrace_cleanup(bench_cwd, unitrace_output):
+    # Remove .pid files from the benchmark directory and .json files from cwd
+    unitrace_dir = os.path.dirname(unitrace_output)
+    unitrace_base = os.path.basename(unitrace_output)
+    print(f"Cleanup unitrace output {unitrace_base} from {unitrace_dir}")
+    for f in os.listdir(unitrace_dir):
+        if f.startswith(unitrace_base + "."):
+            os.remove(os.path.join(unitrace_dir, f))
+            print(f"Cleanup: Removed {f} from {unitrace_dir}")
+    if os.path.exists(bench_cwd):
+        for f in os.listdir(bench_cwd):
+            if f.endswith(".json"):
+                os.remove(os.path.join(bench_cwd, f))
+                print(f"Cleanup: Removed {f} from {bench_cwd}")
+
+
+def unitrace_prepare(name, unitrace_timestamp, command, extra_unitrace_opt=[]):
+    unitrace_bin = os.path.join(options.workdir, "unitrace-build", "unitrace")
+    if not os.path.exists(unitrace_bin):
+        raise FileNotFoundError(f"Unitrace binary not found: {unitrace_bin}. ")
+    os.makedirs(options.unitrace_res_dir, exist_ok=True)
+    if not options.save_name:
+        raise ValueError(
+            "Unitrace requires a save name to be specified via --save option."
+        )
+    bench_dir = f"{options.unitrace_res_dir}/{options.save_name}_{unitrace_timestamp}"
+    os.makedirs(bench_dir, exist_ok=True)
+
+    unitrace_output = f"{bench_dir}/{name}_{unitrace_timestamp}"
+    unitrace_command = (
+        [
+            str(unitrace_bin),
+            "--call-logging",
+            "--host-timing",
+            "--device-timing",
+            "--chrome-sycl-logging",
+            "--chrome-call-logging",
+            "--chrome-kernel-logging",
+            "--output",
+            unitrace_output,
+        ]
+        + extra_unitrace_opt
+        + command
+    )
+    if options.verbose:
+        print(f"Unitrace cmd: {' '.join(unitrace_command)}")
+
+    return bench_dir, unitrace_output, unitrace_command
+
+
+def handle_unitrace_output(bench_dir, unitrace_output, timestamp):
+    # Handle unitrace_output.{pid} logs: rename to unitrace_output (remove pid)
+    for f in os.listdir(bench_dir):
+        if f.startswith(os.path.basename(unitrace_output) + "."):
+            parts = f.rsplit(".", 1)
+            if (
+                len(parts) == 2
+                and parts[1].isdigit()
+                and os.path.isfile(os.path.join(bench_dir, f))
+            ):
+                src = os.path.join(bench_dir, f)
+                dst = os.path.join(bench_dir, os.path.basename(unitrace_output))
+                shutil.move(src, dst)
+                if options.verbose:
+                    print(f"Renamed {src} to {dst}")
+                break
+
+    # Handle {name}.{pid}.json files in cwd: move and rename to {self.name()}_{timestamp}.json
+    pid_json_files = []
+    for f in os.listdir(options.benchmark_cwd):
+        parts = f.split(".")
+        l = len(parts)
+        if len(parts) >= 3 and parts[l - 1] == "json" and parts[l - 2].isdigit():
+            pid_json_files.append(f)
+
+    if len(pid_json_files) == 1:
+        dst = f"{unitrace_output}.json"
+    else:
+        print(
+            f"Warning: Found {len(pid_json_files)} files matching the pattern. Expected 1."
+        )
+        # Find the newest file by modification time
+        newest_file = max(
+            pid_json_files,
+            key=lambda f: os.path.getmtime(os.path.join(options.benchmark_cwd, f)),
+        )
+        dst = f"{unitrace_output}.json"
+        for f in pid_json_files:
+            if f != newest_file:
+                os.remove(os.path.join(options.benchmark_cwd, f))
+                if options.verbose:
+                    print(f"Removed extra file {f}")
+
+    shutil.move(os.path.join(options.benchmark_cwd, pid_json_files[0]), dst)
+    if options.verbose:
+        print(f"Moved {pid_json_files[0]} to {dst}")
+
+    # Prune old unitrace directories
+    prune_unitrace_dirs(options.unitrace_res_dir, FILECNT=5)
diff --git a/devops/scripts/benchmarks/utils/utils.py b/devops/scripts/benchmarks/utils/utils.py
index 3780494e00584..ef2a1222a7183 100644
--- a/devops/scripts/benchmarks/utils/utils.py
+++ b/devops/scripts/benchmarks/utils/utils.py
@@ -56,9 +56,7 @@ def run(
             command_str = " ".join(command)
             env_str = " ".join(f"{key}={value}" for key, value in env_vars.items())
             full_command_str = f"{env_str} {command_str}".strip()
-            print(
-                f"Running: {full_command_str}\nLD_LIBRARY_PATH: {env.get('LD_LIBRARY_PATH', '')}"
-            )
+            print(f"Running: {full_command_str}")
 
         result = subprocess.run(
             command,

From be36447deffabf0773d88eb0db07230238df1ea7 Mon Sep 17 00:00:00 2001
From: "Mateusz P. Nowak" <mateusz.p.nowak@intel.com>
Date: Mon, 7 Jul 2025 09:55:10 +0000
Subject: [PATCH 4/6] Final fixes

Signed-off-by: Mateusz P. Nowak <mateusz.p.nowak@intel.com>
---
 devops/scripts/benchmarks/benches/benchdnn.py |    2 -
 .../benchmarks/benches/benchdnn_list.py       |   11 +-
 devops/scripts/benchmarks/history.py          |   13 +-
 devops/scripts/benchmarks/html/data.js        | 5107 +----------------
 devops/scripts/benchmarks/main.py             |   41 +-
 devops/scripts/benchmarks/utils/unitrace.py   |   41 +-
 6 files changed, 62 insertions(+), 5153 deletions(-)

diff --git a/devops/scripts/benchmarks/benches/benchdnn.py b/devops/scripts/benchmarks/benches/benchdnn.py
index 0e251de96f393..76def7fde77ec 100644
--- a/devops/scripts/benchmarks/benches/benchdnn.py
+++ b/devops/scripts/benchmarks/benches/benchdnn.py
@@ -75,8 +75,6 @@ def setup(self):
             "-DCMAKE_CXX_COMPILER=clang++",
             "-DCMAKE_C_COMPILER=clang",
             "-DCMAKE_BUILD_TYPE=Release",
-            "-DCMAKE_CXX_COMPILER=clang++",
-            "-DCMAKE_C_COMPILER=clang",
             "-DDNNL_BUILD_TESTS=ON",
             "-DDNNL_BUILD_EXAMPLES=OFF",
             "-DDNNL_CPU_RUNTIME=NONE",  # Disable SYCL CPU support
diff --git a/devops/scripts/benchmarks/benches/benchdnn_list.py b/devops/scripts/benchmarks/benches/benchdnn_list.py
index bd84800efc75e..cc5929a5f07da 100644
--- a/devops/scripts/benchmarks/benches/benchdnn_list.py
+++ b/devops/scripts/benchmarks/benches/benchdnn_list.py
@@ -3,6 +3,11 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+unitrace_exclusion_list = [
+    "onednn-graph-sdpa-plain-f32-eager",
+    "onednn-graph-sdpa-plain-f32-graph",
+]
+
 # entry format:
 #  [bench_driver, bench_name, bench_args, rungraph]
 #  bench_driver is the name of the benchdnn driver, e.g. "sum", "graph", etc.
@@ -12,12 +17,6 @@
 #    if rungraph is True, both direct and graph execution modes will be run for the benchmark
 #    if False, only direct execution mode will be run
 
-unitrace_exclusion_list = [
-    "onednn-graph-sdpa-plain-f32-eager",
-    "onednn-graph-sdpa-plain-f32-graph",
-]
-
-
 # the final choice of benchmarks to run, used in CI and other environments
 benches_final_set = [
     [
diff --git a/devops/scripts/benchmarks/history.py b/devops/scripts/benchmarks/history.py
index f91ab1951b270..1a2b40b26c91e 100644
--- a/devops/scripts/benchmarks/history.py
+++ b/devops/scripts/benchmarks/history.py
@@ -160,12 +160,13 @@ def save(self, save_name, timestamp, results: list[Result], to_file=True):
         results_dir = Path(os.path.join(self.dir, "results"))
         os.makedirs(results_dir, exist_ok=True)
 
-        # # Use formatted timestamp for the filename
-        # timestamp = (
-        #     datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
-        #     if options.timestamp_override is None
-        #     else options.timestamp_override
-        # )
+        # Use formatted timestamp for the filename
+        if timestamp is None:
+            timestamp = (
+                datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
+                if options.timestamp_override is None
+                else options.timestamp_override
+            )
         file_path = Path(os.path.join(results_dir, f"{save_name}_{timestamp}.json"))
         with file_path.open("w") as file:
             json.dump(serialized, file, indent=4)
diff --git a/devops/scripts/benchmarks/html/data.js b/devops/scripts/benchmarks/html/data.js
index f4ca859a904fd..eaa5dfdf8b375 100644
--- a/devops/scripts/benchmarks/html/data.js
+++ b/devops/scripts/benchmarks/html/data.js
@@ -1,5101 +1,10 @@
-benchmarkRuns = [
-  {
-    "results": [
-      {
-        "label": "onednn-sum-f16-1-eager",
-        "value": 0.00944,
-        "command": [
-          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
-          "--sum",
-          "--mode=P",
-          "--engine=gpu",
-          "--max-ms-per-prb=100",
-          "--execution-mode=direct",
-          "--sdt=f16:f16:f16",
-          "--stag=abx:abx:abx",
-          "--scales=1.25:3:0.5",
-          "16x2x6x4x3"
-        ],
-        "env": {
-          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
-        },
-        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16 --scales=1.25:3:0.5 16x2x6x4x3,0,2.52173,0.00944,0,0.0128609,0\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.00944 avg(ms):0.0128609\ntotal: 0.27s; create_pd: 0.00s (0%); create_prim: 0.00s (1%); fill: 0.01s (3%); execute: 0.00s (0%);\n",
-        "passed": true,
-        "unit": "ms",
-        "stddev": 0.0,
-        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
-        "git_hash": "v3.8",
-        "name": "onednn-sum-f16-1-eager",
-        "lower_is_better": true,
-        "suite": "BenchDNN"
-      },
-      {
-        "label": "onednn-sum-f16-2-eager",
-        "value": 0.60928,
-        "command": [
-          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
-          "--sum",
-          "--mode=P",
-          "--engine=gpu",
-          "--max-ms-per-prb=100",
-          "--execution-mode=direct",
-          "--reset",
-          "--ddt=f16",
-          "--sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16",
-          "--stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b",
-          "--dtag=abx,aBx16b,ABx16a16b,ABcd16b16a,BAcd16a16b,BAcd16b16a,aBCd16b16c,aBCd16c16b,aCBd16b16c,aCBd16c16b",
-          "--scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2",
-          "16x32x48x5"
-        ],
-        "env": {
-          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
-        },
-        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=abx --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,48.9631,0.06448,0,0.0676806,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBx16b --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,30.5063,0.05808,0,0.0612839,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABx16a16b --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,47.4368,0.05888,0,0.0620269,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABcd16b16a --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,46.4478,0.06368,0,0.0671496,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16a16b --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,46.657,0.05984,0,0.0630586,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16b16a --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,45.9631,0.06448,0,0.0679256,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16b16c --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,41.9988,0.06208,0,0.0652478,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16c16b --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,47.5825,0.05808,0,0.061508,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16b16c --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,51.822,0.06288,0,0.0659863,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=f16:f16:f16:f16:f16:f16:f16:f16:f16:f16 --ddt=f16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16c16b --scales=1.25:3:0.5:2:0.5:2:0.5:2:0.5:2 16x32x48x5,0,50.4551,0.0568,0,0.0609149,0\ntests:10 passed:10 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.60928 avg(ms):0.642782\ntotal: 2.47s; create_pd: 0.01s (0%); create_prim: 0.45s (18%); fill: 0.08s (3%); execute: 0.02s (1%);\n",
-        "passed": true,
-        "unit": "ms",
-        "stddev": 0.0021996363335788104,
-        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
-        "git_hash": "v3.8",
-        "name": "onednn-sum-f16-2-eager",
-        "lower_is_better": true,
-        "suite": "BenchDNN"
-      },
-      {
-        "label": "onednn-sum-f32-1-eager",
-        "value": 0.0088,
-        "command": [
-          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
-          "--sum",
-          "--mode=P",
-          "--engine=gpu",
-          "--max-ms-per-prb=100",
-          "--execution-mode=direct",
-          "--sdt=bf16:bf16:bf16",
-          "--stag=abx:abx:abx",
-          "--scales=0.5:2:0.5",
-          "16x2x6x4x3"
-        ],
-        "env": {
-          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
-        },
-        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16 --scales=0.5:2:0.5 16x2x6x4x3,0,2.42236,0.0088,0,0.0129955,0\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.0088 avg(ms):0.0129955\ntotal: 0.28s; create_pd: 0.00s (0%); create_prim: 0.00s (1%); fill: 0.01s (3%); execute: 0.00s (0%);\n",
-        "passed": true,
-        "unit": "ms",
-        "stddev": 0.00017486502731471965,
-        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
-        "git_hash": "v3.8",
-        "name": "onednn-sum-f32-1-eager",
-        "lower_is_better": true,
-        "suite": "BenchDNN"
-      },
-      {
-        "label": "onednn-sum-f32-2-eager",
-        "value": 0.6441600000000001,
-        "command": [
-          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
-          "--sum",
-          "--mode=P",
-          "--engine=gpu",
-          "--max-ms-per-prb=100",
-          "--execution-mode=direct",
-          "--reset",
-          "--inplace=true,false",
-          "--ddt=bf16",
-          "--sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16",
-          "--stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b",
-          "--dtag=abx,aBx16b,ABx16a16b,ABcd16b16a,BAcd16a16b,BAcd16b16a,aBCd16b16c,aBCd16c16b,aCBd16b16c,aCBd16c16b",
-          "--scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15",
-          "16x32x48x5"
-        ],
-        "env": {
-          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
-        },
-        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=abx --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,47.5034,0.0648,0,0.0682694,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=abx --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,0.0119629,0.06512,0,0.068156,0\n2:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBx16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBx16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBx16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,29.7659,0.0568,0,0.060283,0\n4:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABx16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABx16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABx16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,46.333,0.0568,0,0.0603173,0\n6:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=ABcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,44.2969,0.05808,0,0.0609991,0\n8:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16a16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,44.946,0.05248,0,0.055674,0\n10:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=BAcd16b16a --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,44.5508,0.05904,0,0.0620182,0\n12:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,41.147,0.06112,0,0.0643661,0\n14:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aBCd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,46.8096,0.05728,0,0.0602824,0\n16:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16b16c --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,50.3113,0.05664,0,0.0600053,0\n18:SKIPPED (Invalid case) (0 ms) __REPRO: --mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5\nperf,gpu,,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 --inplace=true 16x32x48x5,0,0,0,0,0,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16:bf16 --ddt=bf16 --stag=abx:aBx16b:ABx16a16b:ABcd16b16a:BAcd16a16b:BAcd16b16a:aBCd16b16c:aBCd16c16b:aCBd16b16c:aCBd16c16b --dtag=aCBd16c16b --scales=0.25:0.15:0.25:0.25:0.25:0.25:0.15:0.25:0.25:0.15 16x32x48x5,0,49.8357,0.056,0,0.0593853,0\ntests:20 passed:11 skipped:9 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.64416 avg(ms):0.679756\ntotal: 2.68s; create_pd: 0.01s (0%); create_prim: 0.44s (16%); fill: 0.08s (3%); execute: 0.02s (1%);\n",
-        "passed": true,
-        "unit": "ms",
-        "stddev": 0.004735567547823622,
-        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
-        "git_hash": "v3.8",
-        "name": "onednn-sum-f32-2-eager",
-        "lower_is_better": true,
-        "suite": "BenchDNN"
-      },
-      {
-        "label": "onednn-sum-padding-1-eager",
-        "value": 0.3904,
-        "command": [
-          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
-          "--sum",
-          "--mode=P",
-          "--engine=gpu",
-          "--max-ms-per-prb=100",
-          "--execution-mode=direct",
-          "--ddt=f32",
-          "--sdt=f32:f32",
-          "--stag=aBcd16b",
-          "--dtag=aBcd16b",
-          "1x8x64x64",
-          "1x8x640x1024",
-          "1x24x640x1024"
-        ],
-        "env": {
-          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
-        },
-        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x8x64x64,0,1.58545,0.00192,0,0.00269551,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x8x640x1024,0,0.890869,0.08528,0,0.0932233,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x24x640x1024,0,1.29517,0.3032,0,0.32437,0\ntests:3 passed:3 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.3904 avg(ms):0.420289\ntotal: 0.91s; create_pd: 0.00s (0%); create_prim: 0.00s (0%); fill: 0.15s (16%); execute: 0.02s (2%);\n",
-        "passed": true,
-        "unit": "ms",
-        "stddev": 0.001602664448140469,
-        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
-        "git_hash": "v3.8",
-        "name": "onednn-sum-padding-1-eager",
-        "lower_is_better": true,
-        "suite": "BenchDNN"
-      },
-      {
-        "label": "onednn-sum-padding-1-graph",
-        "value": 0.39216,
-        "command": [
-          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
-          "--sum",
-          "--mode=P",
-          "--engine=gpu",
-          "--max-ms-per-prb=100",
-          "--execution-mode=graph",
-          "--ddt=f32",
-          "--sdt=f32:f32",
-          "--stag=aBcd16b",
-          "--dtag=aBcd16b",
-          "1x8x64x64",
-          "1x8x640x1024",
-          "1x24x640x1024"
-        ],
-        "env": {
-          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
-        },
-        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --execution-mode=graph --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x8x64x64,0,1.43994,0.00192,0,0.00268973,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --execution-mode=graph --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x8x640x1024,0,0.874268,0.08656,0,0.094599,0\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --execution-mode=graph --stag=aBcd16b:aBcd16b --dtag=aBcd16b --scales=1 1x24x640x1024,0,1.27124,0.30368,0,0.325998,0\ntests:3 passed:3 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.39216 avg(ms):0.423287\ntotal: 0.89s; create_pd: 0.00s (0%); create_prim: 0.00s (0%); fill: 0.13s (15%); execute: 0.02s (2%);\n",
-        "passed": true,
-        "unit": "ms",
-        "stddev": 0.0009097985124923661,
-        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
-        "git_hash": "v3.8",
-        "name": "onednn-sum-padding-1-graph",
-        "lower_is_better": true,
-        "suite": "BenchDNN"
-      },
-      {
-        "label": "onednn-sum-padding-2-eager",
-        "value": 0.00336,
-        "command": [
-          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
-          "--sum",
-          "--mode=P",
-          "--engine=gpu",
-          "--max-ms-per-prb=100",
-          "--execution-mode=direct",
-          "--sdt=bf16:bf16",
-          "--ddt=bf16",
-          "--stag=AB48a16b:AB48a16b",
-          "--dtag=AB48a16b",
-          "512x1024"
-        ],
-        "env": {
-          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
-        },
-        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --sdt=bf16:bf16 --ddt=bf16 --stag=AB48a16b:AB48a16b --dtag=AB48a16b --scales=1 512x1024,0,1.21216,0.00336,0,0.00399908,0\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.00336 avg(ms):0.00399908\ntotal: 0.33s; create_pd: 0.00s (0%); create_prim: 0.00s (0%); fill: 0.02s (6%); execute: 0.00s (0%);\n",
-        "passed": true,
-        "unit": "ms",
-        "stddev": 8.262364471909155e-05,
-        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
-        "git_hash": "v3.8",
-        "name": "onednn-sum-padding-2-eager",
-        "lower_is_better": true,
-        "suite": "BenchDNN"
-      },
-      {
-        "label": "onednn-sum-padding-2-graph",
-        "value": 0.00352,
-        "command": [
-          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
-          "--sum",
-          "--mode=P",
-          "--engine=gpu",
-          "--max-ms-per-prb=100",
-          "--execution-mode=graph",
-          "--sdt=bf16:bf16",
-          "--ddt=bf16",
-          "--stag=AB48a16b:AB48a16b",
-          "--dtag=AB48a16b",
-          "512x1024"
-        ],
-        "env": {
-          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
-        },
-        "stdout": "Output template: perf,%engine%,%impl%,%name%,%prb%,%Gops%,%+ctime%,%-time%,%-Gflops%,%0time%,%0Gflops%\nperf,gpu,multi_po_reorder_sum,,--mode=P --max-ms-per-prb=100 --sum --engine=gpu --execution-mode=graph --sdt=bf16:bf16 --ddt=bf16 --stag=AB48a16b:AB48a16b --dtag=AB48a16b --scales=1 512x1024,0,1.24072,0.00352,0,0.00398547,0\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.00352 avg(ms):0.00398547\ntotal: 0.33s; create_pd: 0.00s (0%); create_prim: 0.00s (0%); fill: 0.02s (7%); execute: 0.00s (0%);\n",
-        "passed": true,
-        "unit": "ms",
-        "stddev": 0.0,
-        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
-        "git_hash": "v3.8",
-        "name": "onednn-sum-padding-2-graph",
-        "lower_is_better": true,
-        "suite": "BenchDNN"
-      },
-      {
-        "label": "onednn-graph-sdpa-plain-f16-eager",
-        "value": 0.33968,
-        "command": [
-          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
-          "--graph",
-          "--mode=P",
-          "--engine=gpu",
-          "--max-ms-per-prb=100",
-          "--execution-mode=direct",
-          "--reset",
-          "--dt=f16",
-          "--case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json"
-        ],
-        "env": {
-          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
-        },
-        "stdout": "Output template: perf,%engine%,%prb%,%-time%,%0time%\nperf,gpu,--mode=P --max-ms-per-prb=100 --graph --engine=gpu --dt=f16 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json,0.33968,0.342391\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.33968 avg(ms):0.342391\ntotal: 0.54s; create_pd: 0.00s (0%); create_prim: 0.07s (13%); fill: 0.00s (0%); execute: 0.00s (0%);\n",
-        "passed": true,
-        "unit": "ms",
-        "stddev": 0.00855442631178792,
-        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
-        "git_hash": "v3.8",
-        "name": "onednn-graph-sdpa-plain-f16-eager",
-        "lower_is_better": true,
-        "suite": "BenchDNN"
-      },
-      {
-        "label": "onednn-graph-sdpa-plain-f32-eager",
-        "value": 0.38512,
-        "command": [
-          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
-          "--graph",
-          "--mode=P",
-          "--engine=gpu",
-          "--max-ms-per-prb=100",
-          "--execution-mode=direct",
-          "--reset",
-          "--dt=f32",
-          "--case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json"
-        ],
-        "env": {
-          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
-        },
-        "stdout": "Output template: perf,%engine%,%prb%,%-time%,%0time%\nperf,gpu,--mode=P --max-ms-per-prb=100 --graph --engine=gpu --dt=f32 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json,0.38512,0.388208\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.38512 avg(ms):0.388208\ntotal: 0.60s; create_pd: 0.00s (0%); create_prim: 0.07s (11%); fill: 0.00s (0%); execute: 0.00s (0%);\n",
-        "passed": true,
-        "unit": "ms",
-        "stddev": 0.0066990148529466635,
-        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
-        "git_hash": "v3.8",
-        "name": "onednn-graph-sdpa-plain-f32-eager",
-        "lower_is_better": true,
-        "suite": "BenchDNN"
-      },
-      {
-        "label": "onednn-graph-sdpa-plain-f32-graph",
-        "value": 0.37952,
-        "command": [
-          "/home/mateuszpn/workdir/onednn-build/tests/benchdnn/benchdnn",
-          "--graph",
-          "--mode=P",
-          "--engine=gpu",
-          "--max-ms-per-prb=100",
-          "--execution-mode=graph",
-          "--reset",
-          "--dt=f32",
-          "--case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json"
-        ],
-        "env": {
-          "ONEAPI_DEVICE_SELECTOR": "level_zero:*"
-        },
-        "stdout": "Output template: perf,%engine%,%prb%,%-time%,%0time%\nperf,gpu,--mode=P --max-ms-per-prb=100 --graph --engine=gpu --execution-mode=graph --dt=f32 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json,0.37952,0.382662\ntests:1 passed:1 skipped:0 mistrusted:0 unimplemented:0 invalid_arguments:0 failed:0 listed:0\ntotal perf: min(ms):0.37952 avg(ms):0.382662\ntotal: 0.58s; create_pd: 0.00s (0%); create_prim: 0.07s (11%); fill: 0.00s (0%); execute: 0.00s (0%);\n",
-        "passed": true,
-        "unit": "ms",
-        "stddev": 0.011297102873450952,
-        "git_url": "https://github.com/uxlfoundation/oneDNN.git",
-        "git_hash": "v3.8",
-        "name": "onednn-graph-sdpa-plain-f32-graph",
-        "lower_is_better": true,
-        "suite": "BenchDNN"
-      }
-    ],
-    "name": "This PR",
-    "hostname": "gkdse-pre-dnp-02",
-    "git_hash": "1eb1026ad0ef",
-    "github_repo": "mateuszpn/llvm",
-    "date": "2025-06-27T09:56:15.698275+00:00",
-    "compute_runtime": "unknown"
-  }
-];
+// This file serves as a placeholder for loading data locally: If
+// `remoteDataUrl` (etc.) is not defined in config.js, the dashboard will
+// attempt to load data from variables defined here instead.
+//
+// These variables are empty by default, and are populated by main.py if
+// `--output-html local` is specified.
 
-benchmarkMetadata = {
-  "SubmitKernel": {
-    "type": "group",
-    "description": "Measures CPU time overhead of submitting kernels through different APIs.",
-    "notes": "Each layer builds on top of the previous layer, adding functionality and overhead.\nThe first layer is the Level Zero API, the second is the Unified Runtime API, and the third is the SYCL API.\nThe UR v2 adapter noticeably reduces UR layer overhead, also improving SYCL performance.\nWork is ongoing to reduce the overhead of the SYCL API\n",
-    "unstable": null,
-    "tags": [
-      "submit",
-      "micro",
-      "SYCL",
-      "UR",
-      "L0"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": null,
-    "explicit_group": null
-  },
-  "SinKernelGraph": {
-    "type": "group",
-    "description": null,
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "submit",
-      "memory",
-      "proxy",
-      "SYCL",
-      "UR",
-      "L0",
-      "graph"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": null,
-    "explicit_group": null
-  },
-  "SubmitGraph": {
-    "type": "group",
-    "description": null,
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "micro",
-      "SYCL",
-      "UR",
-      "L0",
-      "graph"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": null,
-    "explicit_group": null
-  },
-  "FinalizeGraph": {
-    "type": "group",
-    "description": null,
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "finalize",
-      "micro",
-      "SYCL",
-      "graph"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": null,
-    "explicit_group": null
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order, CPU count"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order KernelExecTime=20"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order using events, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order using events"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order using events, CPU count"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion, CPU count"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion using events, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion using events"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion using events, CPU count"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel out of order with measure completion KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order, NumKernels 10",
-    "explicit_group": "SubmitKernel in order"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order, CPU count"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order KernelExecTime=20"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order using events, NumKernels 10",
-    "explicit_group": "SubmitKernel in order using events"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order using events, CPU count"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order using events KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion, CPU count"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion using events, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion using events"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion using events, CPU count"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL Preview API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20, CPU count"
-  },
-  "graph_api_benchmark_syclpreview SinKernelGraph graphs:0, numKernels:5": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 5 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "SYCL",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SinKernelGraph, graphs 0, numKernels 5",
-    "explicit_group": "SinKernelGraph, numKernels: 5"
-  },
-  "graph_api_benchmark_syclpreview SinKernelGraph graphs:0, numKernels:100": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 100 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "SYCL",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SinKernelGraph, graphs 0, numKernels 100",
-    "explicit_group": "SinKernelGraph, numKernels: 100"
-  },
-  "graph_api_benchmark_syclpreview SinKernelGraph graphs:1, numKernels:5": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 5 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "SYCL",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SinKernelGraph, graphs 1, numKernels 5",
-    "explicit_group": "SinKernelGraph, numKernels: 5"
-  },
-  "graph_api_benchmark_syclpreview SinKernelGraph graphs:1, numKernels:100": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 100 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "SYCL",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SinKernelGraph, graphs 1, numKernels 100",
-    "explicit_group": "SinKernelGraph, numKernels: 100"
-  },
-  "ulls_benchmark_syclpreview EmptyKernel wgc:1000, wgs:256": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro",
-      "latency",
-      "submit"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW EmptyKernel, wgc 1000, wgs 256",
-    "explicit_group": "EmptyKernel, wgc: 1000, wgs: 256"
-  },
-  "ulls_benchmark_syclpreview KernelSwitch count 8 kernelTime 200": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro",
-      "latency",
-      "submit"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW KernelSwitch, count 8, kernelTime 200",
-    "explicit_group": "KernelSwitch, count: 8, kernelTime: 200"
-  },
-  "graph_api_benchmark_syclpreview SubmitGraph numKernels:4 ioq 0 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 4, ioq 0, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_syclpreview SubmitGraph numKernels:4 ioq 0 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 4, ioq 0, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_syclpreview SubmitGraph numKernels:10 ioq 0 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 10, ioq 0, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_syclpreview SubmitGraph numKernels:10 ioq 0 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 10, ioq 0, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_syclpreview SubmitGraph numKernels:32 ioq 0 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 32, ioq 0, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "graph_api_benchmark_syclpreview SubmitGraph numKernels:32 ioq 0 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 32, ioq 0, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "graph_api_benchmark_syclpreview SubmitGraph numKernels:4 ioq 1 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 4, ioq 1, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_syclpreview SubmitGraph numKernels:4 ioq 1 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 4, ioq 1, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_syclpreview SubmitGraph numKernels:10 ioq 1 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 10, ioq 1, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_syclpreview SubmitGraph numKernels:10 ioq 1 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 10, ioq 1, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_syclpreview SubmitGraph numKernels:32 ioq 1 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 32, ioq 1, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "graph_api_benchmark_syclpreview SubmitGraph numKernels:32 ioq 1 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures SYCLPREVIEW performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCLPREVIEW SubmitGraph, numKernels 32, ioq 1, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order, CPU count"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order KernelExecTime=20"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order using events, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order using events"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order using events, CPU count"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order with measure completion, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order with measure completion, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion, CPU count"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order with measure completion using events, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion using events"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order with measure completion using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion using events, CPU count"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel out of order with measure completion KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order, NumKernels 10",
-    "explicit_group": "SubmitKernel in order"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order, CPU count"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order KernelExecTime=20"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order using events, NumKernels 10",
-    "explicit_group": "SubmitKernel in order using events"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order using events, CPU count"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order using events KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order with measure completion, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order with measure completion, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion, CPU count"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order with measure completion using events, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion using events"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order with measure completion using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion using events, CPU count"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_sycl SubmitKernel in order with measure completion KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through SYCL API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "SYCL SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20, CPU count"
-  },
-  "graph_api_benchmark_sycl SinKernelGraph graphs:0, numKernels:5": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 5 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "SYCL",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SinKernelGraph, graphs 0, numKernels 5",
-    "explicit_group": "SinKernelGraph, numKernels: 5"
-  },
-  "graph_api_benchmark_sycl SinKernelGraph graphs:0, numKernels:100": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 100 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "SYCL",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SinKernelGraph, graphs 0, numKernels 100",
-    "explicit_group": "SinKernelGraph, numKernels: 100"
-  },
-  "graph_api_benchmark_sycl SinKernelGraph graphs:1, numKernels:5": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 5 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "SYCL",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SinKernelGraph, graphs 1, numKernels 5",
-    "explicit_group": "SinKernelGraph, numKernels: 5"
-  },
-  "graph_api_benchmark_sycl SinKernelGraph graphs:1, numKernels:100": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 100 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "SYCL",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SinKernelGraph, graphs 1, numKernels 100",
-    "explicit_group": "SinKernelGraph, numKernels: 100"
-  },
-  "ulls_benchmark_sycl EmptyKernel wgc:1000, wgs:256": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro",
-      "latency",
-      "submit"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL EmptyKernel, wgc 1000, wgs 256",
-    "explicit_group": "EmptyKernel, wgc: 1000, wgs: 256"
-  },
-  "ulls_benchmark_sycl KernelSwitch count 8 kernelTime 200": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro",
-      "latency",
-      "submit"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL KernelSwitch, count 8, kernelTime 200",
-    "explicit_group": "KernelSwitch, count: 8, kernelTime: 200"
-  },
-  "graph_api_benchmark_sycl SubmitGraph numKernels:4 ioq 0 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SubmitGraph, numKernels 4, ioq 0, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_sycl SubmitGraph numKernels:4 ioq 0 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SubmitGraph, numKernels 4, ioq 0, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_sycl SubmitGraph numKernels:10 ioq 0 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SubmitGraph, numKernels 10, ioq 0, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_sycl SubmitGraph numKernels:10 ioq 0 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SubmitGraph, numKernels 10, ioq 0, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_sycl SubmitGraph numKernels:32 ioq 0 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SubmitGraph, numKernels 32, ioq 0, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "graph_api_benchmark_sycl SubmitGraph numKernels:32 ioq 0 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SubmitGraph, numKernels 32, ioq 0, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "graph_api_benchmark_sycl SubmitGraph numKernels:4 ioq 1 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SubmitGraph, numKernels 4, ioq 1, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_sycl SubmitGraph numKernels:4 ioq 1 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SubmitGraph, numKernels 4, ioq 1, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_sycl SubmitGraph numKernels:10 ioq 1 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SubmitGraph, numKernels 10, ioq 1, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_sycl SubmitGraph numKernels:10 ioq 1 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SubmitGraph, numKernels 10, ioq 1, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_sycl SubmitGraph numKernels:32 ioq 1 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SubmitGraph, numKernels 32, ioq 1, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "graph_api_benchmark_sycl SubmitGraph numKernels:32 ioq 1 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures SYCL performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL SubmitGraph, numKernels 32, ioq 1, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order, CPU count"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order KernelExecTime=20"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order using events, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order using events"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order using events, CPU count"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order with measure completion, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order with measure completion, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion, CPU count"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order with measure completion using events, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion using events"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order with measure completion using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion using events, CPU count"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel out of order with measure completion KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order, NumKernels 10",
-    "explicit_group": "SubmitKernel in order"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order, CPU count"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order KernelExecTime=20"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order using events, NumKernels 10",
-    "explicit_group": "SubmitKernel in order using events"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order using events, CPU count"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order using events KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order with measure completion, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order with measure completion, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion, CPU count"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order with measure completion using events, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion using events"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order with measure completion using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion using events, CPU count"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_l0 SubmitKernel in order with measure completion KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Level Zero API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "L0",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "L0 SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20, CPU count"
-  },
-  "graph_api_benchmark_l0 SinKernelGraph graphs:0, numKernels:5": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 5 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "L0",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SinKernelGraph, graphs 0, numKernels 5",
-    "explicit_group": "SinKernelGraph, numKernels: 5"
-  },
-  "graph_api_benchmark_l0 SinKernelGraph graphs:0, numKernels:100": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 100 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "L0",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SinKernelGraph, graphs 0, numKernels 100",
-    "explicit_group": "SinKernelGraph, numKernels: 100"
-  },
-  "graph_api_benchmark_l0 SinKernelGraph graphs:1, numKernels:5": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 5 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "L0",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SinKernelGraph, graphs 1, numKernels 5",
-    "explicit_group": "SinKernelGraph, numKernels: 5"
-  },
-  "graph_api_benchmark_l0 SinKernelGraph graphs:1, numKernels:100": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 100 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "L0",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SinKernelGraph, graphs 1, numKernels 100",
-    "explicit_group": "SinKernelGraph, numKernels: 100"
-  },
-  "ulls_benchmark_l0 EmptyKernel wgc:1000, wgs:256": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "L0",
-      "micro",
-      "latency",
-      "submit"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 EmptyKernel, wgc 1000, wgs 256",
-    "explicit_group": "EmptyKernel, wgc: 1000, wgs: 256"
-  },
-  "ulls_benchmark_l0 KernelSwitch count 8 kernelTime 200": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "L0",
-      "micro",
-      "latency",
-      "submit"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 KernelSwitch, count 8, kernelTime 200",
-    "explicit_group": "KernelSwitch, count: 8, kernelTime: 200"
-  },
-  "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 0 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "L0",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SubmitGraph, numKernels 4, ioq 0, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 0 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "L0",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SubmitGraph, numKernels 4, ioq 0, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 0 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "L0",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SubmitGraph, numKernels 10, ioq 0, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 0 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "L0",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SubmitGraph, numKernels 10, ioq 0, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 0 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "L0",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SubmitGraph, numKernels 32, ioq 0, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 0 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "L0",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SubmitGraph, numKernels 32, ioq 0, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 1 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "L0",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SubmitGraph, numKernels 4, ioq 1, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 1 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "L0",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SubmitGraph, numKernels 4, ioq 1, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 1 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "L0",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SubmitGraph, numKernels 10, ioq 1, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 1 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "L0",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SubmitGraph, numKernels 10, ioq 1, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 1 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "L0",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SubmitGraph, numKernels 32, ioq 1, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 1 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures L0 performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "L0",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "L0 SubmitGraph, numKernels 32, ioq 1, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order, CPU count"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order KernelExecTime=20"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order using events, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order using events"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order using events, CPU count"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order using events KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order with measure completion, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order with measure completion, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion, CPU count"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order with measure completion using events, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion using events"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order with measure completion using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion using events, CPU count"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_ur SubmitKernel out of order with measure completion KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting out-of-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel out of order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel out of order with completion using events KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order, NumKernels 10",
-    "explicit_group": "SubmitKernel in order"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order, CPU count"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order KernelExecTime=20"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order using events, NumKernels 10",
-    "explicit_group": "SubmitKernel in order using events"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order using events, CPU count"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, excluding kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order using events KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order with measure completion, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order with measure completion, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion, CPU count"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order with measure completion KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion KernelExecTime=20, CPU count"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order with measure completion": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order with measure completion using events, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion using events"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order with measure completion CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 1 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order with measure completion using events, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion using events, CPU count"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order with measure completion KernelExecTime=20": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10",
-    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20"
-  },
-  "api_overhead_benchmark_ur SubmitKernel in order with measure completion KernelExecTime=20 CPU count": {
-    "type": "benchmark",
-    "description": "Measures CPU time overhead of submitting in-order kernels through Unified Runtime API, including kernel completion time. Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time.Each kernel executes for approximately 20 micro seconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "submit",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": 0.0,
-    "range_max": null,
-    "display_name": "UR SubmitKernel in order with measure completion using events KernelExecTime=20, NumKernels 10, CPU count",
-    "explicit_group": "SubmitKernel in order with completion using events KernelExecTime=20, CPU count"
-  },
-  "graph_api_benchmark_ur SinKernelGraph graphs:0, numKernels:5": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 5 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "UR",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SinKernelGraph, graphs 0, numKernels 5",
-    "explicit_group": "SinKernelGraph, numKernels: 5"
-  },
-  "graph_api_benchmark_ur SinKernelGraph graphs:0, numKernels:100": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 100 sin kernels without graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "UR",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SinKernelGraph, graphs 0, numKernels 100",
-    "explicit_group": "SinKernelGraph, numKernels: 100"
-  },
-  "graph_api_benchmark_ur SinKernelGraph graphs:1, numKernels:5": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 5 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "UR",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SinKernelGraph, graphs 1, numKernels 5",
-    "explicit_group": "SinKernelGraph, numKernels: 5"
-  },
-  "graph_api_benchmark_ur SinKernelGraph graphs:1, numKernels:100": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 100 sin kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": "This benchmark combines both eager and graph execution, and may not be representative of real use cases.",
-    "tags": [
-      "graph",
-      "UR",
-      "proxy",
-      "submit",
-      "memory",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SinKernelGraph, graphs 1, numKernels 100",
-    "explicit_group": "SinKernelGraph, numKernels: 100"
-  },
-  "ulls_benchmark_ur EmptyKernel wgc:1000, wgs:256": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "UR",
-      "micro",
-      "latency",
-      "submit"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR EmptyKernel, wgc 1000, wgs 256",
-    "explicit_group": "EmptyKernel, wgc: 1000, wgs: 256"
-  },
-  "ulls_benchmark_ur KernelSwitch count 8 kernelTime 200": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "UR",
-      "micro",
-      "latency",
-      "submit"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR KernelSwitch, count 8, kernelTime 200",
-    "explicit_group": "KernelSwitch, count: 8, kernelTime: 200"
-  },
-  "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 0 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "UR",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SubmitGraph, numKernels 4, ioq 0, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 0 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "UR",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SubmitGraph, numKernels 4, ioq 0, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 0 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "UR",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SubmitGraph, numKernels 10, ioq 0, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 0 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "UR",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SubmitGraph, numKernels 10, ioq 0, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 0 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "UR",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SubmitGraph, numKernels 32, ioq 0, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 0 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "UR",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SubmitGraph, numKernels 32, ioq 0, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 1 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "UR",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SubmitGraph, numKernels 4, ioq 1, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 1 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 4 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "UR",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SubmitGraph, numKernels 4, ioq 1, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 4"
-  },
-  "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 1 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "UR",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SubmitGraph, numKernels 10, ioq 1, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 1 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 10 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "UR",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SubmitGraph, numKernels 10, ioq 1, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 10"
-  },
-  "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 1 measureCompletion 0": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "UR",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SubmitGraph, numKernels 32, ioq 1, measureCompletion 0",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 1 measureCompletion 1": {
-    "type": "benchmark",
-    "description": "Measures UR performance when executing 32 trivial kernels using graphs. Tests overhead and benefits of graph-based execution.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "UR",
-      "micro",
-      "submit",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR SubmitGraph, numKernels 32, ioq 1, measureCompletion 1",
-    "explicit_group": "SubmitGraph, numKernels: 32"
-  },
-  "memory_benchmark_sycl QueueInOrderMemcpy from Device to Device, size 1024": {
-    "type": "benchmark",
-    "description": "Measures SYCL in-order queue memory copy performance for copy and command submission from Device to Device with 1024 bytes, executed 100 times per iteration.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL QueueInOrderMemcpy from Device to Device, size 1024",
-    "explicit_group": ""
-  },
-  "memory_benchmark_sycl QueueInOrderMemcpy from Host to Device, size 1024": {
-    "type": "benchmark",
-    "description": "Measures SYCL in-order queue memory copy performance for copy and command submission from Host to Device with 1024 bytes, executed 100 times per iteration.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL QueueInOrderMemcpy from Host to Device, size 1024",
-    "explicit_group": ""
-  },
-  "memory_benchmark_sycl QueueMemcpy from Device to Device, size 1024": {
-    "type": "benchmark",
-    "description": "Measures general SYCL queue memory copy performance from Device to Device with 1024 bytes per operation.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL QueueMemcpy from Device to Device, size 1024",
-    "explicit_group": ""
-  },
-  "memory_benchmark_sycl StreamMemory, placement Device, type Triad, size 10240": {
-    "type": "benchmark",
-    "description": "Measures Device memory bandwidth using Triad pattern with 10240 bytes. Higher values (GB/s) indicate better performance.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "throughput",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL StreamMemory, placement Device, type Triad, size 10240",
-    "explicit_group": ""
-  },
-  "api_overhead_benchmark_sycl ExecImmediateCopyQueue out of order from Device to Device, size 1024": {
-    "type": "benchmark",
-    "description": "Measures SYCL out-of-order queue overhead for copy-only from Device to Device memory with 1024 bytes. Tests immediate execution overheads.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL ExecImmediateCopyQueue out of order from Device to Device, size 1024",
-    "explicit_group": ""
-  },
-  "api_overhead_benchmark_sycl ExecImmediateCopyQueue in order from Device to Host, size 1024": {
-    "type": "benchmark",
-    "description": "Measures SYCL in-order queue overhead for copy-only from Device to Host memory with 1024 bytes. Tests immediate execution overheads.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "submit",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL ExecImmediateCopyQueue in order from Device to Host, size 1024",
-    "explicit_group": ""
-  },
-  "miscellaneous_benchmark_sycl VectorSum": {
-    "type": "benchmark",
-    "description": "Measures performance of vector addition across 3D grid (512x256x256 elements) using SYCL.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "math",
-      "throughput",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL VectorSum",
-    "explicit_group": ""
-  },
-  "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:0 graphStructure:Gromacs": {
-    "type": "benchmark",
-    "description": "Measures the time taken to finalize a SYCL graph, using a graph structure based on the usage of graphs in Gromacs. It measures finalizing the same modifiable graph repeatedly over multiple iterations.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "finalize",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL FinalizeGraph, rebuildGraphEveryIter 0, graphStructure Gromacs",
-    "explicit_group": "FinalizeGraph, GraphStructure: Gromacs"
-  },
-  "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:1 graphStructure:Gromacs": {
-    "type": "benchmark",
-    "description": "Measures the time taken to finalize a SYCL graph, using a graph structure based on the usage of graphs in Gromacs. It measures finalizing a unique modifiable graph per iteration.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "finalize",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL FinalizeGraph, rebuildGraphEveryIter 1, graphStructure Gromacs",
-    "explicit_group": "FinalizeGraph, GraphStructure: Gromacs"
-  },
-  "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:0 graphStructure:Llama": {
-    "type": "benchmark",
-    "description": "Measures the time taken to finalize a SYCL graph, using a graph structure based on the usage of graphs in Llama. It measures finalizing the same modifiable graph repeatedly over multiple iterations.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "finalize",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL FinalizeGraph, rebuildGraphEveryIter 0, graphStructure Llama",
-    "explicit_group": "FinalizeGraph, GraphStructure: Llama"
-  },
-  "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:1 graphStructure:Llama": {
-    "type": "benchmark",
-    "description": "Measures the time taken to finalize a SYCL graph, using a graph structure based on the usage of graphs in Llama. It measures finalizing a unique modifiable graph per iteration.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "graph",
-      "SYCL",
-      "micro",
-      "finalize",
-      "latency"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL FinalizeGraph, rebuildGraphEveryIter 1, graphStructure Llama",
-    "explicit_group": "FinalizeGraph, GraphStructure: Llama"
-  },
-  "multithread_benchmark_ur MemcpyExecute opsPerThread:400, numThreads:1, allocSize:102400 srcUSM:1 dstUSM:1": {
-    "type": "benchmark",
-    "description": "Measures multithreaded memory copy performance with 1 threads each performing 400 operations on 102400 bytes from device to device memory with events with driver copy offload without barrier. ",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR MemcpyExecute, opsPerThread 400, numThreads 1, allocSize 102400, srcUSM 1, dstUSM 1",
-    "explicit_group": "MemcpyExecute, opsPerThread: 400, numThreads: 1, allocSize: 102400"
-  },
-  "multithread_benchmark_ur MemcpyExecute opsPerThread:400, numThreads:1, allocSize:102400 srcUSM:0 dstUSM:1": {
-    "type": "benchmark",
-    "description": "Measures multithreaded memory copy performance with 1 threads each performing 400 operations on 102400 bytes from host to device memory with events with driver copy offload without barrier. ",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR MemcpyExecute, opsPerThread 400, numThreads 1, allocSize 102400, srcUSM 0, dstUSM 1",
-    "explicit_group": "MemcpyExecute, opsPerThread: 400, numThreads: 1, allocSize: 102400"
-  },
-  "multithread_benchmark_ur MemcpyExecute opsPerThread:100, numThreads:4, allocSize:102400 srcUSM:1 dstUSM:1 without events": {
-    "type": "benchmark",
-    "description": "Measures multithreaded memory copy performance with 4 threads each performing 100 operations on 102400 bytes from device to device memory without events with driver copy offload without barrier. ",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR MemcpyExecute, opsPerThread 100, numThreads 4, allocSize 102400, srcUSM 1, dstUSM 1, without events",
-    "explicit_group": "MemcpyExecute, opsPerThread: 100, numThreads: 4, allocSize: 102400"
-  },
-  "multithread_benchmark_ur MemcpyExecute opsPerThread:100, numThreads:4, allocSize:102400 srcUSM:1 dstUSM:1 without events without copy offload": {
-    "type": "benchmark",
-    "description": "Measures multithreaded memory copy performance with 4 threads each performing 100 operations on 102400 bytes from device to device memory without events without driver copy offload without barrier. ",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR MemcpyExecute, opsPerThread 100, numThreads 4, allocSize 102400, srcUSM 1, dstUSM 1, without events without copy offload",
-    "explicit_group": "MemcpyExecute, opsPerThread: 100, numThreads: 4, allocSize: 102400"
-  },
-  "multithread_benchmark_ur MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:0 dstUSM:1 without events": {
-    "type": "benchmark",
-    "description": "Measures multithreaded memory copy performance with 4 threads each performing 4096 operations on 1024 bytes from host to device memory without events with driver copy offload without barrier. ",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 4, allocSize 1024, srcUSM 0, dstUSM 1, without events",
-    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 4, allocSize: 1024"
-  },
-  "multithread_benchmark_ur MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:0 dstUSM:1 without events with barrier": {
-    "type": "benchmark",
-    "description": "Measures multithreaded memory copy performance with 4 threads each performing 4096 operations on 1024 bytes from host to device memory without events with driver copy offload with barrier. ",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "latency",
-      "UR",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 4, allocSize 1024, srcUSM 0, dstUSM 1, without events",
-    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 4, allocSize: 1024"
-  },
-  "api_overhead_benchmark_ur UsmMemoryAllocation usmMemoryPlacement:Device size:256 measureMode:Both": {
-    "type": "benchmark",
-    "description": "Measures memory allocation overhead by allocating 256 bytes of usm Device memory and free'ing it immediately. Both memory allocation and memory free are timed. ",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "UR",
-      "micro",
-      "latency",
-      "memory"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR UsmMemoryAllocation, usmMemoryPlacement Device, size 256, measureMode Both",
-    "explicit_group": "UsmMemoryAllocation"
-  },
-  "api_overhead_benchmark_ur UsmMemoryAllocation usmMemoryPlacement:Device size:262144 measureMode:Both": {
-    "type": "benchmark",
-    "description": "Measures memory allocation overhead by allocating 262144 bytes of usm Device memory and free'ing it immediately. Both memory allocation and memory free are timed. ",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "UR",
-      "micro",
-      "latency",
-      "memory"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR UsmMemoryAllocation, usmMemoryPlacement Device, size 262144, measureMode Both",
-    "explicit_group": "UsmMemoryAllocation"
-  },
-  "api_overhead_benchmark_ur UsmBatchMemoryAllocation usmMemoryPlacement:Device allocationCount:128 size:256 measureMode:Both": {
-    "type": "benchmark",
-    "description": "Measures memory allocation overhead by allocating 256 bytes of usm Device memory 128 times, then free'ing it all at once. Both memory allocation and memory free are timed. ",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "UR",
-      "micro",
-      "latency",
-      "memory"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR UsmBatchMemoryAllocation, usmMemoryPlacement Device, allocationCount 128, size 256, measureMode Both",
-    "explicit_group": "UsmBatchMemoryAllocation"
-  },
-  "api_overhead_benchmark_ur UsmBatchMemoryAllocation usmMemoryPlacement:Device allocationCount:128 size:16384 measureMode:Both": {
-    "type": "benchmark",
-    "description": "Measures memory allocation overhead by allocating 16384 bytes of usm Device memory 128 times, then free'ing it all at once. Both memory allocation and memory free are timed. ",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "UR",
-      "micro",
-      "latency",
-      "memory"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR UsmBatchMemoryAllocation, usmMemoryPlacement Device, allocationCount 128, size 16384, measureMode Both",
-    "explicit_group": "UsmBatchMemoryAllocation"
-  },
-  "api_overhead_benchmark_ur UsmBatchMemoryAllocation usmMemoryPlacement:Device allocationCount:128 size:131072 measureMode:Both": {
-    "type": "benchmark",
-    "description": "Measures memory allocation overhead by allocating 131072 bytes of usm Device memory 128 times, then free'ing it all at once. Both memory allocation and memory free are timed. ",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "UR",
-      "micro",
-      "latency",
-      "memory"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR UsmBatchMemoryAllocation, usmMemoryPlacement Device, allocationCount 128, size 131072, measureMode Both",
-    "explicit_group": "UsmBatchMemoryAllocation"
-  },
-  "multithread_benchmark_syclpreview MemcpyExecute opsPerThread:4096, numThreads:1, allocSize:1024 srcUSM:1 dstUSM:1 without events": {
-    "type": "benchmark",
-    "description": "Measures multithreaded memory copy performance with 1 threads each performing 4096 operations on 1024 bytes from device to device memory without events with driver copy offload without barrier. ",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 1, allocSize 1024, srcUSM 1, dstUSM 1, without events",
-    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 1, allocSize: 1024"
-  },
-  "multithread_benchmark_syclpreview MemcpyExecute opsPerThread:4096, numThreads:1, allocSize:1024 srcUSM:1 dstUSM:1 without events with barrier": {
-    "type": "benchmark",
-    "description": "Measures multithreaded memory copy performance with 1 threads each performing 4096 operations on 1024 bytes from device to device memory without events with driver copy offload with barrier. ",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 1, allocSize 1024, srcUSM 1, dstUSM 1, without events",
-    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 1, allocSize: 1024"
-  },
-  "multithread_benchmark_syclpreview MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:1 dstUSM:1 without events": {
-    "type": "benchmark",
-    "description": "Measures multithreaded memory copy performance with 4 threads each performing 4096 operations on 1024 bytes from device to device memory without events with driver copy offload without barrier. ",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 4, allocSize 1024, srcUSM 1, dstUSM 1, without events",
-    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 4, allocSize: 1024"
-  },
-  "multithread_benchmark_syclpreview MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:1 dstUSM:1 without events with barrier": {
-    "type": "benchmark",
-    "description": "Measures multithreaded memory copy performance with 4 threads each performing 4096 operations on 1024 bytes from device to device memory without events with driver copy offload with barrier. ",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "memory",
-      "latency",
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "UR MemcpyExecute, opsPerThread 4096, numThreads 4, allocSize 1024, srcUSM 1, dstUSM 1, without events",
-    "explicit_group": "MemcpyExecute, opsPerThread: 4096, numThreads: 4, allocSize: 1024"
-  },
-  "Velocity-Bench Hashtable": {
-    "type": "benchmark",
-    "description": "Measures hash table search performance using an efficient lock-free algorithm with linear probing. Reports throughput in millions of keys processed per second. Higher values indicate better performance.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "application",
-      "throughput"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Velocity-Bench Hashtable",
-    "explicit_group": ""
-  },
-  "Velocity-Bench Bitcracker": {
-    "type": "benchmark",
-    "description": "Password-cracking application for BitLocker-encrypted memory units. Uses dictionary attack to find user or recovery passwords. Measures total time required to process 60000 passwords.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "application",
-      "throughput"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Velocity-Bench Bitcracker",
-    "explicit_group": ""
-  },
-  "Velocity-Bench CudaSift": {
-    "type": "benchmark",
-    "description": "Implementation of the SIFT (Scale Invariant Feature Transform) algorithm for detecting, describing, and matching local features in images. Measures average processing time in milliseconds.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "application",
-      "image"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Velocity-Bench CudaSift",
-    "explicit_group": ""
-  },
-  "Velocity-Bench Easywave": {
-    "type": "benchmark",
-    "description": "A tsunami wave simulator used for researching tsunami generation and wave propagation. Measures the elapsed time in milliseconds to simulate a specified tsunami event based on real-world data.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "application",
-      "simulation"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Velocity-Bench Easywave",
-    "explicit_group": ""
-  },
-  "Velocity-Bench QuickSilver": {
-    "type": "benchmark",
-    "description": "Solves a simplified dynamic Monte Carlo particle-transport problem used in HPC. Replicates memory access patterns, communication patterns, and branching of Mercury workloads. Reports a figure of merit in MMS/CTT where higher values indicate better performance.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "application",
-      "simulation",
-      "throughput"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Velocity-Bench QuickSilver",
-    "explicit_group": ""
-  },
-  "Velocity-Bench Sobel Filter": {
-    "type": "benchmark",
-    "description": "Popular RGB-to-grayscale image conversion technique that applies a gaussian filter to reduce edge artifacts. Processes a large 32K x 32K image and measures the time required to apply the filter.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "application",
-      "image",
-      "throughput"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Velocity-Bench Sobel Filter",
-    "explicit_group": ""
-  },
-  "Velocity-Bench dl-cifar": {
-    "type": "benchmark",
-    "description": "Deep learning image classification workload based on the CIFAR-10 dataset of 60,000 32x32 color images in 10 classes. Uses neural networks to classify input images and measures total calculation time.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "application",
-      "inference",
-      "image"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Velocity-Bench dl-cifar",
-    "explicit_group": ""
-  },
-  "Velocity-Bench dl-mnist": {
-    "type": "benchmark",
-    "description": "Digit recognition based on the MNIST database, one of the oldest and most popular databases of handwritten digits. Uses neural networks to identify digits and measures total calculation time.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "application",
-      "inference",
-      "image"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Velocity-Bench dl-mnist",
-    "explicit_group": ""
-  },
-  "Velocity-Bench svm": {
-    "type": "benchmark",
-    "description": "Implementation of Support Vector Machine, a popular classical machine learning technique. Uses supervised learning models with associated algorithms to analyze data for classification and regression analysis. Measures total elapsed time.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "application",
-      "inference"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Velocity-Bench svm",
-    "explicit_group": ""
-  },
-  "SYCL-Bench IndependentDAGTaskThroughput_multi": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench IndependentDAGTaskThroughput_multi",
-    "explicit_group": ""
-  },
-  "SYCL-Bench DAGTaskThroughput_multi": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench DAGTaskThroughput_multi",
-    "explicit_group": ""
-  },
-  "SYCL-Bench HostDeviceBandwidth_multi": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench HostDeviceBandwidth_multi",
-    "explicit_group": ""
-  },
-  "SYCL-Bench LocalMem_multi": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro",
-      "memory"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench LocalMem_multi",
-    "explicit_group": ""
-  },
-  "SYCL-Bench ScalarProduct_multi": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench ScalarProduct_multi",
-    "explicit_group": ""
-  },
-  "SYCL-Bench Pattern_SegmentedReduction_multi": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench Pattern_SegmentedReduction_multi",
-    "explicit_group": ""
-  },
-  "SYCL-Bench USM_Allocation_latency_multi": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench USM_Allocation_latency_multi",
-    "explicit_group": ""
-  },
-  "SYCL-Bench VectorAddition_multi": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench VectorAddition_multi",
-    "explicit_group": ""
-  },
-  "SYCL-Bench 2mm": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench 2mm",
-    "explicit_group": ""
-  },
-  "SYCL-Bench 3mm": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench 3mm",
-    "explicit_group": ""
-  },
-  "SYCL-Bench Atax": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench Atax",
-    "explicit_group": ""
-  },
-  "SYCL-Bench Bicg": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench Bicg",
-    "explicit_group": ""
-  },
-  "SYCL-Bench Kmeans": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench Kmeans",
-    "explicit_group": ""
-  },
-  "SYCL-Bench LinearRegressionCoeff": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench LinearRegressionCoeff",
-    "explicit_group": ""
-  },
-  "SYCL-Bench MolecularDynamics": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench MolecularDynamics",
-    "explicit_group": ""
-  },
-  "SYCL-Bench sf_16": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "micro"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "SYCL-Bench sf_16",
-    "explicit_group": ""
-  },
-  "llama.cpp DeepSeek-R1-Distill-Qwen-1.5B-Q4_0.gguf": {
-    "type": "benchmark",
-    "description": "Performance testing tool for llama.cpp that measures LLM inference speed in tokens per second. Runs both prompt processing (initial context processing) and text generation benchmarks with different batch sizes. Higher values indicate better performance. Uses the DeepSeek-R1-Distill-Qwen-1.5B-Q4_0.gguf quantized model and leverages SYCL with oneDNN for acceleration.",
-    "notes": null,
-    "unstable": null,
-    "tags": [
-      "SYCL",
-      "application",
-      "inference",
-      "throughput"
-    ],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "llama.cpp DeepSeek-R1-Distill-Qwen-1.5B-Q4_0.gguf",
-    "explicit_group": ""
-  },
-  "umf-benchmark": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "umf-benchmark",
-    "explicit_group": ""
-  },
-  "gromacs-0006-pme-graphs": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "gromacs-0006-pme-graphs",
-    "explicit_group": ""
-  },
-  "gromacs-0006-pme-eager": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "gromacs-0006-pme-eager",
-    "explicit_group": ""
-  },
-  "gromacs-0006-rf-graphs": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "gromacs-0006-rf-graphs",
-    "explicit_group": ""
-  },
-  "gromacs-0006-rf-eager": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "gromacs-0006-rf-eager",
-    "explicit_group": ""
-  },
-  "onednn-sum-f16-1-eager": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "onednn-sum-f16-1-eager",
-    "explicit_group": "sum-f16-1"
-  },
-  "onednn-sum-f16-2-eager": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "onednn-sum-f16-2-eager",
-    "explicit_group": "sum-f16-2"
-  },
-  "onednn-sum-f32-1-eager": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "onednn-sum-f32-1-eager",
-    "explicit_group": "sum-f32-1"
-  },
-  "onednn-sum-f32-2-eager": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "onednn-sum-f32-2-eager",
-    "explicit_group": "sum-f32-2"
-  },
-  "onednn-sum-padding-1-eager": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "onednn-sum-padding-1-eager",
-    "explicit_group": "sum-padding-1"
-  },
-  "onednn-sum-padding-1-graph": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "onednn-sum-padding-1-graph",
-    "explicit_group": "sum-padding-1"
-  },
-  "onednn-sum-padding-2-eager": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "onednn-sum-padding-2-eager",
-    "explicit_group": "sum-padding-2"
-  },
-  "onednn-sum-padding-2-graph": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "onednn-sum-padding-2-graph",
-    "explicit_group": "sum-padding-2"
-  },
-  "onednn-graph-sdpa-plain-f16-eager": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "onednn-graph-sdpa-plain-f16-eager",
-    "explicit_group": "graph-sdpa-plain-f16"
-  },
-  "onednn-graph-sdpa-plain-f16-graph": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "onednn-graph-sdpa-plain-f16-graph",
-    "explicit_group": "graph-sdpa-plain-f16"
-  },
-  "onednn-graph-sdpa-plain-f32-eager": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "onednn-graph-sdpa-plain-f32-eager",
-    "explicit_group": "graph-sdpa-plain-f32"
-  },
-  "onednn-graph-sdpa-plain-f32-graph": {
-    "type": "benchmark",
-    "description": "",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "onednn-graph-sdpa-plain-f32-graph",
-    "explicit_group": "graph-sdpa-plain-f32"
-  },
-  "Foo Group": {
-    "type": "group",
-    "description": "This is a test benchmark for Foo Group.",
-    "notes": "This is a test note for Foo Group.\nLook, multiple lines!",
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": null,
-    "explicit_group": null
-  },
-  "Bar Group": {
-    "type": "group",
-    "description": "This is a test benchmark for Bar Group.",
-    "notes": null,
-    "unstable": "This is an unstable note for Bar Group.",
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": null,
-    "explicit_group": null
-  },
-  "Memory Bandwidth 1": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Memory Bandwidth 1.",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Memory Bandwidth 1",
-    "explicit_group": ""
-  },
-  "Memory Bandwidth 2": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Memory Bandwidth 2.",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Memory Bandwidth 2",
-    "explicit_group": ""
-  },
-  "Memory Bandwidth 3": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Memory Bandwidth 3.",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Memory Bandwidth 3",
-    "explicit_group": ""
-  },
-  "Memory Bandwidth 4": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Memory Bandwidth 4.",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Memory Bandwidth 4",
-    "explicit_group": ""
-  },
-  "Memory Bandwidth 5": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Memory Bandwidth 5.",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Memory Bandwidth 5",
-    "explicit_group": ""
-  },
-  "Memory Bandwidth 6": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Memory Bandwidth 6.",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Memory Bandwidth 6",
-    "explicit_group": ""
-  },
-  "Latency 1": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Latency 1.",
-    "notes": "A Latency test note!",
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Latency 1",
-    "explicit_group": ""
-  },
-  "Latency 2": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Latency 2.",
-    "notes": "A Latency test note!",
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Latency 2",
-    "explicit_group": ""
-  },
-  "Latency 3": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Latency 3.",
-    "notes": "A Latency test note!",
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Latency 3",
-    "explicit_group": ""
-  },
-  "Latency 4": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Latency 4.",
-    "notes": "A Latency test note!",
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Latency 4",
-    "explicit_group": ""
-  },
-  "Latency 5": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Latency 5.",
-    "notes": "A Latency test note!",
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Latency 5",
-    "explicit_group": ""
-  },
-  "Latency 6": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Latency 6.",
-    "notes": "A Latency test note!",
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Latency 6",
-    "explicit_group": ""
-  },
-  "Throughput 1": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Throughput 1.",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Throughput 1",
-    "explicit_group": ""
-  },
-  "Throughput 2": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Throughput 2.",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Throughput 2",
-    "explicit_group": ""
-  },
-  "Throughput 3": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Throughput 3.",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Throughput 3",
-    "explicit_group": ""
-  },
-  "Throughput 4": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Throughput 4.",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Throughput 4",
-    "explicit_group": ""
-  },
-  "Throughput 5": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Throughput 5.",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Throughput 5",
-    "explicit_group": ""
-  },
-  "Throughput 6": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Throughput 6.",
-    "notes": null,
-    "unstable": null,
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Throughput 6",
-    "explicit_group": ""
-  },
-  "FLOPS 1": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for FLOPS 1.",
-    "notes": null,
-    "unstable": "Unstable FLOPS test!",
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "FLOPS 1",
-    "explicit_group": ""
-  },
-  "FLOPS 2": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for FLOPS 2.",
-    "notes": null,
-    "unstable": "Unstable FLOPS test!",
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "FLOPS 2",
-    "explicit_group": ""
-  },
-  "FLOPS 3": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for FLOPS 3.",
-    "notes": null,
-    "unstable": "Unstable FLOPS test!",
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "FLOPS 3",
-    "explicit_group": ""
-  },
-  "FLOPS 4": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for FLOPS 4.",
-    "notes": null,
-    "unstable": "Unstable FLOPS test!",
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "FLOPS 4",
-    "explicit_group": ""
-  },
-  "FLOPS 5": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for FLOPS 5.",
-    "notes": null,
-    "unstable": "Unstable FLOPS test!",
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "FLOPS 5",
-    "explicit_group": ""
-  },
-  "FLOPS 6": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for FLOPS 6.",
-    "notes": null,
-    "unstable": "Unstable FLOPS test!",
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "FLOPS 6",
-    "explicit_group": ""
-  },
-  "Cache Miss Rate 1": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Cache Miss Rate 1.",
-    "notes": "Test Note",
-    "unstable": "And another note!",
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Cache Miss Rate 1",
-    "explicit_group": ""
-  },
-  "Cache Miss Rate 2": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Cache Miss Rate 2.",
-    "notes": "Test Note",
-    "unstable": "And another note!",
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Cache Miss Rate 2",
-    "explicit_group": ""
-  },
-  "Cache Miss Rate 3": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Cache Miss Rate 3.",
-    "notes": "Test Note",
-    "unstable": "And another note!",
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Cache Miss Rate 3",
-    "explicit_group": ""
-  },
-  "Cache Miss Rate 4": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Cache Miss Rate 4.",
-    "notes": "Test Note",
-    "unstable": "And another note!",
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Cache Miss Rate 4",
-    "explicit_group": ""
-  },
-  "Cache Miss Rate 5": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Cache Miss Rate 5.",
-    "notes": "Test Note",
-    "unstable": "And another note!",
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Cache Miss Rate 5",
-    "explicit_group": ""
-  },
-  "Cache Miss Rate 6": {
-    "type": "benchmark",
-    "description": "This is a test benchmark for Cache Miss Rate 6.",
-    "notes": "Test Note",
-    "unstable": "And another note!",
-    "tags": [],
-    "range_min": null,
-    "range_max": null,
-    "display_name": "Cache Miss Rate 6",
-    "explicit_group": ""
-  }
-};
+benchmarkRuns = [];
 
-benchmarkTags = {
-  "SYCL": {
-    "name": "SYCL",
-    "description": "Benchmark uses SYCL runtime"
-  },
-  "UR": {
-    "name": "UR",
-    "description": "Benchmark uses Unified Runtime API"
-  },
-  "L0": {
-    "name": "L0",
-    "description": "Benchmark uses Level Zero API directly"
-  },
-  "UMF": {
-    "name": "UMF",
-    "description": "Benchmark uses Unified Memory Framework directly"
-  },
-  "micro": {
-    "name": "micro",
-    "description": "Microbenchmark focusing on a specific functionality"
-  },
-  "application": {
-    "name": "application",
-    "description": "Real application-based performance test"
-  },
-  "proxy": {
-    "name": "proxy",
-    "description": "Benchmark that simulates real application use-cases"
-  },
-  "submit": {
-    "name": "submit",
-    "description": "Tests kernel submission performance"
-  },
-  "math": {
-    "name": "math",
-    "description": "Tests math computation performance"
-  },
-  "memory": {
-    "name": "memory",
-    "description": "Tests memory transfer or bandwidth performance"
-  },
-  "allocation": {
-    "name": "allocation",
-    "description": "Tests memory allocation performance"
-  },
-  "graph": {
-    "name": "graph",
-    "description": "Tests graph-based execution performance"
-  },
-  "latency": {
-    "name": "latency",
-    "description": "Measures operation latency"
-  },
-  "throughput": {
-    "name": "throughput",
-    "description": "Measures operation throughput"
-  },
-  "inference": {
-    "name": "inference",
-    "description": "Tests ML/AI inference performance"
-  },
-  "image": {
-    "name": "image",
-    "description": "Image processing benchmark"
-  },
-  "simulation": {
-    "name": "simulation",
-    "description": "Physics or scientific simulation benchmark"
-  }
-};
-
-defaultCompareNames = [
-  "This PR"
-];
+defaultCompareNames = [];
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
index 13a5830df7a04..f262e4470ee6f 100755
--- a/devops/scripts/benchmarks/main.py
+++ b/devops/scripts/benchmarks/main.py
@@ -17,8 +17,9 @@
 from output_markdown import generate_markdown
 from output_html import generate_html
 from history import BenchmarkHistory
-from utils.utils import prepare_workdir, git_clone, run
+from utils.utils import prepare_workdir
 from utils.compute_runtime import *
+from utils.unitrace import download_and_build_unitrace
 from utils.validate import Validate
 from utils.detect_versions import DetectVersion
 from presets import enabled_suites, presets
@@ -34,44 +35,6 @@
 INTERNAL_WORKDIR_VERSION = "2.0"
 
 
-def download_and_build_unitrace(workdir):
-    repo_dir = git_clone(
-        workdir,
-        "pti-gpu-repo",
-        "https://github.com/intel/pti-gpu.git",
-        "master",
-    )
-    build_dir = os.path.join(workdir, "unitrace-build")
-    unitrace_src = os.path.join(repo_dir, "tools", "unitrace")
-    os.makedirs(build_dir, exist_ok=True)
-
-    unitrace_exe = os.path.join(build_dir, "unitrace")
-    if not os.path.isfile(unitrace_exe):
-        run(
-            [
-                "cmake",
-                f"-S {unitrace_src}",
-                f"-B {build_dir}",
-                "-DCMAKE_BUILD_TYPE=Release",
-                "-DCMAKE_CXX_COMPILER=clang++",
-                "-DCMAKE_C_COMPILER=clang",
-                "-DBUILD_WITH_L0=1",
-                "-DBUILD_WITH_OPENCL=0",
-                "-DBUILD_WITH_ITT=1",
-                "-DBUILD_WITH_XPTI=1",
-                "-DBUILD_WITH_MPI=0",
-            ],
-            ld_library=get_oneapi().ld_libraries() + [f"{options.sycl}/lib"],
-            add_sycl=True,
-        )
-        run(
-            ["cmake", "--build", build_dir, "-j"],
-            ld_library=get_oneapi().ld_libraries() + [f"{options.sycl}/lib"],
-            add_sycl=True,
-        )
-    print("Unitrace built successfully.")
-
-
 def run_iterations(
     benchmark: Benchmark,
     env_vars,
diff --git a/devops/scripts/benchmarks/utils/unitrace.py b/devops/scripts/benchmarks/utils/unitrace.py
index 87458c9e96967..1597d2c23dbd5 100644
--- a/devops/scripts/benchmarks/utils/unitrace.py
+++ b/devops/scripts/benchmarks/utils/unitrace.py
@@ -7,7 +7,8 @@
 import re
 
 from options import options
-import stat as statmod
+from utils.utils import run, git_clone
+from utils.oneapi import get_oneapi
 
 
 def extract_save_name_and_timestamp(dirname):
@@ -141,3 +142,41 @@ def handle_unitrace_output(bench_dir, unitrace_output, timestamp):
 
     # Prune old unitrace directories
     prune_unitrace_dirs(options.unitrace_res_dir, FILECNT=5)
+
+
+def download_and_build_unitrace(workdir):
+    repo_dir = git_clone(
+        workdir,
+        "pti-gpu-repo",
+        "https://github.com/intel/pti-gpu.git",
+        "master",
+    )
+    build_dir = os.path.join(workdir, "unitrace-build")
+    unitrace_src = os.path.join(repo_dir, "tools", "unitrace")
+    os.makedirs(build_dir, exist_ok=True)
+
+    unitrace_exe = os.path.join(build_dir, "unitrace")
+    if not os.path.isfile(unitrace_exe):
+        run(
+            [
+                "cmake",
+                f"-S {unitrace_src}",
+                f"-B {build_dir}",
+                "-DCMAKE_BUILD_TYPE=Release",
+                "-DCMAKE_CXX_COMPILER=clang++",
+                "-DCMAKE_C_COMPILER=clang",
+                "-DBUILD_WITH_L0=1",
+                "-DBUILD_WITH_OPENCL=0",
+                "-DBUILD_WITH_ITT=1",
+                "-DBUILD_WITH_XPTI=1",
+                "-DBUILD_WITH_MPI=0",
+            ],
+            ld_library=get_oneapi().ld_libraries() + [f"{options.sycl}/lib"],
+            add_sycl=True,
+        )
+        run(
+            ["cmake", "--build", build_dir, "-j"],
+            ld_library=get_oneapi().ld_libraries() + [f"{options.sycl}/lib"],
+            add_sycl=True,
+        )
+    print("Unitrace built successfully.")

From deb9aaf90f40535ed0f86da282a592393a8846ab Mon Sep 17 00:00:00 2001
From: "Mateusz P. Nowak" <mateusz.p.nowak@intel.com>
Date: Tue, 8 Jul 2025 12:47:37 +0000
Subject: [PATCH 5/6] apply comments

Signed-off-by: Mateusz P. Nowak <mateusz.p.nowak@intel.com>
---
 devops/scripts/benchmarks/benches/base.py     |  29 +-
 devops/scripts/benchmarks/benches/benchdnn.py |  13 +-
 .../benchmarks/benches/benchdnn_list.py       |   5 -
 devops/scripts/benchmarks/benches/compute.py  |  73 +---
 devops/scripts/benchmarks/benches/gromacs.py  |   4 +-
 devops/scripts/benchmarks/benches/llamacpp.py |   4 +-
 .../scripts/benchmarks/benches/syclbench.py   |   8 +-
 devops/scripts/benchmarks/benches/test.py     |   2 +-
 devops/scripts/benchmarks/benches/umf.py      |   4 +-
 devops/scripts/benchmarks/benches/velocity.py |   8 +-
 devops/scripts/benchmarks/history.py          |  15 +-
 devops/scripts/benchmarks/main.py             |  72 ++--
 devops/scripts/benchmarks/options.py          |   6 +-
 devops/scripts/benchmarks/utils/oneapi.py     |   1 -
 devops/scripts/benchmarks/utils/unitrace.py   | 341 +++++++++---------
 15 files changed, 254 insertions(+), 331 deletions(-)

diff --git a/devops/scripts/benchmarks/benches/base.py b/devops/scripts/benchmarks/benches/base.py
index 80854d7e17248..32aa3ab220d9c 100644
--- a/devops/scripts/benchmarks/benches/base.py
+++ b/devops/scripts/benchmarks/benches/base.py
@@ -3,7 +3,6 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from dataclasses import dataclass
 import os
 import shutil
 import subprocess
@@ -12,7 +11,7 @@
 from options import options
 from utils.utils import download, run
 from abc import ABC, abstractmethod
-import utils.unitrace as unitrace
+from utils.unitrace import get_unitrace
 
 benchmark_tags = [
     BenchmarkTag("SYCL", "Benchmark uses SYCL runtime"),
@@ -71,7 +70,7 @@ def teardown(self):
         pass
 
     @abstractmethod
-    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
+    def run(self, env_vars: dict, run_unitrace: bool = False) -> list[Result]:
         pass
 
     @staticmethod
@@ -93,8 +92,8 @@ def run_bench(
         ld_library=[],
         add_sycl=True,
         use_stdout=True,
-        unitrace_timestamp: str = None,
-        extra_unitrace_opt=[],
+        run_unitrace=False,
+        extra_unitrace_opt=None,
     ):
         env_vars = env_vars.copy()
         if options.ur is not None:
@@ -107,9 +106,11 @@ def run_bench(
         ld_libraries = options.extra_ld_libraries.copy()
         ld_libraries.extend(ld_library)
 
-        if unitrace_timestamp is not None:
-            bench_dir, unitrace_output, command = unitrace.unitrace_prepare(
-                self.name(), unitrace_timestamp, command, extra_unitrace_opt
+        if run_unitrace:
+            if extra_unitrace_opt is None:
+                extra_unitrace_opt = []
+            unitrace_output, command = get_unitrace().setup(
+                self.name(), command, extra_unitrace_opt
             )
 
         try:
@@ -120,15 +121,13 @@ def run_bench(
                 cwd=options.benchmark_cwd,
                 ld_library=ld_libraries,
             )
-        except subprocess.CalledProcessError as e:
-            if unitrace_timestamp is not None:
-                unitrace.unitrace_cleanup(options.benchmark_cwd, unitrace_output)
+        except subprocess.CalledProcessError:
+            if run_unitrace:
+                get_unitrace().cleanup(options.benchmark_cwd, unitrace_output)
             raise
 
-        if unitrace_timestamp is not None:
-            unitrace.handle_unitrace_output(
-                bench_dir, unitrace_output, unitrace_timestamp
-            )
+        if run_unitrace:
+            get_unitrace().handle_output(unitrace_output)
 
         if use_stdout:
             return result.stdout.decode()
diff --git a/devops/scripts/benchmarks/benches/benchdnn.py b/devops/scripts/benchmarks/benches/benchdnn.py
index 76def7fde77ec..d349789b0ce6f 100644
--- a/devops/scripts/benchmarks/benches/benchdnn.py
+++ b/devops/scripts/benchmarks/benches/benchdnn.py
@@ -9,7 +9,7 @@
 from utils.utils import git_clone, run, create_build_path
 from utils.result import Result
 from utils.oneapi import get_oneapi
-from .benchdnn_list import get_bench_dnn_list, unitrace_exclusion_list
+from .benchdnn_list import get_bench_dnn_list
 
 
 class OneDnnBench(Suite):
@@ -129,7 +129,7 @@ def setup(self):
         if not self.bench_bin.exists():
             raise FileNotFoundError(f"Benchmark binary not found: {self.bench_bin}")
 
-    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
+    def run(self, env_vars, run_unitrace: bool = False) -> list[Result]:
         command = [
             str(self.bench_bin),
             *self.bench_args.split(),
@@ -142,20 +142,13 @@ def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         env_vars = dict(env_vars) if env_vars else {}
         env_vars["ONEAPI_DEVICE_SELECTOR"] = "level_zero:*"
 
-        if self.name() in unitrace_exclusion_list:
-            if options.verbose:
-                print(
-                    f"[{self.name()}] Skipping benchmark due to unitrace exclusion list."
-                )
-            unitrace_timestamp = None
-
         output = self.run_bench(
             command,
             env_vars,
             add_sycl=True,
             ld_library=ld_library,
             use_stdout=True,
-            unitrace_timestamp=unitrace_timestamp,
+            run_unitrace=run_unitrace,
             extra_unitrace_opt=["--chrome-dnn-logging"],
         )
         result_value = self._extract_time(output)
diff --git a/devops/scripts/benchmarks/benches/benchdnn_list.py b/devops/scripts/benchmarks/benches/benchdnn_list.py
index cc5929a5f07da..53721ec1fa178 100644
--- a/devops/scripts/benchmarks/benches/benchdnn_list.py
+++ b/devops/scripts/benchmarks/benches/benchdnn_list.py
@@ -3,11 +3,6 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-unitrace_exclusion_list = [
-    "onednn-graph-sdpa-plain-f32-eager",
-    "onednn-graph-sdpa-plain-f32-graph",
-]
-
 # entry format:
 #  [bench_driver, bench_name, bench_args, rungraph]
 #  bench_driver is the name of the benchdnn driver, e.g. "sum", "graph", etc.
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index 2da64c9a0e371..3748079ed10da 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -225,70 +225,6 @@ def parse_unit_type(compute_unit):
 
 class ComputeBenchmark(Benchmark):
 
-    # list of benchmarks to exclude from unitrace due to SIGSEGV, SIGABRT or timeouts
-    unitrace_exclusion_list = [
-        "api_overhead_benchmark_l0 SubmitKernel in order not using events KernelExecTime=20",
-        "api_overhead_benchmark_l0 SubmitKernel in order not using events",
-        "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events KernelExecTime=20",
-        "api_overhead_benchmark_l0 SubmitKernel in order with measure completion not using events",
-        "api_overhead_benchmark_sycl SubmitKernel in order not using events KernelExecTime=20",
-        "api_overhead_benchmark_sycl SubmitKernel in order not using events",
-        "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events KernelExecTime=20",
-        "api_overhead_benchmark_sycl SubmitKernel in order with measure completion not using events",
-        "api_overhead_benchmark_syclpreview SubmitKernel in order not using events KernelExecTime=20",
-        "api_overhead_benchmark_syclpreview SubmitKernel in order not using events",
-        "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events KernelExecTime=20",
-        "api_overhead_benchmark_syclpreview SubmitKernel in order with measure completion not using events",
-        "api_overhead_benchmark_ur SubmitKernel in order not using events KernelExecTime=20",
-        "api_overhead_benchmark_ur SubmitKernel in order not using events",
-        "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events KernelExecTime=20",
-        "api_overhead_benchmark_ur SubmitKernel in order with measure completion not using events",
-        "api_overhead_benchmark_ur SubmitKernel out of order not using events KernelExecTime=20",
-        "api_overhead_benchmark_ur SubmitKernel out of order not using events",
-        "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events KernelExecTime=20",
-        "api_overhead_benchmark_ur SubmitKernel out of order with measure completion not using events",
-        "graph_api_benchmark_l0 SinKernelGraph graphs:0, numKernels:5",
-        "graph_api_benchmark_l0 SinKernelGraph graphs:0, numKernels:100",
-        "graph_api_benchmark_l0 SinKernelGraph graphs:1, numKernels:5",
-        "graph_api_benchmark_l0 SinKernelGraph graphs:1, numKernels:100",
-        "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 0 measureCompletion 0",
-        "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 0 measureCompletion 1",
-        "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 1 measureCompletion 0",
-        "graph_api_benchmark_l0 SubmitGraph numKernels:10 ioq 1 measureCompletion 1",
-        "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 0 measureCompletion 0",
-        "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 0 measureCompletion 1",
-        "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 1 measureCompletion 0",
-        "graph_api_benchmark_l0 SubmitGraph numKernels:32 ioq 1 measureCompletion 1",
-        "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 0 measureCompletion 0",
-        "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 0 measureCompletion 1",
-        "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 1 measureCompletion 0",
-        "graph_api_benchmark_l0 SubmitGraph numKernels:4 ioq 1 measureCompletion 1",
-        "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:0 graphStructure:Gromacs",
-        "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:0 graphStructure:Llama",
-        "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:1 graphStructure:Gromacs",
-        "graph_api_benchmark_sycl FinalizeGraph rebuildGraphEveryIter:1 graphStructure:Llama",
-        "graph_api_benchmark_ur SinKernelGraph graphs:0, numKernels:100",
-        "graph_api_benchmark_ur SinKernelGraph graphs:0, numKernels:5",
-        "graph_api_benchmark_ur SinKernelGraph graphs:1, numKernels:100",
-        "graph_api_benchmark_ur SinKernelGraph graphs:1, numKernels:5",
-        "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 1 measureCompletion 0",
-        "graph_api_benchmark_ur SubmitGraph numKernels:4 ioq 1 measureCompletion 1",
-        "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 0 measureCompletion 0",
-        "graph_api_benchmark_ur SubmitGraph numKernels:10 ioq 0 measureCompletion 1",
-        "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 0 measureCompletion 0",
-        "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 0 measureCompletion 1",
-        "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 1 measureCompletion 0",
-        "graph_api_benchmark_ur SubmitGraph numKernels:32 ioq 1 measureCompletion 1",
-        "multithread_benchmark_ur MemcpyExecute opsPerThread:400, numThreads:1, allocSize:102400 srcUSM:1 dstUSM:1",
-        "multithread_benchmark_ur MemcpyExecute opsPerThread:400, numThreads:1, allocSize:102400 srcUSM:0 dstUSM:1",
-        "multithread_benchmark_ur MemcpyExecute opsPerThread:100, numThreads:4, allocSize:102400 srcUSM:1 dstUSM:1 without events",
-        "multithread_benchmark_ur MemcpyExecute opsPerThread:100, numThreads:4, allocSize:102400 srcUSM:1 dstUSM:1 without events without copy offload",
-        "multithread_benchmark_ur MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:0 dstUSM:1 without events",
-        "multithread_benchmark_ur MemcpyExecute opsPerThread:4096, numThreads:4, allocSize:1024 srcUSM:0 dstUSM:1 without events with barrier",
-        "memory_benchmark_sycl StreamMemory, placement Device, type Triad, size 10240",
-        "miscellaneous_benchmark_sycl VectorSum",
-    ]
-
     def __init__(self, bench, name, test, runtime: RUNTIMES = None):
         super().__init__(bench.directory, bench)
         self.bench = bench
@@ -345,7 +281,7 @@ def explicit_group(self):
     def description(self) -> str:
         return ""
 
-    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
+    def run(self, env_vars, run_unitrace: bool = False) -> list[Result]:
         command = [
             f"{self.benchmark_bin}",
             f"--test={self.test}",
@@ -356,11 +292,10 @@ def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         command += self.bin_args()
         env_vars.update(self.extra_env_vars())
 
-        if self.name() in self.unitrace_exclusion_list:
-            unitrace_timestamp = None
-
         result = self.run_bench(
-            command, env_vars, unitrace_timestamp=unitrace_timestamp
+            command,
+            env_vars,
+            run_unitrace=run_unitrace,
         )
         parsed_results = self.parse_output(result)
         ret = []
diff --git a/devops/scripts/benchmarks/benches/gromacs.py b/devops/scripts/benchmarks/benches/gromacs.py
index 0ccf6b0738037..5e1843876f7fa 100644
--- a/devops/scripts/benchmarks/benches/gromacs.py
+++ b/devops/scripts/benchmarks/benches/gromacs.py
@@ -162,7 +162,7 @@ def setup(self):
             ld_library=self.suite.oneapi.ld_libraries(),
         )
 
-    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
+    def run(self, env_vars, run_unitrace: bool = False) -> list[Result]:
         model_dir = self.grappa_dir / self.model
 
         env_vars.update({"SYCL_CACHE_PERSISTENT": "1"})
@@ -201,7 +201,7 @@ def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
             add_sycl=True,
             use_stdout=False,
             ld_library=self.suite.oneapi.ld_libraries(),
-            unitrace_timestamp=unitrace_timestamp,
+            run_unitrace=run_unitrace,
         )
 
         if not self._validate_correctness(options.benchmark_cwd + "/md.log"):
diff --git a/devops/scripts/benchmarks/benches/llamacpp.py b/devops/scripts/benchmarks/benches/llamacpp.py
index 58d5d72d4f510..d025357fcd8f6 100644
--- a/devops/scripts/benchmarks/benches/llamacpp.py
+++ b/devops/scripts/benchmarks/benches/llamacpp.py
@@ -115,7 +115,7 @@ def get_tags(self):
     def lower_is_better(self):
         return False
 
-    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
+    def run(self, env_vars, run_unitrace: bool = False) -> list[Result]:
         command = [
             f"{self.benchmark_bin}",
             "--output",
@@ -144,7 +144,7 @@ def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
             command,
             env_vars,
             ld_library=self.bench.oneapi.ld_libraries(),
-            unitrace_timestamp=unitrace_timestamp,
+            run_unitrace=run_unitrace,
         )
         parsed = self.parse_output(result)
         results = []
diff --git a/devops/scripts/benchmarks/benches/syclbench.py b/devops/scripts/benchmarks/benches/syclbench.py
index 152d23223f1c2..65bc26ac46d18 100644
--- a/devops/scripts/benchmarks/benches/syclbench.py
+++ b/devops/scripts/benchmarks/benches/syclbench.py
@@ -137,7 +137,7 @@ def setup(self):
             self.directory, "sycl-bench-build", self.bench_name
         )
 
-    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
+    def run(self, env_vars, run_unitrace: bool = False) -> list[Result]:
         self.outputfile = os.path.join(self.bench.directory, self.test + ".csv")
 
         command = [
@@ -151,7 +151,11 @@ def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         env_vars.update(self.extra_env_vars())
 
         # no output to stdout, all in outputfile
-        self.run_bench(command, env_vars, unitrace_timestamp=unitrace_timestamp)
+        self.run_bench(
+            command,
+            env_vars,
+            run_unitrace=run_unitrace,
+        )
 
         with open(self.outputfile, "r") as f:
             reader = csv.reader(f)
diff --git a/devops/scripts/benchmarks/benches/test.py b/devops/scripts/benchmarks/benches/test.py
index 6736b6a9f499d..e7d40f5380e06 100644
--- a/devops/scripts/benchmarks/benches/test.py
+++ b/devops/scripts/benchmarks/benches/test.py
@@ -88,7 +88,7 @@ def notes(self) -> str:
     def unstable(self) -> str:
         return self.unstable_text
 
-    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
+    def run(self, env_vars, run_unitrace: bool = False) -> list[Result]:
         random_value = self.value + random.uniform(-1 * (self.diff), self.diff)
         return [
             Result(
diff --git a/devops/scripts/benchmarks/benches/umf.py b/devops/scripts/benchmarks/benches/umf.py
index fb8703e892476..0cb2a97f3ed9d 100644
--- a/devops/scripts/benchmarks/benches/umf.py
+++ b/devops/scripts/benchmarks/benches/umf.py
@@ -138,7 +138,7 @@ def get_names_of_benchmarks_to_be_run(self, command, env_vars):
 
         return all_names
 
-    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
+    def run(self, env_vars, run_unitrace: bool = False) -> list[Result]:
         command = [f"{self.benchmark_bin}"]
 
         all_names = self.get_names_of_benchmarks_to_be_run(command, env_vars)
@@ -156,7 +156,7 @@ def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
                 env_vars,
                 add_sycl=False,
                 ld_library=[self.umf_lib],
-                unitrace_timestamp=unitrace_timestamp,
+                run_unitrace=run_unitrace,
             )
 
             parsed = self.parse_output(result)
diff --git a/devops/scripts/benchmarks/benches/velocity.py b/devops/scripts/benchmarks/benches/velocity.py
index 01d10128f0b10..a810674c3f984 100644
--- a/devops/scripts/benchmarks/benches/velocity.py
+++ b/devops/scripts/benchmarks/benches/velocity.py
@@ -130,7 +130,7 @@ def description(self) -> str:
     def get_tags(self):
         return ["SYCL", "application"]
 
-    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
+    def run(self, env_vars, run_unitrace: bool = False) -> list[Result]:
         env_vars.update(self.extra_env_vars())
 
         command = [
@@ -142,7 +142,7 @@ def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
             command,
             env_vars,
             ld_library=self.ld_libraries(),
-            unitrace_timestamp=unitrace_timestamp,
+            run_unitrace=run_unitrace,
         )
 
         return [
@@ -287,7 +287,7 @@ class QuickSilver(VelocityBase):
     def __init__(self, vb: VelocityBench):
         super().__init__("QuickSilver", "qs", vb, "MMS/CTT")
 
-    def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
+    def run(self, env_vars, run_unitrace: bool = False) -> list[Result]:
         # TODO: fix the crash in QuickSilver when UR_L0_USE_IMMEDIATE_COMMANDLISTS=0
         if (
             "UR_L0_USE_IMMEDIATE_COMMANDLISTS" in env_vars
@@ -295,7 +295,7 @@ def run(self, env_vars, unitrace_timestamp: str = None) -> list[Result]:
         ):
             return None
 
-        return super().run(env_vars, unitrace_timestamp=unitrace_timestamp)
+        return super().run(env_vars)
 
     def name(self):
         return "Velocity-Bench QuickSilver"
diff --git a/devops/scripts/benchmarks/history.py b/devops/scripts/benchmarks/history.py
index 1a2b40b26c91e..30591657fa298 100644
--- a/devops/scripts/benchmarks/history.py
+++ b/devops/scripts/benchmarks/history.py
@@ -14,6 +14,7 @@
 from utils.validate import Validate
 
 from utils.detect_versions import DetectVersion
+from utils.unitrace import get_unitrace
 
 
 class BenchmarkHistory:
@@ -149,24 +150,28 @@ def git_info_from_path(path: Path) -> (str, str):
             compute_runtime=compute_runtime,
         )
 
-    def save(self, save_name, timestamp, results: list[Result], to_file=True):
+    def save(self, save_name, results: list[Result]):
         benchmark_data = self.create_run(save_name, results)
         self.runs.append(benchmark_data)
 
-        if not to_file:
+        if options.save_name is None:
             return
 
-        serialized = benchmark_data.to_json()
+        serialized = benchmark_data.to_json()  # type: ignore
         results_dir = Path(os.path.join(self.dir, "results"))
         os.makedirs(results_dir, exist_ok=True)
 
-        # Use formatted timestamp for the filename
-        if timestamp is None:
+        if get_unitrace() is not None:
+            timestamp = get_unitrace().timestamp  # type: ignore
+        elif options.timestamp_override is not None:
+            timestamp = options.timestamp_override
+        else:
             timestamp = (
                 datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
                 if options.timestamp_override is None
                 else options.timestamp_override
             )
+
         file_path = Path(os.path.join(results_dir, f"{save_name}_{timestamp}.json"))
         with file_path.open("w") as file:
             json.dump(serialized, file, indent=4)
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
index f262e4470ee6f..06b72dde2e46b 100755
--- a/devops/scripts/benchmarks/main.py
+++ b/devops/scripts/benchmarks/main.py
@@ -19,13 +19,10 @@
 from history import BenchmarkHistory
 from utils.utils import prepare_workdir
 from utils.compute_runtime import *
-from utils.unitrace import download_and_build_unitrace
 from utils.validate import Validate
 from utils.detect_versions import DetectVersion
+from utils.unitrace import get_unitrace, create_unitrace
 from presets import enabled_suites, presets
-from utils.oneapi import get_oneapi
-from datetime import datetime, timezone
-
 import argparse
 import re
 import statistics
@@ -41,14 +38,15 @@ def run_iterations(
     iters: int,
     results: dict[str, list[Result]],
     failures: dict[str, str],
-    unitrace_timestamp: str = None,
+    run_unitrace: bool = False,
 ):
     for iter in range(iters):
-        if unitrace_timestamp is not None:
-            print(f"running {benchmark.name()} with Unitrace", flush=True)
+        if run_unitrace:
+            print(f"running {benchmark.name()} with Unitrace... ", flush=True)
         else:
             print(f"running {benchmark.name()}, iteration {iter}... ", flush=True)
-        bench_results = benchmark.run(env_vars, unitrace_timestamp=unitrace_timestamp)
+
+        bench_results = benchmark.run(env_vars, run_unitrace=run_unitrace)
         if bench_results is None:
             if options.exit_on_failure:
                 raise RuntimeError(f"Benchmark {benchmark.name()} produced no results!")
@@ -173,13 +171,19 @@ def collect_metadata(suites):
 def main(directory, additional_env_vars, compare_names, filter):
     prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
 
-    if options.unitrace_only or options.unitrace_inclusive:
-        print("Downloading and building Unitrace...")
-        download_and_build_unitrace(options.workdir)
-        if options.results_directory_override == None:
-            options.unitrace_res_dir = os.path.join(directory, "results")
-        else:
-            options.unitrace_res_dir = options.results_directory_override
+    if args.unitrace == "inclusive":
+        create_unitrace(inclusive=True)
+    elif args.unitrace is True:
+        create_unitrace(inclusive=False)
+    elif args.unitrace is not None:
+        parser.error(
+            "Invalid value for --unitrace. Use 'inclusive' for tracing along regular benchmarks or no argument for tracing only."
+        )
+
+    if get_unitrace() is not None and options.save_name is None:
+        raise ValueError(
+            "Unitrace requires a save name to be specified via --save option."
+        )
 
     if options.build_compute_runtime:
         print(f"Setting up Compute Runtime {options.compute_runtime_tag}")
@@ -236,12 +240,6 @@ def main(directory, additional_env_vars, compare_names, filter):
                 print(f"{type(s).__name__} setup complete.")
                 benchmarks += suite_benchmarks
 
-    timestamp = (
-        datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
-        if options.timestamp_override is None
-        else options.timestamp_override
-    )
-
     for benchmark in benchmarks:
         try:
             if options.verbose:
@@ -263,7 +261,7 @@ def main(directory, additional_env_vars, compare_names, filter):
             intermediate_results: dict[str, list[Result]] = {}
             processed: list[Result] = []
             # regular run of the benchmark
-            if not options.unitrace_only:
+            if get_unitrace() is None or get_unitrace().inclusive:
                 for _ in range(options.iterations_stddev):
                     run_iterations(
                         benchmark,
@@ -271,23 +269,22 @@ def main(directory, additional_env_vars, compare_names, filter):
                         options.iterations,
                         intermediate_results,
                         failures,
-                        unitrace_timestamp=None,
+                        run_unitrace=False,
                     )
                     valid, processed = process_results(
                         intermediate_results, benchmark.stddev_threshold()
                     )
                     if valid:
                         break
-            # unitrace run of the benchmark
-            if options.unitrace_inclusive or options.unitrace_only:
-                # set the timestamp to enable unitrace run and save results with proper file names
+            # single unitrace run independent of benchmark iterations
+            if get_unitrace() is not None:
                 run_iterations(
                     benchmark,
                     merged_env_vars,
                     1,
                     intermediate_results,
                     failures,
-                    unitrace_timestamp=timestamp,
+                    run_unitrace=True,
                 )
             results += processed
         except Exception as e:
@@ -351,7 +348,7 @@ def main(directory, additional_env_vars, compare_names, filter):
     # we calculate historical averages or get latest results for compare.
     # Otherwise we might be comparing the results to themselves.
     if not options.dry_run:
-        history.save(saved_name, timestamp, results, options.save_name is not None)
+        history.save(saved_name, results)
         if saved_name not in compare_names:
             compare_names.append(saved_name)
 
@@ -546,14 +543,11 @@ def validate_and_parse_env_args(env_args):
     )
     parser.add_argument(
         "--unitrace",
-        action="store_true",
-        help="Unitrace tracing for sigle iteration of benchmarks",
-    )
-
-    parser.add_argument(
-        "--unitrace-inclusive",
-        action="store_true",
-        help="Regular run of benchmarks iterations and unitrace tracing in single additional run",
+        nargs="?",
+        const=True,
+        default=None,
+        help="Unitrace tracing for single iteration of benchmarks. Inclusive tracing is done along regular benchmarks.",
+        choices=["inclusive", True],
     )
 
     # Options intended for CI:
@@ -661,8 +655,6 @@ def validate_and_parse_env_args(env_args):
     options.results_directory_override = args.results_dir
     options.build_jobs = args.build_jobs
     options.hip_arch = args.hip_arch
-    options.unitrace_only = args.unitrace
-    options.unitrace_inclusive = args.unitrace_inclusive
 
     if args.build_igc and args.compute_runtime is None:
         parser.error("--build-igc requires --compute-runtime to be set")
@@ -673,10 +665,6 @@ def validate_and_parse_env_args(env_args):
         if not os.path.isdir(args.output_dir):
             parser.error("Specified --output-dir is not a valid path")
         options.output_directory = os.path.abspath(args.output_dir)
-    if args.unitrace_inclusive and args.unitrace:
-        parser.error(
-            "--unitrace-inclusive and --unitrace are mutually exclusive, please specify only one of them"
-        )
 
     # Options intended for CI:
     options.timestamp_override = args.timestamp_override
diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py
index a609e9998fd3d..7cc954b0d3775 100644
--- a/devops/scripts/benchmarks/options.py
+++ b/devops/scripts/benchmarks/options.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass, field
 from enum import Enum
 import multiprocessing
+import os
 
 
 class Compare(Enum):
@@ -69,11 +70,8 @@ class Options:
     build_igc: bool = False
     current_run_name: str = "This PR"
     preset: str = "Full"
-    build_jobs: int = multiprocessing.cpu_count()
+    build_jobs: int = len(os.sched_getaffinity(0))  # Cores available for the process.
     exit_on_failure: bool = False
-    unitrace_only: bool = False
-    unitrace_inclusive: bool = False
-    unitrace_res_dir: str = None
 
     # Options intended for CI:
     regression_threshold: float = 0.05
diff --git a/devops/scripts/benchmarks/utils/oneapi.py b/devops/scripts/benchmarks/utils/oneapi.py
index 0a477f6e246b1..80049149810ad 100644
--- a/devops/scripts/benchmarks/utils/oneapi.py
+++ b/devops/scripts/benchmarks/utils/oneapi.py
@@ -108,7 +108,6 @@ def ld_libraries(self):
 
 oneapi_instance = None
 
-
 def get_oneapi() -> OneAPI:  # oneAPI singleton
     if not hasattr(get_oneapi, "instance"):
         get_oneapi.instance = OneAPI()
diff --git a/devops/scripts/benchmarks/utils/unitrace.py b/devops/scripts/benchmarks/utils/unitrace.py
index 1597d2c23dbd5..399908898ddd7 100644
--- a/devops/scripts/benchmarks/utils/unitrace.py
+++ b/devops/scripts/benchmarks/utils/unitrace.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024-2025 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -8,175 +8,182 @@
 
 from options import options
 from utils.utils import run, git_clone
-from utils.oneapi import get_oneapi
-
-
-def extract_save_name_and_timestamp(dirname):
-    """
-    Extracts (save_name, timestamp) from a directory name of the form {save_name}_{timestamp},
-    where timestamp is always 15 characters: YYYYMMDD_HHMMSS.
-    save_name may contain underscores.
-    """
-    m = re.match(r"(.+)_(\d{8}_\d{6})$", dirname)
-    if m:
-        return m.group(1), m.group(2)
-    return None, None
-
-
-def prune_unitrace_dirs(base_dir, FILECNT=10):
-    """
-    Keeps only FILECNT newest directories for each save_name group in base_dir.
-    """
-    dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
-    groups = {}
-    for d in dirs:
-        save_name, ts = extract_save_name_and_timestamp(d)
-        if save_name and ts:
-            groups.setdefault(save_name, []).append((d, ts))
-    for save_name, dirlist in groups.items():
-        # Sort by timestamp string (lexicographically, works for YYYYMMDD_HHMMSS)
-        dirlist.sort(key=lambda x: x[1])
-        if len(dirlist) > FILECNT:
-            for d, ts in dirlist[: len(dirlist) - FILECNT]:
-                full_path = os.path.join(base_dir, d)
-                print(f"Removing old unitrace dir: {full_path}")
-                shutil.rmtree(full_path)
-
-
-def unitrace_cleanup(bench_cwd, unitrace_output):
-    # Remove .pid files from the benchmark directory and .json files from cwd
-    unitrace_dir = os.path.dirname(unitrace_output)
-    unitrace_base = os.path.basename(unitrace_output)
-    print(f"Cleanup unitrace output {unitrace_base} from {unitrace_dir}")
-    for f in os.listdir(unitrace_dir):
-        if f.startswith(unitrace_base + "."):
-            os.remove(os.path.join(unitrace_dir, f))
-            print(f"Cleanup: Removed {f} from {unitrace_dir}")
-    if os.path.exists(bench_cwd):
-        for f in os.listdir(bench_cwd):
-            if f.endswith(".json"):
-                os.remove(os.path.join(bench_cwd, f))
-                print(f"Cleanup: Removed {f} from {bench_cwd}")
-
-
-def unitrace_prepare(name, unitrace_timestamp, command, extra_unitrace_opt=[]):
-    unitrace_bin = os.path.join(options.workdir, "unitrace-build", "unitrace")
-    if not os.path.exists(unitrace_bin):
-        raise FileNotFoundError(f"Unitrace binary not found: {unitrace_bin}. ")
-    os.makedirs(options.unitrace_res_dir, exist_ok=True)
-    if not options.save_name:
-        raise ValueError(
-            "Unitrace requires a save name to be specified via --save option."
-        )
-    bench_dir = f"{options.unitrace_res_dir}/{options.save_name}_{unitrace_timestamp}"
-    os.makedirs(bench_dir, exist_ok=True)
-
-    unitrace_output = f"{bench_dir}/{name}_{unitrace_timestamp}"
-    unitrace_command = (
-        [
-            str(unitrace_bin),
-            "--call-logging",
-            "--host-timing",
-            "--device-timing",
-            "--chrome-sycl-logging",
-            "--chrome-call-logging",
-            "--chrome-kernel-logging",
-            "--output",
-            unitrace_output,
-        ]
-        + extra_unitrace_opt
-        + command
-    )
-    if options.verbose:
-        print(f"Unitrace cmd: {' '.join(unitrace_command)}")
-
-    return bench_dir, unitrace_output, unitrace_command
-
-
-def handle_unitrace_output(bench_dir, unitrace_output, timestamp):
-    # Handle unitrace_output.{pid} logs: rename to unitrace_output (remove pid)
-    for f in os.listdir(bench_dir):
-        if f.startswith(os.path.basename(unitrace_output) + "."):
-            parts = f.rsplit(".", 1)
-            if (
-                len(parts) == 2
-                and parts[1].isdigit()
-                and os.path.isfile(os.path.join(bench_dir, f))
-            ):
-                src = os.path.join(bench_dir, f)
-                dst = os.path.join(bench_dir, os.path.basename(unitrace_output))
-                shutil.move(src, dst)
-                if options.verbose:
-                    print(f"Renamed {src} to {dst}")
-                break
-
-    # Handle {name}.{pid}.json files in cwd: move and rename to {self.name()}_{timestamp}.json
-    pid_json_files = []
-    for f in os.listdir(options.benchmark_cwd):
-        parts = f.split(".")
-        l = len(parts)
-        if len(parts) >= 3 and parts[l - 1] == "json" and parts[l - 2].isdigit():
-            pid_json_files.append(f)
-
-    if len(pid_json_files) == 1:
-        dst = f"{unitrace_output}.json"
-    else:
-        print(
-            f"Warning: Found {len(pid_json_files)} files matching the pattern. Expected 1."
+from datetime import datetime, timezone
+
+
+class Unitrace:
+
+    inclusive: bool = False
+
+    def __init__(self, inclusive: bool = False):
+        self.inclusive = inclusive
+        self.timestamp = (
+            datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
+            if options.timestamp_override is None
+            else options.timestamp_override
         )
-        # Find the newest file by modification time
-        newest_file = max(
-            pid_json_files,
-            key=lambda f: os.path.getmtime(os.path.join(options.benchmark_cwd, f)),
+
+        print("Downloading and building Unitrace...")
+        repo_dir = git_clone(
+            options.workdir,
+            "pti-gpu-repo",
+            "https://github.com/intel/pti-gpu.git",
+            "master",
         )
-        dst = f"{unitrace_output}.json"
-        for f in pid_json_files:
-            if f != newest_file:
-                os.remove(os.path.join(options.benchmark_cwd, f))
+        build_dir = os.path.join(options.workdir, "unitrace-build")
+        unitrace_src = os.path.join(repo_dir, "tools", "unitrace")
+        os.makedirs(build_dir, exist_ok=True)
+
+        unitrace_exe = os.path.join(build_dir, "unitrace")
+        if not os.path.isfile(unitrace_exe):
+            run(
+                [
+                    "cmake",
+                    f"-S {unitrace_src}",
+                    f"-B {build_dir}",
+                    "-DCMAKE_BUILD_TYPE=Release",
+                    "-DCMAKE_CXX_COMPILER=clang++",
+                    "-DCMAKE_C_COMPILER=clang",
+                    "-DBUILD_WITH_L0=1",
+                    "-DBUILD_WITH_OPENCL=0",
+                    "-DBUILD_WITH_ITT=1",
+                    "-DBUILD_WITH_XPTI=1",
+                    "-DBUILD_WITH_MPI=0",
+                ],
+                add_sycl=True,
+            )
+            run(
+                ["cmake", "--build", build_dir, "-j", options.build_jobs],
+                add_sycl=True,
+            )
+        print("Unitrace built successfully.")
+
+        if options.results_directory_override == None:
+            self.traces_dir = os.path.join(options.workdir, "results", "traces")
+        else:
+            self.traces_dir = os.path.join(options.results_directory_override, "traces")
+
+    def _prune_unitrace_dirs(self, dir: str, FILECNT: int = 10):
+        files = os.listdir(dir)
+        files.sort()  # Lexicographical sort matches timestamp order
+        if len(files) > 2 * FILECNT:
+            for f in files[: len(files) - 2 * FILECNT]:
+                full_path = os.path.join(dir, f)
+                if os.path.isdir(full_path):
+                    shutil.rmtree(full_path)
+                else:
+                    os.remove(full_path)
                 if options.verbose:
-                    print(f"Removed extra file {f}")
-
-    shutil.move(os.path.join(options.benchmark_cwd, pid_json_files[0]), dst)
-    if options.verbose:
-        print(f"Moved {pid_json_files[0]} to {dst}")
-
-    # Prune old unitrace directories
-    prune_unitrace_dirs(options.unitrace_res_dir, FILECNT=5)
-
-
-def download_and_build_unitrace(workdir):
-    repo_dir = git_clone(
-        workdir,
-        "pti-gpu-repo",
-        "https://github.com/intel/pti-gpu.git",
-        "master",
-    )
-    build_dir = os.path.join(workdir, "unitrace-build")
-    unitrace_src = os.path.join(repo_dir, "tools", "unitrace")
-    os.makedirs(build_dir, exist_ok=True)
-
-    unitrace_exe = os.path.join(build_dir, "unitrace")
-    if not os.path.isfile(unitrace_exe):
-        run(
+                    print(f"Removing old unitrace file: {full_path}")
+
+    def cleanup(self, bench_cwd: str, unitrace_output: str):
+        # Remove .pid files from the benchmark directory and .json files from cwd
+        unitrace_dir = os.path.dirname(unitrace_output)
+        unitrace_base = os.path.basename(unitrace_output)
+        print(f"Cleanup unitrace output {unitrace_base} from {unitrace_dir}")
+        for f in os.listdir(unitrace_dir):
+            if f.startswith(unitrace_base + "."):
+                os.remove(os.path.join(unitrace_dir, f))
+                print(f"Cleanup: Removed {f} from {unitrace_dir}")
+        if os.path.exists(bench_cwd):
+            for f in os.listdir(bench_cwd):
+                if f.endswith(".json"):
+                    os.remove(os.path.join(bench_cwd, f))
+                    print(f"Cleanup: Removed {f} from {bench_cwd}")
+
+    def setup(
+        self, bench_name: str, command: list[str], extra_unitrace_opt: list[str] = None
+    ):
+        unitrace_bin = os.path.join(options.workdir, "unitrace-build", "unitrace")
+        if not os.path.exists(unitrace_bin):
+            raise FileNotFoundError(f"Unitrace binary not found: {unitrace_bin}. ")
+        os.makedirs(self.traces_dir, exist_ok=True)
+        bench_dir = os.path.join(f"{self.traces_dir}", f"{bench_name}")
+
+        os.makedirs(bench_dir, exist_ok=True)
+
+        unitrace_output = os.path.join(
+            bench_dir, f"{self.timestamp}_{options.save_name}.out"
+        )
+
+        if extra_unitrace_opt is None:
+            extra_unitrace_opt = []
+
+        unitrace_command = (
             [
-                "cmake",
-                f"-S {unitrace_src}",
-                f"-B {build_dir}",
-                "-DCMAKE_BUILD_TYPE=Release",
-                "-DCMAKE_CXX_COMPILER=clang++",
-                "-DCMAKE_C_COMPILER=clang",
-                "-DBUILD_WITH_L0=1",
-                "-DBUILD_WITH_OPENCL=0",
-                "-DBUILD_WITH_ITT=1",
-                "-DBUILD_WITH_XPTI=1",
-                "-DBUILD_WITH_MPI=0",
-            ],
-            ld_library=get_oneapi().ld_libraries() + [f"{options.sycl}/lib"],
-            add_sycl=True,
+                str(unitrace_bin),
+                "--call-logging",
+                "--host-timing",
+                "--device-timing",
+                "--chrome-sycl-logging",
+                "--chrome-call-logging",
+                "--chrome-kernel-logging",
+                "--output",
+                unitrace_output,
+            ]
+            + extra_unitrace_opt
+            + command
         )
-        run(
-            ["cmake", "--build", build_dir, "-j"],
-            ld_library=get_oneapi().ld_libraries() + [f"{options.sycl}/lib"],
-            add_sycl=True,
+        if options.verbose:
+            print(f"Unitrace cmd: {' '.join(unitrace_command)}")
+
+        return unitrace_output, unitrace_command
+
+    def handle_output(self, unitrace_output: str):
+
+        # Handle {name}.{pid}.json files in cwd: move and rename to {self.name()}_{timestamp}.{pid}.json
+        pid_json_files = []
+        pid = ""
+        for f in os.listdir(options.benchmark_cwd):
+            parts = f.split(".")
+            l = len(parts)
+            if len(parts) >= 3 and parts[l - 1] == "json" and parts[l - 2].isdigit():
+                pid_json_files.append(f)
+                pid = parts[l - 2]
+
+        if len(pid_json_files) == 0:
+            raise FileNotFoundError(
+                f"No .pid.json files found in {options.benchmark_cwd}."
+            )
+        elif len(pid_json_files) > 1:
+            # If there are multiple .pid.json files due to previous failures, keep only the most recent one
+            pid_json_files.sort(
+                key=lambda f: os.path.getmtime(os.path.join(options.benchmark_cwd, f))
+            )
+            for f in pid_json_files[:-1]:
+                os.remove(os.path.join(options.benchmark_cwd, f))
+            pid_json_files = [pid_json_files[-1]]
+
+        dst = (
+            unitrace_output[:-4] + f".{pid}.json"
+            if unitrace_output.endswith(".out")
+            else unitrace_output + f".{pid}.json"
         )
-    print("Unitrace built successfully.")
+
+        shutil.move(os.path.join(options.benchmark_cwd, pid_json_files[0]), dst)
+        if options.verbose:
+            print(f"Moved {pid_json_files[0]} to {dst}")
+
+        # Prune old unitrace directories
+        self._prune_unitrace_dirs(os.path.dirname(unitrace_output))
+
+
+_unitrace_instance = None
+
+
+def create_unitrace(inclusive: bool) -> None:
+    global _unitrace_instance
+    if _unitrace_instance is None:
+        try:
+            _unitrace_instance = Unitrace(inclusive)
+        except Exception as e:
+            print(f"Failed to build Unitrace: {e}")
+            _unitrace_instance = None
+        if _unitrace_instance is not None:
+            print("Unitrace instance created successfully.")
+    else:
+        raise ValueError("Unitrace instance already created")
+
+
+def get_unitrace() -> Unitrace | None:
+    return _unitrace_instance

From a5b1fd1717b91d3924397d514c4af656d3b6b37c Mon Sep 17 00:00:00 2001
From: "Mateusz P. Nowak" <mateusz.p.nowak@intel.com>
Date: Tue, 8 Jul 2025 13:23:10 +0000
Subject: [PATCH 6/6] add traceable() to Benchmark class

Signed-off-by: Mateusz P. Nowak <mateusz.p.nowak@intel.com>
---
 devops/scripts/benchmarks/benches/base.py    | 10 ++++++++--
 devops/scripts/benchmarks/benches/compute.py |  5 +++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/devops/scripts/benchmarks/benches/base.py b/devops/scripts/benchmarks/benches/base.py
index 32aa3ab220d9c..26a5516e2e88d 100644
--- a/devops/scripts/benchmarks/benches/base.py
+++ b/devops/scripts/benchmarks/benches/base.py
@@ -61,6 +61,12 @@ def enabled(self) -> bool:
         By default, it returns True, but can be overridden to disable a benchmark."""
         return True
 
+    def traceable(self) -> bool:
+        """Returns whether this benchmark should be traced by Unitrace.
+        By default, it returns True, but can be overridden to disable tracing for a benchmark.
+        """
+        return True
+
     @abstractmethod
     def setup(self):
         pass
@@ -106,7 +112,7 @@ def run_bench(
         ld_libraries = options.extra_ld_libraries.copy()
         ld_libraries.extend(ld_library)
 
-        if run_unitrace:
+        if self.traceable() and run_unitrace:
             if extra_unitrace_opt is None:
                 extra_unitrace_opt = []
             unitrace_output, command = get_unitrace().setup(
@@ -126,7 +132,7 @@ def run_bench(
                 get_unitrace().cleanup(options.benchmark_cwd, unitrace_output)
             raise
 
-        if run_unitrace:
+        if self.traceable() and run_unitrace:
             get_unitrace().handle_output(unitrace_output)
 
         if use_stdout:
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index 3748079ed10da..723d6719a2869 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -225,6 +225,8 @@ def parse_unit_type(compute_unit):
 
 class ComputeBenchmark(Benchmark):
 
+    not_traceable = []  # List of benchmarks that should not be traced by Unitrace
+
     def __init__(self, bench, name, test, runtime: RUNTIMES = None):
         super().__init__(bench.directory, bench)
         self.bench = bench
@@ -264,6 +266,9 @@ def enabled(self) -> bool:
         # Check if the specific runtime is enabled (or no specific runtime required)
         return self.runtime is None or self.runtime in self.enabled_runtimes()
 
+    def traceable(self) -> bool:
+        return self.bench_name not in self.not_traceable
+
     def bin_args(self) -> list[str]:
         return []