@@ -115,37 +115,11 @@ def additional_metadata(self) -> dict[str, BenchmarkMetadata]:
115
115
),
116
116
}
117
117
118
- def enabled_runtimes (self , supported_runtimes = None , extra_runtimes = None ):
119
- # all runtimes in the RUNTIMES enum
120
- runtimes = supported_runtimes or list (RUNTIMES )
121
-
122
- # filter out SYCL_PREVIEW which is not supported by default in all benchmarks
123
- runtimes = [r for r in runtimes if r != RUNTIMES .SYCL_PREVIEW ]
124
-
125
- if extra_runtimes is not None :
126
- runtimes .extend (extra_runtimes )
127
-
128
- # Filter out UR if not available
129
- if options .ur is None :
130
- runtimes = [r for r in runtimes if r != RUNTIMES .UR ]
131
-
132
- # Filter out L0 if cuda backend
133
- if options .ur_adapter == "cuda" :
134
- runtimes = [r for r in runtimes if r != RUNTIMES .LEVEL_ZERO ]
135
-
136
- return runtimes
137
-
138
118
def benchmarks (self ) -> list [Benchmark ]:
139
- if options .sycl is None :
140
- return []
141
-
142
- if options .ur_adapter == "hip" :
143
- return []
144
-
145
119
benches = []
146
120
147
- # Add SubmitKernel benchmarks using loops
148
- for runtime in self . enabled_runtimes ( extra_runtimes = [ RUNTIMES . SYCL_PREVIEW ]):
121
+ for runtime in list ( RUNTIMES ):
122
+ # Add SubmitKernel benchmarks using loops
149
123
for in_order_queue in [0 , 1 ]:
150
124
for measure_completion in [0 , 1 ]:
151
125
for use_events in [0 , 1 ]:
@@ -161,21 +135,18 @@ def benchmarks(self) -> list[Benchmark]:
161
135
)
162
136
)
163
137
164
- # Add SinKernelGraph benchmarks
165
- for runtime in self .enabled_runtimes ():
138
+ # Add SinKernelGraph benchmarks
166
139
for with_graphs in [0 , 1 ]:
167
140
for num_kernels in [5 , 100 ]:
168
141
benches .append (
169
142
GraphApiSinKernelGraph (self , runtime , with_graphs , num_kernels )
170
143
)
171
144
172
- # Add ULLS benchmarks
173
- for runtime in self .enabled_runtimes ([RUNTIMES .SYCL , RUNTIMES .LEVEL_ZERO ]):
145
+ # Add ULLS benchmarks
174
146
benches .append (UllsEmptyKernel (self , runtime , 1000 , 256 ))
175
147
benches .append (UllsKernelSwitch (self , runtime , 8 , 200 , 0 , 0 , 1 , 1 ))
176
148
177
- # Add GraphApiSubmitGraph benchmarks
178
- for runtime in self .enabled_runtimes ():
149
+ # Add GraphApiSubmitGraph benchmarks
179
150
for in_order_queue in [0 , 1 ]:
180
151
for num_kernels in [4 , 10 , 32 ]:
181
152
for measure_completion_time in [0 , 1 ]:
@@ -201,24 +172,24 @@ def benchmarks(self) -> list[Benchmark]:
201
172
]
202
173
203
174
# Add UR-specific benchmarks
204
- if options . ur is not None :
205
- benches += [
206
- MemcpyExecute (self , RUNTIMES .UR , 400 , 1 , 102400 , 10 , 1 , 1 , 1 , 1 , 0 ),
207
- MemcpyExecute (self , RUNTIMES .UR , 400 , 1 , 102400 , 10 , 0 , 1 , 1 , 1 , 0 ),
208
- MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 1 , 0 ),
209
- MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 0 , 0 ),
210
- MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 0 ),
211
- MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 1 ),
212
- UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 , "Both" ),
213
- UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 * 1024 , "Both" ),
214
- UsmBatchMemoryAllocation (self , RUNTIMES . UR , "Device" , 128 , 256 , "Both" ),
215
- UsmBatchMemoryAllocation (
216
- self , RUNTIMES . UR , "Device" , 128 , 16 * 1024 , "Both"
217
- ),
218
- UsmBatchMemoryAllocation (
219
- self , RUNTIMES . UR , "Device" , 128 , 128 * 1024 , "Both"
220
- ),
221
- ]
175
+ benches += [
176
+ MemcpyExecute ( self , RUNTIMES . UR , 400 , 1 , 102400 , 10 , 1 , 1 , 1 , 1 , 0 ),
177
+ MemcpyExecute (self , RUNTIMES .UR , 400 , 1 , 102400 , 10 , 0 , 1 , 1 , 1 , 0 ),
178
+ MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 1 , 0 ),
179
+ MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 0 , 0 ),
180
+ MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 0 ),
181
+ MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 1 ),
182
+ UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 , "Both" ),
183
+ UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 * 1024 , "Both" ),
184
+ UsmBatchMemoryAllocation (self , RUNTIMES .UR , "Device" , 128 , 256 , "Both" ),
185
+ UsmBatchMemoryAllocation (
186
+ self , RUNTIMES . UR , "Device" , 128 , 16 * 1024 , "Both"
187
+ ),
188
+ UsmBatchMemoryAllocation (
189
+ self , RUNTIMES . UR , "Device" , 128 , 128 * 1024 , "Both"
190
+ ),
191
+ ]
192
+
222
193
benches += [
223
194
MemcpyExecute (
224
195
self , RUNTIMES .SYCL_PREVIEW , 4096 , 1 , 1024 , 40 , 1 , 1 , 0 , 1 , 0
@@ -246,11 +217,44 @@ def parse_unit_type(compute_unit):
246
217
247
218
248
219
class ComputeBenchmark (Benchmark ):
249
- def __init__ (self , bench , name , test ):
220
+ def __init__ (self , bench , name , test , runtime : RUNTIMES = None ):
250
221
super ().__init__ (bench .directory , bench )
251
222
self .bench = bench
252
223
self .bench_name = name
253
224
self .test = test
225
+ self .runtime = runtime
226
+
227
+ def supported_runtimes (self ) -> list [RUNTIMES ]:
228
+ """Base runtimes supported by this benchmark, can be overridden."""
229
+ # By default, support all runtimes except SYCL_PREVIEW
230
+ return [r for r in RUNTIMES if r != RUNTIMES .SYCL_PREVIEW ]
231
+
232
+ def enabled_runtimes (self ) -> list [RUNTIMES ]:
233
+ """Runtimes available given the current configuration."""
234
+ # Start with all supported runtimes and apply configuration filters
235
+ runtimes = self .supported_runtimes ()
236
+
237
+ # Remove UR if not available
238
+ if options .ur is None :
239
+ runtimes = [r for r in runtimes if r != RUNTIMES .UR ]
240
+
241
+ # Remove Level Zero if using CUDA backend
242
+ if options .ur_adapter == "cuda" :
243
+ runtimes = [r for r in runtimes if r != RUNTIMES .LEVEL_ZERO ]
244
+
245
+ return runtimes
246
+
247
+ def enabled (self ) -> bool :
248
+ # SYCL is required for all benchmarks
249
+ if options .sycl is None :
250
+ return False
251
+
252
+ # HIP adapter is not supported
253
+ if options .ur_adapter == "hip" :
254
+ return False
255
+
256
+ # Check if the specific runtime is enabled (or no specific runtime required)
257
+ return self .runtime is None or self .runtime in self .enabled_runtimes ()
254
258
255
259
def bin_args (self ) -> list [str ]:
256
260
return []
@@ -338,15 +342,17 @@ def __init__(
338
342
KernelExecTime = 1 ,
339
343
):
340
344
self .ioq = ioq
341
- self .runtime = runtime
342
345
self .MeasureCompletion = MeasureCompletion
343
346
self .UseEvents = UseEvents
344
347
self .KernelExecTime = KernelExecTime
345
348
self .NumKernels = 10
346
349
super ().__init__ (
347
- bench , f"api_overhead_benchmark_{ runtime .value } " , "SubmitKernel"
350
+ bench , f"api_overhead_benchmark_{ runtime .value } " , "SubmitKernel" , runtime
348
351
)
349
352
353
+ def supported_runtimes (self ) -> list [RUNTIMES ]:
354
+ return super ().supported_runtimes () + [RUNTIMES .SYCL_PREVIEW ]
355
+
350
356
def get_tags (self ):
351
357
return ["submit" , "latency" , runtime_to_tag_name (self .runtime ), "micro" ]
352
358
@@ -619,7 +625,6 @@ def __init__(
619
625
useCopyOffload ,
620
626
useBarrier ,
621
627
):
622
- self .runtime = runtime
623
628
self .numOpsPerThread = numOpsPerThread
624
629
self .numThreads = numThreads
625
630
self .allocSize = allocSize
@@ -630,7 +635,7 @@ def __init__(
630
635
self .useCopyOffload = useCopyOffload
631
636
self .useBarrier = useBarrier
632
637
super ().__init__ (
633
- bench , f"multithread_benchmark_{ self . runtime .value } " , "MemcpyExecute"
638
+ bench , f"multithread_benchmark_{ runtime .value } " , "MemcpyExecute" , runtime
634
639
)
635
640
636
641
def extra_env_vars (self ) -> dict :
@@ -706,9 +711,8 @@ class GraphApiSinKernelGraph(ComputeBenchmark):
706
711
def __init__ (self , bench , runtime : RUNTIMES , withGraphs , numKernels ):
707
712
self .withGraphs = withGraphs
708
713
self .numKernels = numKernels
709
- self .runtime = runtime
710
714
super ().__init__ (
711
- bench , f"graph_api_benchmark_{ runtime .value } " , "SinKernelGraph"
715
+ bench , f"graph_api_benchmark_{ runtime .value } " , "SinKernelGraph" , runtime
712
716
)
713
717
714
718
def explicit_group (self ):
@@ -759,9 +763,10 @@ def __init__(
759
763
):
760
764
self .inOrderQueue = inOrderQueue
761
765
self .numKernels = numKernels
762
- self .runtime = runtime
763
766
self .measureCompletionTime = measureCompletionTime
764
- super ().__init__ (bench , f"graph_api_benchmark_{ runtime .value } " , "SubmitGraph" )
767
+ super ().__init__ (
768
+ bench , f"graph_api_benchmark_{ runtime .value } " , "SubmitGraph" , runtime
769
+ )
765
770
766
771
def explicit_group (self ):
767
772
return f"SubmitGraph, numKernels: { self .numKernels } "
@@ -804,8 +809,12 @@ class UllsEmptyKernel(ComputeBenchmark):
804
809
def __init__ (self , bench , runtime : RUNTIMES , wgc , wgs ):
805
810
self .wgc = wgc
806
811
self .wgs = wgs
807
- self .runtime = runtime
808
- super ().__init__ (bench , f"ulls_benchmark_{ runtime .value } " , "EmptyKernel" )
812
+ super ().__init__ (
813
+ bench , f"ulls_benchmark_{ runtime .value } " , "EmptyKernel" , runtime
814
+ )
815
+
816
+ def supported_runtimes (self ) -> list [RUNTIMES ]:
817
+ return [RUNTIMES .SYCL , RUNTIMES .LEVEL_ZERO ]
809
818
810
819
def explicit_group (self ):
811
820
return f"EmptyKernel, wgc: { self .wgc } , wgs: { self .wgs } "
@@ -849,9 +858,13 @@ def __init__(
849
858
self .barrier = barrier
850
859
self .hostVisible = hostVisible
851
860
self .ctrBasedEvents = ctrBasedEvents
852
- self .runtime = runtime
853
861
self .ioq = ioq
854
- super ().__init__ (bench , f"ulls_benchmark_{ runtime .value } " , "KernelSwitch" )
862
+ super ().__init__ (
863
+ bench , f"ulls_benchmark_{ runtime .value } " , "KernelSwitch" , runtime
864
+ )
865
+
866
+ def supported_runtimes (self ):
867
+ return [RUNTIMES .SYCL , RUNTIMES .LEVEL_ZERO ]
855
868
856
869
def explicit_group (self ):
857
870
return f"KernelSwitch, count: { self .count } , kernelTime: { self .kernelTime } "
@@ -884,12 +897,14 @@ class UsmMemoryAllocation(ComputeBenchmark):
884
897
def __init__ (
885
898
self , bench , runtime : RUNTIMES , usm_memory_placement , size , measure_mode
886
899
):
887
- self .runtime = runtime
888
900
self .usm_memory_placement = usm_memory_placement
889
901
self .size = size
890
902
self .measure_mode = measure_mode
891
903
super ().__init__ (
892
- bench , f"api_overhead_benchmark_{ runtime .value } " , "UsmMemoryAllocation"
904
+ bench ,
905
+ f"api_overhead_benchmark_{ runtime .value } " ,
906
+ "UsmMemoryAllocation" ,
907
+ runtime ,
893
908
)
894
909
895
910
def get_tags (self ):
@@ -941,13 +956,15 @@ def __init__(
941
956
size ,
942
957
measure_mode ,
943
958
):
944
- self .runtime = runtime
945
959
self .usm_memory_placement = usm_memory_placement
946
960
self .allocation_count = allocation_count
947
961
self .size = size
948
962
self .measure_mode = measure_mode
949
963
super ().__init__ (
950
- bench , f"api_overhead_benchmark_{ runtime .value } " , "UsmBatchMemoryAllocation"
964
+ bench ,
965
+ f"api_overhead_benchmark_{ runtime .value } " ,
966
+ "UsmBatchMemoryAllocation" ,
967
+ runtime ,
951
968
)
952
969
953
970
def get_tags (self ):
0 commit comments