@@ -314,6 +314,7 @@ def __init__(
314
314
)
315
315
else :
316
316
raise ValueError ("Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet" )
317
+ self .dev = dev
317
318
318
319
# look for NVMLObserver in observers, if present, enable special tunable parameters through nvml
319
320
self .use_nvml = False
@@ -332,53 +333,58 @@ def __init__(
332
333
if isinstance (obs , PrologueObserver ):
333
334
self .prologue_observers .append (obs )
334
335
336
+ # Take list of observers from self.dev because Backends tend to add their own observer
337
+ self .benchmark_observers = [
338
+ obs for obs in self .dev .observers if not isinstance (obs , (ContinuousObserver , PrologueObserver ))
339
+ ]
340
+
335
341
self .iterations = iterations
336
342
337
343
self .lang = lang
338
- self .dev = dev
339
344
self .units = dev .units
340
345
self .name = dev .name
341
346
self .max_threads = dev .max_threads
342
347
if not quiet :
343
348
print ("Using: " + self .dev .name )
344
349
345
- def benchmark_default (self , func , gpu_args , threads , grid , result ):
346
- """Benchmark one kernel execution at a time"""
347
- observers = [
348
- obs for obs in self .dev .observers if not isinstance (obs , ContinuousObserver )
349
- ]
350
+ def benchmark_prologue (self , func , gpu_args , threads , grid , result ):
351
+ """Benchmark prologue one kernel execution per PrologueObserver"""
350
352
351
353
for obs in self .prologue_observers :
352
- obs .prologue_start ()
354
+ self .dev .synchronize ()
355
+ obs .before_start ()
353
356
self .dev .run_kernel (func , gpu_args , threads , grid )
354
357
self .dev .synchronize ()
355
- obs .prologue_finish ()
358
+ obs .after_finish ()
359
+ result .update (obs .get_results ())
360
+
361
+ def benchmark_default (self , func , gpu_args , threads , grid , result ):
362
+ """Benchmark one kernel execution for 'iterations' at a time"""
356
363
357
364
self .dev .synchronize ()
358
365
for _ in range (self .iterations ):
359
- for obs in observers :
366
+ for obs in self . benchmark_observers :
360
367
obs .before_start ()
361
368
self .dev .synchronize ()
362
369
self .dev .start_event ()
363
370
self .dev .run_kernel (func , gpu_args , threads , grid )
364
371
self .dev .stop_event ()
365
- for obs in observers :
372
+ for obs in self . benchmark_observers :
366
373
obs .after_start ()
367
374
while not self .dev .kernel_finished ():
368
- for obs in observers :
375
+ for obs in self . benchmark_observers :
369
376
obs .during ()
370
377
time .sleep (1e-6 ) # one microsecond
371
378
self .dev .synchronize ()
372
- for obs in observers :
379
+ for obs in self . benchmark_observers :
373
380
obs .after_finish ()
374
381
375
- for obs in observers :
382
+ for obs in self . benchmark_observers :
376
383
result .update (obs .get_results ())
377
384
378
385
def benchmark_continuous (self , func , gpu_args , threads , grid , result , duration ):
379
386
"""Benchmark continuously for at least 'duration' seconds"""
380
387
iterations = int (np .ceil (duration / (result ["time" ] / 1000 )))
381
- # print(f"{iterations=} {(result['time']/1000)=}")
382
388
self .dev .synchronize ()
383
389
for obs in self .continuous_observers :
384
390
obs .before_start ()
@@ -423,9 +429,8 @@ def benchmark(self, func, gpu_args, instance, verbose, objective):
423
429
424
430
result = {}
425
431
try :
426
- self .benchmark_default (
427
- func , gpu_args , instance .threads , instance .grid , result
428
- )
432
+ self .benchmark_prologue (func , gpu_args , instance .threads , instance .grid , result )
433
+ self .benchmark_default (func , gpu_args , instance .threads , instance .grid , result )
429
434
430
435
if self .continuous_observers :
431
436
duration = 1
0 commit comments