Skip to content

Commit 35910f5

Browse files
suggestion to organize prologue observers differently
1 parent 3402808 commit 35910f5

File tree

3 files changed

+26
-21
lines changed

3 files changed

+26
-21
lines changed

kernel_tuner/core.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,7 @@ def __init__(
314314
)
315315
else:
316316
raise ValueError("Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet")
317+
self.dev = dev
317318

318319
# look for NVMLObserver in observers, if present, enable special tunable parameters through nvml
319320
self.use_nvml = False
@@ -332,53 +333,58 @@ def __init__(
332333
if isinstance(obs, PrologueObserver):
333334
self.prologue_observers.append(obs)
334335

336+
# Take list of observers from self.dev because Backends tend to add their own observer
337+
self.benchmark_observers = [
338+
obs for obs in self.dev.observers if not isinstance(obs, (ContinuousObserver, PrologueObserver))
339+
]
340+
335341
self.iterations = iterations
336342

337343
self.lang = lang
338-
self.dev = dev
339344
self.units = dev.units
340345
self.name = dev.name
341346
self.max_threads = dev.max_threads
342347
if not quiet:
343348
print("Using: " + self.dev.name)
344349

345-
def benchmark_default(self, func, gpu_args, threads, grid, result):
346-
"""Benchmark one kernel execution at a time"""
347-
observers = [
348-
obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver)
349-
]
350+
def benchmark_prologue(self, func, gpu_args, threads, grid, result):
351+
"""Benchmark prologue one kernel execution per PrologueObserver"""
350352

351353
for obs in self.prologue_observers:
352-
obs.prologue_start()
354+
self.dev.synchronize()
355+
obs.before_start()
353356
self.dev.run_kernel(func, gpu_args, threads, grid)
354357
self.dev.synchronize()
355-
obs.prologue_finish()
358+
obs.after_finish()
359+
result.update(obs.get_results())
360+
361+
def benchmark_default(self, func, gpu_args, threads, grid, result):
362+
"""Benchmark one kernel execution for 'iterations' at a time"""
356363

357364
self.dev.synchronize()
358365
for _ in range(self.iterations):
359-
for obs in observers:
366+
for obs in self.benchmark_observers:
360367
obs.before_start()
361368
self.dev.synchronize()
362369
self.dev.start_event()
363370
self.dev.run_kernel(func, gpu_args, threads, grid)
364371
self.dev.stop_event()
365-
for obs in observers:
372+
for obs in self.benchmark_observers:
366373
obs.after_start()
367374
while not self.dev.kernel_finished():
368-
for obs in observers:
375+
for obs in self.benchmark_observers:
369376
obs.during()
370377
time.sleep(1e-6) # one microsecond
371378
self.dev.synchronize()
372-
for obs in observers:
379+
for obs in self.benchmark_observers:
373380
obs.after_finish()
374381

375-
for obs in observers:
382+
for obs in self.benchmark_observers:
376383
result.update(obs.get_results())
377384

378385
def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
379386
"""Benchmark continuously for at least 'duration' seconds"""
380387
iterations = int(np.ceil(duration / (result["time"] / 1000)))
381-
# print(f"{iterations=} {(result['time']/1000)=}")
382388
self.dev.synchronize()
383389
for obs in self.continuous_observers:
384390
obs.before_start()
@@ -423,9 +429,8 @@ def benchmark(self, func, gpu_args, instance, verbose, objective):
423429

424430
result = {}
425431
try:
426-
self.benchmark_default(
427-
func, gpu_args, instance.threads, instance.grid, result
428-
)
432+
self.benchmark_prologue(func, gpu_args, instance.threads, instance.grid, result)
433+
self.benchmark_default(func, gpu_args, instance.threads, instance.grid, result)
429434

430435
if self.continuous_observers:
431436
duration = 1

kernel_tuner/observers/ncu.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@ def __init__(self, metrics=None, device=0):
3131
self.device = device
3232
self.results = dict()
3333

34-
def prologue_start(self):
34+
def before_start(self):
3535
nvmetrics.measureMetricsStart(self.metrics, self.device)
3636

37-
def prologue_finish(self):
37+
def after_finish(self):
3838
self.results = nvmetrics.measureMetricsStop()
3939

4040
def get_results(self):

kernel_tuner/observers/observer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,11 @@ class PrologueObserver(BenchmarkObserver):
6161
"""Observer that measures something in a seperate kernel invocation prior to the normal benchmark."""
6262

6363
@abstractmethod
64-
def prologue_start(self):
64+
def before_start(self):
6565
"""prologue start is called before the kernel starts"""
6666
pass
6767

6868
@abstractmethod
69-
def prologue_finish(self):
69+
def after_finish(self):
7070
"""prologue finish is called after the kernel has finished execution"""
7171
pass

0 commit comments

Comments
 (0)