Skip to content

Commit 99b5c90

Browse files
committed
Merge branch 'master' into directives
2 parents 8336cd0 + 41ae1d2 commit 99b5c90

File tree

7 files changed

+147
-17
lines changed

7 files changed

+147
-17
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
</div>
66

77
---
8-
[![Build Status](https://github.com/KernelTuner/kernel_tuner/actions/workflows/build-test-python-package.yml/badge.svg)](https://github.com/KernelTuner/kernel_tuner/actions/workflows/build-test-python-package.yml)
8+
[![Build Status](https://github.com/KernelTuner/kernel_tuner/actions/workflows/test-python-package.yml/badge.svg)](https://github.com/KernelTuner/kernel_tuner/actions/workflows/test-python-package.yml)
99
[![CodeCov Badge](https://codecov.io/gh/KernelTuner/kernel_tuner/branch/master/graph/badge.svg)](https://codecov.io/gh/KernelTuner/kernel_tuner)
1010
[![PyPi Badge](https://img.shields.io/pypi/v/kernel_tuner.svg?colorB=blue)](https://pypi.python.org/pypi/kernel_tuner/)
1111
[![Zenodo Badge](https://zenodo.org/badge/54894320.svg)](https://zenodo.org/badge/latestdoi/54894320)

doc/source/observers.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,3 +112,11 @@ More information about PMT can be found here: https://git.astron.nl/RD/pmt/
112112

113113

114114

115+
NCUObserver
116+
~~~~~~~~~~~
117+
118+
The NCUObserver can be used to automatically extract performance counters during tuning using Nvidia's NsightCompute profiler.
119+
The NCUObserver relies on an intermediate library, which can be found here: https://github.com/nlesc-recruit/nvmetrics
120+
121+
.. autoclass:: kernel_tuner.observers.ncu.NCUObserver
122+
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/usr/bin/env python
2+
"""This is the minimal example from the README"""
3+
import json
4+
5+
import numpy
6+
from kernel_tuner import tune_kernel
7+
from kernel_tuner.observers.ncu import NCUObserver
8+
9+
def tune():
10+
11+
kernel_string = """
12+
__global__ void vector_add(float *c, float *a, float *b, int n) {
13+
int i = blockIdx.x * block_size_x + threadIdx.x;
14+
if (i<n) {
15+
c[i] = a[i] + b[i];
16+
}
17+
}
18+
"""
19+
20+
size = 80000000
21+
22+
a = numpy.random.randn(size).astype(numpy.float32)
23+
b = numpy.random.randn(size).astype(numpy.float32)
24+
c = numpy.zeros_like(b)
25+
n = numpy.int32(size)
26+
27+
args = [c, a, b, n]
28+
29+
tune_params = dict()
30+
tune_params["block_size_x"] = [128+64*i for i in range(15)]
31+
32+
ncu_metrics = ["dram__bytes.sum", # Counter byte # of bytes accessed in DRAM
33+
"dram__bytes_read.sum", # Counter byte # of bytes read from DRAM
34+
"dram__bytes_write.sum", # Counter byte # of bytes written to DRAM
35+
"smsp__sass_thread_inst_executed_op_fadd_pred_on.sum", # Counter inst # of FADD thread instructions executed where all predicates were true
36+
"smsp__sass_thread_inst_executed_op_ffma_pred_on.sum", # Counter inst # of FFMA thread instructions executed where all predicates were true
37+
"smsp__sass_thread_inst_executed_op_fmul_pred_on.sum", # Counter inst # of FMUL thread instructions executed where all predicates were true
38+
]
39+
40+
ncuobserver = NCUObserver(metrics=ncu_metrics)
41+
42+
def total_fp32_flops(p):
43+
return p["smsp__sass_thread_inst_executed_op_fadd_pred_on.sum"] + 2 * p["smsp__sass_thread_inst_executed_op_ffma_pred_on.sum"] + p["smsp__sass_thread_inst_executed_op_fmul_pred_on.sum"]
44+
45+
metrics = dict()
46+
metrics["GFLOP/s"] = lambda p: (total_fp32_flops(p) / 1e9) / (p["time"]/1e3)
47+
metrics["Expected GFLOP/s"] = lambda p: (size / 1e9) / (p["time"]/1e3)
48+
metrics["GB/s"] = lambda p: (p["dram__bytes.sum"] / 1e9) / (p["time"]/1e3)
49+
metrics["Expected GB/s"] = lambda p: (size*4*3 / 1e9) / (p["time"]/1e3)
50+
51+
results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, observers=[ncuobserver], metrics=metrics, iterations=7)
52+
53+
return results
54+
55+
56+
if __name__ == "__main__":
57+
tune()

kernel_tuner/core.py

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from kernel_tuner.backends.opencl import OpenCLFunctions
2424
from kernel_tuner.backends.hip import HipFunctions
2525
from kernel_tuner.observers.nvml import NVMLObserver
26-
from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver
26+
from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver, PrologueObserver
2727

2828
try:
2929
import torch
@@ -314,11 +314,13 @@ def __init__(
314314
)
315315
else:
316316
raise ValueError("Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet")
317+
self.dev = dev
317318

318319
# look for NVMLObserver in observers, if present, enable special tunable parameters through nvml
319320
self.use_nvml = False
320321
self.continuous_observers = []
321322
self.output_observers = []
323+
self.prologue_observers = []
322324
if observers:
323325
for obs in observers:
324326
if isinstance(obs, NVMLObserver):
@@ -328,49 +330,61 @@ def __init__(
328330
self.continuous_observers.append(obs.continuous_observer)
329331
if isinstance(obs, OutputObserver):
330332
self.output_observers.append(obs)
333+
if isinstance(obs, PrologueObserver):
334+
self.prologue_observers.append(obs)
331335

336+
# Take list of observers from self.dev because Backends tend to add their own observer
337+
self.benchmark_observers = [
338+
obs for obs in self.dev.observers if not isinstance(obs, (ContinuousObserver, PrologueObserver))
339+
]
332340

333341
self.iterations = iterations
334342

335343
self.lang = lang
336-
self.dev = dev
337344
self.units = dev.units
338345
self.name = dev.name
339346
self.max_threads = dev.max_threads
340347
if not quiet:
341348
print("Using: " + self.dev.name)
342349

350+
def benchmark_prologue(self, func, gpu_args, threads, grid, result):
351+
"""Benchmark prologue one kernel execution per PrologueObserver"""
352+
353+
for obs in self.prologue_observers:
354+
self.dev.synchronize()
355+
obs.before_start()
356+
self.dev.run_kernel(func, gpu_args, threads, grid)
357+
self.dev.synchronize()
358+
obs.after_finish()
359+
result.update(obs.get_results())
360+
343361
def benchmark_default(self, func, gpu_args, threads, grid, result):
344-
"""Benchmark one kernel execution at a time."""
345-
observers = [
346-
obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver)
347-
]
362+
"""Benchmark one kernel execution for 'iterations' at a time"""
348363

349364
self.dev.synchronize()
350365
for _ in range(self.iterations):
351-
for obs in observers:
366+
for obs in self.benchmark_observers:
352367
obs.before_start()
353368
self.dev.synchronize()
354369
self.dev.start_event()
355370
self.dev.run_kernel(func, gpu_args, threads, grid)
356371
self.dev.stop_event()
357-
for obs in observers:
372+
for obs in self.benchmark_observers:
358373
obs.after_start()
359374
while not self.dev.kernel_finished():
360-
for obs in observers:
375+
for obs in self.benchmark_observers:
361376
obs.during()
362377
time.sleep(1e-6) # one microsecond
363378
self.dev.synchronize()
364-
for obs in observers:
379+
for obs in self.benchmark_observers:
365380
obs.after_finish()
366381

367-
for obs in observers:
382+
for obs in self.benchmark_observers:
368383
result.update(obs.get_results())
369384

370385
def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
371386
"""Benchmark continuously for at least 'duration' seconds"""
372387
iterations = int(np.ceil(duration / (result["time"] / 1000)))
373-
# print(f"{iterations=} {(result['time']/1000)=}")
374388
self.dev.synchronize()
375389
for obs in self.continuous_observers:
376390
obs.before_start()
@@ -420,9 +434,8 @@ def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_sett
420434

421435
result = {}
422436
try:
423-
self.benchmark_default(
424-
func, gpu_args, instance.threads, instance.grid, result
425-
)
437+
self.benchmark_prologue(func, gpu_args, instance.threads, instance.grid, result)
438+
self.benchmark_default(func, gpu_args, instance.threads, instance.grid, result)
426439

427440
if self.continuous_observers:
428441
duration = 1

kernel_tuner/observers/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
from .observer import BenchmarkObserver, IterationObserver, ContinuousObserver, OutputObserver
1+
from .observer import BenchmarkObserver, IterationObserver, ContinuousObserver, OutputObserver, PrologueObserver

kernel_tuner/observers/ncu.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from kernel_tuner.observers import PrologueObserver
2+
3+
try:
4+
import nvmetrics
5+
except (ImportError):
6+
nvmetrics = None
7+
8+
class NCUObserver(PrologueObserver):
9+
"""``NCUObserver`` measures performance counters.
10+
11+
The exact performance counters supported differ per GPU, some examples:
12+
13+
* "dram__bytes.sum", # Counter byte # of bytes accessed in DRAM
14+
* "dram__bytes_read.sum", # Counter byte # of bytes read from DRAM
15+
* "dram__bytes_write.sum", # Counter byte # of bytes written to DRAM
16+
* "smsp__sass_thread_inst_executed_op_fadd_pred_on.sum", # Counter inst # of FADD thread instructions executed where all predicates were true
17+
* "smsp__sass_thread_inst_executed_op_ffma_pred_on.sum", # Counter inst # of FFMA thread instructions executed where all predicates were true
18+
* "smsp__sass_thread_inst_executed_op_fmul_pred_on.sum", # Counter inst # of FMUL thread instructions executed where all predicates were true
19+
20+
:param metrics: The metrics to observe. This should be a list of strings.
21+
You can use ``ncu --query-metrics`` to get a list of valid metrics.
22+
:type metrics: list[str]
23+
24+
"""
25+
26+
def __init__(self, metrics=None, device=0):
27+
if not nvmetrics:
28+
raise ImportError("could not import nvmetrics")
29+
30+
self.metrics = metrics
31+
self.device = device
32+
self.results = dict()
33+
34+
def before_start(self):
35+
nvmetrics.measureMetricsStart(self.metrics, self.device)
36+
37+
def after_finish(self):
38+
self.results = nvmetrics.measureMetricsStop()
39+
40+
def get_results(self):
41+
return dict(zip(self.metrics, self.results))

kernel_tuner/observers/observer.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,4 +57,15 @@ def process_output(self, answer, output):
5757
"""
5858
pass
5959

60+
class PrologueObserver(BenchmarkObserver):
61+
"""Observer that measures something in a seperate kernel invocation prior to the normal benchmark."""
6062

63+
@abstractmethod
64+
def before_start(self):
65+
"""prologue start is called before the kernel starts"""
66+
pass
67+
68+
@abstractmethod
69+
def after_finish(self):
70+
"""prologue finish is called after the kernel has finished execution"""
71+
pass

0 commit comments

Comments
 (0)