Skip to content

Commit 225dae7

Browse files
add example for NCUObserver
1 parent 2183d13 commit 225dae7

File tree

1 file changed

+57
-0
lines changed

1 file changed

+57
-0
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/usr/bin/env python
2+
"""This is the minimal example from the README"""
3+
import json
4+
5+
import numpy
6+
from kernel_tuner import tune_kernel
7+
from kernel_tuner.observers.ncu import NCUObserver
8+
9+
def tune():
10+
11+
kernel_string = """
12+
__global__ void vector_add(float *c, float *a, float *b, int n) {
13+
int i = blockIdx.x * block_size_x + threadIdx.x;
14+
if (i<n) {
15+
c[i] = a[i] + b[i];
16+
}
17+
}
18+
"""
19+
20+
size = 80000000
21+
22+
a = numpy.random.randn(size).astype(numpy.float32)
23+
b = numpy.random.randn(size).astype(numpy.float32)
24+
c = numpy.zeros_like(b)
25+
n = numpy.int32(size)
26+
27+
args = [c, a, b, n]
28+
29+
tune_params = dict()
30+
tune_params["block_size_x"] = [128+64*i for i in range(15)]
31+
32+
ncu_metrics = ["dram__bytes.sum", # Counter byte # of bytes accessed in DRAM
33+
"dram__bytes_read.sum", # Counter byte # of bytes read from DRAM
34+
"dram__bytes_write.sum", # Counter byte # of bytes written to DRAM
35+
"smsp__sass_thread_inst_executed_op_fadd_pred_on.sum", # Counter inst # of FADD thread instructions executed where all predicates were true
36+
"smsp__sass_thread_inst_executed_op_ffma_pred_on.sum", # Counter inst # of FFMA thread instructions executed where all predicates were true
37+
"smsp__sass_thread_inst_executed_op_fmul_pred_on.sum", # Counter inst # of FMUL thread instructions executed where all predicates were true
38+
]
39+
40+
ncuobserver = NCUObserver(metrics=ncu_metrics)
41+
42+
def total_fp32_flops(p):
43+
return p["smsp__sass_thread_inst_executed_op_fadd_pred_on.sum"] + 2 * p["smsp__sass_thread_inst_executed_op_ffma_pred_on.sum"] + p["smsp__sass_thread_inst_executed_op_fmul_pred_on.sum"]
44+
45+
metrics = dict()
46+
metrics["GFLOP/s"] = lambda p: (total_fp32_flops(p) / 1e9) / (p["time"]/1e3)
47+
metrics["Expected GFLOP/s"] = lambda p: (size / 1e9) / (p["time"]/1e3)
48+
metrics["GB/s"] = lambda p: (p["dram__bytes.sum"] / 1e9) / (p["time"]/1e3)
49+
metrics["Expected GB/s"] = lambda p: (size*4*3 / 1e9) / (p["time"]/1e3)
50+
51+
results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, observers=[ncuobserver], metrics=metrics, iterations=7)
52+
53+
return results
54+
55+
56+
if __name__ == "__main__":
57+
tune()

0 commit comments

Comments
 (0)