Skip to content

Commit bb1f701

Browse files
committed
First draft of the matrix multiply example.
1 parent b8d4e82 commit bb1f701

File tree

1 file changed

+51
-0
lines changed

1 file changed

+51
-0
lines changed
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#!/usr/bin/env python
2+
"""This is an example tuning a naive matrix multiplication using the simplified directives interface"""
3+
4+
from kernel_tuner import tune_kernel
5+
from kernel_tuner.utils.directives import (
6+
Code,
7+
OpenACC,
8+
Cxx,
9+
process_directives
10+
)
11+
12+
code = """
13+
#include <stdlib.h>
14+
15+
#define N 4096
16+
17+
void matrix_multiply(float *A, float *B, float *C) {
18+
#pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN)
19+
#pragma acc parallel vector_length(nthreads)
20+
#pragma acc loop
21+
for ( i = 0; i < N; i++) {
22+
for ( j = 0; j < N; j++ ) {
23+
for ( k = 0; k < N; k++ ) {
24+
C[i][j] += A[i][k] * B[k][j];
25+
}
26+
}
27+
}
28+
#pragma tuner stop
29+
}
30+
"""
31+
32+
# Extract tunable directive
33+
app = Code(OpenACC(), Cxx())
34+
dims = {"NN": 4096*4096}
35+
kernel_string, kernel_args = process_directives(app, code, user_dimensions=dims)
36+
37+
tune_params = dict()
38+
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
39+
metrics = dict()
40+
metrics["GB/s"] = lambda x: (4096 * 4096 * 4) / (x["time"] / 10**3) / 10**9
41+
42+
tune_kernel(
43+
"mm",
44+
kernel_string["mm"],
45+
0,
46+
kernel_args["mm"],
47+
tune_params,
48+
metrics=metrics,
49+
compiler_options=["-fast", "-acc=gpu"],
50+
compiler="nvc++",
51+
)

0 commit comments

Comments
 (0)