Skip to content

Commit f307f50

Browse files
authored
Merge pull request #269 from KernelTuner/directives
Directives: summer refactoring
2 parents 6ad115c + a950997 commit f307f50

File tree

3 files changed

+132
-35
lines changed

3 files changed

+132
-35
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/usr/bin/env python
2+
"""This is an example tuning a naive matrix multiplication using the simplified directives interface"""
3+
4+
from kernel_tuner import tune_kernel
5+
from kernel_tuner.utils.directives import (
6+
Code,
7+
OpenACC,
8+
Cxx,
9+
process_directives
10+
)
11+
12+
N = 4096
13+
14+
code = """
15+
#define N 4096
16+
17+
void matrix_multiply(float *A, float *B, float *C) {
18+
#pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN)
19+
float temp_sum = 0.0f;
20+
#pragma acc parallel vector_length(nthreads)
21+
#pragma acc loop gang collapse(2)
22+
for ( int i = 0; i < N; i++) {
23+
for ( int j = 0; j < N; j++ ) {
24+
temp_sum = 0.0f;
25+
#pragma acc loop vector reduction(+:temp_sum)
26+
for ( int k = 0; k < N; k++ ) {
27+
temp_sum += A[(i * N) + k] * B[(k * N) + j];
28+
}
29+
C[(i * N) + j] = temp_sum;
30+
}
31+
}
32+
#pragma tuner stop
33+
}
34+
"""
35+
36+
# Extract tunable directive
37+
app = Code(OpenACC(), Cxx())
38+
dims = {"NN": N**2}
39+
kernel_string, kernel_args = process_directives(app, code, user_dimensions=dims)
40+
41+
tune_params = dict()
42+
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
43+
metrics = dict()
44+
metrics["time_s"] = lambda x: x["time"] / 10**3
45+
metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9
46+
metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9
47+
48+
tune_kernel(
49+
"mm",
50+
kernel_string["mm"],
51+
0,
52+
kernel_args["mm"],
53+
tune_params,
54+
metrics=metrics,
55+
compiler_options=["-fast", "-acc=gpu"],
56+
compiler="nvc++",
57+
)

kernel_tuner/utils/directives.py

Lines changed: 72 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,37 @@
1-
from typing import Any
1+
from typing import Any, Tuple
22
from abc import ABC, abstractmethod
33
import numpy as np
44

5+
# Function templates
6+
cpp_template: str = """
7+
<!?PREPROCESSOR?!>
8+
<!?USER_DEFINES?!>
9+
#include <chrono>
10+
11+
extern "C" <!?SIGNATURE?!> {
12+
<!?INITIALIZATION?!>
13+
<!?BODY?!>
14+
<!?DEINITIALIZATION?!>
15+
}
16+
"""
17+
18+
f90_template: str = """
19+
<!?PREPROCESSOR?!>
20+
<!?USER_DEFINES?!>
21+
22+
module kt
23+
use iso_c_binding
24+
contains
25+
26+
<!?SIGNATURE?!>
27+
<!?INITIALIZATION?!>
28+
<!?BODY?!>
29+
<!?DEINITIALIZATION?!>
30+
end function <!?NAME?!>
31+
32+
end module kt
33+
"""
34+
535

636
class Directive(ABC):
737
"""Base class for all directives"""
@@ -339,7 +369,7 @@ def wrap_timing_fortran(code: str) -> str:
339369

340370
def end_timing_cxx(code: str) -> str:
341371
"""In C++ we need to return the measured time"""
342-
return code + "\nreturn elapsed_time.count();\n"
372+
return "\n".join([code, "return elapsed_time.count();\n"])
343373

344374

345375
def wrap_data(code: str, langs: Code, data: dict, preprocessor: list = None, user_dimensions: dict = None) -> str:
@@ -355,7 +385,7 @@ def wrap_data(code: str, langs: Code, data: dict, preprocessor: list = None, use
355385
elif is_openacc(langs.directive) and is_fortran(langs.language):
356386
intro += create_data_directive_openacc_fortran(name, size)
357387
outro += exit_data_directive_openacc_fortran(name, size)
358-
return intro + code + outro
388+
return "\n".join([intro, code, outro])
359389

360390

361391
def extract_directive_code(code: str, langs: Code, kernel_name: str = None) -> dict:
@@ -529,42 +559,34 @@ def generate_directive_function(
529559
) -> str:
530560
"""Generate tunable function for one directive"""
531561

532-
code = "\n".join(preprocessor) + "\n"
533-
if user_dimensions is not None:
534-
# add user dimensions to preprocessor
535-
for key, value in user_dimensions.items():
536-
code += f"#define {key} {value}\n"
537-
if is_cxx(langs.language) and "#include <chrono>" not in preprocessor:
538-
code += "\n#include <chrono>\n"
539-
if is_cxx(langs.language):
540-
code += 'extern "C" ' + signature + "{\n"
541-
elif is_fortran(langs.language):
542-
code += "\nmodule kt\nuse iso_c_binding\ncontains\n"
543-
code += "\n" + signature
544-
if len(initialization) > 1:
545-
code += initialization + "\n"
546-
if data is not None:
547-
body = add_present_openacc(body, langs, data, preprocessor, user_dimensions)
548562
if is_cxx(langs.language):
563+
code = cpp_template
549564
body = start_timing_cxx(body)
550565
if data is not None:
551-
code += wrap_data(body + "\n", langs, data, preprocessor, user_dimensions)
552-
else:
553-
code += body
554-
code = end_timing_cxx(code)
555-
if len(deinitialization) > 1:
556-
code += deinitialization + "\n"
557-
code += "\n}"
566+
body = wrap_data(body + "\n", langs, data, preprocessor, user_dimensions)
567+
body = end_timing_cxx(body)
558568
elif is_fortran(langs.language):
569+
code = f90_template
559570
body = wrap_timing(body, langs.language)
560571
if data is not None:
561-
code += wrap_data(body + "\n", langs, data, preprocessor, user_dimensions)
562-
else:
563-
code += body + "\n"
564-
if len(deinitialization) > 1:
565-
code += deinitialization + "\n"
572+
body = wrap_data(body + "\n", langs, data, preprocessor, user_dimensions)
566573
name = signature.split(" ")[1].split("(")[0]
567-
code += f"\nend function {name}\nend module kt\n"
574+
code = code.replace("<!?NAME?!>", name)
575+
code = code.replace("<!?PREPROCESSOR?!>", "\n".join(preprocessor))
576+
# if present, add user specific dimensions as defines
577+
if user_dimensions is not None:
578+
user_defines = ""
579+
for key, value in user_dimensions.items():
580+
user_defines += f"#define {key} {value}\n"
581+
code = code.replace("<!?USER_DEFINES?!>", user_defines)
582+
else:
583+
code = code.replace("<!?USER_DEFINES?!>", "")
584+
code = code.replace("<!?SIGNATURE?!>", signature)
585+
code = code.replace("<!?INITIALIZATION?!>", initialization)
586+
code = code.replace("<!?DEINITIALIZATION?!>", deinitialization)
587+
if data is not None:
588+
body = add_present_openacc(body, langs, data, preprocessor, user_dimensions)
589+
code = code.replace("<!?BODY?!>", body)
568590

569591
return code
570592

@@ -662,3 +684,21 @@ def add_present_openacc_fortran(name: str, size: ArraySize) -> str:
662684
else:
663685
md_size = fortran_md_size(size)
664686
return f" present({name}({','.join(md_size)})) "
687+
688+
689+
def process_directives(langs: Code, source: str, user_dimensions: dict = None) -> Tuple[dict, dict]:
690+
"""Helper functions to process all the directives in the code and create tunable functions"""
691+
kernel_strings = dict()
692+
kernel_args = dict()
693+
preprocessor = extract_preprocessor(source)
694+
signatures = extract_directive_signature(source, langs)
695+
bodies = extract_directive_code(source, langs)
696+
data = extract_directive_data(source, langs)
697+
init = extract_initialization_code(source, langs)
698+
deinit = extract_deinitialization_code(source, langs)
699+
for kernel in signatures.keys():
700+
kernel_strings[kernel] = generate_directive_function(
701+
preprocessor, signatures[kernel], bodies[kernel], langs, data[kernel], init, deinit, user_dimensions
702+
)
703+
kernel_args[kernel] = allocate_signature_memory(data[kernel], preprocessor, user_dimensions)
704+
return (kernel_strings, kernel_args)

test/utils/test_directives.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,13 +105,13 @@ def test_wrap_data():
105105
code_f90 = "! this is a comment\n"
106106
data = {"array": ["int*", "size"]}
107107
preprocessor = ["#define size 42"]
108-
expected_cxx = "#pragma acc enter data create(array[:42])\n#pragma acc update device(array[:42])\n// this is a comment\n#pragma acc exit data copyout(array[:42])\n"
108+
expected_cxx = "#pragma acc enter data create(array[:42])\n#pragma acc update device(array[:42])\n\n// this is a comment\n\n#pragma acc exit data copyout(array[:42])\n"
109109
assert wrap_data(code_cxx, acc_cxx, data, preprocessor, None) == expected_cxx
110-
expected_f90 = "!$acc enter data create(array(:42))\n!$acc update device(array(:42))\n! this is a comment\n!$acc exit data copyout(array(:42))\n"
110+
expected_f90 = "!$acc enter data create(array(:42))\n!$acc update device(array(:42))\n\n! this is a comment\n\n!$acc exit data copyout(array(:42))\n"
111111
assert wrap_data(code_f90, acc_f90, data, preprocessor, None) == expected_f90
112112
data = {"matrix": ["float*", "rows,cols"]}
113113
preprocessor = ["#define rows 42", "#define cols 84"]
114-
expected_f90 = "!$acc enter data create(matrix(:42,:84))\n!$acc update device(matrix(:42,:84))\n! this is a comment\n!$acc exit data copyout(matrix(:42,:84))\n"
114+
expected_f90 = "!$acc enter data create(matrix(:42,:84))\n!$acc update device(matrix(:42,:84))\n\n! this is a comment\n\n!$acc exit data copyout(matrix(:42,:84))\n"
115115
assert wrap_data(code_f90, acc_f90, data, preprocessor, None) == expected_f90
116116
dimensions = {"rows": 42, "cols": 84}
117117
assert wrap_data(code_f90, acc_f90, data, user_dimensions=dimensions) == expected_f90

0 commit comments

Comments
 (0)