Skip to content

Commit 9e8a59a

Browse files
Merge pull request #248 from KernelTuner/directives
Improved OpenACC support
2 parents 902340d + 88bde19 commit 9e8a59a

File tree

9 files changed

+798
-327
lines changed

9 files changed

+798
-327
lines changed

examples/README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ Below we list the example applications and the features they illustrate.
1818

1919
Vector Add
2020
----------
21-
[`CUDA <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/vector_add.py>`__] [`CUDA-C++ <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda-c++/vector_add.py>`__] [`OpenCL <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/opencl/vector_add.py>`__] [`C <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/c/vector_add.py>`__] [`Fortran <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/fortran/vector_add.py>`__]
21+
[`CUDA <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/vector_add.py>`__] [`CUDA-C++ <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda-c++/vector_add.py>`__] [`OpenCL <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/opencl/vector_add.py>`__] [`C <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/c/vector_add.py>`__] [`Fortran <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/fortran/vector_add.py>`__] [`OpenACC-C++ <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/directives/vector_add_c_openacc.py>`__] [`OpenACC-Fortran <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/directives/vector_add_fortran_openacc.py>`__]
2222
- use Kernel Tuner to tune a simple kernel
2323

2424
Stencil

examples/c/vector_add_openacc.py renamed to examples/directives/vector_add_c_openacc.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,21 @@
33

44
from kernel_tuner import tune_kernel
55
from kernel_tuner.utils.directives import (
6+
Code,
7+
OpenACC,
8+
Cxx,
69
extract_directive_signature,
710
extract_directive_code,
811
extract_preprocessor,
912
generate_directive_function,
1013
extract_directive_data,
1114
allocate_signature_memory,
1215
)
13-
from collections import OrderedDict
1416

1517
code = """
1618
#include <stdlib.h>
1719
18-
#define VECTOR_SIZE 65536
20+
#define VECTOR_SIZE 1000000
1921
2022
int main(void) {
2123
int size = VECTOR_SIZE;
@@ -24,7 +26,7 @@
2426
float * c = (float *) malloc(VECTOR_SIZE * sizeof(float));
2527
2628
#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)
27-
#pragma acc parallel num_gangs(ngangs) vector_length(nthreads)
29+
#pragma acc parallel vector_length(nthreads)
2830
#pragma acc loop
2931
for ( int i = 0; i < size; i++ ) {
3032
c[i] = a[i] + b[i];
@@ -37,21 +39,23 @@
3739
}
3840
"""
3941

40-
# Extract tunable directive and generate kernel_string
42+
# Extract tunable directive
43+
app = Code(OpenACC(), Cxx())
4144
preprocessor = extract_preprocessor(code)
42-
signature = extract_directive_signature(code)
43-
body = extract_directive_code(code)
44-
kernel_string = generate_directive_function(
45-
preprocessor, signature["vector_add"], body["vector_add"]
46-
)
47-
45+
signature = extract_directive_signature(code, app)
46+
body = extract_directive_code(code, app)
4847
# Allocate memory on the host
49-
data = extract_directive_data(code)
48+
data = extract_directive_data(code, app)
5049
args = allocate_signature_memory(data["vector_add"], preprocessor)
50+
# Generate kernel string
51+
kernel_string = generate_directive_function(
52+
preprocessor, signature["vector_add"], body["vector_add"], app, data=data["vector_add"]
53+
)
5154

52-
tune_params = OrderedDict()
53-
tune_params["ngangs"] = [2**i for i in range(0, 15)]
54-
tune_params["nthreads"] = [2**i for i in range(0, 11)]
55+
tune_params = dict()
56+
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
57+
metrics = dict()
58+
metrics["GB/s"] = lambda x: ((2 * 4 * len(args[0])) + (4 * len(args[0]))) / (x["time"] / 10**3) / 10**9
5559

5660
answer = [None, None, args[0] + args[1], None]
5761

@@ -61,6 +65,7 @@
6165
0,
6266
args,
6367
tune_params,
68+
metrics=metrics,
6469
answer=answer,
6570
compiler_options=["-fast", "-acc=gpu"],
6671
compiler="nvc++",

examples/fortran/vector_add_openacc.py renamed to examples/directives/vector_add_fortran_openacc.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,28 @@
33

44
from kernel_tuner import tune_kernel
55
from kernel_tuner.utils.directives import (
6+
Code,
7+
OpenACC,
8+
Fortran,
69
extract_directive_signature,
710
extract_directive_code,
811
extract_preprocessor,
912
generate_directive_function,
1013
extract_directive_data,
1114
allocate_signature_memory,
1215
)
13-
from collections import OrderedDict
1416

1517
code = """
16-
#define VECTOR_SIZE 65536
18+
#define VECTOR_SIZE 1000000
1719
1820
subroutine vector_add(A, B, C, n)
1921
use iso_c_binding
2022
real (c_float), intent(out), dimension(VECTOR_SIZE) :: C
2123
real (c_float), intent(in), dimension(VECTOR_SIZE) :: A, B
2224
integer (c_int), intent(in) :: n
2325
24-
!$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE)
25-
!$acc parallel loop num_gangs(ngangs) vector_length(vlength)
26+
!$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE) i(int:VECTOR_SIZE)
27+
!$acc parallel loop vector_length(nthreads)
2628
do i = 1, n
2729
C(i) = A(i) + B(i)
2830
end do
@@ -32,30 +34,33 @@
3234
end subroutine vector_add
3335
"""
3436

35-
# Extract tunable directive and generate kernel_string
37+
# Extract tunable directive
38+
app = Code(OpenACC(), Fortran())
3639
preprocessor = extract_preprocessor(code)
37-
signature = extract_directive_signature(code)
38-
body = extract_directive_code(code)
39-
kernel_string = generate_directive_function(
40-
preprocessor, signature["vector_add"], body["vector_add"]
41-
)
42-
40+
signature = extract_directive_signature(code, app)
41+
body = extract_directive_code(code, app)
4342
# Allocate memory on the host
44-
data = extract_directive_data(code)
43+
data = extract_directive_data(code, app)
4544
args = allocate_signature_memory(data["vector_add"], preprocessor)
45+
# Generate kernel string
46+
kernel_string = generate_directive_function(
47+
preprocessor, signature["vector_add"], body["vector_add"], app, data=data["vector_add"]
48+
)
4649

47-
tune_params = OrderedDict()
48-
tune_params["ngangs"] = [2**i for i in range(0, 15)]
49-
tune_params["vlength"] = [2**i for i in range(0, 11)]
50+
tune_params = dict()
51+
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
52+
metrics = dict()
53+
metrics["GB/s"] = lambda x: ((2 * 4 * len(args[0])) + (4 * len(args[0]))) / (x["time"] / 10**3) / 10**9
5054

51-
answer = [None, None, args[0] + args[1], None]
55+
answer = [None, None, args[0] + args[1], None, None]
5256

5357
tune_kernel(
5458
"vector_add",
5559
kernel_string,
5660
0,
5761
args,
5862
tune_params,
63+
metrics=metrics,
5964
answer=answer,
6065
compiler_options=["-fast", "-acc=gpu"],
6166
compiler="nvfortran",

kernel_tuner/backends/compiler.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -146,9 +146,7 @@ def ready_argument_list(self, arguments):
146146

147147
for i, arg in enumerate(arguments):
148148
if not (isinstance(arg, (np.ndarray, np.number)) or is_cupy_array(arg)):
149-
raise TypeError(
150-
f"Argument is not numpy or cupy ndarray or numpy scalar but a {type(arg)}"
151-
)
149+
raise TypeError(f"Argument is not numpy or cupy ndarray or numpy scalar but a {type(arg)}")
152150
dtype_str = str(arg.dtype)
153151
if isinstance(arg, np.ndarray):
154152
if dtype_str in dtype_map.keys():
@@ -210,11 +208,7 @@ def compile(self, kernel_instance):
210208

211209
# detect whether to use nvcc as default instead of g++, may overrule an explicitly passed g++
212210
if (
213-
(
214-
(suffix == ".cu")
215-
or ("#include <cuda" in kernel_string)
216-
or ("cudaMemcpy" in kernel_string)
217-
)
211+
((suffix == ".cu") or ("#include <cuda" in kernel_string) or ("cudaMemcpy" in kernel_string))
218212
and self.compiler == "g++"
219213
and self.nvcc_available
220214
):
@@ -271,11 +265,7 @@ def compile(self, kernel_instance):
271265
if platform.system() == "Darwin":
272266
lib_extension = ".dylib"
273267

274-
subprocess.check_call(
275-
[self.compiler, "-c", source_file]
276-
+ compiler_options
277-
+ ["-o", filename + ".o"]
278-
)
268+
subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"])
279269
subprocess.check_call(
280270
[self.compiler, filename + ".o"]
281271
+ compiler_options
@@ -319,7 +309,7 @@ def synchronize(self):
319309
C backend does not support asynchronous launches"""
320310
pass
321311

322-
def run_kernel(self, func, c_args, threads, grid):
312+
def run_kernel(self, func, c_args, threads, grid, stream=None):
323313
"""runs the kernel once, returns whatever the kernel returns
324314
325315
:param func: A C function compiled for this specific configuration
@@ -331,11 +321,15 @@ def run_kernel(self, func, c_args, threads, grid):
331321
:type c_args: list(Argument)
332322
333323
:param threads: Ignored, but left as argument for now to have the same
334-
interface as CudaFunctions and OpenCLFunctions.
324+
interface as Backend.
335325
:type threads: any
336326
337327
:param grid: Ignored, but left as argument for now to have the same
338-
interface as CudaFunctions and OpenCLFunctions.
328+
interface as Backend.
329+
:type grid: any
330+
331+
:param stream: Ignored, but left as argument for now to have the same
332+
interface as Backend.
339333
:type grid: any
340334
341335
:returns: A robust average of values returned by the C function.

0 commit comments

Comments
 (0)