Skip to content

Commit 7e5e6b2

Browse files
committed
Fixing code.
1 parent bb1f701 commit 7e5e6b2

File tree

1 file changed

+9
-8
lines changed

1 file changed

+9
-8
lines changed

examples/directives/matrix_multiply_c_openacc.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,20 @@
1010
)
1111

1212
code = """
13-
#include <stdlib.h>
14-
1513
#define N 4096
1614
1715
void matrix_multiply(float *A, float *B, float *C) {
1816
#pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN)
17+
float temp_sum = 0.0f;
1918
#pragma acc parallel vector_length(nthreads)
20-
#pragma acc loop
21-
for ( i = 0; i < N; i++) {
22-
for ( j = 0; j < N; j++ ) {
23-
for ( k = 0; k < N; k++ ) {
24-
C[i][j] += A[i][k] * B[k][j];
19+
#pragma acc loop collapse(2) reduction(+:temp_sum)
20+
for ( int i = 0; i < N; i++) {
21+
for ( int j = 0; j < N; j++ ) {
22+
temp_sum = 0.0f;
23+
for ( int k = 0; k < N; k++ ) {
24+
temp_sum += A[(i * N) + k] * B[(k * N) + j];
2525
}
26+
C[(i * N) + j] = temp_sum;
2627
}
2728
}
2829
#pragma tuner stop
@@ -37,7 +38,7 @@
3738
tune_params = dict()
3839
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
3940
metrics = dict()
40-
metrics["GB/s"] = lambda x: (4096 * 4096 * 4) / (x["time"] / 10**3) / 10**9
41+
metrics["GB/s"] = lambda x: ((4096 * 4096 * 4096 * 2 * 4) + (4096 * 4096 * 4)) / (x["time"] / 10**3) / 10**9
4142

4243
tune_kernel(
4344
"mm",

0 commit comments

Comments
 (0)