File tree Expand file tree Collapse file tree 1 file changed +9
-8
lines changed Expand file tree Collapse file tree 1 file changed +9
-8
lines changed Original file line number Diff line number Diff line change 10
10
)
11
11
12
12
code = """
13
- #include <stdlib.h>
14
-
15
13
#define N 4096
16
14
17
15
void matrix_multiply(float *A, float *B, float *C) {
18
16
#pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN)
17
+ float temp_sum = 0.0f;
19
18
#pragma acc parallel vector_length(nthreads)
20
- #pragma acc loop
21
- for ( i = 0; i < N; i++) {
22
- for ( j = 0; j < N; j++ ) {
23
- for ( k = 0; k < N; k++ ) {
24
- C[i][j] += A[i][k] * B[k][j];
19
+ #pragma acc loop collapse(2) reduction(+:temp_sum)
20
+ for ( int i = 0; i < N; i++) {
21
+ for ( int j = 0; j < N; j++ ) {
22
+ temp_sum = 0.0f;
23
+ for ( int k = 0; k < N; k++ ) {
24
+ temp_sum += A[(i * N) + k] * B[(k * N) + j];
25
25
}
26
+ C[(i * N) + j] = temp_sum;
26
27
}
27
28
}
28
29
#pragma tuner stop
37
38
tune_params = dict ()
38
39
tune_params ["nthreads" ] = [32 * i for i in range (1 , 33 )]
39
40
metrics = dict ()
40
- metrics ["GB/s" ] = lambda x : (4096 * 4096 * 4 ) / (x ["time" ] / 10 ** 3 ) / 10 ** 9
41
+ metrics ["GB/s" ] = lambda x : (( 4096 * 4096 * 4096 * 2 * 4 ) + ( 4096 * 4096 * 4 ) ) / (x ["time" ] / 10 ** 3 ) / 10 ** 9
41
42
42
43
tune_kernel (
43
44
"mm" ,
You can’t perform that action at this time.
0 commit comments