@@ -33,24 +33,12 @@ void benchmark_device(const Device_Info& device_info) {
33
33
;
34
34
print (" | Compiling ... |" );
35
35
Device device (device_info, defines+get_opencl_c_code ());
36
-
37
36
Memory<float > buffer (device, N, M);
38
- Kernel kernel_double (device, N, " kernel_double" , buffer);
39
- Kernel kernel_float (device, N, " kernel_float" , buffer);
40
- Kernel kernel_half (device, N, " kernel_half" , buffer);
41
- Kernel kernel_long (device, N, " kernel_long" , buffer);
42
- Kernel kernel_int (device, N, " kernel_int" , buffer);
43
- Kernel kernel_short (device, N, " kernel_short" , buffer);
44
- Kernel kernel_char (device, N, " kernel_char" , buffer);
45
- Kernel kernel_coalesced_write (device, N, " kernel_coalesced_write" , buffer);
46
- Kernel kernel_coalesced_read (device, N, " kernel_coalesced_read" , buffer);
47
- Kernel kernel_misaligned_write (device, N, " kernel_misaligned_write" , buffer);
48
- Kernel kernel_misaligned_read (device, N, " kernel_misaligned_read" , buffer);
49
-
50
37
// print_info("Device mormory usage: "+to_string(device.info.memory_used)+" MB");
51
38
52
39
if (device.info .is_fp64_capable ) {
53
40
print (" | Benchmarking ... |" );
41
+ Kernel kernel_double (device, N, " kernel_double" , buffer);
54
42
for (uint i=0u ; i<N_kernel; i++) {
55
43
clock.start ();
56
44
kernel_double.run ();
@@ -63,6 +51,7 @@ void benchmark_device(const Device_Info& device_info) {
63
51
}
64
52
65
53
print (" | Benchmarking ... |" );
54
+ Kernel kernel_float (device, N, " kernel_float" , buffer);
66
55
for (uint i=0u ; i<N_kernel; i++) {
67
56
clock.start ();
68
57
kernel_float.run ();
@@ -73,6 +62,7 @@ void benchmark_device(const Device_Info& device_info) {
73
62
74
63
if (device.info .is_fp16_capable ) {
75
64
print (" | Benchmarking ... |" );
65
+ Kernel kernel_half (device, N, " kernel_half" , buffer);
76
66
for (uint i=0u ; i<N_kernel; i++) {
77
67
clock.start ();
78
68
kernel_half.run ();
@@ -85,6 +75,7 @@ void benchmark_device(const Device_Info& device_info) {
85
75
}
86
76
87
77
print (" | Benchmarking ... |" );
78
+ Kernel kernel_long (device, N, " kernel_long" , buffer);
88
79
for (uint i=0u ; i<N_kernel; i++) {
89
80
clock.start ();
90
81
kernel_long.run ();
@@ -94,6 +85,7 @@ void benchmark_device(const Device_Info& device_info) {
94
85
println (" \r | INT64 compute " +alignr (45u , to_string (flops_long, 3u ))+" TIOPs/s " +fraction (100 .0f *flops_long/device.info .tflops )+" |" );
95
86
96
87
print (" | Benchmarking ... |" );
88
+ Kernel kernel_int (device, N, " kernel_int" , buffer);
97
89
for (uint i=0u ; i<N_kernel; i++) {
98
90
clock.start ();
99
91
kernel_int.run ();
@@ -103,6 +95,7 @@ void benchmark_device(const Device_Info& device_info) {
103
95
println (" \r | INT32 compute " +alignr (45u , to_string (flops_int, 3u ))+" TIOPs/s " +fraction (100 .0f *flops_int/device.info .tflops )+" |" );
104
96
105
97
print (" | Benchmarking ... |" );
98
+ Kernel kernel_short (device, N, " kernel_short" , buffer);
106
99
for (uint i=0u ; i<N_kernel; i++) {
107
100
clock.start ();
108
101
kernel_short.run ();
@@ -112,6 +105,7 @@ void benchmark_device(const Device_Info& device_info) {
112
105
println (" \r | INT16 compute " +alignr (45u , to_string (flops_short, 3u ))+" TIOPs/s " +fraction (100 .0f *flops_short/device.info .tflops )+" |" );
113
106
114
107
print (" | Benchmarking ... |" );
108
+ Kernel kernel_char (device, N, " kernel_char" , buffer);
115
109
for (uint i=0u ; i<N_kernel; i++) {
116
110
clock.start ();
117
111
kernel_char.run ();
@@ -121,11 +115,13 @@ void benchmark_device(const Device_Info& device_info) {
121
115
println (" \r | INT8 compute " +alignr (45u , to_string (flops_char, 3u ))+" TIOPs/s " +fraction (100 .0f *flops_char/device.info .tflops )+" |" );
122
116
123
117
print (" | Benchmarking ... |" );
118
+ Kernel kernel_coalesced_write (device, N, " kernel_coalesced_write" , buffer);
124
119
for (uint i=0u ; i<N_kernel; i++) {
125
120
clock.start ();
126
121
kernel_coalesced_write.run ();
127
122
time_cw = fmin (clock.stop (), time_cw);
128
123
}
124
+ Kernel kernel_coalesced_read (device, N, " kernel_coalesced_read" , buffer);
129
125
for (uint i=0u ; i<N_kernel; i++) {
130
126
clock.start ();
131
127
kernel_coalesced_read.run ();
@@ -135,11 +131,13 @@ void benchmark_device(const Device_Info& device_info) {
135
131
println (" \r | Memory Bandwidth ( coalesced write) " +alignr (29u , to_string (4 .0f *(float )N*(float )M/(float ) time_cw *1E-9f , 2u ))+" GB/s |" );
136
132
137
133
print (" | Benchmarking ... |" );
134
+ Kernel kernel_misaligned_write (device, N, " kernel_misaligned_write" , buffer);
138
135
for (uint i=0u ; i<N_kernel; i++) {
139
136
clock.start ();
140
137
kernel_misaligned_write.run ();
141
138
time_mw = fmin (clock.stop (), time_mw);
142
139
}
140
+ Kernel kernel_misaligned_read (device, N, " kernel_misaligned_read" , buffer);
143
141
for (uint i=0u ; i<N_kernel; i++) {
144
142
clock.start ();
145
143
kernel_misaligned_read.run ();
0 commit comments