Skip to content

Commit 2c3173c

Browse files
committed
Updated OpenCL-Wrapper
1 parent 677d52f commit 2c3173c

File tree

2 files changed

+27
-25
lines changed

2 files changed

+27
-25
lines changed

src/main.cpp

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -33,24 +33,12 @@ void benchmark_device(const Device_Info& device_info) {
3333
;
3434
print("| Compiling ... |");
3535
Device device(device_info, defines+get_opencl_c_code());
36-
3736
Memory<float> buffer(device, N, M);
38-
Kernel kernel_double(device, N, "kernel_double", buffer);
39-
Kernel kernel_float(device, N, "kernel_float", buffer);
40-
Kernel kernel_half(device, N, "kernel_half", buffer);
41-
Kernel kernel_long(device, N, "kernel_long", buffer);
42-
Kernel kernel_int(device, N, "kernel_int", buffer);
43-
Kernel kernel_short(device, N, "kernel_short", buffer);
44-
Kernel kernel_char(device, N, "kernel_char", buffer);
45-
Kernel kernel_coalesced_write(device, N, "kernel_coalesced_write" , buffer);
46-
Kernel kernel_coalesced_read(device, N, "kernel_coalesced_read" , buffer);
47-
Kernel kernel_misaligned_write(device, N, "kernel_misaligned_write", buffer);
48-
Kernel kernel_misaligned_read(device, N, "kernel_misaligned_read" , buffer);
49-
5037
//print_info("Device mormory usage: "+to_string(device.info.memory_used)+" MB");
5138

5239
if(device.info.is_fp64_capable) {
5340
print("| Benchmarking ... |");
41+
Kernel kernel_double(device, N, "kernel_double", buffer);
5442
for(uint i=0u; i<N_kernel; i++) {
5543
clock.start();
5644
kernel_double.run();
@@ -63,6 +51,7 @@ void benchmark_device(const Device_Info& device_info) {
6351
}
6452

6553
print("| Benchmarking ... |");
54+
Kernel kernel_float(device, N, "kernel_float", buffer);
6655
for(uint i=0u; i<N_kernel; i++) {
6756
clock.start();
6857
kernel_float.run();
@@ -73,6 +62,7 @@ void benchmark_device(const Device_Info& device_info) {
7362

7463
if(device.info.is_fp16_capable) {
7564
print("| Benchmarking ... |");
65+
Kernel kernel_half(device, N, "kernel_half", buffer);
7666
for(uint i=0u; i<N_kernel; i++) {
7767
clock.start();
7868
kernel_half.run();
@@ -85,6 +75,7 @@ void benchmark_device(const Device_Info& device_info) {
8575
}
8676

8777
print("| Benchmarking ... |");
78+
Kernel kernel_long(device, N, "kernel_long", buffer);
8879
for(uint i=0u; i<N_kernel; i++) {
8980
clock.start();
9081
kernel_long.run();
@@ -94,6 +85,7 @@ void benchmark_device(const Device_Info& device_info) {
9485
println("\r| INT64 compute "+alignr(45u, to_string(flops_long, 3u))+" TIOPs/s "+fraction(100.0f*flops_long/device.info.tflops)+" |");
9586

9687
print("| Benchmarking ... |");
88+
Kernel kernel_int(device, N, "kernel_int", buffer);
9789
for(uint i=0u; i<N_kernel; i++) {
9890
clock.start();
9991
kernel_int.run();
@@ -103,6 +95,7 @@ void benchmark_device(const Device_Info& device_info) {
10395
println("\r| INT32 compute "+alignr(45u, to_string(flops_int, 3u))+" TIOPs/s "+fraction(100.0f*flops_int/device.info.tflops)+" |");
10496

10597
print("| Benchmarking ... |");
98+
Kernel kernel_short(device, N, "kernel_short", buffer);
10699
for(uint i=0u; i<N_kernel; i++) {
107100
clock.start();
108101
kernel_short.run();
@@ -112,6 +105,7 @@ void benchmark_device(const Device_Info& device_info) {
112105
println("\r| INT16 compute "+alignr(45u, to_string(flops_short, 3u))+" TIOPs/s "+fraction(100.0f*flops_short/device.info.tflops)+" |");
113106

114107
print("| Benchmarking ... |");
108+
Kernel kernel_char(device, N, "kernel_char", buffer);
115109
for(uint i=0u; i<N_kernel; i++) {
116110
clock.start();
117111
kernel_char.run();
@@ -121,11 +115,13 @@ void benchmark_device(const Device_Info& device_info) {
121115
println("\r| INT8 compute "+alignr(45u, to_string(flops_char, 3u))+" TIOPs/s "+fraction(100.0f*flops_char/device.info.tflops)+" |");
122116

123117
print("| Benchmarking ... |");
118+
Kernel kernel_coalesced_write(device, N, "kernel_coalesced_write" , buffer);
124119
for(uint i=0u; i<N_kernel; i++) {
125120
clock.start();
126121
kernel_coalesced_write.run();
127122
time_cw = fmin(clock.stop(), time_cw);
128123
}
124+
Kernel kernel_coalesced_read(device, N, "kernel_coalesced_read" , buffer);
129125
for(uint i=0u; i<N_kernel; i++) {
130126
clock.start();
131127
kernel_coalesced_read.run();
@@ -135,11 +131,13 @@ void benchmark_device(const Device_Info& device_info) {
135131
println("\r| Memory Bandwidth ( coalesced write) "+alignr(29u, to_string(4.0f*(float)N*(float)M/(float) time_cw *1E-9f, 2u))+" GB/s |");
136132

137133
print("| Benchmarking ... |");
134+
Kernel kernel_misaligned_write(device, N, "kernel_misaligned_write", buffer);
138135
for(uint i=0u; i<N_kernel; i++) {
139136
clock.start();
140137
kernel_misaligned_write.run();
141138
time_mw = fmin(clock.stop(), time_mw);
142139
}
140+
Kernel kernel_misaligned_read(device, N, "kernel_misaligned_read" , buffer);
143141
for(uint i=0u; i<N_kernel; i++) {
144142
clock.start();
145143
kernel_misaligned_read.run();

src/opencl.hpp

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -339,14 +339,10 @@ template<typename T> class Memory {
339339
delete_host_buffer();
340340
}
341341
inline void reset(const T value=(T)0) {
342-
if(host_buffer_exists) {
343-
std::fill(host_buffer, host_buffer+range(), value); // faster than "for(ulong i=0ull; i<range(); i++) host_buffer[i] = value;"
344-
}
342+
//if(device_buffer_exists) cl_queue.enqueueFillBuffer(device_buffer, value, 0ull, capacity()); // faster than "write_to_device();"
343+
if(host_buffer_exists) std::fill(host_buffer, host_buffer+range(), value); // faster than "for(ulong i=0ull; i<range(); i++) host_buffer[i] = value;"
345344
write_to_device(); // enqueueFillBuffer is broken for large buffers on Nvidia GPUs!
346-
//if(device_buffer_exists) {
347-
// cl_queue.enqueueFillBuffer(device_buffer, value, 0ull, capacity()); // faster than "write_to_device();"
348-
// cl_queue.finish();
349-
//}
345+
//if(device_buffer_exists) cl_queue.finish();
350346
}
351347
inline const ulong length() const { return N; }
352348
inline const uint dimensions() const { return d; }
@@ -466,14 +462,21 @@ class Kernel {
466462
private:
467463
ulong N = 0ull; // kernel range
468464
uint number_of_parameters = 0u;
465+
string name = "";
469466
cl::Kernel cl_kernel;
470467
cl::NDRange cl_range_global, cl_range_local;
471468
cl::CommandQueue cl_queue;
469+
inline void check_for_errors(const int error) {
470+
if(error==-48) print_error("There is no OpenCL kernel with name \""+name+"(...)\" in the OpenCL C code! Check spelling!");
471+
if(error<-48&&error>-53) print_error("Parameters for OpenCL kernel \""+name+"(...)\" don't match between C++ and OpenCL C!");
472+
if(error==-54) print_error("Workgrop size "+to_string(WORKGROUP_SIZE)+" for OpenCL kernel \""+name+"(...)\" is invalid!");
473+
if(error!=0) print_error("OpenCL kernel \""+name+"(...)\" failed with error code "+to_string(error)+"!");
474+
}
472475
template<typename T> inline void link_parameter(const uint position, const Memory<T>& memory) {
473-
cl_kernel.setArg(position, memory.get_cl_buffer());
476+
check_for_errors(cl_kernel.setArg(position, memory.get_cl_buffer()));
474477
}
475478
template<typename T> inline void link_parameter(const uint position, const T& constant) {
476-
cl_kernel.setArg(position, sizeof(T), (void*)&constant);
479+
check_for_errors(cl_kernel.setArg(position, sizeof(T), (void*)&constant));
477480
}
478481
inline void link_parameters(const uint starting_position) {
479482
number_of_parameters = max(number_of_parameters, starting_position);
@@ -484,14 +487,15 @@ class Kernel {
484487
}
485488
public:
486489
template<class... T> inline Kernel(const Device& device, const ulong N, const string& name, const T&... parameters) { // accepts Memory<T> objects and fundamental data type constants
487-
if(!device.is_initialized()) print_error("No Device selected. Call Device constructor.");
490+
if(!device.is_initialized()) print_error("No OpenCL Device selected. Call Device constructor.");
491+
this->name = name;
488492
cl_kernel = cl::Kernel(device.get_cl_program(), name.c_str());
489493
link_parameters(number_of_parameters, parameters...); // expand variadic template to link kernel parameters
490494
set_ranges(N);
491495
cl_queue = device.get_cl_queue();
492496
}
493497
template<class... T> inline Kernel(const Device& device, const ulong N, const uint workgroup_size, const string& name, const T&... parameters) { // accepts Memory<T> objects and fundamental data type constants
494-
if(!device.is_initialized()) print_error("No Device selected. Call Device constructor.");
498+
if(!device.is_initialized()) print_error("No OpenCL Device selected. Call Device constructor.");
495499
cl_kernel = cl::Kernel(device.get_cl_program(), name.c_str());
496500
link_parameters(number_of_parameters, parameters...); // expand variadic template to link kernel parameters
497501
set_ranges(N, (ulong)workgroup_size);
@@ -516,7 +520,7 @@ class Kernel {
516520
}
517521
inline Kernel& enqueue_run(const uint t=1u, const vector<Event>* event_waitlist=nullptr, Event* event_returned=nullptr) {
518522
for(uint i=0u; i<t; i++) {
519-
cl_queue.enqueueNDRangeKernel(cl_kernel, cl::NullRange, cl_range_global, cl_range_local, event_waitlist, event_returned);
523+
check_for_errors(cl_queue.enqueueNDRangeKernel(cl_kernel, cl::NullRange, cl_range_global, cl_range_local, event_waitlist, event_returned));
520524
}
521525
return *this;
522526
}

0 commit comments

Comments
 (0)