Skip to content

Commit 6d6eccd

Browse files
committed
Faster enqueueReadBuffer on modern CPUs with 64-Byte-aligned host_buffer (thanks to @pioto1225)
1 parent c89be69 commit 6d6eccd

File tree

1 file changed

+7
-13
lines changed

1 file changed

+7
-13
lines changed

src/opencl.hpp

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -313,12 +313,10 @@ template<typename T> class Memory {
313313
}
314314
inline void allocate_host_buffer(Device& device, const bool allocate_host, const bool allow_zero_copy) {
315315
if(allocate_host) {
316-
if(allow_zero_copy&&device.info.uses_ram) {
317-
host_buffer_unaligned = new T[N*(ulong)d+4160ull/sizeof(T)]; // over-allocate by (4096+64) Bytes
318-
host_buffer = (T*)((((ulong)host_buffer_unaligned+4095ull)/4096ull)*4096ull); // host_buffer must be aligned to 4096 Bytes for CL_MEM_USE_HOST_PTR
319-
} else {
320-
host_buffer = new T[N*(ulong)d];
321-
}
316+
const ulong alignment = allow_zero_copy&&device.info.uses_ram ? 4096ull : 64ull; // host_buffer must be aligned to 4096 Bytes for CL_MEM_USE_HOST_PTR, and to 64 Bytes for optimal enqueueReadBuffer performance on modern CPUs
317+
const ulong padding = allow_zero_copy&&device.info.uses_ram ? 64ull : 0ull; // for CL_MEM_USE_HOST_PTR, 64 Bytes padding is required because device_buffer capacity in this case must be a multiple of 64 Bytes
318+
host_buffer_unaligned = new T[N*(ulong)d+(alignment+padding)/sizeof(T)]; // over-allocate host_buffer_unaligned by (alignment+padding) Bytes
319+
host_buffer = (T*)((((ulong)host_buffer_unaligned+alignment-1ull)/alignment)*alignment); // align host_buffer by fine-tuning pointer to be a multiple of alignment
322320
initialize_auxiliary_pointers();
323321
host_buffer_exists = true;
324322
}
@@ -334,7 +332,7 @@ template<typename T> class Memory {
334332
device_buffer = cl::Buffer( // if(is_zero_copy) { don't allocate extra memory on CPUs/iGPUs } else { allocate VRAM on GPUs }
335333
device.get_cl_context(),
336334
CL_MEM_READ_WRITE|((int)is_zero_copy*CL_MEM_USE_HOST_PTR)|((int)device.info.patch_intel_gpu_above_4gb<<23), // for Intel GPUs set flag CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL = (1<<23)
337-
is_zero_copy ? ((capacity()+63ull)/64ull)*64ull : capacity(), // buffer capacity must be a multiple of 64 Bytes for CL_MEM_USE_HOST_PTR
335+
is_zero_copy ? ((capacity()+63ull)/64ull)*64ull : capacity(), // device_buffer capacity must be a multiple of 64 Bytes for CL_MEM_USE_HOST_PTR
338336
is_zero_copy ? (void*)host_buffer : nullptr,
339337
&error
340338
);
@@ -423,12 +421,8 @@ template<typename T> class Memory {
423421
inline void delete_host_buffer() {
424422
host_buffer_exists = false;
425423
if(!external_host_buffer) {
426-
if(host_buffer_unaligned!=nullptr) {
427-
host_buffer = nullptr;
428-
delete[] host_buffer_unaligned;
429-
} else {
430-
delete[] host_buffer;
431-
}
424+
host_buffer = nullptr;
425+
delete[] host_buffer_unaligned;
432426
}
433427
if(!device_buffer_exists) {
434428
N = 0ull;

0 commit comments

Comments
 (0)