You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
host_buffer_unaligned = new T[N*(ulong)d+4160ull/sizeof(T)]; // over-allocate by (4096+64) Bytes
318
-
host_buffer = (T*)((((ulong)host_buffer_unaligned+4095ull)/4096ull)*4096ull); // host_buffer must be aligned to 4096 Bytes for CL_MEM_USE_HOST_PTR
319
-
} else {
320
-
host_buffer = new T[N*(ulong)d];
321
-
}
316
+
const ulong alignment = allow_zero_copy&&device.info.uses_ram ? 4096ull : 64ull; // host_buffer must be aligned to 4096 Bytes for CL_MEM_USE_HOST_PTR, and to 64 Bytes for optimal enqueueReadBuffer performance on modern CPUs
317
+
const ulong padding = allow_zero_copy&&device.info.uses_ram ? 64ull : 0ull; // for CL_MEM_USE_HOST_PTR, 64 Bytes padding is required because device_buffer capacity in this case must be a multiple of 64 Bytes
318
+
host_buffer_unaligned = new T[N*(ulong)d+(alignment+padding)/sizeof(T)]; // over-allocate host_buffer_unaligned by (alignment+padding) Bytes
319
+
host_buffer = (T*)((((ulong)host_buffer_unaligned+alignment-1ull)/alignment)*alignment); // align host_buffer by fine-tuning pointer to be a multiple of alignment
322
320
initialize_auxiliary_pointers();
323
321
host_buffer_exists = true;
324
322
}
@@ -334,7 +332,7 @@ template<typename T> class Memory {
334
332
device_buffer = cl::Buffer( // if(is_zero_copy) { don't allocate extra memory on CPUs/iGPUs } else { allocate VRAM on GPUs }
335
333
device.get_cl_context(),
336
334
CL_MEM_READ_WRITE|((int)is_zero_copy*CL_MEM_USE_HOST_PTR)|((int)device.info.patch_intel_gpu_above_4gb<<23), // for Intel GPUs set flag CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL = (1<<23)
337
-
is_zero_copy ? ((capacity()+63ull)/64ull)*64ull : capacity(), //buffer capacity must be a multiple of 64 Bytes for CL_MEM_USE_HOST_PTR
335
+
is_zero_copy ? ((capacity()+63ull)/64ull)*64ull : capacity(), //device_buffer capacity must be a multiple of 64 Bytes for CL_MEM_USE_HOST_PTR
338
336
is_zero_copy ? (void*)host_buffer : nullptr,
339
337
&error
340
338
);
@@ -423,12 +421,8 @@ template<typename T> class Memory {
0 commit comments