Skip to content

Commit b91f51a

Browse files
committed
Minor cosmetics in OpenCL-Wrapper
1 parent 8ce0fdc commit b91f51a

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

src/opencl.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ struct Device_Info {
2525
uint clock_frequency=0u; // in MHz
2626
bool is_cpu=false, is_gpu=false;
2727
bool intel_gpu_above_4gb_patch = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
28-
bool arm_fma_patch = false; // ARM GPUs have terrible fma performance, so replace with a*b+c
28+
bool legacy_gpu_fma_patch = false; // some old GPUs have terrible fma performance, so replace with a*b+c
2929
uint is_fp64_capable=0u, is_fp32_capable=0u, is_fp16_capable=0u, is_int64_capable=0u, is_int32_capable=0u, is_int16_capable=0u, is_int8_capable=0u;
3030
uint cores=0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
3131
float tflops=0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s
@@ -77,8 +77,8 @@ struct Device_Info {
7777
memory = (uint)((cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()*50ull/49ull)/1048576ull); // fix wrong (98% on Windows) memory reporting on Intel Arc
7878
}
7979
}
80-
intel_gpu_above_4gb_patch = (intel==8.0f)&&(memory>4096); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
81-
arm_fma_patch = contains(to_lower(vendor), "arm"); // enable for all ARM GPUs
80+
intel_gpu_above_4gb_patch = intel_gpu_above_4gb_patch||((intel==8.0f)&&(memory>4096)); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
81+
legacy_gpu_fma_patch = legacy_gpu_fma_patch||contains(to_lower(vendor), "arm"); // enable for all ARM GPUs
8282
}
8383
inline Device_Info() {}; // default constructor
8484
};
@@ -176,7 +176,7 @@ class Device {
176176
"\n #ifdef cl_khr_int64_base_atomics"
177177
"\n #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable" // make sure cl_khr_int64_base_atomics extension is enabled
178178
"\n #endif"
179-
+(info.arm_fma_patch ? "\n #define fma(a, b, c) ((a)*(b)+(c))" : "") // ARM GPUs have terrible fma performance, so replace with a*b+c
179+
+(info.legacy_gpu_fma_patch ? "\n #define fma(a, b, c) ((a)*(b)+(c))" : "") // some old GPUs have terrible fma performance, so replace with a*b+c
180180
;}
181181
public:
182182
Device_Info info;

0 commit comments

Comments
 (0)