@@ -25,7 +25,7 @@ struct Device_Info {
25
25
uint clock_frequency=0u ; // in MHz
26
26
bool is_cpu=false , is_gpu=false ;
27
27
bool intel_gpu_above_4gb_patch = false ; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
28
- bool arm_fma_patch = false ; // ARM GPUs have terrible fma performance, so replace with a*b+c
28
+ bool legacy_gpu_fma_patch = false ; // some old GPUs have terrible fma performance, so replace with a*b+c
29
29
uint is_fp64_capable=0u , is_fp32_capable=0u , is_fp16_capable=0u , is_int64_capable=0u , is_int32_capable=0u , is_int16_capable=0u , is_int8_capable=0u ;
30
30
uint cores=0u ; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
31
31
float tflops=0 .0f ; // estimated device FP32 floating point performance in TeraFLOPs/s
@@ -77,8 +77,8 @@ struct Device_Info {
77
77
memory = (uint)((cl_device.getInfo <CL_DEVICE_GLOBAL_MEM_SIZE>()*50ull /49ull )/1048576ull ); // fix wrong (98% on Windows) memory reporting on Intel Arc
78
78
}
79
79
}
80
- intel_gpu_above_4gb_patch = ( intel==8 .0f )&&(memory>4096 ); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
81
- arm_fma_patch = contains (to_lower (vendor), " arm" ); // enable for all ARM GPUs
80
+ intel_gpu_above_4gb_patch = intel_gpu_above_4gb_patch||(( intel==8 .0f )&&(memory>4096 ) ); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
81
+ legacy_gpu_fma_patch = legacy_gpu_fma_patch|| contains (to_lower (vendor), " arm" ); // enable for all ARM GPUs
82
82
}
83
83
inline Device_Info () {}; // default constructor
84
84
};
@@ -176,7 +176,7 @@ class Device {
176
176
" \n #ifdef cl_khr_int64_base_atomics"
177
177
" \n #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable" // make sure cl_khr_int64_base_atomics extension is enabled
178
178
" \n #endif"
179
- +(info.arm_fma_patch ? " \n #define fma(a, b, c) ((a)*(b)+(c))" : " " ) // ARM GPUs have terrible fma performance, so replace with a*b+c
179
+ +(info.legacy_gpu_fma_patch ? " \n #define fma(a, b, c) ((a)*(b)+(c))" : " " ) // some old GPUs have terrible fma performance, so replace with a*b+c
180
180
;}
181
181
public:
182
182
Device_Info info;
0 commit comments