Skip to content

Commit 9ce2cae

Browse files
committed
Fixed terrible performance on ARM GPUs by macro-replacing fused-multiply-add (fma) with a*b+c
1 parent 8ccdf11 commit 9ce2cae

File tree

2 files changed

+5
-3
lines changed

2 files changed

+5
-3
lines changed

README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
153153
- fixed that voxelization failed in Intel OpenCL CPU Runtime due to array out-of-bounds access
154154
- fixed that voxelization did not always produce binary identical results in multi-GPU compared to single-GPU
155155
- fixed that velocity voxelization failed for free surface simulations
156+
- fixed terrible performance on ARM GPUs by macro-replacing fused-multiply-add (`fma`) with `a*b+c`
156157
- fixed that <kbd>Y</kbd>/<kbd>Z</kbd> keys were incorrect for `QWERTY` keyboard layout in Linux
157158
- fixed that free camera movement speed in help overlay was not updated in stationary image when scrolling
158159
- fixed that cursor would sometimes flicker when scrolling on trackpads with Linux-X11 interactive graphics
@@ -552,9 +553,7 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, ⚪ Apple, 🟡 ARM, 🟤 Glenfly
552553
| 🔵&nbsp;UHD&nbsp;Graphics&nbsp;P630 | 0.46 | 51 | 42 | 177 (65%) | 288 (53%) | 137 (25%) |
553554
| 🔵&nbsp;HD&nbsp;Graphics&nbsp;5500 | 0.35 | 3 | 26 | 75 (45%) | 192 (58%) | 108 (32%) |
554555
| 🔵&nbsp;HD&nbsp;Graphics&nbsp;4600 | 0.38 | 2 | 26 | 105 (63%) | 115 (35%) | 34 (10%) |
555-
| 🟡&nbsp;Mali-G610&nbsp;MP4 (Orange&nbsp;Pi&nbsp;5&nbsp;Plus) | 0.06 | 16 | 34 | 43 (19%) | 59 (13%) | 19 ( 4%) |
556-
| 🟡&nbsp;Mali-G72&nbsp;MP18 (Samsung&nbsp;S9+) | 0.24 | 4 | 29 | 14 ( 7%) | 17 ( 5%) | 12 ( 3%) |
557-
| 🟡&nbsp;Qualcomm&nbsp;Adreno&nbsp;530 (LG&nbsp;G6) | 0.33 | 2 | 30 | 1 ( 1%) | 1 ( 0%) | 1 ( 0%) |
556+
| 🟡&nbsp;Mali-G72&nbsp;MP18 (Samsung&nbsp;S9+) | 0.24 | 4 | 29 | 110 (59%) | 230 (62%) | 21 ( 6%) |
558557
| | | | | | | |
559558
| 🔴&nbsp;2x&nbsp;EPYC&nbsp;9654 | 29.49 | 1536 | 922 | 1381 (23%) | 1814 (15%) | 1801 (15%) |
560559
| 🔵&nbsp;2x&nbsp;Xeon&nbsp;CPU&nbsp;Max&nbsp;9480 | 13.62 | 256 | 614 | 2037 (51%) | 1520 (19%) | 1464 (18%) |

src/opencl.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ struct Device_Info {
2525
uint clock_frequency=0u; // in MHz
2626
bool is_cpu=false, is_gpu=false;
2727
bool intel_gpu_above_4gb_patch = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
28+
bool arm_fma_patch = false; // ARM GPUs have terrible fma performance, so replace with a*b+c
2829
uint is_fp64_capable=0u, is_fp32_capable=0u, is_fp16_capable=0u, is_int64_capable=0u, is_int32_capable=0u, is_int16_capable=0u, is_int8_capable=0u;
2930
uint cores=0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
3031
float tflops=0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s
@@ -77,6 +78,7 @@ struct Device_Info {
7778
}
7879
}
7980
intel_gpu_above_4gb_patch = (intel==8.0f)&&(memory>4096); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
81+
arm_fma_patch = contains(to_lower(vendor), "arm"); // enable for all ARM GPUs
8082
}
8183
inline Device_Info() {}; // default constructor
8284
};
@@ -174,6 +176,7 @@ class Device {
174176
"\n #ifdef cl_khr_int64_base_atomics"
175177
"\n #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable" // make sure cl_khr_int64_base_atomics extension is enabled
176178
"\n #endif"
179+
+(info.arm_fma_patch ? "\n #define fma(a, b, c) ((a)*(b)+(c))" : "") // ARM GPUs have terrible fma performance, so replace with a*b+c
177180
;}
178181
public:
179182
Device_Info info;

0 commit comments

Comments
 (0)