Skip to content

Commit 677d52f

Browse files
committed
Fixed terrible performance on ARM GPUs by macro-replacing fused-multiply-add (fma) with a*b+c, added automatic OS detection in make.sh
1 parent 8137aea commit 677d52f

File tree

4 files changed

+15
-21
lines changed

4 files changed

+15
-21
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
bin/
2+
.vs/
3+
OpenCL-Benchmark.vcxproj.user

README.md

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ Works with any GPU in Windows, Linux, macOS and Android.
2727
OpenCL-Benchmark.exe
2828
```
2929

30-
### Linux
30+
### Linux / macOS / Android
3131
- Download, compile and run:
3232
```
3333
git clone https://github.com/ProjectPhysX/OpenCL-Benchmark.git
@@ -40,21 +40,6 @@ Works with any GPU in Windows, Linux, macOS and Android.
4040
bin/OpenCL-Benchmark
4141
```
4242

43-
### macOS
44-
- Download, compile and run:
45-
```
46-
git clone https://github.com/ProjectPhysX/OpenCL-Benchmark.git
47-
cd OpenCL-Benchmark
48-
vim make.sh
49-
# press i, comment the line "compile on Linux" line with an "#", uncomment "compile on macOS" line by removing the "#" in front of the line, press Esc : w q Enter
50-
chmod +x make.sh
51-
./make.sh
52-
```
53-
- Run
54-
```
55-
bin/OpenCL-Benchmark
56-
```
57-
5843
### Run only for a specified list of devices
5944
- call `OpenCL-Benchmark.exe 0 2 5` (Windows) or `bin/OpenCL-Benchmark 0 2 5` (Linux/macOS) with the number(s) being the device IDs to be benchmarked
6045

make.sh

100644100755
Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
#!/usr/bin/env bash
12
# command line argument(s): device ID(s); if empty, it will benchmark all available devices
23

34
mkdir -p bin # create directory for executable
4-
rm -f ./bin/OpenCL-Benchmark # prevent execution of old version if compiling fails
5+
rm -f bin/OpenCL-Benchmark # prevent execution of old version if compiling fails
56

6-
g++ ./src/*.cpp -o ./bin/OpenCL-Benchmark -std=c++17 -pthread -I./src/OpenCL/include -L./src/OpenCL/lib -lOpenCL # compile on Linux
7-
#g++ ./src/*.cpp -o ./bin/OpenCL-Benchmark -std=c++17 -pthread -I./src/OpenCL/include -framework OpenCL # compile on macOS
8-
#g++ ./src/*.cpp -o ./bin/OpenCL-Benchmark -std=c++17 -pthread -I./src/OpenCL/include -L/system/vendor/lib64 -lOpenCL # compile on Android
7+
case "$(uname -a)" in # automatically detect operating system and X11 support on Linux
8+
Darwin*) g++ src/*.cpp -o ./bin/OpenCL-Benchmark -std=c++17 -pthread -O -Wno-comment -I./src/OpenCL/include -framework OpenCL ;; # compile on macOS
9+
*Android) g++ src/*.cpp -o ./bin/OpenCL-Benchmark -std=c++17 -pthread -O -Wno-comment -I./src/OpenCL/include -L/system/vendor/lib64 -lOpenCL ;; # compile on Android
10+
* ) g++ src/*.cpp -o ./bin/OpenCL-Benchmark -std=c++17 -pthread -O -Wno-comment -I./src/OpenCL/include -L./src/OpenCL/lib -lOpenCL ;; # compile on Linux
11+
esac
912

10-
./bin/OpenCL-Benchmark "$@" # run OpenCL-Benchmark
13+
if [[ $? == 0 ]]; then bin/OpenCL-Benchmark "$@"; fi # run OpenCL-Benchmark only if last compilation was successful

src/opencl.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ struct Device_Info {
2525
uint clock_frequency=0u; // in MHz
2626
bool is_cpu=false, is_gpu=false;
2727
bool intel_gpu_above_4gb_patch = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
28+
bool arm_fma_patch = false; // ARM GPUs have terrible fma performance, so replace with a*b+c
2829
uint is_fp64_capable=0u, is_fp32_capable=0u, is_fp16_capable=0u, is_int64_capable=0u, is_int32_capable=0u, is_int16_capable=0u, is_int8_capable=0u;
2930
uint cores=0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
3031
float tflops=0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s
@@ -77,6 +78,7 @@ struct Device_Info {
7778
}
7879
}
7980
intel_gpu_above_4gb_patch = (intel==8.0f)&&(memory>4096); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
81+
arm_fma_patch = contains(to_lower(vendor), "arm"); // enable for all ARM GPUs
8082
}
8183
inline Device_Info() {}; // default constructor
8284
};
@@ -174,6 +176,7 @@ class Device {
174176
"\n #ifdef cl_khr_int64_base_atomics"
175177
"\n #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable" // make sure cl_khr_int64_base_atomics extension is enabled
176178
"\n #endif"
179+
+(info.arm_fma_patch ? "\n #define fma(a, b, c) ((a)*(b)+(c))" : "") // ARM GPUs have terrible fma performance, so replace with a*b+c
177180
;}
178181
public:
179182
Device_Info info;

0 commit comments

Comments
 (0)