Fixed terrible performance on ARM GPUs by macro-replacing fused-multiply-add (fma) with a*b+c, added automatic OS detection in make.sh

ProjectPhysX · ProjectPhysX · commit 677d52f89db8 · 2024-05-02T22:00:51.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+bin/
+.vs/
+OpenCL-Benchmark.vcxproj.user
diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@ Works with any GPU in Windows, Linux, macOS and Android.
   OpenCL-Benchmark.exe
   ```
 
-### Linux
+### Linux / macOS / Android
 - Download, compile and run:
   ```
   git clone https://github.com/ProjectPhysX/OpenCL-Benchmark.git
@@ -40,21 +40,6 @@ Works with any GPU in Windows, Linux, macOS and Android.
   bin/OpenCL-Benchmark
   ```
 
-### macOS
-- Download, compile and run:
-  ```
-  git clone https://github.com/ProjectPhysX/OpenCL-Benchmark.git
-  cd OpenCL-Benchmark
-  vim make.sh
-  # press i, comment the line "compile on Linux" line with an "#", uncomment "compile on macOS" line by removing the "#" in front of the line, press Esc : w q Enter
-  chmod +x make.sh
-  ./make.sh
-  ```
-- Run
-  ```
-  bin/OpenCL-Benchmark
-  ```
-
 ### Run only for a specified list of devices
 - call `OpenCL-Benchmark.exe 0 2 5` (Windows) or `bin/OpenCL-Benchmark 0 2 5` (Linux/macOS) with the number(s) being the device IDs to be benchmarked
 
diff --git a/make.sh b/make.sh
@@ -1,10 +1,13 @@
+#!/usr/bin/env bash
 # command line argument(s): device ID(s); if empty, it will benchmark all available devices
 
 mkdir -p bin # create directory for executable
-rm -f ./bin/OpenCL-Benchmark # prevent execution of old version if compiling fails
+rm -f bin/OpenCL-Benchmark # prevent execution of old version if compiling fails
 
-g++ ./src/*.cpp -o ./bin/OpenCL-Benchmark -std=c++17 -pthread -I./src/OpenCL/include -L./src/OpenCL/lib -lOpenCL # compile on Linux
-#g++ ./src/*.cpp -o ./bin/OpenCL-Benchmark -std=c++17 -pthread -I./src/OpenCL/include -framework OpenCL # compile on macOS
-#g++ ./src/*.cpp -o ./bin/OpenCL-Benchmark -std=c++17 -pthread -I./src/OpenCL/include -L/system/vendor/lib64 -lOpenCL # compile on Android
+case "$(uname -a)" in # automatically detect operating system and X11 support on Linux
+	 Darwin*) g++ src/*.cpp -o ./bin/OpenCL-Benchmark -std=c++17 -pthread -O -Wno-comment -I./src/OpenCL/include -framework OpenCL               ;; # compile on macOS
+	*Android) g++ src/*.cpp -o ./bin/OpenCL-Benchmark -std=c++17 -pthread -O -Wno-comment -I./src/OpenCL/include -L/system/vendor/lib64 -lOpenCL ;; # compile on Android
+	*       ) g++ src/*.cpp -o ./bin/OpenCL-Benchmark -std=c++17 -pthread -O -Wno-comment -I./src/OpenCL/include -L./src/OpenCL/lib -lOpenCL     ;; # compile on Linux
+esac
 
-./bin/OpenCL-Benchmark "$@" # run OpenCL-Benchmark
+if [[ $? == 0 ]]; then bin/OpenCL-Benchmark "$@"; fi # run OpenCL-Benchmark only if last compilation was successful
diff --git a/src/opencl.hpp b/src/opencl.hpp
@@ -25,6 +25,7 @@ struct Device_Info {
 	uint clock_frequency=0u; // in MHz
 	bool is_cpu=false, is_gpu=false;
 	bool intel_gpu_above_4gb_patch = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
+	bool arm_fma_patch = false;  // ARM GPUs have terrible fma performance, so replace with a*b+c
 	uint is_fp64_capable=0u, is_fp32_capable=0u, is_fp16_capable=0u, is_int64_capable=0u, is_int32_capable=0u, is_int16_capable=0u, is_int8_capable=0u;
 	uint cores=0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
 	float tflops=0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s
@@ -77,6 +78,7 @@ struct Device_Info {
 			}
 		}
 		intel_gpu_above_4gb_patch = (intel==8.0f)&&(memory>4096); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
+		arm_fma_patch = contains(to_lower(vendor), "arm"); // enable for all ARM GPUs
 	}
 	inline Device_Info() {}; // default constructor
 };
@@ -174,6 +176,7 @@ class Device {
 		"\n	#ifdef cl_khr_int64_base_atomics"
 		"\n	#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable" // make sure cl_khr_int64_base_atomics extension is enabled
 		"\n	#endif"
+		+(info.arm_fma_patch ? "\n #define fma(a, b, c) ((a)*(b)+(c))" : "") // ARM GPUs have terrible fma performance, so replace with a*b+c
 	;}
 public:
 	Device_Info info;

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+bin/`
	`2`	`+.vs/`
	`3`	`+OpenCL-Benchmark.vcxproj.user`