Enabled basic FP16 vector arithmetic support on Nvidia Pascal and newer GPUs with driver 520 or newer

ProjectPhysX · ProjectPhysX · commit 6bfc415a6c1b · 2024-08-18T10:18:15.000+02:00
diff --git a/src/kernel.cpp b/src/kernel.cpp
@@ -30,19 +30,19 @@ kernel void kernel_half(global float* data) {
 	half2 x = (half2)((float)get_global_id(0), (float)get_local_id(0));
 	half2 y = (half2)((float)get_local_id(0), (float)get_global_id(0));
 	for(uint i=0u; i<512u; i++) {
-		x = fma(y, x, y);
-		y = fma(x, y, x);
+		x = y*x+y;
+		y = x*y+x;
 	}
-	data[get_global_id(0)] = as_float(y);
+	data[get_global_id(0)] = (float)y.x+(float)y.y;
 }
 )+"#endif"+R( // cl_khr_fp16
 
 kernel void kernel_long(global float* data) {
 	long x = (long)get_global_id(0);
 	long y = (long)get_local_id(0);
 	for(uint i=0u; i<8u; i++) {
-		x = (y*x)+y;
-		y = (x*y)+x;
+		x = y*x+y;
+		y = x*y+x;
 	}
 	data[get_global_id(0)] = as_float((int)y);
 }
@@ -51,8 +51,8 @@ kernel void kernel_int(global float* data) {
 	int x = get_global_id(0);
 	int y = get_local_id(0);
 	for(uint i=0u; i<512u; i++) {
-		x = (y*x)+y;
-		y = (x*y)+x;
+		x = y*x+y;
+		y = x*y+x;
 	}
 	data[get_global_id(0)] = as_float(y);
 }
@@ -61,8 +61,8 @@ kernel void kernel_short(global float* data) {
 	short2 x = as_short2((int)get_global_id(0));
 	short2 y = as_short2((int)get_local_id(0));
 	for(uint i=0u; i<128u; i++) {
-		x = (y*x)+y;
-		y = (x*y)+x;
+		x = y*x+y;
+		y = x*y+x;
 	}
 	data[get_global_id(0)] = as_float(y);
 }
@@ -71,8 +71,8 @@ kernel void kernel_char(global float* data) {
 	char4 x = as_char4((int)get_global_id(0));
 	char4 y = as_char4((int)get_local_id(0));
 	for(uint i=0u; i<64u; i++) {
-		x = (y*x)+y;
-		y = (x*y)+x;
+		x = y*x+y;
+		y = x*y+x;
 	}
 	data[get_global_id(0)] = as_float(y);
 }
diff --git a/src/opencl.hpp b/src/opencl.hpp
@@ -58,19 +58,19 @@ export OCLV="2024.18.6.0.02_rel"
 export TBBV="2021.13.0"
 sudo apt update && sudo apt upgrade -y
 sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev
-sudo mkdir -p ~/cpuruntime /opt/intel/oclcpuexp_${OCLV} /etc/OpenCL/vendors /etc/ld.so.conf.d
-sudo wget -P ~/cpuruntime https://github.com/intel/llvm/releases/download/2024-WW25/oclcpuexp-${OCLV}.tar.gz
-sudo wget -P ~/cpuruntime https://github.com/oneapi-src/oneTBB/releases/download/v${TBBV}/oneapi-tbb-${TBBV}-lin.tgz
-sudo tar -zxvf ~/cpuruntime/oclcpuexp-${OCLV}.tar.gz -C /opt/intel/oclcpuexp_${OCLV}
-sudo tar -zxvf ~/cpuruntime/oneapi-tbb-${TBBV}-lin.tgz -C /opt/intel
+sudo mkdir -p ~/cpurt /opt/intel/oclcpuexp_${OCLV} /etc/OpenCL/vendors /etc/ld.so.conf.d
+sudo wget -P ~/cpurt https://github.com/intel/llvm/releases/download/2024-WW25/oclcpuexp-${OCLV}.tar.gz
+sudo wget -P ~/cpurt https://github.com/oneapi-src/oneTBB/releases/download/v${TBBV}/oneapi-tbb-${TBBV}-lin.tgz
+sudo tar -zxvf ~/cpurt/oclcpuexp-${OCLV}.tar.gz -C /opt/intel/oclcpuexp_${OCLV}
+sudo tar -zxvf ~/cpurt/oneapi-tbb-${TBBV}-lin.tgz -C /opt/intel
 echo /opt/intel/oclcpuexp_${OCLV}/x64/libintelocl.so | sudo tee /etc/OpenCL/vendors/intel_expcpu.icd
 echo /opt/intel/oclcpuexp_${OCLV}/x64 | sudo tee /etc/ld.so.conf.d/libintelopenclexp.conf
 sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbb.so /opt/intel/oclcpuexp_${OCLV}/x64
 sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbbmalloc.so /opt/intel/oclcpuexp_${OCLV}/x64
 sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbb.so.12 /opt/intel/oclcpuexp_${OCLV}/x64
 sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbbmalloc.so.2 /opt/intel/oclcpuexp_${OCLV}/x64
 sudo ldconfig -f /etc/ld.so.conf.d/libintelopenclexp.conf
-sudo rm -r ~/cpuruntime
+sudo rm -r ~/cpurt
 
 )"+string("\033[33m")+R"(.-----------------------------------------------------------------------------.
 | CPU Option 2: PoCL                                                          |
@@ -94,8 +94,9 @@ struct Device_Info {
 	uint compute_units = 0u; // compute units (CUs) can contain multiple cores depending on the microarchitecture
 	uint clock_frequency = 0u; // in MHz
 	bool is_cpu=false, is_gpu=false;
-	bool intel_gpu_above_4gb_patch = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
-	bool legacy_gpu_fma_patch = false; // some old GPUs have terrible fma performance, so replace with a*b+c
+	bool patch_nvidia_fp16 = false; // Nvidia Pascal and newer GPUs with driver>=520.00 don't report cl_khr_fp16, but do support basic FP16 arithmetic
+	bool patch_intel_gpu_above_4gb = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
+	bool patch_legacy_gpu_fma = false; // some old GPUs have terrible fma performance, so replace with a*b+c
 	uint is_fp64_capable=0u, is_fp32_capable=0u, is_fp16_capable=0u, is_int64_capable=0u, is_int32_capable=0u, is_int16_capable=0u, is_int8_capable=0u;
 	uint cores = 0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
 	float tflops = 0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s
@@ -147,8 +148,10 @@ struct Device_Info {
 				memory = (uint)((cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()*50ull/49ull)/1048576ull); // fix wrong (98% on Windows) memory reporting on Intel Arc
 			}
 		}
-		intel_gpu_above_4gb_patch = intel_gpu_above_4gb_patch||((intel==8.0f)&&(memory>4096)); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
-		legacy_gpu_fma_patch = legacy_gpu_fma_patch||contains(to_lower(vendor), "arm"); // enable for all ARM GPUs
+		patch_nvidia_fp16 = patch_nvidia_fp16||(nvidia>0.0f&&atof(driver_version.substr(0, 6).c_str())>=520.00&&!nvidia_192_cores_per_cu&&!contains_any(to_lower(name), {"gtx 8", "gtx 9", "quadro m", "tesla m", "gtx titan"})); // enable for all Nvidia GPUs with driver>=520.00 except Kepler and Maxwell
+		patch_intel_gpu_above_4gb = patch_intel_gpu_above_4gb||((intel==8.0f)&&(memory>4096)); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
+		patch_legacy_gpu_fma = patch_legacy_gpu_fma||arm>0.0f; // enable for all ARM GPUs
+		if(patch_nvidia_fp16) is_fp16_capable = 2u;
 	}
 	inline Device_Info() {}; // default constructor
 };
@@ -176,13 +179,8 @@ inline void print_device_info(const Device_Info& d) { // print OpenCL device inf
 	println("|----------------'------------------------------------------------------------|");
 }
 inline vector<Device_Info> get_devices(const bool print_info=true) { // returns a vector of all available OpenCL devices
-#if defined(_WIN32)
-	(void)_putenv((char*)"CL_CONFIG_CPU_FORCE_MAX_MEM_ALLOC_SIZE=17179869183GB"); // fix maximum buffer allocation size limit in Intel CPU Runtime for OpenCL, 2^34-1 is max non-overflowing value
-	(void)_putenv((char*)"GPU_SINGLE_ALLOC_PERCENT=100"); // fix maximum buffer allocation size limit for AMD GPUs
-#elif defined(__linux__)
-	(void) putenv((char*)"CL_CONFIG_CPU_FORCE_MAX_MEM_ALLOC_SIZE=17179869183GB"); // fix maximum buffer allocation size limit in Intel CPU Runtime for OpenCL, 2^34-1 is max non-overflowing value
-	(void) putenv((char*)"GPU_SINGLE_ALLOC_PERCENT=100"); // fix maximum buffer allocation size limit for AMD GPUs
-#endif // Linux
+	set_environment_variable((char*)"GPU_SINGLE_ALLOC_PERCENT=100"); // fix maximum buffer allocation size limit for AMD GPUs
+	set_environment_variable((char*)"CL_CONFIG_CPU_FORCE_MAX_MEM_ALLOC_SIZE=17179869183GB"); // fix maximum buffer allocation size limit in Intel CPU Runtime for OpenCL, 2^34-1 is max non-overflowing value
 	vector<Device_Info> devices; // get all devices of all platforms
 	vector<cl::Platform> cl_platforms; // get all platforms (drivers)
 	cl::Platform::get(&cl_platforms);
@@ -248,17 +246,18 @@ class Device {
 	cl::CommandQueue cl_queue;
 	bool exists = false;
 	inline string enable_device_capabilities() const { return // enable FP64/FP16 capabilities if available
-		"\n	#define def_workgroup_size "+to_string(WORKGROUP_SIZE)+"u"
-		"\n	#ifdef cl_khr_fp64"
-		"\n	#pragma OPENCL EXTENSION cl_khr_fp64 : enable" // make sure cl_khr_fp64 extension is enabled
-		"\n	#endif"
-		"\n	#ifdef cl_khr_fp16"
-		"\n	#pragma OPENCL EXTENSION cl_khr_fp16 : enable" // make sure cl_khr_fp16 extension is enabled
-		"\n	#endif"
-		"\n	#ifdef cl_khr_int64_base_atomics"
-		"\n	#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable" // make sure cl_khr_int64_base_atomics extension is enabled
-		"\n	#endif"
-		+(info.legacy_gpu_fma_patch ? "\n #define fma(a, b, c) ((a)*(b)+(c))" : "") // some old GPUs have terrible fma performance, so replace with a*b+c
+		string(info.patch_nvidia_fp16    ? "\n #define cl_khr_fp16"                : "")+ // Nvidia Pascal and newer GPUs with driver>=520.00 don't report cl_khr_fp16, but do support basic FP16 arithmetic
+		string(info.patch_legacy_gpu_fma ? "\n #define fma(a, b, c) ((a)*(b)+(c))" : "")+ // some old GPUs have terrible fma performance, so replace with a*b+c
+		"\n #define def_workgroup_size "+to_string(WORKGROUP_SIZE)+"u"
+		"\n #ifdef cl_khr_fp64"
+		"\n #pragma OPENCL EXTENSION cl_khr_fp64 : enable" // make sure cl_khr_fp64 extension is enabled
+		"\n #endif"
+		"\n #ifdef cl_khr_fp16"
+		"\n #pragma OPENCL EXTENSION cl_khr_fp16 : enable" // make sure cl_khr_fp16 extension is enabled
+		"\n #endif"
+		"\n #ifdef cl_khr_int64_base_atomics"
+		"\n #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable" // make sure cl_khr_int64_base_atomics extension is enabled
+		"\n #endif"
 	;}
 public:
 	Device_Info info;
@@ -270,7 +269,7 @@ class Device {
 		const string kernel_code = enable_device_capabilities()+"\n"+opencl_c_code;
 		cl_source.push_back({ kernel_code.c_str(), kernel_code.length() });
 		this->cl_program = cl::Program(info.cl_context, cl_source);
-		const string build_options = string("-cl-finite-math-only -cl-no-signed-zeros -cl-mad-enable")+(info.intel_gpu_above_4gb_patch ? " -cl-intel-greater-than-4GB-buffer-required" : "");
+		const string build_options = string("-cl-finite-math-only -cl-no-signed-zeros -cl-mad-enable")+(info.patch_intel_gpu_above_4gb ? " -cl-intel-greater-than-4GB-buffer-required" : "");
 #ifndef LOG
 		int error = cl_program.build({ info.cl_device }, (build_options+" -w").c_str()); // compile OpenCL C code, disable warnings
 		if(error) print_warning(cl_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(info.cl_device)); // print build log
@@ -320,7 +319,7 @@ template<typename T> class Memory {
 			device.info.memory_used += (uint)(capacity()/1048576ull); // track device memory usage
 			if(device.info.memory_used>device.info.memory) print_error("Device \""+device.info.name+"\" does not have enough memory. Allocating another "+to_string((uint)(capacity()/1048576ull))+" MB would use a total of "+to_string(device.info.memory_used)+" MB / "+to_string(device.info.memory)+" MB.");
 			int error = 0;
-			device_buffer = cl::Buffer(device.get_cl_context(), CL_MEM_READ_WRITE|((int)device.info.intel_gpu_above_4gb_patch<<23), capacity(), nullptr, &error); // for Intel GPUs, set flag CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL = (1<<23)
+			device_buffer = cl::Buffer(device.get_cl_context(), CL_MEM_READ_WRITE|((int)device.info.patch_intel_gpu_above_4gb<<23), capacity(), nullptr, &error); // for Intel GPUs, set flag CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL = (1<<23)
 			if(error==-61) print_error("Memory size is too large at "+to_string((uint)(capacity()/1048576ull))+" MB. Device \""+device.info.name+"\" accepts a maximum buffer size of "+to_string(device.info.max_global_buffer)+" MB.");
 			else if(error) print_error("Device buffer allocation failed with error code "+to_string(error)+".");
 			device_buffer_exists = true;
diff --git a/src/utilities.hpp b/src/utilities.hpp
@@ -587,7 +587,7 @@ inline string replace_regex(const string& s, const string& from, const string& t
 inline bool is_number(const string& s) {
 	return equals_regex(s, "\\d+(u|l|ul|ll|ull)?")||equals_regex(s, "0x(\\d|[a-fA-F])+(u|l|ul|ll|ull)?")||equals_regex(s, "0b[01]+(u|l|ul|ll|ull)?")||equals_regex(s, "(((\\d+\\.?\\d*|\\.\\d+)([eE][+-]?\\d+[fF]?)?)|(\\d+\\.\\d*|\\.\\d+)[fF]?)");
 }
-inline void print_message(const string& message, const string& keyword="", const int colons=true) { // print formatted message
+inline void print_message(const string& message, const string& keyword="", const int keyword_color=-1, const int colons=true) { // print formatted message
 	const uint k=length(keyword)+2u, w=CONSOLE_WIDTH-4u-k;
 	string p=colons?": ":"  ", f="";
 	for(uint j=0u; j<k; j++) f += " ";
@@ -714,6 +714,14 @@ inline void print_info(const string& s) { // print info message
 }
 #endif // UTILITIES_REGEX
 
+inline void set_environment_variable(char* s) { // usage: set_environment_variable((char*)"VARIABLE=VALUE");
+#if defined(_WIN32)
+	(void)_putenv(s);
+#elif defined(__linux__)
+	(void) putenv(s);
+#endif // Linux
+}
+
 #ifdef UTILITIES_FILE
 #include <fstream> // read/write files
 #ifndef UTILITIES_NO_CPP17

Original file line number	Diff line number	Diff line change
`@@ -30,19 +30,19 @@ kernel void kernel_half(global float* data) {`
`30`	`30`	`half2 x = (half2)((float)get_global_id(0), (float)get_local_id(0));`
`31`	`31`	`half2 y = (half2)((float)get_local_id(0), (float)get_global_id(0));`
`32`	`32`	`for(uint i=0u; i<512u; i++) {`
`33`		`- x = fma(y, x, y);`
`34`		`- y = fma(x, y, x);`
	`33`	`+ x = y*x+y;`
	`34`	`+ y = x*y+x;`
`35`	`35`	`}`
`36`		`- data[get_global_id(0)] = as_float(y);`
	`36`	`+ data[get_global_id(0)] = (float)y.x+(float)y.y;`
`37`	`37`	`}`
`38`	`38`	`)+"#endif"+R( // cl_khr_fp16`
`39`	`39`
`40`	`40`	`kernel void kernel_long(global float* data) {`
`41`	`41`	`long x = (long)get_global_id(0);`
`42`	`42`	`long y = (long)get_local_id(0);`
`43`	`43`	`for(uint i=0u; i<8u; i++) {`
`44`		`- x = (y*x)+y;`
`45`		`- y = (x*y)+x;`
	`44`	`+ x = y*x+y;`
	`45`	`+ y = x*y+x;`
`46`	`46`	`}`
`47`	`47`	`data[get_global_id(0)] = as_float((int)y);`
`48`	`48`	`}`
`@@ -51,8 +51,8 @@ kernel void kernel_int(global float* data) {`
`51`	`51`	`int x = get_global_id(0);`
`52`	`52`	`int y = get_local_id(0);`
`53`	`53`	`for(uint i=0u; i<512u; i++) {`
`54`		`- x = (y*x)+y;`
`55`		`- y = (x*y)+x;`
	`54`	`+ x = y*x+y;`
	`55`	`+ y = x*y+x;`
`56`	`56`	`}`
`57`	`57`	`data[get_global_id(0)] = as_float(y);`
`58`	`58`	`}`
`@@ -61,8 +61,8 @@ kernel void kernel_short(global float* data) {`
`61`	`61`	`short2 x = as_short2((int)get_global_id(0));`
`62`	`62`	`short2 y = as_short2((int)get_local_id(0));`
`63`	`63`	`for(uint i=0u; i<128u; i++) {`
`64`		`- x = (y*x)+y;`
`65`		`- y = (x*y)+x;`
	`64`	`+ x = y*x+y;`
	`65`	`+ y = x*y+x;`
`66`	`66`	`}`
`67`	`67`	`data[get_global_id(0)] = as_float(y);`
`68`	`68`	`}`
`@@ -71,8 +71,8 @@ kernel void kernel_char(global float* data) {`
`71`	`71`	`char4 x = as_char4((int)get_global_id(0));`
`72`	`72`	`char4 y = as_char4((int)get_local_id(0));`
`73`	`73`	`for(uint i=0u; i<64u; i++) {`
`74`		`- x = (y*x)+y;`
`75`		`- y = (x*y)+x;`
	`74`	`+ x = y*x+y;`
	`75`	`+ y = x*y+x;`
`76`	`76`	`}`
`77`	`77`	`data[get_global_id(0)] = as_float(y);`
`78`	`78`	`}`