Skip to content

Commit 6bfc415

Browse files
committed
Enabled basic FP16 vector arithmetic support on Nvidia Pascal and newer GPUs with driver 520 or newer
1 parent 5df9952 commit 6bfc415

File tree

3 files changed

+49
-42
lines changed

3 files changed

+49
-42
lines changed

src/kernel.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,19 @@ kernel void kernel_half(global float* data) {
3030
half2 x = (half2)((float)get_global_id(0), (float)get_local_id(0));
3131
half2 y = (half2)((float)get_local_id(0), (float)get_global_id(0));
3232
for(uint i=0u; i<512u; i++) {
33-
x = fma(y, x, y);
34-
y = fma(x, y, x);
33+
x = y*x+y;
34+
y = x*y+x;
3535
}
36-
data[get_global_id(0)] = as_float(y);
36+
data[get_global_id(0)] = (float)y.x+(float)y.y;
3737
}
3838
)+"#endif"+R( // cl_khr_fp16
3939

4040
kernel void kernel_long(global float* data) {
4141
long x = (long)get_global_id(0);
4242
long y = (long)get_local_id(0);
4343
for(uint i=0u; i<8u; i++) {
44-
x = (y*x)+y;
45-
y = (x*y)+x;
44+
x = y*x+y;
45+
y = x*y+x;
4646
}
4747
data[get_global_id(0)] = as_float((int)y);
4848
}
@@ -51,8 +51,8 @@ kernel void kernel_int(global float* data) {
5151
int x = get_global_id(0);
5252
int y = get_local_id(0);
5353
for(uint i=0u; i<512u; i++) {
54-
x = (y*x)+y;
55-
y = (x*y)+x;
54+
x = y*x+y;
55+
y = x*y+x;
5656
}
5757
data[get_global_id(0)] = as_float(y);
5858
}
@@ -61,8 +61,8 @@ kernel void kernel_short(global float* data) {
6161
short2 x = as_short2((int)get_global_id(0));
6262
short2 y = as_short2((int)get_local_id(0));
6363
for(uint i=0u; i<128u; i++) {
64-
x = (y*x)+y;
65-
y = (x*y)+x;
64+
x = y*x+y;
65+
y = x*y+x;
6666
}
6767
data[get_global_id(0)] = as_float(y);
6868
}
@@ -71,8 +71,8 @@ kernel void kernel_char(global float* data) {
7171
char4 x = as_char4((int)get_global_id(0));
7272
char4 y = as_char4((int)get_local_id(0));
7373
for(uint i=0u; i<64u; i++) {
74-
x = (y*x)+y;
75-
y = (x*y)+x;
74+
x = y*x+y;
75+
y = x*y+x;
7676
}
7777
data[get_global_id(0)] = as_float(y);
7878
}

src/opencl.hpp

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -58,19 +58,19 @@ export OCLV="2024.18.6.0.02_rel"
5858
export TBBV="2021.13.0"
5959
sudo apt update && sudo apt upgrade -y
6060
sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev
61-
sudo mkdir -p ~/cpuruntime /opt/intel/oclcpuexp_${OCLV} /etc/OpenCL/vendors /etc/ld.so.conf.d
62-
sudo wget -P ~/cpuruntime https://github.com/intel/llvm/releases/download/2024-WW25/oclcpuexp-${OCLV}.tar.gz
63-
sudo wget -P ~/cpuruntime https://github.com/oneapi-src/oneTBB/releases/download/v${TBBV}/oneapi-tbb-${TBBV}-lin.tgz
64-
sudo tar -zxvf ~/cpuruntime/oclcpuexp-${OCLV}.tar.gz -C /opt/intel/oclcpuexp_${OCLV}
65-
sudo tar -zxvf ~/cpuruntime/oneapi-tbb-${TBBV}-lin.tgz -C /opt/intel
61+
sudo mkdir -p ~/cpurt /opt/intel/oclcpuexp_${OCLV} /etc/OpenCL/vendors /etc/ld.so.conf.d
62+
sudo wget -P ~/cpurt https://github.com/intel/llvm/releases/download/2024-WW25/oclcpuexp-${OCLV}.tar.gz
63+
sudo wget -P ~/cpurt https://github.com/oneapi-src/oneTBB/releases/download/v${TBBV}/oneapi-tbb-${TBBV}-lin.tgz
64+
sudo tar -zxvf ~/cpurt/oclcpuexp-${OCLV}.tar.gz -C /opt/intel/oclcpuexp_${OCLV}
65+
sudo tar -zxvf ~/cpurt/oneapi-tbb-${TBBV}-lin.tgz -C /opt/intel
6666
echo /opt/intel/oclcpuexp_${OCLV}/x64/libintelocl.so | sudo tee /etc/OpenCL/vendors/intel_expcpu.icd
6767
echo /opt/intel/oclcpuexp_${OCLV}/x64 | sudo tee /etc/ld.so.conf.d/libintelopenclexp.conf
6868
sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbb.so /opt/intel/oclcpuexp_${OCLV}/x64
6969
sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbbmalloc.so /opt/intel/oclcpuexp_${OCLV}/x64
7070
sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbb.so.12 /opt/intel/oclcpuexp_${OCLV}/x64
7171
sudo ln -sf /opt/intel/oneapi-tbb-${TBBV}/lib/intel64/gcc4.8/libtbbmalloc.so.2 /opt/intel/oclcpuexp_${OCLV}/x64
7272
sudo ldconfig -f /etc/ld.so.conf.d/libintelopenclexp.conf
73-
sudo rm -r ~/cpuruntime
73+
sudo rm -r ~/cpurt
7474

7575
)"+string("\033[33m")+R"(.-----------------------------------------------------------------------------.
7676
| CPU Option 2: PoCL |
@@ -94,8 +94,9 @@ struct Device_Info {
9494
uint compute_units = 0u; // compute units (CUs) can contain multiple cores depending on the microarchitecture
9595
uint clock_frequency = 0u; // in MHz
9696
bool is_cpu=false, is_gpu=false;
97-
bool intel_gpu_above_4gb_patch = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
98-
bool legacy_gpu_fma_patch = false; // some old GPUs have terrible fma performance, so replace with a*b+c
97+
bool patch_nvidia_fp16 = false; // Nvidia Pascal and newer GPUs with driver>=520.00 don't report cl_khr_fp16, but do support basic FP16 arithmetic
98+
bool patch_intel_gpu_above_4gb = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
99+
bool patch_legacy_gpu_fma = false; // some old GPUs have terrible fma performance, so replace with a*b+c
99100
uint is_fp64_capable=0u, is_fp32_capable=0u, is_fp16_capable=0u, is_int64_capable=0u, is_int32_capable=0u, is_int16_capable=0u, is_int8_capable=0u;
100101
uint cores = 0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
101102
float tflops = 0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s
@@ -147,8 +148,10 @@ struct Device_Info {
147148
memory = (uint)((cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()*50ull/49ull)/1048576ull); // fix wrong (98% on Windows) memory reporting on Intel Arc
148149
}
149150
}
150-
intel_gpu_above_4gb_patch = intel_gpu_above_4gb_patch||((intel==8.0f)&&(memory>4096)); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
151-
legacy_gpu_fma_patch = legacy_gpu_fma_patch||contains(to_lower(vendor), "arm"); // enable for all ARM GPUs
151+
patch_nvidia_fp16 = patch_nvidia_fp16||(nvidia>0.0f&&atof(driver_version.substr(0, 6).c_str())>=520.00&&!nvidia_192_cores_per_cu&&!contains_any(to_lower(name), {"gtx 8", "gtx 9", "quadro m", "tesla m", "gtx titan"})); // enable for all Nvidia GPUs with driver>=520.00 except Kepler and Maxwell
152+
patch_intel_gpu_above_4gb = patch_intel_gpu_above_4gb||((intel==8.0f)&&(memory>4096)); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
153+
patch_legacy_gpu_fma = patch_legacy_gpu_fma||arm>0.0f; // enable for all ARM GPUs
154+
if(patch_nvidia_fp16) is_fp16_capable = 2u;
152155
}
153156
inline Device_Info() {}; // default constructor
154157
};
@@ -176,13 +179,8 @@ inline void print_device_info(const Device_Info& d) { // print OpenCL device inf
176179
println("|----------------'------------------------------------------------------------|");
177180
}
178181
inline vector<Device_Info> get_devices(const bool print_info=true) { // returns a vector of all available OpenCL devices
179-
#if defined(_WIN32)
180-
(void)_putenv((char*)"CL_CONFIG_CPU_FORCE_MAX_MEM_ALLOC_SIZE=17179869183GB"); // fix maximum buffer allocation size limit in Intel CPU Runtime for OpenCL, 2^34-1 is max non-overflowing value
181-
(void)_putenv((char*)"GPU_SINGLE_ALLOC_PERCENT=100"); // fix maximum buffer allocation size limit for AMD GPUs
182-
#elif defined(__linux__)
183-
(void) putenv((char*)"CL_CONFIG_CPU_FORCE_MAX_MEM_ALLOC_SIZE=17179869183GB"); // fix maximum buffer allocation size limit in Intel CPU Runtime for OpenCL, 2^34-1 is max non-overflowing value
184-
(void) putenv((char*)"GPU_SINGLE_ALLOC_PERCENT=100"); // fix maximum buffer allocation size limit for AMD GPUs
185-
#endif // Linux
182+
set_environment_variable((char*)"GPU_SINGLE_ALLOC_PERCENT=100"); // fix maximum buffer allocation size limit for AMD GPUs
183+
set_environment_variable((char*)"CL_CONFIG_CPU_FORCE_MAX_MEM_ALLOC_SIZE=17179869183GB"); // fix maximum buffer allocation size limit in Intel CPU Runtime for OpenCL, 2^34-1 is max non-overflowing value
186184
vector<Device_Info> devices; // get all devices of all platforms
187185
vector<cl::Platform> cl_platforms; // get all platforms (drivers)
188186
cl::Platform::get(&cl_platforms);
@@ -248,17 +246,18 @@ class Device {
248246
cl::CommandQueue cl_queue;
249247
bool exists = false;
250248
inline string enable_device_capabilities() const { return // enable FP64/FP16 capabilities if available
251-
"\n #define def_workgroup_size "+to_string(WORKGROUP_SIZE)+"u"
252-
"\n #ifdef cl_khr_fp64"
253-
"\n #pragma OPENCL EXTENSION cl_khr_fp64 : enable" // make sure cl_khr_fp64 extension is enabled
254-
"\n #endif"
255-
"\n #ifdef cl_khr_fp16"
256-
"\n #pragma OPENCL EXTENSION cl_khr_fp16 : enable" // make sure cl_khr_fp16 extension is enabled
257-
"\n #endif"
258-
"\n #ifdef cl_khr_int64_base_atomics"
259-
"\n #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable" // make sure cl_khr_int64_base_atomics extension is enabled
260-
"\n #endif"
261-
+(info.legacy_gpu_fma_patch ? "\n #define fma(a, b, c) ((a)*(b)+(c))" : "") // some old GPUs have terrible fma performance, so replace with a*b+c
249+
string(info.patch_nvidia_fp16 ? "\n #define cl_khr_fp16" : "")+ // Nvidia Pascal and newer GPUs with driver>=520.00 don't report cl_khr_fp16, but do support basic FP16 arithmetic
250+
string(info.patch_legacy_gpu_fma ? "\n #define fma(a, b, c) ((a)*(b)+(c))" : "")+ // some old GPUs have terrible fma performance, so replace with a*b+c
251+
"\n #define def_workgroup_size "+to_string(WORKGROUP_SIZE)+"u"
252+
"\n #ifdef cl_khr_fp64"
253+
"\n #pragma OPENCL EXTENSION cl_khr_fp64 : enable" // make sure cl_khr_fp64 extension is enabled
254+
"\n #endif"
255+
"\n #ifdef cl_khr_fp16"
256+
"\n #pragma OPENCL EXTENSION cl_khr_fp16 : enable" // make sure cl_khr_fp16 extension is enabled
257+
"\n #endif"
258+
"\n #ifdef cl_khr_int64_base_atomics"
259+
"\n #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable" // make sure cl_khr_int64_base_atomics extension is enabled
260+
"\n #endif"
262261
;}
263262
public:
264263
Device_Info info;
@@ -270,7 +269,7 @@ class Device {
270269
const string kernel_code = enable_device_capabilities()+"\n"+opencl_c_code;
271270
cl_source.push_back({ kernel_code.c_str(), kernel_code.length() });
272271
this->cl_program = cl::Program(info.cl_context, cl_source);
273-
const string build_options = string("-cl-finite-math-only -cl-no-signed-zeros -cl-mad-enable")+(info.intel_gpu_above_4gb_patch ? " -cl-intel-greater-than-4GB-buffer-required" : "");
272+
const string build_options = string("-cl-finite-math-only -cl-no-signed-zeros -cl-mad-enable")+(info.patch_intel_gpu_above_4gb ? " -cl-intel-greater-than-4GB-buffer-required" : "");
274273
#ifndef LOG
275274
int error = cl_program.build({ info.cl_device }, (build_options+" -w").c_str()); // compile OpenCL C code, disable warnings
276275
if(error) print_warning(cl_program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(info.cl_device)); // print build log
@@ -320,7 +319,7 @@ template<typename T> class Memory {
320319
device.info.memory_used += (uint)(capacity()/1048576ull); // track device memory usage
321320
if(device.info.memory_used>device.info.memory) print_error("Device \""+device.info.name+"\" does not have enough memory. Allocating another "+to_string((uint)(capacity()/1048576ull))+" MB would use a total of "+to_string(device.info.memory_used)+" MB / "+to_string(device.info.memory)+" MB.");
322321
int error = 0;
323-
device_buffer = cl::Buffer(device.get_cl_context(), CL_MEM_READ_WRITE|((int)device.info.intel_gpu_above_4gb_patch<<23), capacity(), nullptr, &error); // for Intel GPUs, set flag CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL = (1<<23)
322+
device_buffer = cl::Buffer(device.get_cl_context(), CL_MEM_READ_WRITE|((int)device.info.patch_intel_gpu_above_4gb<<23), capacity(), nullptr, &error); // for Intel GPUs, set flag CL_MEM_ALLOW_UNRESTRICTED_SIZE_INTEL = (1<<23)
324323
if(error==-61) print_error("Memory size is too large at "+to_string((uint)(capacity()/1048576ull))+" MB. Device \""+device.info.name+"\" accepts a maximum buffer size of "+to_string(device.info.max_global_buffer)+" MB.");
325324
else if(error) print_error("Device buffer allocation failed with error code "+to_string(error)+".");
326325
device_buffer_exists = true;

src/utilities.hpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,7 @@ inline string replace_regex(const string& s, const string& from, const string& t
587587
inline bool is_number(const string& s) {
588588
return equals_regex(s, "\\d+(u|l|ul|ll|ull)?")||equals_regex(s, "0x(\\d|[a-fA-F])+(u|l|ul|ll|ull)?")||equals_regex(s, "0b[01]+(u|l|ul|ll|ull)?")||equals_regex(s, "(((\\d+\\.?\\d*|\\.\\d+)([eE][+-]?\\d+[fF]?)?)|(\\d+\\.\\d*|\\.\\d+)[fF]?)");
589589
}
590-
inline void print_message(const string& message, const string& keyword="", const int colons=true) { // print formatted message
590+
inline void print_message(const string& message, const string& keyword="", const int keyword_color=-1, const int colons=true) { // print formatted message
591591
const uint k=length(keyword)+2u, w=CONSOLE_WIDTH-4u-k;
592592
string p=colons?": ":" ", f="";
593593
for(uint j=0u; j<k; j++) f += " ";
@@ -714,6 +714,14 @@ inline void print_info(const string& s) { // print info message
714714
}
715715
#endif // UTILITIES_REGEX
716716

717+
inline void set_environment_variable(char* s) { // usage: set_environment_variable((char*)"VARIABLE=VALUE");
718+
#if defined(_WIN32)
719+
(void)_putenv(s);
720+
#elif defined(__linux__)
721+
(void) putenv(s);
722+
#endif // Linux
723+
}
724+
717725
#ifdef UTILITIES_FILE
718726
#include <fstream> // read/write files
719727
#ifndef UTILITIES_NO_CPP17

0 commit comments

Comments
 (0)