|
3 | 3 | #define WORKGROUP_SIZE 64 // needs to be 64 to fully use AMD GPUs
|
4 | 4 | //#define PTX
|
5 | 5 | //#define LOG
|
6 |
| -//#define USE_OPENCL_1_1 |
7 | 6 |
|
8 |
| -#ifdef USE_OPENCL_1_1 |
9 |
| -#define CL_USE_DEPRECATED_OPENCL_1_1_APIS |
10 |
| -#endif // USE_OPENCL_1_1 |
11 | 7 | #ifndef _WIN32
|
12 | 8 | #pragma GCC diagnostic ignored "-Wignored-attributes" // ignore compiler warnings for CL/cl.hpp with g++
|
13 | 9 | #endif // _WIN32
|
@@ -54,7 +50,7 @@ struct Device_Info {
|
54 | 50 | const bool nvidia_192_cores_per_cu = contains_any(to_lower(name), {" 6", " 7", "ro k", "la k"}) || (clock_frequency<1000u&&contains(to_lower(name), "titan")); // identify Kepler GPUs
|
55 | 51 | const bool nvidia_64_cores_per_cu = contains_any(to_lower(name), {"p100", "v100", "a100", "a30", " 16", " 20", "titan v", "titan rtx", "ro t", "la t", "ro rtx"}) && !contains(to_lower(name), "rtx a"); // identify P100, Volta, Turing, A100, A30
|
56 | 52 | const bool amd_128_cores_per_dualcu = contains(to_lower(name), "gfx10"); // identify RDNA/RDNA2 GPUs where dual CUs are reported
|
57 |
| - const float nvidia = (float)(contains(to_lower(vendor), "nvidia"))*(nvidia_192_cores_per_cu?192.0f:(nvidia_64_cores_per_cu?64.0f:128.0f)); // Nvidia GPUs have 192 cores/CU (Kepler), 128 cores/CU (Maxwell, Pascal, Ampere) or 64 cores/CU (P100, Volta, Turing, A100) |
| 53 | + const float nvidia = (float)(contains(to_lower(vendor), "nvidia"))*(nvidia_192_cores_per_cu?192.0f:(nvidia_64_cores_per_cu?64.0f:128.0f)); // Nvidia GPUs have 192 cores/CU (Kepler), 128 cores/CU (Maxwell, Pascal, Ampere, Hopper, Ada) or 64 cores/CU (P100, Volta, Turing, A100, A30) |
58 | 54 | const float amd = (float)(contains_any(to_lower(vendor), {"amd", "advanced"}))*(is_gpu?(amd_128_cores_per_dualcu?128.0f:64.0f):0.5f); // AMD GPUs have 64 cores/CU (GCN, CDNA) or 128 cores/dualCU (RDNA, RDNA2), AMD CPUs (with SMT) have 1/2 core/CU
|
59 | 55 | const float intel = (float)(contains(to_lower(vendor), "intel"))*(is_gpu?8.0f:0.5f); // Intel integrated GPUs usually have 8 cores/CU, Intel CPUs (with HT) have 1/2 core/CU
|
60 | 56 | const float apple = (float)(contains(to_lower(vendor), "apple"))*(128.0f); // Apple ARM GPUs usually have 128 cores/CU
|
@@ -170,7 +166,7 @@ class Device {
|
170 | 166 | write_file("bin/kernel.log", log); // save build log
|
171 | 167 | if((uint)log.length()>2u) print_warning(log); // print build log
|
172 | 168 | #endif // LOG
|
173 |
| - if(error) print_error("OpenCL C code compilation failed with error code "+to_string(error)+". Make sure there are no errors in kernel.cpp (\"#define LOG\" might help). If your GPU is old, try uncommenting \"#define USE_OPENCL_1_1\"."); |
| 169 | + if(error) print_error("OpenCL C code compilation failed with error code "+to_string(error)+". Make sure there are no errors in kernel.cpp."); |
174 | 170 | else print_info("OpenCL C code successfully compiled.");
|
175 | 171 | #ifdef PTX // generate assembly (ptx) file for OpenCL code
|
176 | 172 | write_file("bin/kernel.ptx", cl_program.getInfo<CL_PROGRAM_BINARIES>()[0]); // save binary (ptx file)
|
|
0 commit comments