OpenCL driver bug workaround for old AMD GPUs (binary number literals don't work), FORCE_FIELD and VOLUME_FORCE can now be used independently, updated OpenCL-Wrapper

ProjectPhysX · ProjectPhysX · commit 9a3f593a5902 · 2022-11-08T18:37:11.000+01:00
diff --git a/src/defines.hpp b/src/defines.hpp
@@ -16,7 +16,7 @@
 #define BENCHMARK // disable all extensions and setups and run benchmark setup instead
 
 //#define VOLUME_FORCE // enables global force per volume in one direction, specified in the LBM class constructor; the force can be changed on-the-fly between time steps at no performance cost
-//#define FORCE_FIELD // enables a force per volume for each lattice point independently; allocates an extra 12 Bytes/node; enables computing the forces from the fluid on solid boundaries with lbm.calculate_force_on_boundaries();
+//#define FORCE_FIELD // enables computing the forces on solid boundaries with lbm.calculate_force_on_boundaries(); and enables setting the force for each lattice point independently (enable VOLUME_FORCE too); allocates an extra 12 Bytes/node
 //#define MOVING_BOUNDARIES // enables moving solids: set solid nodes to TYPE_S and set their velocity u unequal to zero
 //#define EQUILIBRIUM_BOUNDARIES // enables fixing the velocity/density by marking nodes with TYPE_E; can be used for inflow/outflow; does not reflect shock waves
 //#define SURFACE // enables free surface LBM: mark fluid nodes with TYPE_F; at initialization the TYPE_I interface and TYPE_G gas domains will automatically be completed; allocates an extra 12 Bytes/node
@@ -69,10 +69,6 @@
 #undef GRAPHICS
 #endif // BENCHMARK
 
-#ifdef FORCE_FIELD
-#define VOLUME_FORCE
-#endif // FORCE_FIELD
-
 #ifdef SURFACE // (rho, u) need to be updated exactly every LBM step
 #define UPDATE_FIELDS // update (rho, u, T) in every LBM step
 #endif // SURFACE
diff --git a/src/lbm.cpp b/src/lbm.cpp
@@ -490,21 +490,21 @@ string LBM::device_defines() const { return
 	"\n	#define TRT"
 #endif // TRT
 
-	"\n	#define TYPE_S 0b00000001" // (stationary or moving) solid boundary
-	"\n	#define TYPE_E 0b00000010" // equilibrium boundary (inflow/outflow)
-	"\n	#define TYPE_T 0b00000100" // temperature boundary
-	"\n	#define TYPE_F 0b00001000" // fluid
-	"\n	#define TYPE_I 0b00010000" // interface
-	"\n	#define TYPE_G 0b00100000" // gas
-	"\n	#define TYPE_X 0b01000000" // reserved type X
-	"\n	#define TYPE_Y 0b10000000" // reserved type Y
-
-	"\n	#define TYPE_MS 0b00000011" // node next to moving solid boundary
-	"\n	#define TYPE_BO 0b00000011" // any flag bit used for boundaries (temperature excluded)
-	"\n	#define TYPE_IF 0b00011000" // change from interface to fluid
-	"\n	#define TYPE_IG 0b00110000" // change from interface to gas
-	"\n	#define TYPE_GI 0b00111000" // change from gas to interface
-	"\n	#define TYPE_SU 0b00111000" // any flag bit used for SURFACE
+	"\n	#define TYPE_S 0x01" // 0b00000001 // (stationary or moving) solid boundary
+	"\n	#define TYPE_E 0x02" // 0b00000010 // equilibrium boundary (inflow/outflow)
+	"\n	#define TYPE_T 0x04" // 0b00000100 // temperature boundary
+	"\n	#define TYPE_F 0x08" // 0b00001000 // fluid
+	"\n	#define TYPE_I 0x10" // 0b00010000 // interface
+	"\n	#define TYPE_G 0x20" // 0b00100000 // gas
+	"\n	#define TYPE_X 0x40" // 0b01000000 // reserved type X
+	"\n	#define TYPE_Y 0x80" // 0b10000000 // reserved type Y
+
+	"\n	#define TYPE_MS 0x03" // 0b00000011 // node next to moving solid boundary
+	"\n	#define TYPE_BO 0x03" // 0b00000011 // any flag bit used for boundaries (temperature excluded)
+	"\n	#define TYPE_IF 0x18" // 0b00011000 // change from interface to fluid
+	"\n	#define TYPE_IG 0x30" // 0b00110000 // change from interface to gas
+	"\n	#define TYPE_GI 0x38" // 0b00111000 // change from gas to interface
+	"\n	#define TYPE_SU 0x38" // 0b00111000 // any flag bit used for SURFACE
 
 #if defined(FP16S)
 	"\n	#define fpxx half" // switchable data type (scaled IEEE-754 16-bit floating-point format: 1-5-10, exp-30, +-1.99902344, +-1.86446416E-9, +-1.81898936E-12, 3.311 digits)
diff --git a/src/opencl.hpp b/src/opencl.hpp
@@ -3,11 +3,7 @@
 #define WORKGROUP_SIZE 64 // needs to be 64 to fully use AMD GPUs
 //#define PTX
 //#define LOG
-//#define USE_OPENCL_1_1
 
-#ifdef USE_OPENCL_1_1
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#endif // USE_OPENCL_1_1
 #ifndef _WIN32
 #pragma GCC diagnostic ignored "-Wignored-attributes" // ignore compiler warnings for CL/cl.hpp with g++
 #endif // _WIN32
@@ -54,7 +50,7 @@ struct Device_Info {
 		const bool nvidia_192_cores_per_cu = contains_any(to_lower(name), {" 6", " 7", "ro k", "la k"}) || (clock_frequency<1000u&&contains(to_lower(name), "titan")); // identify Kepler GPUs
 		const bool nvidia_64_cores_per_cu = contains_any(to_lower(name), {"p100", "v100", "a100", "a30", " 16", " 20", "titan v", "titan rtx", "ro t", "la t", "ro rtx"}) && !contains(to_lower(name), "rtx a"); // identify P100, Volta, Turing, A100, A30
 		const bool amd_128_cores_per_dualcu = contains(to_lower(name), "gfx10"); // identify RDNA/RDNA2 GPUs where dual CUs are reported
-		const float nvidia = (float)(contains(to_lower(vendor), "nvidia"))*(nvidia_192_cores_per_cu?192.0f:(nvidia_64_cores_per_cu?64.0f:128.0f)); // Nvidia GPUs have 192 cores/CU (Kepler), 128 cores/CU (Maxwell, Pascal, Ampere) or 64 cores/CU (P100, Volta, Turing, A100)
+		const float nvidia = (float)(contains(to_lower(vendor), "nvidia"))*(nvidia_192_cores_per_cu?192.0f:(nvidia_64_cores_per_cu?64.0f:128.0f)); // Nvidia GPUs have 192 cores/CU (Kepler), 128 cores/CU (Maxwell, Pascal, Ampere, Hopper, Ada) or 64 cores/CU (P100, Volta, Turing, A100, A30)
 		const float amd = (float)(contains_any(to_lower(vendor), {"amd", "advanced"}))*(is_gpu?(amd_128_cores_per_dualcu?128.0f:64.0f):0.5f); // AMD GPUs have 64 cores/CU (GCN, CDNA) or 128 cores/dualCU (RDNA, RDNA2), AMD CPUs (with SMT) have 1/2 core/CU
 		const float intel = (float)(contains(to_lower(vendor), "intel"))*(is_gpu?8.0f:0.5f); // Intel integrated GPUs usually have 8 cores/CU, Intel CPUs (with HT) have 1/2 core/CU
 		const float apple = (float)(contains(to_lower(vendor), "apple"))*(128.0f); // Apple ARM GPUs usually have 128 cores/CU
@@ -170,7 +166,7 @@ class Device {
 		write_file("bin/kernel.log", log); // save build log
 		if((uint)log.length()>2u) print_warning(log); // print build log
 #endif // LOG
-		if(error) print_error("OpenCL C code compilation failed with error code "+to_string(error)+". Make sure there are no errors in kernel.cpp (\"#define LOG\" might help). If your GPU is old, try uncommenting \"#define USE_OPENCL_1_1\".");
+		if(error) print_error("OpenCL C code compilation failed with error code "+to_string(error)+". Make sure there are no errors in kernel.cpp.");
 		else print_info("OpenCL C code successfully compiled.");
 #ifdef PTX // generate assembly (ptx) file for OpenCL code
 		write_file("bin/kernel.ptx", cl_program.getInfo<CL_PROGRAM_BINARIES>()[0]); // save binary (ptx file)
diff --git a/src/setup.cpp b/src/setup.cpp
@@ -204,10 +204,10 @@
 
 /*void main_setup() { // lid-driven cavity
 	// ######################################################### define simulation box size, viscosity and volume force ############################################################################
-	const uint L = 96u;
+	const uint L = 128u;
 	const float Re = 1000.0f;
-	const float u = 0.4f;
-	LBM lbm(L, L, L, units.nu_from_Re(Re, (float)L, u));
+	const float u = 0.1f;
+	LBM lbm(L, L, L, units.nu_from_Re(Re, (float)(L-2u), u));
 	// #############################################################################################################################################################################################
 	const uint N=lbm.get_N(), Nx=lbm.get_Nx(), Ny=lbm.get_Ny(), Nz=lbm.get_Nz(); for(uint n=0u, x=0u, y=0u, z=0u; n<N; n++, lbm.coordinates(n, x, y, z)) {
 		// ########################################################################### define geometry #############################################################################################