ProjectPhysX
diff --git a/‎DOCUMENTATION.md
Lines changed: 10 additions & 17 deletions b/‎DOCUMENTATION.md
Lines changed: 10 additions & 17 deletions
diff --git a/‎README.md
Lines changed: 6 additions & 0 deletions b/‎README.md
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/defines.hpp
Lines changed: 3 additions & 3 deletions b/‎src/defines.hpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/info.cpp
Lines changed: 1 addition & 1 deletion b/‎src/info.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/kernel.cpp
Lines changed: 73 additions & 31 deletions b/‎src/kernel.cpp
Lines changed: 73 additions & 31 deletions
@@ -386,29 +386,22 @@
 
 ### Lift/Drag Forces
 - Enable (uncomment) the [`FORCE_FIELD`](src/defines.hpp) extension. This extension allows computing boundary forces on every solid cell (`TYPE_S`) individually, as well as placing an individual volume force on every fluid cell (not used here).
-- In the [`main_setup()`](src/setup.cpp) function's main simulation loop, alternatingly call:
+- In the [`main_setup()`](src/setup.cpp) function, voxelize the mesh with a unique flag combination, such as `(TYPE_S|TYPE_X)` or `(TYPE_S|TYPE_Y)` or `(TYPE_S|TYPE_X|TYPE_Y)`, to distinguish it from all other `(TYPE_S)` cells that might be needed to define other geometry, and compute its center of mass:
   ```c
-  lbm.run(lbm_dt); // run lbm_dt LBM time steps
-  lbm.calculate_force_on_boundaries(); // compute boundary forces on GPU on all solid cells (TYPE_S)
+  lbm.voxelize_mesh_on_device(mesh, TYPE_S|TYPE_X); // voxelize mesh with unique flag combination
+  const float3 lbm_com = lbm.object_center_of_mass(TYPE_S|TYPE_X); // object center of mass in LBM unit coordinates
   ```
-  The latter computes the boundary forces on the GPU into the `lbm.F` field in VRAM.
-- To copy `lbm.F` from GPU VRAM to CPU RAM, call:
+- To sum over all the individual boundary cells that belong to the object, in the [`main_setup()`](src/setup.cpp) function's main simulation loop call:
   ```c
-  lbm.F.read_from_device();
+  const float3 lbm_force = lbm.object_force(TYPE_S|TYPE_X); // force on object
+  const float3 lbm_torque = lbm.object_torque(lbm_com, TYPE_S|TYPE_X); // torque on object around lbm_com rotation point
   ```
-  You can then access the boundary forces at each individual cell with:
+  These functions sum over all cells marked `(TYPE_S|TYPE_X)` that belong to the object. The summation happens GPU-accelerated in VRAM, and only the result is copied to CPU RAM.
+- You may also access the force field on individual grid cells. Note that copying the entire `lbm.F` force field from GPU VRAM to CPU RAM is slow:
   ```c
-  float lbm_force_x_n = lbm.F.x[lbm.index(x, y, z)];
+  lbm.F.read_from_device(); // copy entire force field from GPU VRAM to CPU RAM (slow)
+  lbm_force_x_n = lbm.F.x[lbm.index(x, y, z)]; // access force at one particular grid cell with integer coordinates x, y, z
   ```
-- To sum over all the individual boundary cells that belong to the body, to get the total force on the body, first voxelize the body with
-  ```c
-  lbm.voxelize_mesh_on_device(mesh, TYPE_S|TYPE_X);
-  ```
-  with the additional `TYPE_X` flagging, and then call
-  ```c
-  const float3 lbm_force = lbm.calculate_force_on_object(TYPE_S|TYPE_X);
-  ```
-  to sum over all cells marked `TYPE_S|TYPE_X` that belong to the body. You can also use `TYPE_Y` for this.
 - Finally, [convert from LBM to SI units](#unit-conversion) with
   ```c
   const float si_force_x = units.si_F(lbm_force.x);
 
@@ -213,6 +213,12 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
   - added workaround for compiler bug in Intel CPU Runtime for OpenCL that causes Q-criterion isosurface rendering corruption
   - fixed TFlops estimate for Intel Battlemage GPUs
   - fixed wrong device name reporting for AMD GPUs
+- [v3.2](https://github.com/ProjectPhysX/FluidX3D/releases/tag/v3.2) (09.03.2025) [changes](https://github.com/ProjectPhysX/FluidX3D/compare/v3.1...v3.2) (fast force/torque summation)
+  - implemented GPU-accelerated force/torque summation (~20x faster than CPU-multithreaded implementation before)
+  - simplified calculating object force/torque in setups
+  - improved coloring in `VIS_FIELD`/`ray_grid_traverse_sum()`
+  - updated OpenCL-Wrapper now compiles OpenCL C code with `-cl-std=CL3.0` if available
+  - fixed compiling on macOS with new OpenCL headers
 
 </details>
 
 
@@ -16,7 +16,7 @@
 #define BENCHMARK // disable all extensions and setups and run benchmark setup instead
 
 //#define VOLUME_FORCE // enables global force per volume in one direction (equivalent to a pressure gradient); specified in the LBM class constructor; the force can be changed on-the-fly between time steps at no performance cost
-//#define FORCE_FIELD // enables computing the forces on solid boundaries with lbm.calculate_force_on_boundaries(); and enables setting the force for each lattice point independently (enable VOLUME_FORCE too); allocates an extra 12 Bytes/cell
+//#define FORCE_FIELD // enables computing the forces on solid boundaries with lbm.update_force_field(); and enables setting the force for each lattice point independently (enable VOLUME_FORCE too); allocates an extra 12 Bytes/cell
 //#define EQUILIBRIUM_BOUNDARIES // enables fixing the velocity/density by marking cells with TYPE_E; can be used for inflow/outflow; does not reflect shock waves
 //#define MOVING_BOUNDARIES // enables moving solids: set solid cells to TYPE_S and set their velocity u unequal to zero
 //#define SURFACE // enables free surface LBM: mark fluid cells with TYPE_F; at initialization the TYPE_I interface and TYPE_G gas domains will automatically be completed; allocates an extra 12 Bytes/cell
@@ -32,9 +32,9 @@
 #define GRAPHICS_FRAME_HEIGHT 1080 // set frame height if only GRAPHICS is enabled
 #define GRAPHICS_BACKGROUND_COLOR 0x000000 // set background color; black background (default) = 0x000000, white background = 0xFFFFFF
 #define GRAPHICS_U_MAX 0.18f // maximum velocity for velocity coloring in units of LBM lattice speed of sound (c=1/sqrt(3)) (default: 0.18f)
-#define GRAPHICS_RHO_DELTA 0.01f // coloring range for density rho will be [1.0f-GRAPHICS_RHO_DELTA, 1.0f+GRAPHICS_RHO_DELTA] (default: 0.01f)
+#define GRAPHICS_RHO_DELTA 0.001f // coloring range for density rho will be [1.0f-GRAPHICS_RHO_DELTA, 1.0f+GRAPHICS_RHO_DELTA] (default: 0.001f)
 #define GRAPHICS_T_DELTA 1.0f // coloring range for temperature T will be [1.0f-GRAPHICS_T_DELTA, 1.0f+GRAPHICS_T_DELTA] (default: 1.0f)
-#define GRAPHICS_F_MAX 0.001f // maximum force in LBM units for visualization of forces on solid boundaries if VOLUME_FORCE is enabled and lbm.calculate_force_on_boundaries(); is called (default: 0.001f)
+#define GRAPHICS_F_MAX 0.001f // maximum force in LBM units for visualization of forces on solid boundaries if VOLUME_FORCE is enabled and lbm.update_force_field(); is called (default: 0.001f)
 #define GRAPHICS_Q_CRITERION 0.0001f // Q-criterion value for Q-criterion isosurface visualization (default: 0.0001f)
 #define GRAPHICS_STREAMLINE_SPARSE 8 // set how many streamlines there are every x lattice points
 #define GRAPHICS_STREAMLINE_LENGTH 128 // set maximum length of streamlines
 
@@ -42,7 +42,7 @@ void Info::print_logo() const {
 	print("|                                  ");                 print("\\  \\ /  /", c);                print("                                  |\n");
 	print("|                                   ");                 print("\\  '  /", c);                 print("                                   |\n");
 	print("|                                    ");                 print("\\   /", c);                 print("                                    |\n");
-	print("|                                     ");                 print("\\ /", c);                 print("                FluidX3D Version 3.1 |\n");
+	print("|                                     ");                 print("\\ /", c);                 print("                FluidX3D Version 3.2 |\n");
 	print("|                                      ");                 print( "'", c);                 print("     Copyright (c) Dr. Moritz Lehmann |\n");
 	print("|-----------------------------------------------------------------------------|\n");
 }
 
@@ -14,13 +14,13 @@ string opencl_c_container() { return R( // ########################## begin of O
 )+R(float angle(const float3 v1, const float3 v2) {
 	return acos(dot(v1, v2)/(length(v1)*length(v2)));
 }
-)+R(float fast_rsqrt(const float x) { // slightly fastwer approximation
+)+R(float fast_rsqrt(const float x) { // slightly faster approximation
 	return as_float(0x5F37642F-(as_int(x)>>1));
 }
-)+R(float fast_asin(const float x) { // slightly fastwer approximation
+)+R(float fast_asin(const float x) { // slightly faster approximation
 	return x*fma(0.5702f, sq(sq(sq(x))), 1.0f); // 0.5707964f = (pi-2)/2
 }
-)+R(float fast_acos(const float x) { // slightly fastwer approximation
+)+R(float fast_acos(const float x) { // slightly faster approximation
 	return fma(fma(-0.5702f, sq(sq(sq(x))), -1.0f), x, 1.5712963f); // 0.5707964f = (pi-2)/2
 }
 )+R(void swap(float* x, float* y) {
@@ -52,22 +52,6 @@ string opencl_c_container() { return R( // ########################## begin of O
 	const float x1=p.x, y1=p.y, z1=p.z, x0=1.0f-x1, y0=1.0f-y1, z0=1.0f-z1; // calculate interpolation factors
 	return (x0*y0*z0)*v[0]+(x1*y0*z0)*v[1]+(x1*y0*z1)*v[2]+(x0*y0*z1)*v[3]+(x0*y1*z0)*v[4]+(x1*y1*z0)*v[5]+(x1*y1*z1)*v[6]+(x0*y1*z1)*v[7]; // perform trilinear interpolation
 }
-//bool workgroup_any(const bool condition) { // returns true if any thread within the workgroup enters true
-//	volatile local uint workgroup_condition; // does not work on AMD GPUs (error: non-kernel function variable cannot be declared in local address space)
-//	workgroup_condition = 0u;
-//	barrier(CLK_LOCAL_MEM_FENCE);
-//	atomic_or(&workgroup_condition, (uint)condition);
-//	barrier(CLK_LOCAL_MEM_FENCE);
-//	return (bool)workgroup_condition;
-//}
-//bool workgroup_all(const bool condition) { // returns true if all threads within the workgroup enter true
-//	volatile local uint workgroup_condition; // does not work on AMD GPUs (error: non-kernel function variable cannot be declared in local address space)
-//	workgroup_condition = 1u;
-//	barrier(CLK_LOCAL_MEM_FENCE);
-//	atomic_and(&workgroup_condition, (uint)condition);
-//	barrier(CLK_LOCAL_MEM_FENCE);
-//	return (bool)workgroup_condition;
-//}
 
 
 
@@ -1916,9 +1900,9 @@ string opencl_c_container() { return R( // ########################## begin of O
 } // update_fields()
 
 )+"#ifdef FORCE_FIELD"+R(
-)+R(kernel void calculate_force_on_boundaries(const global fpxx* fi, const global uchar* flags, const ulong t, global float* F) { // calculate force from the fluid on solid boundaries from fi directly
+)+R(kernel void update_force_field(const global fpxx* fi, const global uchar* flags, const ulong t, global float* F) { // calculate force from the fluid on solid boundaries from fi directly
 	const uxx n = get_global_id(0); // n = x+(y+z*Ny)*Nx
-	if(n>=(uxx)def_N||is_halo(n)) return; // don't execute calculate_force_on_boundaries() on halo
+	if(n>=(uxx)def_N||is_halo(n)) return; // don't execute update_force_field() on halo
 	if((flags[n]&TYPE_BO)!=TYPE_S) return; // only continue for solid boundary cells
 	uxx j[def_velocity_set]; // neighbor indices
 	neighbors(n, j); // calculate neighbor indices
@@ -1929,19 +1913,15 @@ string opencl_c_container() { return R( // ########################## begin of O
 	F[                 n] = 2.0f*fx*Fb; // 2 times because fi are reflected on solid boundary cells (bounced-back)
 	F[    def_N+(ulong)n] = 2.0f*fy*Fb;
 	F[2ul*def_N+(ulong)n] = 2.0f*fz*Fb;
-} // calculate_force_on_boundaries()
+} // update_force_field()
 )+R(kernel void reset_force_field(global float* F) { // reset force field
 	const uxx n = get_global_id(0); // n = x+(y+z*Ny)*Nx
 	if(n>=(uxx)def_N) return; // execute reset_force_field() also on halo
 	F[                 n] = 0.0f;
 	F[    def_N+(ulong)n] = 0.0f;
 	F[2ul*def_N+(ulong)n] = 0.0f;
 } // reset_force_field()
-)+"#endif"+R( // FORCE_FIELD
-
-)+"#ifdef PARTICLES"+R(
-)+"#ifdef FORCE_FIELD"+R(
-void atomic_add_f(volatile global float* addr, const float val) {
+)+R(void atomic_add_f(volatile global float* addr, const float val) {
 )+"#if cl_nv_compute_capability>=20"+R( // use hardware-supported atomic addition on Nvidia GPUs with inline PTX assembly
 	float ret;)+"asm volatile(\"atom.global.add.f32\t%0,[%1],%2;\":\"=f\"(ret):\"l\"(addr),\"f\"(val):\"memory\");"+R(
 )+"#elif defined(__opencl_c_ext_fp32_global_atomic_add)"+R( // use hardware-supported atomic addition on some Intel GPUs
@@ -1952,6 +1932,68 @@ void atomic_add_f(volatile global float* addr, const float val) {
 	float old = val; while((old=atomic_xchg(addr, atomic_xchg(addr, 0.0f)+old))!=0.0f);
 )+"#endif"+R(
 }
+)+R(kernel void object_center_of_mass(const global uchar* flags, const uchar flag_marker, volatile global float* object_sum) {
+	const uxx n = get_global_id(0); // n = x+(y+z*Ny)*Nx
+	const uint lid = get_local_id(0); // local memory reduction of cl_workgroup_size:1
+	local float3 cache[cl_workgroup_size];
+	local uint cells[cl_workgroup_size];
+	cache[lid] = n<(uxx)def_N&&flags[n]==flag_marker ? position(coordinates(n)) : (float3)(0.0f, 0.0f, 0.0f);
+	cells[lid] = (uint)(n<(uxx)def_N&&flags[n]==flag_marker);
+	barrier(CLK_GLOBAL_MEM_FENCE);
+	for(uint s=1u; s<cl_workgroup_size; s*=2u) {
+		if(lid%(2u*s)==0u) {
+			cache[lid] += cache[lid+s];
+			cells[lid] += cells[lid+s];
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	const uint local_cells = cells[0];
+	if(lid==0u&&local_cells>0u) { // global memory reduction with atomic addition of local_sum
+		const float3 local_sum = cache[0];
+		atomic_add_f(&object_sum[0], local_sum.x);
+		atomic_add_f(&object_sum[1], local_sum.y);
+		atomic_add_f(&object_sum[2], local_sum.z);
+		atomic_add((volatile global uint*)&object_sum[3], local_cells);
+	}
+} // object_center_of_mass()
+)+R(kernel void object_force(const global float* F, const global uchar* flags, const uchar flag_marker, volatile global float* object_sum) {
+	const uxx n = get_global_id(0); // n = x+(y+z*Ny)*Nx
+	const uint lid = get_local_id(0); // local memory reduction of cl_workgroup_size:1
+	local float3 cache[cl_workgroup_size];
+	cache[lid] = n<(uxx)def_N&&flags[n]==flag_marker ? load3(n, F) : (float3)(0.0f, 0.0f, 0.0f);
+	barrier(CLK_GLOBAL_MEM_FENCE);
+	for(uint s=1u; s<cl_workgroup_size; s*=2u) {
+		if(lid%(2u*s)==0u) cache[lid] += cache[lid+s];
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	if(lid==0u) { // global memory reduction with atomic addition of local_sum
+		const float3 local_sum = cache[0];
+		if(local_sum.x!=0.0f) atomic_add_f(&object_sum[0], local_sum.x);
+		if(local_sum.y!=0.0f) atomic_add_f(&object_sum[1], local_sum.y);
+		if(local_sum.z!=0.0f) atomic_add_f(&object_sum[2], local_sum.z);
+	}
+} // object_force()
+)+R(kernel void object_torque(const global float* F, const global uchar* flags, const uchar flag_marker, const float cx, const float cy, const float cz, volatile global float* object_sum) {
+	const uxx n = get_global_id(0); // n = x+(y+z*Ny)*Nx
+	const uint lid = get_local_id(0); // local memory reduction of cl_workgroup_size:1
+	local float3 cache[cl_workgroup_size];
+	cache[lid] = n<(uxx)def_N&&flags[n]==flag_marker ? cross(position(coordinates(n))-(float3)(cx, cy, cz), load3(n, F)) : (float3)(0.0f, 0.0f, 0.0f);
+	barrier(CLK_GLOBAL_MEM_FENCE);
+	for(uint s=1u; s<cl_workgroup_size; s*=2u) {
+		if(lid%(2u*s)==0u) cache[lid] += cache[lid+s];
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	if(lid==0u) { // global memory reduction with atomic addition of local_sum
+		const float3 local_sum = cache[0];
+		if(local_sum.x!=0.0f) atomic_add_f(&object_sum[0], local_sum.x);
+		if(local_sum.y!=0.0f) atomic_add_f(&object_sum[1], local_sum.y);
+		if(local_sum.z!=0.0f) atomic_add_f(&object_sum[2], local_sum.z);
+	}
+} // object_torque()
+)+"#endif"+R( // FORCE_FIELD
+
+)+"#ifdef PARTICLES"+R(
+)+"#ifdef FORCE_FIELD"+R(
 )+R(void spread_force(volatile global float* F, const float3 p, const float3 Fn) {
 	const float xa=p.x-0.5f+1.5f*(float)def_Nx, ya=p.y-0.5f+1.5f*(float)def_Ny, za=p.z-0.5f+1.5f*(float)def_Nz; // subtract lattice offsets
 	const uint xb=(uint)xa, yb=(uint)ya, zb=(uint)za; // integer casting to find bottom left corner
@@ -2545,14 +2587,14 @@ void atomic_add_f(volatile global float* addr, const float val) {
 				const uxx n = index((uint3)((uint)clamp(xyz.x, 0, (int)Nx-1), (uint)clamp(xyz.y, 0, (int)Ny-1), (uint)clamp(xyz.z, 0, (int)Nz-1)));
 				if(!(flags[n]&(TYPE_S|TYPE_E|TYPE_G))) {
 					const float un = length(load3(n, u));
-					const float weight = sq(un)+sq(un-0.5f/def_scale_u);
+					const float weight = fmin(un, fabs(un-0.5f/def_scale_u));
 					sum = fma(weight, un, sum);
 					traversed_cells_weighted += weight;
 				}
 				traversed_cells++;
 			}
 			color = colorscale_rainbow(def_scale_u*sum/traversed_cells_weighted);
-			traversed_cells_weighted *= sq(def_scale_u);
+			traversed_cells_weighted *= 2.0f*def_scale_u;
 			break;
 		case 1: // coloring by density
 			while(traversed_cells<Nx+Ny+Nz) { // limit number of traversed cells to space diagonal
@@ -2562,14 +2604,14 @@ void atomic_add_f(volatile global float* addr, const float val) {
 				const uxx n = index((uint3)((uint)clamp(xyz.x, 0, (int)Nx-1), (uint)clamp(xyz.y, 0, (int)Ny-1), (uint)clamp(xyz.z, 0, (int)Nz-1)));
 				if(!(flags[n]&(TYPE_S|TYPE_E|TYPE_G))) {
 					const float rhon = rho[n];
-					const float weight = sq(rhon-1.0f);
+					const float weight = fabs(rhon-1.0f);
 					sum = fma(weight, rhon, sum);
 					traversed_cells_weighted += weight;
 				}
 				traversed_cells++;
 			}
 			color = colorscale_twocolor(0.5f+def_scale_rho*(sum/traversed_cells_weighted-1.0f));
-			traversed_cells_weighted *= sq(def_scale_rho);
+			traversed_cells_weighted *= def_scale_rho;
 			break;
 )+"#ifdef TEMPERATURE"+R(
 		case 2: // coloring by temperature
Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ void Info::print_logo() const {`
`42`	`42`	`print("\| "); print("\\ \\ / /", c); print(" \|\n");`
`43`	`43`	`print("\| "); print("\\ ' /", c); print(" \|\n");`
`44`	`44`	`print("\| "); print("\\ /", c); print(" \|\n");`
`45`		`- print("\| "); print("\\ /", c); print(" FluidX3D Version 3.1 \|\n");`
	`45`	`+ print("\| "); print("\\ /", c); print(" FluidX3D Version 3.2 \|\n");`
`46`	`46`	`print("\| "); print( "'", c); print(" Copyright (c) Dr. Moritz Lehmann \|\n");`
`47`	`47`	`print("\|-----------------------------------------------------------------------------\|\n");`
`48`	`48`	`}`