Skip to content

Commit 81d8047

Browse files
committed
FluidX3D v3.0 upgrade
1 parent 497331f commit 81d8047

File tree

7 files changed

+118
-53
lines changed

7 files changed

+118
-53
lines changed

DOCUMENTATION.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
sudo apt update && sudo apt upgrade -y
2626
sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev
2727
mkdir -p ~/amdgpu
28-
wget -P ~/amdgpu https://repo.radeon.com/amdgpu-install/6.1.3/ubuntu/jammy/amdgpu-install_6.1.60103-1_all.deb
28+
wget -P ~/amdgpu https://repo.radeon.com/amdgpu-install/6.2.3/ubuntu/noble/amdgpu-install_6.2.60203-1_all.deb
2929
sudo apt install -y ~/amdgpu/amdgpu-install*.deb
3030
sudo amdgpu-install -y --usecase=graphics,rocm,opencl --opencl=rocr
3131
sudo usermod -a -G render,video $(whoami)
@@ -60,12 +60,12 @@
6060
6161
- Option 1: Download and install the [oneAPI DPC++ Compiler](https://github.com/intel/llvm/releases?q=oneAPI+DPC%2B%2B+Compiler) and [oneTBB](https://github.com/oneapi-src/oneTBB/releases) with:
6262
```bash
63-
export OCLV="2024.18.6.0.02_rel"
64-
export TBBV="2021.13.0"
63+
export OCLV="2024.18.10.0.08_rel"
64+
export TBBV="2022.0.0"
6565
sudo apt update && sudo apt upgrade -y
6666
sudo apt install -y g++ git make ocl-icd-libopencl1 ocl-icd-opencl-dev
6767
sudo mkdir -p ~/cpurt /opt/intel/oclcpuexp_${OCLV} /etc/OpenCL/vendors /etc/ld.so.conf.d
68-
sudo wget -P ~/cpurt https://github.com/intel/llvm/releases/download/2024-WW25/oclcpuexp-${OCLV}.tar.gz
68+
sudo wget -P ~/cpurt https://github.com/intel/llvm/releases/download/2024-WW43/oclcpuexp-${OCLV}.tar.gz
6969
sudo wget -P ~/cpurt https://github.com/oneapi-src/oneTBB/releases/download/v${TBBV}/oneapi-tbb-${TBBV}-lin.tgz
7070
sudo tar -zxvf ~/cpurt/oclcpuexp-${OCLV}.tar.gz -C /opt/intel/oclcpuexp_${OCLV}
7171
sudo tar -zxvf ~/cpurt/oneapi-tbb-${TBBV}-lin.tgz -C /opt/intel

README.md

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# FluidX3D
22

3-
The fastest and most memory efficient lattice Boltzmann CFD software, running on all GPUs via [OpenCL](https://github.com/ProjectPhysX/OpenCL-Wrapper "OpenCL-Wrapper"). Free for non-commercial use.
3+
The fastest and most memory efficient lattice Boltzmann CFD software, running on all GPUs and CPUs via [OpenCL](https://github.com/ProjectPhysX/OpenCL-Wrapper "OpenCL-Wrapper"). Free for non-commercial use.
44

55
<a href="https://youtu.be/-MkRBeQkLk8"><img src="https://img.youtube.com/vi/o3TPN142HxM/maxresdefault.jpg" width="50%"></img></a><a href="https://youtu.be/oC6U1M0Fsug"><img src="https://img.youtube.com/vi/oC6U1M0Fsug/maxresdefault.jpg" width="50%"></img></a><br>
6-
<a href="https://youtu.be/XOfXHgP4jnQ"><img src="https://img.youtube.com/vi/XOfXHgP4jnQ/maxresdefault.jpg" width="50%"></img></a><a href="https://youtu.be/clAqgNtySow"><img src="https://img.youtube.com/vi/clAqgNtySow/maxresdefault.jpg" width="50%"></img></a>
6+
<a href="https://youtu.be/XOfXHgP4jnQ"><img src="https://img.youtube.com/vi/XOfXHgP4jnQ/maxresdefault.jpg" width="50%"></img></a><a href="https://youtu.be/K5eKxzklXDA"><img src="https://img.youtube.com/vi/K5eKxzklXDA/maxresdefault.jpg" width="50%"></img></a>
77
(click on images to show videos on YouTube)
88

99
<details><summary>Update History</summary>
@@ -193,6 +193,13 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
193193
- fixed maximum buffer allocation size limit for AMD GPUs and in Intel CPU Runtime for OpenCL
194194
- fixed wrong `Re<Re_max` info printout for 2D simulations
195195
- minor fix in `bandwidth_bytes_per_cell_device()`
196+
- [v3.0](https://github.com/ProjectPhysX/FluidX3D/releases/tag/v3.0) (16.11.2024) [changes](https://github.com/ProjectPhysX/FluidX3D/compare/v2.19...v3.0) (larger CPU/iGPU simulations)
197+
- reduced memory footprint on CPUs and iGPU from 72 to 55 Bytes/cell (fused OpenCL host+device buffers for `rho`/`u`/`flags`), allowing 31% higher resolution in the same RAM capacity
198+
- faster hardware-supported and faster fallback emulation atomic floating-point addition for `PARTICLES` extension
199+
- hardened `calculate_f_eq()` against bad user input for `D2Q9`
200+
- fixed velocity voxelization for overlapping geometry with different velocity
201+
- fixed Remaining Time printout during paused simulation
202+
- fixed CPU/GPU memory printout for CPU/iGPU simulations
196203

197204
</details>
198205

@@ -759,6 +766,8 @@ section Orange Pi 5 Mali-G610 MP4
759766
232 :active, 0, 232
760767
section Samsung Mali-G72 MP18 (S9+)
761768
230 :active, 0, 230
769+
section 2x EPYC 9754
770+
5179 :crit, 0, 5179
762771
section 2x EPYC 9654
763772
1814 :crit, 0, 1814
764773
section 2x EPYC 7352
@@ -767,6 +776,12 @@ section 2x EPYC 7313
767776
498 :crit, 0, 498
768777
section 2x EPYC 7302
769778
784 :crit, 0, 784
779+
section 2x 6980P
780+
7875 :done, 0, 7875
781+
section 2x 6979P
782+
8135 :done, 0, 8135
783+
section 2x Platinum 8592+
784+
3135 :done, 0, 3135
770785
section 2x CPU Max 9480
771786
2037 :done, 0, 2037
772787
section 2x Platinum 8480+
@@ -993,10 +1008,14 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, ⚪ Apple, 🟡 ARM, 🟤 Glenfly
9931008
| 🟡&nbsp;Mali-G610&nbsp;MP4 (Orange&nbsp;Pi&nbsp;5) | 0.06 | 16 | 34 | 130 (58%) | 232 (52%) | 93 (21%) |
9941009
| 🟡&nbsp;Mali-G72&nbsp;MP18 (Samsung&nbsp;S9+) | 0.24 | 4 | 29 | 110 (59%) | 230 (62%) | 21 ( 6%) |
9951010
| | | | | | | |
1011+
| 🔴&nbsp;2x&nbsp;EPYC&nbsp;9754 | 50.79 | 3072 | 922 | 3276 (54%) | 5077 (42%) | 5179 (43%) |
9961012
| 🔴&nbsp;2x&nbsp;EPYC&nbsp;9654 | 43.62 | 1536 | 922 | 1381 (23%) | 1814 (15%) | 1801 (15%) |
9971013
| 🔴&nbsp;2x&nbsp;EPYC&nbsp;7352 | 3.53 | 512 | 410 | 739 (28%) | 106 ( 2%) | 412 ( 8%) |
9981014
| 🔴&nbsp;2x&nbsp;EPYC&nbsp;7313 | 3.07 | 128 | 410 | 498 (19%) | 367 ( 7%) | 418 ( 8%) |
9991015
| 🔴&nbsp;2x&nbsp;EPYC&nbsp;7302 | 3.07 | 128 | 410 | 784 (29%) | 336 ( 6%) | 411 ( 8%) |
1016+
| 🔵&nbsp;2x&nbsp;Xeon&nbsp;6980P | 98.30 | 6144 | 1690 | 7875 (71%) | 5112 (23%) | 5610 (26%) |
1017+
| 🔵&nbsp;2x&nbsp;Xeon&nbsp;6979P | 92.16 | 3072 | 1690 | 8135 (74%) | 4175 (19%) | 4622 (21%) |
1018+
| 🔵&nbsp;2x&nbsp;Xeon&nbsp;Platinum&nbsp;8592+ | 31.13 | 1024 | 717 | 3135 (67%) | 2359 (25%) | 2466 (26%) |
10001019
| 🔵&nbsp;2x&nbsp;Xeon&nbsp;CPU&nbsp;Max&nbsp;9480 | 27.24 | 256 | 614 | 2037 (51%) | 1520 (19%) | 1464 (18%) |
10011020
| 🔵&nbsp;2x&nbsp;Xeon&nbsp;Platinum&nbsp;8480+ | 28.67 | 512 | 614 | 2162 (54%) | 1845 (23%) | 1884 (24%) |
10021021
| 🔵&nbsp;2x&nbsp;Xeon&nbsp;Platinum&nbsp;8380 | 23.55 | 2048 | 410 | 1410 (53%) | 1159 (22%) | 1298 (24%) |

src/info.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ void Info::print_logo() const {
4242
print("| "); print("\\ \\ / /", c); print(" |\n");
4343
print("| "); print("\\ ' /", c); print(" |\n");
4444
print("| "); print("\\ /", c); print(" |\n");
45-
print("| "); print("\\ /", c); print(" FluidX3D Version 2.19 |\n");
45+
print("| "); print("\\ /", c); print(" FluidX3D Version 3.0 |\n");
4646
print("| "); print( "'", c); print(" Copyright (c) Dr. Moritz Lehmann |\n");
4747
print("|-----------------------------------------------------------------------------|\n");
4848
}
@@ -61,8 +61,17 @@ void Info::print_initialize(LBM* lbm) {
6161
#else // FP32
6262
collision += " (FP32/FP32)";
6363
#endif // FP32
64-
cpu_mem_required = (uint)(lbm->get_N()*(ulong)bytes_per_cell_host()/1048576ull); // reset to get valid values for consecutive simulations
65-
gpu_mem_required = lbm->lbm_domain[0]->get_device().info.memory_used;
64+
bool all_domains_use_ram = true; // reset cpu/gpu_mem_required to get valid values for consecutive simulations
65+
for(uint d=0u; d<lbm->get_D(); d++) {
66+
all_domains_use_ram = all_domains_use_ram&&lbm->lbm_domain[d]->get_device().info.uses_ram;
67+
}
68+
if(all_domains_use_ram) {
69+
cpu_mem_required = lbm->get_D()*lbm->lbm_domain[0]->get_device().info.memory_used;
70+
gpu_mem_required = 0u;
71+
} else {
72+
cpu_mem_required = (uint)(lbm->get_N()*(ulong)bytes_per_cell_host()/1048576ull);
73+
gpu_mem_required = lbm->lbm_domain[0]->get_device().info.memory_used;
74+
}
6675
const float Re = lbm->get_Re_max();
6776
println("|-----------------.-----------------------------------------------------------|");
6877
println("| Grid Resolution | "+alignr(57u, to_string(lbm->get_Nx())+" x "+to_string(lbm->get_Ny())+" x "+to_string(lbm->get_Nz())+" = "+to_string(lbm->get_N()))+" |");

src/lbm.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1259,8 +1259,8 @@ void LBM_Domain::allocate_transfer(Device& device) { // allocate all memory for
12591259
if(Dy>1u) Amax = max(Amax, (ulong)Nz*(ulong)Nx); // Ay
12601260
if(Dz>1u) Amax = max(Amax, (ulong)Nx*(ulong)Ny); // Az
12611261

1262-
transfer_buffer_p = Memory<char>(device, Amax, max(transfers*(uint)sizeof(fpxx), 17u)); // only allocate one set of transfer buffers in plus/minus directions, for all x/y/z transfers
1263-
transfer_buffer_m = Memory<char>(device, Amax, max(transfers*(uint)sizeof(fpxx), 17u));
1262+
transfer_buffer_p = Memory<char>(device, Amax, max(transfers*(uint)sizeof(fpxx), 17u), true, true, 0, false); // only allocate one set of transfer buffers in plus/minus directions, for all x/y/z transfers
1263+
transfer_buffer_m = Memory<char>(device, Amax, max(transfers*(uint)sizeof(fpxx), 17u), true, true, 0, false); // these transfer buffers must not be zero-copy!
12641264

12651265
kernel_transfer[enum_transfer_field::fi ][0] = Kernel(device, 0u, "transfer_extract_fi" , 0u, t, transfer_buffer_p, transfer_buffer_m, fi);
12661266
kernel_transfer[enum_transfer_field::fi ][1] = Kernel(device, 0u, "transfer__insert_fi" , 0u, t, transfer_buffer_p, transfer_buffer_m, fi);

src/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ void main_label(const double frametime) {
7777
draw_label(ox, oy+i, "Steps " +alignr(31u, /************************************/ alignr(10u, info.lbm->get_t())+" ("+alignr(5, to_uint(1.0/info.runtime_lbm_timestep_smooth))+" Steps/s)"), c); i+=FONT_HEIGHT;
7878
draw_label(ox, oy+i, "FPS " +alignr(33u, /************************************************************/ alignr(4u, to_uint(1.0/frametime))+" ("+alignr(5u, camera.fps_limit)+" fps max)"), c);
7979
}
80-
draw_label(2, camera.height-1*(FONT_HEIGHT)-1, "FluidX3D v2.19 Copyright (c) Dr. Moritz Lehmann", c);
80+
draw_label(2, camera.height-1*(FONT_HEIGHT)-1, "FluidX3D v3.0 Copyright (c) Dr. Moritz Lehmann", c);
8181
if(!key_H) {
8282
draw_label(camera.width-16*(FONT_WIDTH)-1, 2, "Press H for Help", c);
8383
} else {

0 commit comments

Comments
 (0)