Skip to content

Commit b935846

Browse files
committed
.vtk export now converts and writes data in chunks, to reduce memory footprint and time for large memory allocation
1 parent f1a36ab commit b935846

File tree

8 files changed

+45
-19
lines changed

8 files changed

+45
-19
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,17 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
219219
- improved coloring in `VIS_FIELD`/`ray_grid_traverse_sum()`
220220
- updated OpenCL-Wrapper now compiles OpenCL C code with `-cl-std=CL3.0` if available
221221
- fixed compiling on macOS with new OpenCL headers
222+
- [v3.3](https://github.com/ProjectPhysX/FluidX3D/releases/tag/v3.3) (17.05.2025) [changes](https://github.com/ProjectPhysX/FluidX3D/compare/v3.2...v3.3) (faster .vtk export)
223+
- `.vtk` export now converts and writes data in chunks, to reduce memory footprint and time for large memory allocation
224+
- `.vtk` files now contain original file name as metadata in title
225+
- `INTERACTIVE_GRAPHICS_ASCII` now renders in 2x vertical resolution but less colors
226+
- updated OpenCL-Wrapper: more robust dp4a detection, fixed core count reporting for RDNA4 GPUs
227+
- fixed `update_moving_boundaries()` kernel not being called with flags other than `TYPE_S`
228+
- fixed corrupted first frame until resizing with `INTERACTIVE_GRAPHICS_ASCII`
229+
- fixed `resolution()` function for D2Q9
230+
- fixed missing `<chrono>` header on some compilers
231+
- fixed bug in `split_regex()`
232+
- fixed compiler warning with `min_int`
222233

223234
</details>
224235

@@ -748,6 +759,8 @@ section RTX 3050M Ti
748759
2341 : 0, 2341
749760
section RTX 3050M
750761
2339 : 0, 2339
762+
section RTX 3050 6GB
763+
1898 : 0, 1898
751764
section Titan RTX
752765
7554 : 0, 7554
753766
section RTX 6000
@@ -816,6 +829,8 @@ section M60 (1 GPU)
816829
1571 : 0, 1571
817830
section GTX 960M
818831
872 : 0, 872
832+
section GTX 780 Ti
833+
2776 : 0, 2776
819834
section GTX 770
820835
1215 : 0, 1215
821836
section GTX 680 4GB
@@ -1110,6 +1125,7 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, ⚪ Apple, 🟡 ARM, 🟤 Glenfly
11101125
| 🟢&nbsp;A2 | 4.53 | 15 | 200 | 1031 (79%) | 2051 (79%) | 1199 (46%) |
11111126
| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3050M&nbsp;Ti | 7.60 | 4 | 192 | 1181 (94%) | 2341 (94%) | 2253 (90%) |
11121127
| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3050M | 7.13 | 4 | 192 | 1180 (94%) | 2339 (94%) | 2016 (81%) |
1128+
| 🟢&nbsp;GeForce&nbsp;RTX&nbsp;3050&nbsp;6GB | 6.77 | 6 | 168 | 993 (90%) | 1898 (87%) | 1879 (86%) |
11131129
| 🟢&nbsp;Titan&nbsp;RTX | 16.31 | 24 | 672 | 3471 (79%) | 7456 (85%) | 7554 (87%) |
11141130
| 🟢&nbsp;Quadro&nbsp;RTX&nbsp;6000 | 16.31 | 24 | 672 | 3307 (75%) | 6836 (78%) | 6879 (79%) |
11151131
| 🟢&nbsp;Quadro&nbsp;RTX&nbsp;8000&nbsp;Passive | 14.93 | 48 | 624 | 2591 (64%) | 5408 (67%) | 5607 (69%) |
@@ -1144,6 +1160,7 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, ⚪ Apple, 🟡 ARM, 🟤 Glenfly
11441160
| 🟢&nbsp;Quadro&nbsp;M4000 | 2.57 | 8 | 192 | 899 (72%) | 1519 (61%) | 1050 (42%) |
11451161
| 🟢&nbsp;Tesla&nbsp;M60&nbsp;(1&nbsp;GPU) | 4.82 | 8 | 160 | 853 (82%) | 1571 (76%) | 1557 (75%) |
11461162
| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;960M | 1.51 | 4 | 80 | 442 (84%) | 872 (84%) | 627 (60%) |
1163+
| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;780&nbsp;Ti | 5.35 | 3 | 336 | 1710 (78%) | 2776 (64%) | 1302 (30%) |
11471164
| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;770 | 3.33 | 2 | 224 | 800 (55%) | 1215 (42%) | 876 (30%) |
11481165
| 🟢&nbsp;GeForce&nbsp;GTX&nbsp;680&nbsp;4GB | 3.33 | 4 | 192 | 783 (62%) | 1274 (51%) | 814 (33%) |
11491166
| 🟢&nbsp;Quadro&nbsp;K2000 | 0.73 | 2 | 64 | 312 (75%) | 444 (53%) | 171 (21%) |

src/info.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ void Info::print_logo() const {
4242
print("| "); print("\\ \\ / /", c); print(" |\n");
4343
print("| "); print("\\ ' /", c); print(" |\n");
4444
print("| "); print("\\ /", c); print(" |\n");
45-
print("| "); print("\\ /", c); print(" FluidX3D Version 3.2 |\n");
45+
print("| "); print("\\ /", c); print(" FluidX3D Version 3.3 |\n");
4646
print("| "); print( "'", c); print(" Copyright (c) Dr. Moritz Lehmann |\n");
4747
print("|-----------------------------------------------------------------------------|\n");
4848
}

src/lbm.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1053,7 +1053,8 @@ void LBM::unvoxelize_mesh_on_device(const Mesh* mesh, const uchar flag) { // rem
10531053
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->finish_queue();
10541054
}
10551055
void LBM::write_mesh_to_vtk(const Mesh* mesh, const string& path, const bool convert_to_si_units) const { // write mesh to binary .vtk file
1056-
const string header_1 = "# vtk DataFile Version 3.0\nData\nBINARY\nDATASET POLYDATA\nPOINTS "+to_string(3u*mesh->triangle_number)+" float\n";
1056+
const string filename = default_filename(path, "mesh", ".vtk", get_t());
1057+
const string header_1 = "# vtk DataFile Version 3.0\nFluidX3D "+filename.substr(filename.rfind('/')+1)+"\nBINARY\nDATASET POLYDATA\nPOINTS "+to_string(3u*mesh->triangle_number)+" float\n";
10571058
const string header_2 = "POLYGONS "+to_string(mesh->triangle_number)+" "+to_string(4u*mesh->triangle_number)+"\n";
10581059
float* points = new float[9u*mesh->triangle_number];
10591060
int* triangles = new int[4u*mesh->triangle_number];
@@ -1074,7 +1075,6 @@ void LBM::write_mesh_to_vtk(const Mesh* mesh, const string& path, const bool con
10741075
triangles[4u*i+2u] = reverse_bytes(3*(int)i+1); // vertex 1
10751076
triangles[4u*i+3u] = reverse_bytes(3*(int)i+2); // vertex 2
10761077
});
1077-
const string filename = default_filename(path, "mesh", ".vtk", get_t());
10781078
create_folder(filename);
10791079
std::ofstream file(filename, std::ios::out|std::ios::binary);
10801080
file.write(header_1.c_str(), header_1.length()); // write non-binary file header

src/lbm.hpp

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -299,25 +299,33 @@ class LBM {
299299
if(name=="F" ) unit_conversion_factor = (T)units.si_F (1.0f);
300300
if(name=="T" ) unit_conversion_factor = (T)units.si_T (1.0f);
301301
}
302+
const string filename = create_file_extension(path, ".vtk");
302303
const float3 origin = spacing*float3(0.5f-0.5f*(float)Nx, 0.5f-0.5f*(float)Ny, 0.5f-0.5f*(float)Nz);
303304
const string header =
304-
"# vtk DataFile Version 3.0\nData\nBINARY\nDATASET STRUCTURED_POINTS\n"
305+
"# vtk DataFile Version 3.0\nFluidX3D "+filename.substr(filename.rfind('/')+1)+"\nBINARY\nDATASET STRUCTURED_POINTS\n"
305306
"DIMENSIONS "+to_string(Nx)+" "+to_string(Ny)+" "+to_string(Nz)+"\n"
306307
"ORIGIN "+to_string(origin.x)+" "+to_string(origin.y)+" "+to_string(origin.z)+"\n"
307308
"SPACING "+to_string(spacing)+" "+to_string(spacing)+" "+to_string(spacing)+"\n"
308-
"POINT_DATA "+to_string((ulong)Nx*(ulong)Ny*(ulong)Nz)+"\nSCALARS data "+vtk_type()+" "+to_string(dimensions())+"\nLOOKUP_TABLE default\n"
309+
"POINT_DATA "+to_string((ulong)Nx*(ulong)Ny*(ulong)Nz)+"\n"
310+
"SCALARS data "+vtk_type()+" "+to_string(dimensions())+"\nLOOKUP_TABLE default\n"
309311
;
310-
T* data = new T[range()];
311-
parallel_for(length(), [&](ulong i) {
312-
for(uint d=0u; d<dimensions(); d++) {
313-
data[i*(ulong)dimensions()+(ulong)d] = reverse_bytes((T)(unit_conversion_factor*reference(i, d))); // SoA <- AoS
314-
}
315-
});
316-
const string filename = create_file_extension(path, ".vtk");
312+
const uint chunk_size_MB = 4u*thread::hardware_concurrency(); // in MB; convert and write data in chunks, to reduce memory footprint and time for large memory allocation
313+
const ulong chunk_elements = (1048576ull*(ulong)chunk_size_MB)/((ulong)dimensions()*sizeof(T));
314+
const ulong chunks=length()/chunk_elements, chunk_remainder=length()%chunk_elements;
315+
T* data = new T[chunk_elements*(ulong)dimensions()];
317316
create_folder(filename);
318317
std::ofstream file(filename, std::ios::out|std::ios::binary);
319318
file.write(header.c_str(), header.length()); // write non-binary file header
320-
file.write((char*)data, capacity()); // write binary data
319+
for(ulong c=0u; c<chunks+1ull; c++) { // iterate over all full chunks + last chunk_remainder chunk
320+
const ulong N = c<chunks ? chunk_elements : chunk_remainder;
321+
if(N==0ull) break; // chunk_remainder may be 0, then skip last iteration
322+
parallel_for(N, [&](ulong i) {
323+
for(uint d=0u; d<dimensions(); d++) { // LBM to SI units, LittleEndian to BigEndian, AoS to SoA
324+
data[i*(ulong)dimensions()+(ulong)d] = reverse_bytes((T)(unit_conversion_factor*reference(c*chunk_elements+i, d)));
325+
}
326+
});
327+
file.write((char*)data, N*(ulong)dimensions()*sizeof(T)); // write binary data
328+
}
321329
file.close();
322330
delete[] data;
323331
info.allow_printing.lock();

src/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ void main_label(const double frametime) {
7777
draw_label(ox, oy+i, "Steps " +alignr(31u, /************************************/ alignr(10u, info.lbm->get_t())+" ("+alignr(5, to_uint(1.0/info.runtime_lbm_timestep_smooth))+" Steps/s)"), c); i+=FONT_HEIGHT;
7878
draw_label(ox, oy+i, "FPS " +alignr(33u, /************************************************************/ alignr(4u, to_uint(1.0/frametime))+" ("+alignr(5u, camera.fps_limit)+" fps max)"), c);
7979
}
80-
draw_label(2, camera.height-1*(FONT_HEIGHT)-1, "FluidX3D v3.2 Copyright (c) Dr. Moritz Lehmann", c);
80+
draw_label(2, camera.height-1*(FONT_HEIGHT)-1, "FluidX3D v3.3 Copyright (c) Dr. Moritz Lehmann", c);
8181
if(!key_H) {
8282
draw_label(camera.width-16*(FONT_WIDTH)-1, 2, "Press H for Help", c);
8383
} else {

src/opencl.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,12 +154,13 @@ struct Device_Info {
154154
cores_per_cu = is_gpu ? (intel_16_cores_per_cu ? 16.0f : 8.0f) : 0.5f; // Intel GPUs have 16 cores/CU (PVC) or 8 cores/CU (integrated/Arc), Intel CPUs (with HT) have 1/2 core/CU
155155
if(is_gpu&&!uses_ram) { // fix wrong global memory capacity reporting for Intel dGPUs
156156
#if defined(_WIN32)
157-
memory = (uint)((cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()*50ull/49ull)/1048576ull); // 98% on Windows https://github.com/intel/compute-runtime/blob/master/shared/source/os_interface/windows/wddm_memory_manager.cpp#L964
157+
memory = (uint)((cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()*50ull/49ull)/1048576ull); // 98% on Windows https://github.com/intel/compute-runtime/blob/master/shared/source/os_interface/windows/wddm_memory_manager.cpp#L958
158158
#elif defined(__linux__)
159-
memory = (uint)((cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()*20ull/19ull)/1048576ull); // 95% on Linux https://github.com/intel/compute-runtime/blob/master/shared/source/os_interface/linux/drm_memory_manager.cpp#L1424
159+
memory = (uint)((cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()*20ull/19ull)/1048576ull); // 95% on Linux https://github.com/intel/compute-runtime/blob/master/shared/source/os_interface/linux/drm_memory_manager.cpp#L1521
160160
#endif // Linux
161161
}
162162
patch_intel_gpu_above_4gb = patch_intel_gpu_above_4gb||(is_gpu&&memory>4096u); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
163+
if(is_cpu) is_dp4a_capable = 0u; // native dp4a in Intel CPU Runtime for OpenCL is slower than emulated dp4a
163164
} else if(vendor_id==0x10DE||vendor_id==0x13B5) { // Nvidia GPU/CPU
164165
nvidia_compute_capability = 10u*(uint)cl_device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV>()+(uint)cl_device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV>();
165166
const bool nvidia__32_cores_per_cu = (nvidia_compute_capability <30); // identify Fermi GPUs

src/resource.rc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ BEGIN
2424
VALUE "LegalCopyright", "(c) Dr. Moritz Lehmann"
2525
VALUE "OriginalFilename", "FluidX3D.exe"
2626
VALUE "ProductName", "FluidX3D"
27-
VALUE "ProductVersion", "v3.2"
27+
VALUE "ProductVersion", "v3.3"
2828
END
2929
END
3030
BLOCK "VarFileInfo"

src/utilities.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,10 @@ typedef uint64_t ulong;
4040
#define min_short ((short)-32768)
4141
#define max_short ((short)32767)
4242
#define max_ushort ((ushort)65535)
43-
#define min_int -2147483648
43+
#define min_int ((int)-2147483648)
4444
#define max_int 2147483647
4545
#define max_uint 4294967295u
46-
#define min_slong -9223372036854775808ll
46+
#define min_slong ((slong)-9223372036854775808ll)
4747
#define max_slong 9223372036854775807ll
4848
#define max_ulong 18446744073709551615ull
4949
#define min_float 1.401298464E-45f

0 commit comments

Comments
 (0)