diff --git a/pysplashsurf/pysplashsurf/docs/requirements.txt b/pysplashsurf/pysplashsurf/docs/requirements.txt index 9126624..d6dacc2 100644 --- a/pysplashsurf/pysplashsurf/docs/requirements.txt +++ b/pysplashsurf/pysplashsurf/docs/requirements.txt @@ -1,4 +1,4 @@ -pip==25.0.1 +pip==25.2 sphinx==8.2.3 numpy==2.2.3 meshio==5.3.5 diff --git a/pysplashsurf/pysplashsurf/pysplashsurf.pyi b/pysplashsurf/pysplashsurf/pysplashsurf.pyi index ed3344f..6acb672 100644 --- a/pysplashsurf/pysplashsurf/pysplashsurf.pyi +++ b/pysplashsurf/pysplashsurf/pysplashsurf.pyi @@ -185,11 +185,11 @@ class NeighborhoodLists: class SphInterpolator: r""" - Interpolator of per-particle quantities to arbitrary points using SPH interpolation (with cubic kernel) + Interpolator of per-particle quantities to arbitrary points using SPH interpolation """ - def __new__(cls, particle_positions:numpy.typing.NDArray[typing.Any], particle_densities:numpy.typing.NDArray[typing.Any], particle_rest_mass:builtins.float, compact_support_radius:builtins.float) -> SphInterpolator: + def __new__(cls, particle_positions:numpy.typing.NDArray[typing.Any], particle_densities:numpy.typing.NDArray[typing.Any], particle_rest_mass:builtins.float, compact_support_radius:builtins.float, kernel_type:KernelType) -> SphInterpolator: r""" - Constructs an SPH interpolator (with cubic kernels) for the given particles + Constructs an SPH interpolator for the given particles Parameters ---------- @@ -200,7 +200,9 @@ class SphInterpolator: particle_rest_mass The rest mass of each particle (assumed to be the same for all particles). compact_support_radius - The compact support radius of the cubic spline kernel used for interpolation. + The compact support radius of the kernel used for interpolation. + kernel_type + The kernel function used for interpolation """ def interpolate_quantity(self, particle_quantity:numpy.typing.NDArray[typing.Any], interpolation_points:numpy.typing.NDArray[typing.Any], *, first_order_correction:builtins.bool=False) -> numpy.typing.NDArray[typing.Any]: r""" @@ -315,6 +317,15 @@ class VertexVertexConnectivity: Returns the wrapped connectivity data by moving it out of this object (zero copy) """ +class KernelType(Enum): + r""" + Enum for specifying the Kernel function used for the reconstruction + """ + CubicSpline = ... + Poly6 = ... + Spiky = ... + WendlandQuinticC2 = ... + class MeshType(Enum): r""" Enum specifying the type of mesh wrapped by a ``MeshWithData`` @@ -497,7 +508,7 @@ def neighborhood_search_spatial_hashing_parallel(particle_positions:numpy.typing The radius per particle where other particles are considered neighbors. """ -def reconstruct_surface(particles:numpy.typing.NDArray[typing.Any], *, particle_radius:builtins.float, rest_density:builtins.float=1000.0, smoothing_length:builtins.float, cube_size:builtins.float, iso_surface_threshold:builtins.float=0.6, aabb_min:typing.Optional[typing.Sequence[builtins.float]]=None, aabb_max:typing.Optional[typing.Sequence[builtins.float]]=None, multi_threading:builtins.bool=True, global_neighborhood_list:builtins.bool=False, subdomain_grid:builtins.bool=True, subdomain_grid_auto_disable:builtins.bool=True, subdomain_num_cubes_per_dim:builtins.int=64) -> SurfaceReconstruction: +def reconstruct_surface(particles:numpy.typing.NDArray[typing.Any], *, particle_radius:builtins.float, rest_density:builtins.float=1000.0, smoothing_length:builtins.float, cube_size:builtins.float, iso_surface_threshold:builtins.float=0.6, aabb_min:typing.Optional[typing.Sequence[builtins.float]]=None, aabb_max:typing.Optional[typing.Sequence[builtins.float]]=None, multi_threading:builtins.bool=True, simd:builtins.bool=True, kernel_type:KernelType=..., global_neighborhood_list:builtins.bool=False, subdomain_grid:builtins.bool=True, subdomain_grid_auto_disable:builtins.bool=True, subdomain_num_cubes_per_dim:builtins.int=64) -> SurfaceReconstruction: r""" Performs a surface reconstruction from the given particles without additional post-processing @@ -523,6 +534,8 @@ def reconstruct_surface(particles:numpy.typing.NDArray[typing.Any], *, particle_ Upper corner of the AABB of particles to consider in the reconstruction. multi_threading Flag to enable multi-threading for the reconstruction and post-processing steps. + simd + Flag to enable SIMD vectorization for the reconstruction if supported by the CPU architecture. subdomain_grid Flag to enable spatial decomposition by dividing the domain into subdomains with dense marching cube grids for efficient multi-threading. subdomain_grid_auto_disable @@ -531,7 +544,7 @@ def reconstruct_surface(particles:numpy.typing.NDArray[typing.Any], *, particle_ Number of marching cubes voxels along each coordinate axis in each subdomain if the subdomain grid is enabled. """ -def reconstruction_pipeline(particles:numpy.typing.NDArray[typing.Any], *, attributes_to_interpolate:typing.Optional[dict]=None, particle_radius:builtins.float, rest_density:builtins.float=1000.0, smoothing_length:builtins.float, cube_size:builtins.float, iso_surface_threshold:builtins.float=0.6, aabb_min:typing.Optional[typing.Sequence[builtins.float]]=None, aabb_max:typing.Optional[typing.Sequence[builtins.float]]=None, multi_threading:builtins.bool=True, subdomain_grid:builtins.bool=True, subdomain_grid_auto_disable:builtins.bool=True, subdomain_num_cubes_per_dim:builtins.int=64, check_mesh_closed:builtins.bool=False, check_mesh_manifold:builtins.bool=False, check_mesh_orientation:builtins.bool=False, check_mesh_debug:builtins.bool=False, mesh_cleanup:builtins.bool=False, mesh_cleanup_snap_dist:typing.Optional[builtins.float]=None, decimate_barnacles:builtins.bool=False, keep_vertices:builtins.bool=False, compute_normals:builtins.bool=False, sph_normals:builtins.bool=False, normals_smoothing_iters:typing.Optional[builtins.int]=None, mesh_smoothing_iters:typing.Optional[builtins.int]=None, mesh_smoothing_weights:builtins.bool=True, mesh_smoothing_weights_normalization:builtins.float=13.0, generate_quads:builtins.bool=False, quad_max_edge_diag_ratio:builtins.float=1.75, quad_max_normal_angle:builtins.float=10.0, quad_max_interior_angle:builtins.float=135.0, output_mesh_smoothing_weights:builtins.bool=False, output_raw_normals:builtins.bool=False, output_raw_mesh:builtins.bool=False, mesh_aabb_min:typing.Optional[typing.Sequence[builtins.float]]=None, mesh_aabb_max:typing.Optional[typing.Sequence[builtins.float]]=None, mesh_aabb_clamp_vertices:builtins.bool=True) -> tuple[MeshWithData, SurfaceReconstruction]: +def reconstruction_pipeline(particles:numpy.typing.NDArray[typing.Any], *, attributes_to_interpolate:typing.Optional[dict]=None, particle_radius:builtins.float, rest_density:builtins.float=1000.0, smoothing_length:builtins.float, cube_size:builtins.float, iso_surface_threshold:builtins.float=0.6, aabb_min:typing.Optional[typing.Sequence[builtins.float]]=None, aabb_max:typing.Optional[typing.Sequence[builtins.float]]=None, multi_threading:builtins.bool=True, simd:builtins.bool=True, kernel_type:KernelType=..., subdomain_grid:builtins.bool=True, subdomain_grid_auto_disable:builtins.bool=True, subdomain_num_cubes_per_dim:builtins.int=64, check_mesh_closed:builtins.bool=False, check_mesh_manifold:builtins.bool=False, check_mesh_orientation:builtins.bool=False, check_mesh_debug:builtins.bool=False, mesh_cleanup:builtins.bool=False, mesh_cleanup_snap_dist:typing.Optional[builtins.float]=None, decimate_barnacles:builtins.bool=False, keep_vertices:builtins.bool=False, compute_normals:builtins.bool=False, sph_normals:builtins.bool=False, normals_smoothing_iters:typing.Optional[builtins.int]=None, mesh_smoothing_iters:typing.Optional[builtins.int]=None, mesh_smoothing_weights:builtins.bool=True, mesh_smoothing_weights_normalization:builtins.float=13.0, generate_quads:builtins.bool=False, quad_max_edge_diag_ratio:builtins.float=1.75, quad_max_normal_angle:builtins.float=10.0, quad_max_interior_angle:builtins.float=135.0, output_mesh_smoothing_weights:builtins.bool=False, output_raw_normals:builtins.bool=False, output_raw_mesh:builtins.bool=False, mesh_aabb_min:typing.Optional[typing.Sequence[builtins.float]]=None, mesh_aabb_max:typing.Optional[typing.Sequence[builtins.float]]=None, mesh_aabb_clamp_vertices:builtins.bool=True) -> tuple[MeshWithData, SurfaceReconstruction]: r""" Runs the surface reconstruction pipeline for the given particle positions with optional post-processing @@ -561,6 +574,10 @@ def reconstruction_pipeline(particles:numpy.typing.NDArray[typing.Any], *, attri Upper corner [x,y,z] of the AABB of particles to consider in the reconstruction. multi_threading Flag to enable multi-threading for the reconstruction and post-processing steps. + simd + Flag to enable SIMD vectorization for the reconstruction if supported by the CPU architecture. + kernel_type + Kernel function to use for the reconstruction. subdomain_grid Flag to enable spatial decomposition by dividing the domain into subdomains with dense marching cube grids for efficient multi-threading. subdomain_grid_auto_disable diff --git a/pysplashsurf/src/lib.rs b/pysplashsurf/src/lib.rs index 02183d1..6924fa0 100644 --- a/pysplashsurf/src/lib.rs +++ b/pysplashsurf/src/lib.rs @@ -39,6 +39,7 @@ fn pysplashsurf(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; use wrap_pyfunction as wrap; diff --git a/pysplashsurf/src/pipeline.rs b/pysplashsurf/src/pipeline.rs index 549f089..082f102 100644 --- a/pysplashsurf/src/pipeline.rs +++ b/pysplashsurf/src/pipeline.rs @@ -18,7 +18,7 @@ use std::borrow::Cow; use crate::mesh::PyMeshWithData; use crate::reconstruction::PySurfaceReconstruction; -use crate::utils::{IndexT, pyerr_unsupported_scalar}; +use crate::utils::{IndexT, KernelType, pyerr_unsupported_scalar}; /// Runs the surface reconstruction pipeline for the given particle positions with optional post-processing /// @@ -49,7 +49,9 @@ use crate::utils::{IndexT, pyerr_unsupported_scalar}; /// multi_threading /// Flag to enable multi-threading for the reconstruction and post-processing steps. /// simd -/// Flag to enable SIMD vectorization for the reconstruction if supported by the CPU architecture. +/// Flag to enable SIMD vectorization for the reconstruction if supported by the CPU architecture. +/// kernel_type +/// Kernel function to use for the reconstruction. /// subdomain_grid /// Flag to enable spatial decomposition by dividing the domain into subdomains with dense marching cube grids for efficient multi-threading. /// subdomain_grid_auto_disable @@ -110,7 +112,7 @@ use crate::utils::{IndexT, pyerr_unsupported_scalar}; #[pyo3(name = "reconstruction_pipeline")] #[pyo3(signature = (particles, *, attributes_to_interpolate = None, particle_radius, rest_density = 1000.0, smoothing_length, cube_size, iso_surface_threshold = 0.6, - aabb_min = None, aabb_max = None, multi_threading = true, simd = true, + aabb_min = None, aabb_max = None, multi_threading = true, simd = true, kernel_type = KernelType::CubicSpline, subdomain_grid = true, subdomain_grid_auto_disable = true, subdomain_num_cubes_per_dim = 64, check_mesh_closed = false, check_mesh_manifold = false, check_mesh_orientation = false, check_mesh_debug = false, mesh_cleanup = false, mesh_cleanup_snap_dist = None, decimate_barnacles = false, keep_vertices = false, compute_normals = false, sph_normals = false, @@ -131,6 +133,7 @@ pub fn reconstruction_pipeline<'py>( aabb_max: Option<[f64; 3]>, multi_threading: bool, simd: bool, + kernel_type: KernelType, subdomain_grid: bool, subdomain_grid_auto_disable: bool, subdomain_num_cubes_per_dim: u32, @@ -198,6 +201,7 @@ pub fn reconstruction_pipeline<'py>( enable_simd: simd, spatial_decomposition, global_neighborhood_list: false, + kernel_type: kernel_type.into_lib_enum(), }; let postprocessing_args = splashsurf::reconstruct::ReconstructionPostprocessingParameters { diff --git a/pysplashsurf/src/reconstruction.rs b/pysplashsurf/src/reconstruction.rs index 1a6ec9f..fd88c07 100644 --- a/pysplashsurf/src/reconstruction.rs +++ b/pysplashsurf/src/reconstruction.rs @@ -1,7 +1,7 @@ use crate::mesh::PyTriMesh3d; use crate::neighborhood_search::PyNeighborhoodLists; use crate::uniform_grid::PyUniformGrid; -use crate::utils; +use crate::utils::{self, KernelType}; use anyhow::anyhow; use ndarray::ArrayView1; use numpy as np; @@ -124,6 +124,8 @@ impl PySurfaceReconstruction { /// Flag to enable multi-threading for the reconstruction and post-processing steps. /// simd /// Flag to enable SIMD vectorization for the reconstruction if supported by the CPU architecture. +/// kernel_type +/// Kernel function to use for the reconstruction. /// subdomain_grid /// Flag to enable spatial decomposition by dividing the domain into subdomains with dense marching cube grids for efficient multi-threading. /// subdomain_grid_auto_disable @@ -135,8 +137,8 @@ impl PySurfaceReconstruction { #[pyo3(name = "reconstruct_surface")] #[pyo3(signature = (particles, *, particle_radius, rest_density = 1000.0, smoothing_length, cube_size, iso_surface_threshold = 0.6, - aabb_min = None, aabb_max = None, - multi_threading = true, simd = true, global_neighborhood_list = false, + aabb_min = None, aabb_max = None, multi_threading = true, simd = true, + kernel_type = KernelType::CubicSpline, global_neighborhood_list = false, subdomain_grid = true, subdomain_grid_auto_disable = true, subdomain_num_cubes_per_dim = 64 ))] pub fn reconstruct_surface<'py>( @@ -150,6 +152,7 @@ pub fn reconstruct_surface<'py>( aabb_max: Option<[f64; 3]>, multi_threading: bool, simd: bool, + kernel_type: KernelType, global_neighborhood_list: bool, subdomain_grid: bool, subdomain_grid_auto_disable: bool, @@ -181,6 +184,7 @@ pub fn reconstruct_surface<'py>( enable_simd: simd, spatial_decomposition, global_neighborhood_list, + kernel_type: kernel_type.into_lib_enum(), }; let element_type = particles.dtype(); diff --git a/pysplashsurf/src/sph_interpolation.rs b/pysplashsurf/src/sph_interpolation.rs index b386a00..6bd3ced 100644 --- a/pysplashsurf/src/sph_interpolation.rs +++ b/pysplashsurf/src/sph_interpolation.rs @@ -1,3 +1,4 @@ +use crate::utils::{KernelType, *}; use numpy as np; use numpy::prelude::*; use numpy::{Element, PyArray1, PyArray2, PyUntypedArray}; @@ -12,14 +13,12 @@ use splashsurf_lib::{ sph_interpolation::SphInterpolator, }; -use crate::utils::*; - enum PySphInterpolatorWrapper { F32(SphInterpolator), F64(SphInterpolator), } -/// Interpolator of per-particle quantities to arbitrary points using SPH interpolation (with cubic kernel) +/// Interpolator of per-particle quantities to arbitrary points using SPH interpolation #[gen_stub_pyclass] #[pyclass] #[pyo3(name = "SphInterpolator")] @@ -36,6 +35,7 @@ impl PySphInterpolator { particle_densities: &Bound<'py, PyUntypedArray>, particle_rest_mass: f64, compact_support_radius: f64, + kernel_type: KernelType, ) -> PyResult where PySphInterpolator: From>, @@ -55,6 +55,7 @@ impl PySphInterpolator { densities, R::from_float(particle_rest_mass), R::from_float(compact_support_radius), + kernel_type.into_lib_enum(), ))) } else { Err(pyerr_scalar_type_mismatch()) @@ -159,7 +160,7 @@ impl PySphInterpolator { #[gen_stub_pymethods] #[pymethods] impl PySphInterpolator { - /// Constructs an SPH interpolator (with cubic kernels) for the given particles + /// Constructs an SPH interpolator for the given particles /// /// Parameters /// ---------- @@ -170,13 +171,16 @@ impl PySphInterpolator { /// particle_rest_mass /// The rest mass of each particle (assumed to be the same for all particles). /// compact_support_radius - /// The compact support radius of the cubic spline kernel used for interpolation. + /// The compact support radius of the kernel used for interpolation. + /// kernel_type + /// The kernel function used for interpolation #[new] fn py_new<'py>( particle_positions: &Bound<'py, PyUntypedArray>, particle_densities: &Bound<'py, PyUntypedArray>, particle_rest_mass: f64, compact_support_radius: f64, + kernel_type: KernelType, ) -> PyResult { let py = particle_positions.py(); let element_type = particle_positions.dtype(); @@ -187,6 +191,7 @@ impl PySphInterpolator { particle_densities, particle_rest_mass, compact_support_radius, + kernel_type, ) } else if element_type.is_equiv_to(&np::dtype::(py)) { Self::new_generic::( @@ -194,6 +199,7 @@ impl PySphInterpolator { particle_densities, particle_rest_mass, compact_support_radius, + kernel_type, ) } else { Err(pyerr_unsupported_scalar()) diff --git a/pysplashsurf/src/utils.rs b/pysplashsurf/src/utils.rs index 095c359..ff7f66c 100644 --- a/pysplashsurf/src/utils.rs +++ b/pysplashsurf/src/utils.rs @@ -3,9 +3,32 @@ use numpy::{Element, PyArray, PyUntypedArray}; use pyo3::exceptions::PyTypeError; use pyo3::prelude::*; use pyo3::{Bound, PyAny, PyErr, PyResult}; +use pyo3_stub_gen::derive::gen_stub_pyclass_enum; use splashsurf_lib::Real; use splashsurf_lib::nalgebra::SVector; +/// Enum for specifying the Kernel function used for the reconstruction +#[gen_stub_pyclass_enum] +#[pyclass] +#[derive(Clone)] +pub enum KernelType { + CubicSpline, + Poly6, + Spiky, + WendlandQuinticC2, +} + +impl KernelType { + pub fn into_lib_enum(&self) -> splashsurf_lib::kernel::KernelType { + match self { + KernelType::CubicSpline => splashsurf_lib::kernel::KernelType::CubicSpline, + KernelType::Poly6 => splashsurf_lib::kernel::KernelType::Poly6, + KernelType::Spiky => splashsurf_lib::kernel::KernelType::Spiky, + KernelType::WendlandQuinticC2 => splashsurf_lib::kernel::KernelType::WendlandQuinticC2, + } + } +} + /// The index type used for all grids and reconstructions in this crate pub(crate) type IndexT = i64; diff --git a/pysplashsurf/tests/test_basic.py b/pysplashsurf/tests/test_basic.py index 083cc47..2d38871 100644 --- a/pysplashsurf/tests/test_basic.py +++ b/pysplashsurf/tests/test_basic.py @@ -301,7 +301,7 @@ def interpolator_test(dtype): rest_mass = 1000.0 * 0.025**3 interpolator = pysplashsurf.SphInterpolator( - particles, reconstruction.particle_densities, rest_mass, compact_support + particles, reconstruction.particle_densities, rest_mass, compact_support, pysplashsurf.KernelType.CubicSpline ) assert type(interpolator) is pysplashsurf.SphInterpolator diff --git a/pysplashsurf/tests/test_calling.py b/pysplashsurf/tests/test_calling.py index c64f1ad..2c89f53 100644 --- a/pysplashsurf/tests/test_calling.py +++ b/pysplashsurf/tests/test_calling.py @@ -59,6 +59,7 @@ def reconstruction_pipeline( smoothing_length=2.0, cube_size=0.5, iso_surface_threshold=0.6, + kernel_type=pysplashsurf.KernelType.CubicSpline, mesh_smoothing_weights=False, output_mesh_smoothing_weights=False, sph_normals=False, @@ -109,6 +110,7 @@ def reconstruction_pipeline( rest_density=rest_density, smoothing_length=smoothing_length, cube_size=cube_size, + kernel_type=kernel_type, iso_surface_threshold=iso_surface_threshold, mesh_smoothing_weights=mesh_smoothing_weights, sph_normals=sph_normals, @@ -276,3 +278,56 @@ def test_with_post_processing_f32(): def test_with_post_processing_f64(): with_post_processing_test(np.float64) + + +def poly6_kernel_test(dtype): + start = now_s() + subprocess.run( + [BINARY_PATH] + + f"reconstruct {VTK_PATH} -o {DIR.joinpath('test_bin.vtk')} -r=0.025 -l=2.0 -c=0.5 -t=0.6 {"-d=on" if dtype == np.float64 else ""} --kernel=poly6 --subdomain-grid=on --mesh-cleanup=off --mesh-smoothing-weights=off --mesh-smoothing-iters=0 --normals=off --normals-smoothing-iters=0".split(), + check=True, + ) + print("Binary done in", now_s() - start) + + start = now_s() + reconstruction_pipeline( + VTK_PATH, + DIR.joinpath("test.vtk"), + particle_radius=0.025, + smoothing_length=2.0, + cube_size=0.5, + iso_surface_threshold=0.6, + kernel_type=pysplashsurf.KernelType.Poly6, + mesh_smoothing_weights=False, + mesh_smoothing_iters=0, + normals_smoothing_iters=0, + mesh_cleanup=False, + compute_normals=False, + subdomain_grid=True, + dtype=dtype + ) + print("Python done in", now_s() - start) + + binary_mesh = meshio.read(DIR.joinpath("test_bin.vtk")) + python_mesh = meshio.read(DIR.joinpath("test.vtk")) + + binary_verts = np.array(binary_mesh.points, dtype=dtype) + python_verts = np.array(python_mesh.points, dtype=dtype) + + print("# of vertices binary:", len(binary_verts)) + print("# of vertices python:", len(python_verts)) + + assert len(binary_verts) == len(python_verts) + + binary_verts.sort(axis=0) + python_verts.sort(axis=0) + + assert np.allclose(binary_verts, python_verts) + + +def test_poly6_kernel_f32(): + poly6_kernel_test(np.float32) + + +def test_poly6_kernel_f64(): + poly6_kernel_test(np.float64) diff --git a/splashsurf/src/cli.rs b/splashsurf/src/cli.rs index 060ebf3..275f6be 100644 --- a/splashsurf/src/cli.rs +++ b/splashsurf/src/cli.rs @@ -80,6 +80,26 @@ impl Switch { } } +/// An enum for specifying the wanted kernel type in the CLI +#[derive(Copy, Clone, Debug, PartialEq, Eq, clap::ValueEnum)] +pub(crate) enum KernelType { + CubicSpline, + Poly6, + Spiky, + WendlandQuinticC2, +} + +impl KernelType { + pub(crate) fn into_lib_enum(self) -> splashsurf_lib::kernel::KernelType { + match self { + KernelType::CubicSpline => splashsurf_lib::kernel::KernelType::CubicSpline, + KernelType::Poly6 => splashsurf_lib::kernel::KernelType::Poly6, + KernelType::Spiky => splashsurf_lib::kernel::KernelType::Spiky, + KernelType::WendlandQuinticC2 => splashsurf_lib::kernel::KernelType::WendlandQuinticC2, + } + } +} + /// Runs the splashsurf CLI with the provided command line arguments. /// /// This function behaves like the binary `splashsurf` command line tool including output to stdout diff --git a/splashsurf/src/reconstruct.rs b/splashsurf/src/reconstruct.rs index 0cc8428..b5119ac 100644 --- a/splashsurf/src/reconstruct.rs +++ b/splashsurf/src/reconstruct.rs @@ -1,6 +1,6 @@ //! Implementation of the `reconstruct` subcommand of the splashsurf CLI. -use crate::cli::Switch; +use crate::cli::{KernelType, Switch}; use crate::reconstruct::arguments::*; use crate::{io, logging}; use anyhow::{Context, anyhow}; @@ -135,7 +135,16 @@ pub(crate) struct ReconstructSubcommandArgs { require_equals = true )] pub simd: Switch, - + /// Kernel that is used for the reconstruction and interpolation. + #[arg( + help_heading = ARGS_ADV, + long, + default_value = "cubic-spline", + value_name = "cubic-spline|poly6|spiky|wendland-quintic-c2", + ignore_case = true, + require_equals = true + )] + pub kernel: KernelType, /// Enable automatic spatial decomposition using a regular grid-based approach (for efficient multithreading) if the domain is large enough #[arg( help_heading = ARGS_DECOMPOSITION, @@ -651,6 +660,7 @@ pub(crate) mod arguments { enable_simd: args.simd.into_bool(), spatial_decomposition, global_neighborhood_list: args.mesh_smoothing_weights.into_bool(), + kernel_type: args.kernel.into_lib_enum(), }; // Optionally initialize thread pool @@ -1143,6 +1153,7 @@ pub fn reconstruction_pipeline<'a, I: Index, R: Real>( particle_densities, particle_rest_mass, params.compact_support_radius, + params.kernel_type, )) } else { None diff --git a/splashsurf_lib/benches/benches/bench_full.rs b/splashsurf_lib/benches/benches/bench_full.rs index be2476a..23af42f 100644 --- a/splashsurf_lib/benches/benches/bench_full.rs +++ b/splashsurf_lib/benches/benches/bench_full.rs @@ -105,6 +105,7 @@ pub fn surface_reconstruction_dam_break(c: &mut Criterion) { enable_simd: true, spatial_decomposition: SpatialDecomposition::None, global_neighborhood_list: false, + kernel_type: splashsurf_lib::kernel::KernelType::CubicSpline, }; let mut group = c.benchmark_group("full surface reconstruction"); @@ -165,6 +166,7 @@ pub fn surface_reconstruction_double_dam_break(c: &mut Criterion) { enable_simd: true, spatial_decomposition: SpatialDecomposition::None, global_neighborhood_list: false, + kernel_type: splashsurf_lib::kernel::KernelType::CubicSpline, }; let mut group = c.benchmark_group("full surface reconstruction"); @@ -225,6 +227,7 @@ pub fn surface_reconstruction_double_dam_break_inplace(c: &mut Criterion) { enable_simd: true, spatial_decomposition: SpatialDecomposition::None, global_neighborhood_list: false, + kernel_type: splashsurf_lib::kernel::KernelType::CubicSpline, }; let mut group = c.benchmark_group("full surface reconstruction"); diff --git a/splashsurf_lib/benches/benches/bench_grid_loop.rs b/splashsurf_lib/benches/benches/bench_grid_loop.rs index 4fd5ff2..ee6898f 100644 --- a/splashsurf_lib/benches/benches/bench_grid_loop.rs +++ b/splashsurf_lib/benches/benches/bench_grid_loop.rs @@ -2,7 +2,7 @@ use criterion::{Criterion, criterion_group}; use nalgebra::{Scalar, Vector3}; use serde_derive::{Deserialize, Serialize}; use splashsurf_lib::dense_subdomains; -use splashsurf_lib::kernel::CubicSplineKernel; +use splashsurf_lib::kernel::{CubicSplineKernel, SymmetricKernel3d}; use splashsurf_lib::uniform_grid::UniformCartesianCubeGrid3d; use std::time::Duration; @@ -231,7 +231,6 @@ pub fn grid_loop_avx2(c: &mut Criterion) { #[cfg(all(target_feature = "avx2", target_feature = "fma"))] { let avx_result = unsafe { - let kernel = CubicSplineKernel::new(params.compact_support_radius); let mut params = params.clone(); dense_subdomains::density_grid_loop_avx( params.levelset_grid.as_mut_slice(), @@ -243,7 +242,8 @@ pub fn grid_loop_avx2(c: &mut Criterion) { params.cube_radius, params.squared_support_with_margin, params.particle_rest_mass, - &kernel, + params.compact_support_radius, + &splashsurf_lib::kernel::KernelType::CubicSpline, ); params.levelset_grid }; @@ -268,7 +268,6 @@ pub fn grid_loop_avx2(c: &mut Criterion) { unsafe { group.bench_function("grid_loop_avx2", |b| { b.iter(|| { - let kernel = CubicSplineKernel::new(params.compact_support_radius); let mut params = params.clone(); dense_subdomains::density_grid_loop_avx( params.levelset_grid.as_mut_slice(), @@ -280,7 +279,8 @@ pub fn grid_loop_avx2(c: &mut Criterion) { params.cube_radius, params.squared_support_with_margin, params.particle_rest_mass, - &kernel, + params.compact_support_radius, + &splashsurf_lib::kernel::KernelType::CubicSpline, ); }) }); diff --git a/splashsurf_lib/benches/benches/bench_mesh.rs b/splashsurf_lib/benches/benches/bench_mesh.rs index 4d49d00..f8caa3d 100644 --- a/splashsurf_lib/benches/benches/bench_mesh.rs +++ b/splashsurf_lib/benches/benches/bench_mesh.rs @@ -29,6 +29,7 @@ fn reconstruct_particles>(particle_file: P) -> SurfaceReconstruct auto_disable: false, }), global_neighborhood_list: false, + kernel_type: splashsurf_lib::kernel::KernelType::CubicSpline, }; reconstruct_surface::(particle_positions.as_slice(), ¶meters).unwrap() diff --git a/splashsurf_lib/benches/benches/bench_subdomain_grid.rs b/splashsurf_lib/benches/benches/bench_subdomain_grid.rs index 592ca81..b91316c 100644 --- a/splashsurf_lib/benches/benches/bench_subdomain_grid.rs +++ b/splashsurf_lib/benches/benches/bench_subdomain_grid.rs @@ -28,6 +28,7 @@ fn parameters_canyon() -> Parameters { auto_disable: false, }), global_neighborhood_list: false, + kernel_type: splashsurf_lib::kernel::KernelType::CubicSpline, } } diff --git a/splashsurf_lib/src/dense_subdomains.rs b/splashsurf_lib/src/dense_subdomains.rs index ed19c6d..71e8803 100644 --- a/splashsurf_lib/src/dense_subdomains.rs +++ b/splashsurf_lib/src/dense_subdomains.rs @@ -16,7 +16,10 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use thread_local::ThreadLocal; use crate::density_map::sequential_compute_particle_densities_filtered; -use crate::kernel::{CubicSplineKernel, SymmetricKernel3d}; +use crate::kernel::{ + CubicSplineKernel, KernelType, Poly6Kernel, SpikyKernel, SymmetricKernel3d, + WendlandQuinticC2Kernel, +}; use crate::marching_cubes::marching_cubes_lut::marching_cubes_triangulation_iter; use crate::mesh::{HexMesh3d, TriMesh3d}; use crate::neighborhood_search::{ @@ -59,6 +62,8 @@ pub(crate) struct ParametersSubdomainGrid { pub chunk_size: usize, /// Whether to return the global particle neighborhood list instead of only using per-domain lists internally pub global_neighborhood_list: bool, + /// The SPH kernel that should be used + pub kernel_type: KernelType, } impl ParametersSubdomainGrid { @@ -240,6 +245,7 @@ pub(crate) fn initialize_parameters( subdomain_grid, chunk_size, global_neighborhood_list: parameters.global_neighborhood_list, + kernel_type: parameters.kernel_type.clone(), }) } @@ -493,7 +499,7 @@ pub(crate) fn decomposition< }) } -pub(crate) fn compute_global_densities_and_neighbors( +pub(crate) fn compute_global_densities_and_neighbors>( parameters: &ParametersSubdomainGrid, global_particles: &[Vector3], subdomains: &Subdomains, @@ -583,7 +589,7 @@ pub(crate) fn compute_global_densities_and_neighbors( |i| is_inside[i], ); - sequential_compute_particle_densities_filtered::( + sequential_compute_particle_densities_filtered::( subdomain_particles, neighborhood_lists, parameters.compact_support_radius, @@ -712,7 +718,7 @@ fn local_to_global_point_ijk( } /// Auto-dispatching density grid loop for f32/i64: chooses AVX2+FMA on x86, NEON on aarch64, otherwise scalar fallback -pub fn density_grid_loop_auto>( +pub fn density_grid_loop_auto( levelset_grid: &mut [f32], subdomain_particles: &[Vector3], subdomain_particle_densities: &[f32], @@ -722,7 +728,8 @@ pub fn density_grid_loop_auto>( cube_radius: i64, squared_support_with_margin: f32, particle_rest_mass: f32, - kernel: &K, + kernel_support: f32, + kernel_type: &KernelType, ) { #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { @@ -739,7 +746,8 @@ pub fn density_grid_loop_auto>( cube_radius, squared_support_with_margin, particle_rest_mass, - kernel, + kernel_support, + kernel_type, ); } } @@ -760,25 +768,72 @@ pub fn density_grid_loop_auto>( cube_radius, squared_support_with_margin, particle_rest_mass, - kernel, + kernel_support, + kernel_type, ); } } } // Fallback: scalar generic implementation - density_grid_loop_scalar::( - levelset_grid, - subdomain_particles, - subdomain_particle_densities, - subdomain_mc_grid, - subdomain_ijk, - global_mc_grid, - cube_radius, - squared_support_with_margin, - particle_rest_mass, - kernel, - ); + match kernel_type { + KernelType::CubicSpline => { + density_grid_loop_scalar::>( + levelset_grid, + subdomain_particles, + subdomain_particle_densities, + subdomain_mc_grid, + subdomain_ijk, + global_mc_grid, + cube_radius, + squared_support_with_margin, + particle_rest_mass, + &CubicSplineKernel::new(kernel_support), + ); + } + KernelType::Poly6 => { + density_grid_loop_scalar::>( + levelset_grid, + subdomain_particles, + subdomain_particle_densities, + subdomain_mc_grid, + subdomain_ijk, + global_mc_grid, + cube_radius, + squared_support_with_margin, + particle_rest_mass, + &Poly6Kernel::new(kernel_support), + ); + } + KernelType::Spiky => { + density_grid_loop_scalar::>( + levelset_grid, + subdomain_particles, + subdomain_particle_densities, + subdomain_mc_grid, + subdomain_ijk, + global_mc_grid, + cube_radius, + squared_support_with_margin, + particle_rest_mass, + &SpikyKernel::new(kernel_support), + ); + } + KernelType::WendlandQuinticC2 => { + density_grid_loop_scalar::>( + levelset_grid, + subdomain_particles, + subdomain_particle_densities, + subdomain_mc_grid, + subdomain_ijk, + global_mc_grid, + cube_radius, + squared_support_with_margin, + particle_rest_mass, + &WendlandQuinticC2Kernel::new(kernel_support), + ); + } + } } pub fn density_grid_loop_scalar>( @@ -848,7 +903,7 @@ pub fn density_grid_loop_scalar>( #[cfg(all(target_arch = "aarch64"))] #[target_feature(enable = "neon")] -pub fn density_grid_loop_neon>( +pub fn density_grid_loop_neon( levelset_grid: &mut [f32], subdomain_particles: &[Vector3], subdomain_particle_densities: &[f32], @@ -858,8 +913,10 @@ pub fn density_grid_loop_neon>( cube_radius: i64, _squared_support_with_margin: f32, particle_rest_mass: f32, - kernel: &K, + kernel_support: f32, + kernel_type: &KernelType, ) { + use crate::kernel::NeonKernel; use core::arch::aarch64::*; const LANES: usize = 4; @@ -875,120 +932,193 @@ pub fn density_grid_loop_neon>( u32::MAX ); - let kernel_neon = kernel::CubicSplineKernelNeonF32::new(kernel.compact_support_radius()); - - profile!("density grid loop (neon)"); - let mc_grid = subdomain_mc_grid; - let mc_points = mc_grid.points_per_dim(); - let dim_y = mc_points[1] as usize; - let dim_z = mc_points[2] as usize; - - // Preload constants used inside the hot loops - const K_OFFSETS_ARR: [u32; 4] = [0, 1, 2, 3]; - let k_offsets = unsafe { vld1q_u32(K_OFFSETS_ARR.as_ptr()) }; - let global_min = global_mc_grid.aabb().min(); - let min_z_v = vdupq_n_f32(global_min[2]); - let cube_size = global_mc_grid.cell_size(); - let support = kernel.compact_support_radius(); - let support_sq_v = vdupq_n_f32(support * support); - - for (p_i, rho_i) in subdomain_particles - .iter() - .copied() - .zip(subdomain_particle_densities.iter().copied()) - { - let v_i = particle_rest_mass / rho_i; - - let (lower, upper) = particle_influence_aabb(&p_i, &mc_grid, cube_radius); - let lower = lower.map(|i| i as usize); - let upper = upper.map(|i| i as usize); - - let remainder = (upper[2] - lower[2]) % LANES; - let upper_k_aligned = upper[2] - remainder; - - let subdomain_ijk = subdomain_ijk.map(|i| i as u32); - let mc_cells = mc_grid.cells_per_dim().map(|i| i as u32); - - // Broadcast particle coordinates once per particle - let particle_xs = vdupq_n_f32(p_i[0]); - let particle_ys = vdupq_n_f32(p_i[1]); - let particle_zs = vdupq_n_f32(p_i[2]); - - // Function to evaluate kernel contribution of current particle to 8 grid points at once - let evaluate_contribution = - |k: usize, grid_xs: float32x4_t, grid_ys: float32x4_t| -> float32x4_t { - // Compute global k for 4 consecutive points - let global_k_base = vdupq_n_u32(subdomain_ijk[2] * mc_cells[2] + k as u32); - let global_k = vaddq_u32(global_k_base, k_offsets); - let global_k_f32 = vcvtq_f32_u32(global_k); - let grid_zs = vmlaq_n_f32(min_z_v, global_k_f32, cube_size); - // Deltas - let dxs = vsubq_f32(particle_xs, grid_xs); - let dys = vsubq_f32(particle_ys, grid_ys); - let dzs = vsubq_f32(particle_zs, grid_zs); - // Distance squared - let dist_sq = vmlaq_f32(vmulq_f32(dxs, dxs), dys, dys); - let dist_sq = vmlaq_f32(dist_sq, dzs, dzs); - // Cheap mask to skip work if all lanes are outside support - let inside_mask = vcltq_f32(dist_sq, support_sq_v); - let any_inside = vmaxvq_u32(inside_mask); - if any_inside == 0 { - return vdupq_n_f32(0.0); - } - // Compute weights, then mask out lanes outside support - let dist = vsqrtq_f32(dist_sq); - let w = kernel_neon.evaluate(dist); - vbslq_f32(inside_mask, w, vdupq_n_f32(0.0)) - }; - - // Loop over grid points in support region of the particle - for i in lower[0]..upper[0] { - for j in lower[1]..upper[1] { - let flat_index_base = (i * dim_y + j) * dim_z; - - let global_i = subdomain_ijk[0] * mc_cells[0] + i as u32; - let global_j = subdomain_ijk[1] * mc_cells[1] + j as u32; - - let grid_x = global_i as f32 * cube_size + global_min[0]; - let grid_y = global_j as f32 * cube_size + global_min[1]; - let grid_xs = vdupq_n_f32(grid_x); - let grid_ys = vdupq_n_f32(grid_y); - - for k in (lower[2]..upper_k_aligned).step_by(LANES) { - let flat_point_idx = flat_index_base + k; - let w_ij = evaluate_contribution(k, grid_xs, grid_ys); + #[target_feature(enable = "neon")] + fn density_grid_loop( + levelset_grid: &mut [f32], + subdomain_particles: &[Vector3], + subdomain_particle_densities: &[f32], + subdomain_mc_grid: &UniformCartesianCubeGrid3d, + subdomain_ijk: &[i64; 3], + global_mc_grid: &UniformCartesianCubeGrid3d, + cube_radius: i64, + _squared_support_with_margin: f32, + particle_rest_mass: f32, + kernel_support: f32, + ) { + let kernel_neon = K::new(kernel_support); + + profile!("density grid loop (neon)"); + let mc_grid = subdomain_mc_grid; + let mc_points = mc_grid.points_per_dim(); + let dim_y = mc_points[1] as usize; + let dim_z = mc_points[2] as usize; + + // Preload constants used inside the hot loops + const K_OFFSETS_ARR: [u32; 4] = [0, 1, 2, 3]; + let k_offsets = unsafe { vld1q_u32(K_OFFSETS_ARR.as_ptr()) }; + let global_min = global_mc_grid.aabb().min(); + let min_z_v = vdupq_n_f32(global_min[2]); + let cube_size = global_mc_grid.cell_size(); + let support = kernel_support; + let support_sq_v = vdupq_n_f32(support * support); + + for (p_i, rho_i) in subdomain_particles + .iter() + .copied() + .zip(subdomain_particle_densities.iter().copied()) + { + let v_i = particle_rest_mass / rho_i; + + let (lower, upper) = particle_influence_aabb(&p_i, &mc_grid, cube_radius); + let lower = lower.map(|i| i as usize); + let upper = upper.map(|i| i as usize); + + let remainder = (upper[2] - lower[2]) % LANES; + let upper_k_aligned = upper[2] - remainder; + + let subdomain_ijk = subdomain_ijk.map(|i| i as u32); + let mc_cells = mc_grid.cells_per_dim().map(|i| i as u32); + + // Broadcast particle coordinates once per particle + let particle_xs = vdupq_n_f32(p_i[0]); + let particle_ys = vdupq_n_f32(p_i[1]); + let particle_zs = vdupq_n_f32(p_i[2]); + + // Function to evaluate kernel contribution of current particle to 8 grid points at once + let evaluate_contribution = + |k: usize, grid_xs: float32x4_t, grid_ys: float32x4_t| -> float32x4_t { + // Compute global k for 4 consecutive points + let global_k_base = vdupq_n_u32(subdomain_ijk[2] * mc_cells[2] + k as u32); + let global_k = vaddq_u32(global_k_base, k_offsets); + let global_k_f32 = vcvtq_f32_u32(global_k); + let grid_zs = vmlaq_n_f32(min_z_v, global_k_f32, cube_size); + // Deltas + let dxs = vsubq_f32(particle_xs, grid_xs); + let dys = vsubq_f32(particle_ys, grid_ys); + let dzs = vsubq_f32(particle_zs, grid_zs); + // Distance squared + let dist_sq = vmlaq_f32(vmulq_f32(dxs, dxs), dys, dys); + let dist_sq = vmlaq_f32(dist_sq, dzs, dzs); + // Cheap mask to skip work if all lanes are outside support + let inside_mask = vcltq_f32(dist_sq, support_sq_v); + let any_inside = vmaxvq_u32(inside_mask); + if any_inside == 0 { + return vdupq_n_f32(0.0); + } + // Compute weights, then mask out lanes outside support + let dist = vsqrtq_f32(dist_sq); + let w = unsafe { kernel_neon.evaluate(dist) }; + vbslq_f32(inside_mask, w, vdupq_n_f32(0.0)) + }; - let prev_val = unsafe { - vld1q_f32(levelset_grid.as_ptr().offset(flat_point_idx as isize)) - }; - let val = vmlaq_n_f32(prev_val, w_ij, v_i); - unsafe { - vst1q_f32( - levelset_grid.as_mut_ptr().offset(flat_point_idx as isize), - val, - ) - }; - } + // Loop over grid points in support region of the particle + for i in lower[0]..upper[0] { + for j in lower[1]..upper[1] { + let flat_index_base = (i * dim_y + j) * dim_z; + + let global_i = subdomain_ijk[0] * mc_cells[0] + i as u32; + let global_j = subdomain_ijk[1] * mc_cells[1] + j as u32; + + let grid_x = global_i as f32 * cube_size + global_min[0]; + let grid_y = global_j as f32 * cube_size + global_min[1]; + let grid_xs = vdupq_n_f32(grid_x); + let grid_ys = vdupq_n_f32(grid_y); + + for k in (lower[2]..upper_k_aligned).step_by(LANES) { + let flat_point_idx = flat_index_base + k; + let w_ij = evaluate_contribution(k, grid_xs, grid_ys); + + let prev_val = unsafe { + vld1q_f32(levelset_grid.as_ptr().offset(flat_point_idx as isize)) + }; + let val = vmlaq_n_f32(prev_val, w_ij, v_i); + unsafe { + vst1q_f32( + levelset_grid.as_mut_ptr().offset(flat_point_idx as isize), + val, + ) + }; + } - // Handle remainder - if remainder > 0 { - let flat_point_idx = flat_index_base + upper_k_aligned; - let w_ij = evaluate_contribution(upper_k_aligned, grid_xs, grid_ys); - let w_ij_v = vmulq_n_f32(w_ij, v_i); - let w_ij_v = - unsafe { std::mem::transmute::(w_ij_v) }; - for rr in 0..remainder { - levelset_grid[flat_point_idx + rr] += w_ij_v[rr]; + // Handle remainder + if remainder > 0 { + let flat_point_idx = flat_index_base + upper_k_aligned; + let w_ij = evaluate_contribution(upper_k_aligned, grid_xs, grid_ys); + let w_ij_v = vmulq_n_f32(w_ij, v_i); + let w_ij_v = + unsafe { std::mem::transmute::(w_ij_v) }; + for rr in 0..remainder { + levelset_grid[flat_point_idx + rr] += w_ij_v[rr]; + } } } } } } + + match kernel_type { + KernelType::CubicSpline => { + density_grid_loop::( + levelset_grid, + subdomain_particles, + subdomain_particle_densities, + subdomain_mc_grid, + subdomain_ijk, + global_mc_grid, + cube_radius, + _squared_support_with_margin, + particle_rest_mass, + kernel_support, + ); + } + KernelType::Poly6 => { + density_grid_loop::( + levelset_grid, + subdomain_particles, + subdomain_particle_densities, + subdomain_mc_grid, + subdomain_ijk, + global_mc_grid, + cube_radius, + _squared_support_with_margin, + particle_rest_mass, + kernel_support, + ); + } + KernelType::Spiky => { + density_grid_loop::( + levelset_grid, + subdomain_particles, + subdomain_particle_densities, + subdomain_mc_grid, + subdomain_ijk, + global_mc_grid, + cube_radius, + _squared_support_with_margin, + particle_rest_mass, + kernel_support, + ); + } + KernelType::WendlandQuinticC2 => { + density_grid_loop::( + levelset_grid, + subdomain_particles, + subdomain_particle_densities, + subdomain_mc_grid, + subdomain_ijk, + global_mc_grid, + cube_radius, + _squared_support_with_margin, + particle_rest_mass, + kernel_support, + ); + } + } } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[target_feature(enable = "avx2,fma")] -pub fn density_grid_loop_avx>( +pub fn density_grid_loop_avx( levelset_grid: &mut [f32], subdomain_particles: &[Vector3], subdomain_particle_densities: &[f32], @@ -998,13 +1128,16 @@ pub fn density_grid_loop_avx>( cube_radius: i64, _squared_support_with_margin: f32, particle_rest_mass: f32, - kernel: &K, + kernel_support: f32, + kernel_type: &KernelType, ) { #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; + use crate::kernel::AvxKernel; + const LANES: usize = 8; // Ensure that we can safely compute global MC vertex positions via i32 indices per dimension @@ -1019,117 +1152,191 @@ pub fn density_grid_loop_avx>( i32::MAX ); - let kernel_avx = kernel::CubicSplineKernelAvxF32::new(kernel.compact_support_radius()); + #[target_feature(enable = "avx2,fma")] + fn density_grid_loop( + levelset_grid: &mut [f32], + subdomain_particles: &[Vector3], + subdomain_particle_densities: &[f32], + subdomain_mc_grid: &UniformCartesianCubeGrid3d, + subdomain_ijk: &[i64; 3], + global_mc_grid: &UniformCartesianCubeGrid3d, + cube_radius: i64, + _squared_support_with_margin: f32, + particle_rest_mass: f32, + kernel_support: f32, + ) { + let kernel_avx = K::new(kernel_support); + + profile!("density grid loop (avx)"); + let mc_grid = subdomain_mc_grid; + let mc_points = mc_grid.points_per_dim(); + let dim_y = mc_points[1] as usize; + let dim_z = mc_points[2] as usize; + + // Preload constants used inside the hot loops + let k_offsets_i32 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + let global_min = global_mc_grid.aabb().min(); + let min_z_v = _mm256_set1_ps(global_min[2]); + let cube_size = global_mc_grid.cell_size(); + let cube_size_ps = _mm256_set1_ps(cube_size); + let support = kernel_support; + let support_sq_v = _mm256_set1_ps(support * support); + + for (p_i, rho_i) in subdomain_particles + .iter() + .copied() + .zip(subdomain_particle_densities.iter().copied()) + { + let v_i = particle_rest_mass / rho_i; - profile!("density grid loop (avx)"); - let mc_grid = subdomain_mc_grid; - let mc_points = mc_grid.points_per_dim(); - let dim_y = mc_points[1] as usize; - let dim_z = mc_points[2] as usize; - - // Preload constants used inside the hot loops - let k_offsets_i32 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); - let global_min = global_mc_grid.aabb().min(); - let min_z_v = _mm256_set1_ps(global_min[2]); - let cube_size = global_mc_grid.cell_size(); - let cube_size_ps = _mm256_set1_ps(cube_size); - let support = kernel.compact_support_radius(); - let support_sq_v = _mm256_set1_ps(support * support); + let (lower, upper) = particle_influence_aabb(&p_i, &mc_grid, cube_radius); + let lower = lower.map(|i| i as usize); + let upper = upper.map(|i| i as usize); - for (p_i, rho_i) in subdomain_particles - .iter() - .copied() - .zip(subdomain_particle_densities.iter().copied()) - { - let v_i = particle_rest_mass / rho_i; + let remainder = (upper[2] - lower[2]) % LANES; + let upper_k_aligned = upper[2] - remainder; - let (lower, upper) = particle_influence_aabb(&p_i, &mc_grid, cube_radius); - let lower = lower.map(|i| i as usize); - let upper = upper.map(|i| i as usize); - - let remainder = (upper[2] - lower[2]) % LANES; - let upper_k_aligned = upper[2] - remainder; - - let subdomain_ijk_i32 = subdomain_ijk.map(|i| i as i32); - let mc_cells = mc_grid.cells_per_dim().map(|i| i as i32); - - // Broadcast particle coordinates once per particle - let particle_xs = _mm256_set1_ps(p_i[0]); - let particle_ys = _mm256_set1_ps(p_i[1]); - let particle_zs = _mm256_set1_ps(p_i[2]); - let v_i_ps = _mm256_set1_ps(v_i); - - // Function to evaluate kernel contribution of current particle to 8 grid points at once - let evaluate_contribution = |k: usize, grid_xs: __m256, grid_ys: __m256| -> __m256 { - // Compute global k for 8 consecutive points using i32 lanes and convert to f32 - let base_k = subdomain_ijk_i32[2] * mc_cells[2] + k as i32; - let global_k_i32 = _mm256_add_epi32(_mm256_set1_epi32(base_k), k_offsets_i32); - let global_k_f32 = _mm256_cvtepi32_ps(global_k_i32); - let grid_zs = _mm256_fmadd_ps(global_k_f32, cube_size_ps, min_z_v); - - // Deltas - let dxs = _mm256_sub_ps(particle_xs, grid_xs); - let dys = _mm256_sub_ps(particle_ys, grid_ys); - let dzs = _mm256_sub_ps(particle_zs, grid_zs); - - // Distance squared - let dist_sq = { - let t = _mm256_fmadd_ps(dxs, dxs, _mm256_mul_ps(dys, dys)); - _mm256_fmadd_ps(dzs, dzs, t) - }; + let subdomain_ijk_i32 = subdomain_ijk.map(|i| i as i32); + let mc_cells = mc_grid.cells_per_dim().map(|i| i as i32); - // Cheap mask to skip work if all lanes are outside support - let inside_mask = _mm256_cmp_ps::<_CMP_LT_OQ>(dist_sq, support_sq_v); - if _mm256_movemask_ps(inside_mask) == 0 { - return _mm256_set1_ps(0.0); - } + // Broadcast particle coordinates once per particle + let particle_xs = _mm256_set1_ps(p_i[0]); + let particle_ys = _mm256_set1_ps(p_i[1]); + let particle_zs = _mm256_set1_ps(p_i[2]); + let v_i_ps = _mm256_set1_ps(v_i); - // Compute weights, then mask out lanes outside support - let dist = _mm256_sqrt_ps(dist_sq); - let w = kernel_avx.evaluate(dist); - _mm256_and_ps(w, inside_mask) - }; + // Function to evaluate kernel contribution of current particle to 8 grid points at once + let evaluate_contribution = |k: usize, grid_xs: __m256, grid_ys: __m256| -> __m256 { + // Compute global k for 8 consecutive points using i32 lanes and convert to f32 + let base_k = subdomain_ijk_i32[2] * mc_cells[2] + k as i32; + let global_k_i32 = _mm256_add_epi32(_mm256_set1_epi32(base_k), k_offsets_i32); + let global_k_f32 = _mm256_cvtepi32_ps(global_k_i32); + let grid_zs = _mm256_fmadd_ps(global_k_f32, cube_size_ps, min_z_v); - // Loop over grid points in support region of the particle - for i in lower[0]..upper[0] { - for j in lower[1]..upper[1] { - let flat_index_base = (i * dim_y + j) * dim_z; + // Deltas + let dxs = _mm256_sub_ps(particle_xs, grid_xs); + let dys = _mm256_sub_ps(particle_ys, grid_ys); + let dzs = _mm256_sub_ps(particle_zs, grid_zs); - let global_i = subdomain_ijk_i32[0] * mc_cells[0] + i as i32; - let global_j = subdomain_ijk_i32[1] * mc_cells[1] + j as i32; + // Distance squared + let dist_sq = { + let t = _mm256_fmadd_ps(dxs, dxs, _mm256_mul_ps(dys, dys)); + _mm256_fmadd_ps(dzs, dzs, t) + }; - let grid_x = global_i as f32 * cube_size + global_min[0]; - let grid_y = global_j as f32 * cube_size + global_min[1]; - let grid_xs = _mm256_set1_ps(grid_x); - let grid_ys = _mm256_set1_ps(grid_y); + // Cheap mask to skip work if all lanes are outside support + let inside_mask = _mm256_cmp_ps::<_CMP_LT_OQ>(dist_sq, support_sq_v); + if _mm256_movemask_ps(inside_mask) == 0 { + return _mm256_set1_ps(0.0); + } - for k in (lower[2]..upper_k_aligned).step_by(LANES) { - let flat_point_idx = flat_index_base + k; - let w_ij = evaluate_contribution(k, grid_xs, grid_ys); + // Compute weights, then mask out lanes outside support + let dist = _mm256_sqrt_ps(dist_sq); + let w = unsafe { kernel_avx.evaluate(dist) }; + _mm256_and_ps(w, inside_mask) + }; - unsafe { - let prev_val = _mm256_loadu_ps(levelset_grid.as_ptr().add(flat_point_idx)); - let val = _mm256_fmadd_ps(w_ij, v_i_ps, prev_val); - _mm256_storeu_ps(levelset_grid.as_mut_ptr().add(flat_point_idx), val); + // Loop over grid points in support region of the particle + for i in lower[0]..upper[0] { + for j in lower[1]..upper[1] { + let flat_index_base = (i * dim_y + j) * dim_z; + + let global_i = subdomain_ijk_i32[0] * mc_cells[0] + i as i32; + let global_j = subdomain_ijk_i32[1] * mc_cells[1] + j as i32; + + let grid_x = global_i as f32 * cube_size + global_min[0]; + let grid_y = global_j as f32 * cube_size + global_min[1]; + let grid_xs = _mm256_set1_ps(grid_x); + let grid_ys = _mm256_set1_ps(grid_y); + + for k in (lower[2]..upper_k_aligned).step_by(LANES) { + let flat_point_idx = flat_index_base + k; + let w_ij = evaluate_contribution(k, grid_xs, grid_ys); + + unsafe { + let prev_val = + _mm256_loadu_ps(levelset_grid.as_ptr().add(flat_point_idx)); + let val = _mm256_fmadd_ps(w_ij, v_i_ps, prev_val); + _mm256_storeu_ps(levelset_grid.as_mut_ptr().add(flat_point_idx), val); + } } - } - // Handle remainder - if remainder > 0 { - let flat_point_idx = flat_index_base + upper_k_aligned; - let w_ij = evaluate_contribution(upper_k_aligned, grid_xs, grid_ys); - unsafe { - let val = _mm256_mul_ps(w_ij, v_i_ps); - let mut tmp: [f32; LANES] = [0.0; LANES]; - _mm256_storeu_ps(tmp.as_mut_ptr(), val); - for rr in 0..remainder { - levelset_grid[flat_point_idx + rr] += tmp[rr]; + // Handle remainder + if remainder > 0 { + let flat_point_idx = flat_index_base + upper_k_aligned; + let w_ij = evaluate_contribution(upper_k_aligned, grid_xs, grid_ys); + unsafe { + let val = _mm256_mul_ps(w_ij, v_i_ps); + let mut tmp: [f32; LANES] = [0.0; LANES]; + _mm256_storeu_ps(tmp.as_mut_ptr(), val); + for rr in 0..remainder { + levelset_grid[flat_point_idx + rr] += tmp[rr]; + } } } } } } } + + match kernel_type { + KernelType::CubicSpline => { + density_grid_loop::( + levelset_grid, + subdomain_particles, + subdomain_particle_densities, + subdomain_mc_grid, + subdomain_ijk, + global_mc_grid, + cube_radius, + _squared_support_with_margin, + particle_rest_mass, + kernel_support, + ); + } + KernelType::Poly6 => { + density_grid_loop::( + levelset_grid, + subdomain_particles, + subdomain_particle_densities, + subdomain_mc_grid, + subdomain_ijk, + global_mc_grid, + cube_radius, + _squared_support_with_margin, + particle_rest_mass, + kernel_support, + ); + } + KernelType::Spiky => { + density_grid_loop::( + levelset_grid, + subdomain_particles, + subdomain_particle_densities, + subdomain_mc_grid, + subdomain_ijk, + global_mc_grid, + cube_radius, + _squared_support_with_margin, + particle_rest_mass, + kernel_support, + ); + } + KernelType::WendlandQuinticC2 => { + density_grid_loop::( + levelset_grid, + subdomain_particles, + subdomain_particle_densities, + subdomain_mc_grid, + subdomain_ijk, + global_mc_grid, + cube_radius, + _squared_support_with_margin, + particle_rest_mass, + kernel_support, + ); + } + } } pub fn density_grid_loop_sparse>( @@ -1212,7 +1419,7 @@ pub fn density_grid_loop_sparse>( } } -pub(crate) fn reconstruct_subdomains( +pub(crate) fn reconstruct_subdomains + Sync>( parameters: &ParametersSubdomainGrid, global_particles: &[Vector3], global_particle_densities: &[R], @@ -1228,7 +1435,7 @@ pub(crate) fn reconstruct_subdomains( let cube_radius = I::from((parameters.compact_support_radius / parameters.cube_size).ceil()) .expect("kernel radius in cubes has to fit in index type"); // Kernel - let kernel = CubicSplineKernel::new(parameters.compact_support_radius); + let kernel = K::new(parameters.compact_support_radius); //let kernel = DiscreteSquaredDistanceCubicKernel::new::(1000, parameters.compact_support_radius); let mc_total_cells = parameters.subdomain_cubes.cubed(); @@ -1436,7 +1643,8 @@ pub(crate) fn reconstruct_subdomains( cube_radius.to_i64().unwrap(), squared_support_with_margin.to_f32().unwrap(), parameters.particle_rest_mass.to_f32().unwrap(), - &CubicSplineKernel::new(parameters.compact_support_radius.to_f32().unwrap()), + parameters.compact_support_radius.to_f32().unwrap(), + ¶meters.kernel_type, ); } else { density_grid_loop_scalar( diff --git a/splashsurf_lib/src/density_map.rs b/splashsurf_lib/src/density_map.rs index 0043439..da3c3ec 100644 --- a/splashsurf_lib/src/density_map.rs +++ b/splashsurf_lib/src/density_map.rs @@ -24,7 +24,7 @@ //! indices, even if the density map is only generated for a smaller subdomain. use crate::aabb::Aabb3d; -use crate::kernel::{CubicSplineKernel, SymmetricKernel3d}; +use crate::kernel::SymmetricKernel3d; use crate::mesh::{HexMesh3d, MeshWithData, OwnedMeshAttribute}; use crate::neighborhood_search::NeighborhoodList; use crate::uniform_grid::UniformGrid; @@ -62,7 +62,7 @@ pub enum DensityMapError { /// Computes the individual densities of particles using a standard SPH sum #[inline(never)] -pub fn compute_particle_densities( +pub fn compute_particle_densities + Sync>( particle_positions: &[Vector3], particle_neighbor_lists: &[Vec], compact_support_radius: R, @@ -71,7 +71,7 @@ pub fn compute_particle_densities( ) -> Vec { let mut densities = Vec::new(); if enable_multi_threading { - parallel_compute_particle_densities::( + parallel_compute_particle_densities::( particle_positions, particle_neighbor_lists, compact_support_radius, @@ -79,7 +79,7 @@ pub fn compute_particle_densities( &mut densities, ) } else { - sequential_compute_particle_densities::( + sequential_compute_particle_densities::( particle_positions, particle_neighbor_lists, compact_support_radius, @@ -92,7 +92,7 @@ pub fn compute_particle_densities( /// Computes the individual densities of particles inplace using a standard SPH sum #[inline(never)] -pub fn compute_particle_densities_inplace( +pub fn compute_particle_densities_inplace + Sync>( particle_positions: &[Vector3], particle_neighbor_lists: &[Vec], compact_support_radius: R, @@ -101,7 +101,7 @@ pub fn compute_particle_densities_inplace( densities: &mut Vec, ) { if enable_multi_threading { - parallel_compute_particle_densities::( + parallel_compute_particle_densities::( particle_positions, particle_neighbor_lists, compact_support_radius, @@ -109,7 +109,7 @@ pub fn compute_particle_densities_inplace( densities, ) } else { - sequential_compute_particle_densities::( + sequential_compute_particle_densities::( particle_positions, particle_neighbor_lists, compact_support_radius, @@ -127,7 +127,12 @@ fn init_density_storage(densities: &mut Vec, new_len: usize) { /// Computes the individual densities of particles using a standard SPH sum, sequential implementation #[inline(never)] -pub fn sequential_compute_particle_densities( +pub fn sequential_compute_particle_densities< + I: Index, + R: Real, + Nl: NeighborhoodList + ?Sized, + K: SymmetricKernel3d, +>( particle_positions: &[Vector3], particle_neighbor_lists: &Nl, compact_support_radius: R, @@ -136,7 +141,7 @@ pub fn sequential_compute_particle_densities( + sequential_compute_particle_densities_filtered::( particle_positions, particle_neighbor_lists, compact_support_radius, @@ -151,6 +156,7 @@ pub fn sequential_compute_particle_densities_filtered< I: Index, R: Real, Nl: NeighborhoodList + ?Sized, + K: SymmetricKernel3d, >( particle_positions: &[Vector3], particle_neighbor_lists: &Nl, @@ -164,7 +170,7 @@ pub fn sequential_compute_particle_densities_filtered< init_density_storage(particle_densities, particle_positions.len()); // Pre-compute the kernel which can be queried using squared distances - let kernel = CubicSplineKernel::new(compact_support_radius); + let kernel = K::new(compact_support_radius); for (i, particle_i_position) in particle_positions .iter() @@ -187,7 +193,7 @@ pub fn sequential_compute_particle_densities_filtered< /// Computes the individual densities of particles using a standard SPH sum, multi-threaded implementation #[inline(never)] -pub fn parallel_compute_particle_densities( +pub fn parallel_compute_particle_densities + Sync>( particle_positions: &[Vector3], particle_neighbor_lists: &[Vec], compact_support_radius: R, @@ -199,7 +205,7 @@ pub fn parallel_compute_particle_densities( init_density_storage(particle_densities, particle_positions.len()); // Pre-compute the kernel which can be queried using squared distances - let kernel = CubicSplineKernel::new(compact_support_radius); + let kernel = K::new(compact_support_radius); particle_positions .par_iter() @@ -313,7 +319,7 @@ impl<'a, I: Index, R: Real> DensityMap<'a, I, R> { /// Computes a sparse density map for the fluid based on the specified background grid #[inline(never)] -pub fn generate_sparse_density_map( +pub fn generate_sparse_density_map + Sync>( grid: &UniformGrid, particle_positions: &[Vector3], particle_densities: &[R], @@ -334,7 +340,7 @@ pub fn generate_sparse_density_map( ); if allow_threading { - *density_map = parallel_generate_sparse_density_map( + *density_map = parallel_generate_sparse_density_map::<_, _, K>( grid, particle_positions, particle_densities, @@ -344,7 +350,7 @@ pub fn generate_sparse_density_map( cube_size, )? } else { - *density_map = sequential_generate_sparse_density_map( + *density_map = sequential_generate_sparse_density_map::<_, _, K>( grid, particle_positions, particle_densities, @@ -365,7 +371,7 @@ pub fn generate_sparse_density_map( /// Computes a sparse density map for the fluid based on the specified background grid, sequential implementation #[inline(never)] -pub fn sequential_generate_sparse_density_map( +pub fn sequential_generate_sparse_density_map>( grid: &UniformGrid, particle_positions: &[Vector3], particle_densities: &[R], @@ -378,12 +384,13 @@ pub fn sequential_generate_sparse_density_map( let mut sparse_densities = new_map(); - let density_map_generator = SparseDensityMapGenerator::try_new( - grid, - compact_support_radius, - cube_size, - particle_rest_mass, - )?; + let density_map_generator: SparseDensityMapGenerator = + SparseDensityMapGenerator::try_new( + grid, + compact_support_radius, + cube_size, + particle_rest_mass, + )?; let process_particle = |particle_data: (&Vector3, R)| { let (particle, particle_density) = particle_data; @@ -412,7 +419,7 @@ pub fn sequential_generate_sparse_density_map( /// Computes a sparse density map for the fluid based on the specified background grid, multi-threaded implementation #[inline(never)] -pub fn parallel_generate_sparse_density_map( +pub fn parallel_generate_sparse_density_map + Sync>( grid: &UniformGrid, particle_positions: &[Vector3], particle_densities: &[R], @@ -428,12 +435,13 @@ pub fn parallel_generate_sparse_density_map( // Generate thread local density maps { - let density_map_generator = SparseDensityMapGenerator::try_new( - grid, - compact_support_radius, - cube_size, - particle_rest_mass, - )?; + let density_map_generator: SparseDensityMapGenerator = + SparseDensityMapGenerator::try_new( + grid, + compact_support_radius, + cube_size, + particle_rest_mass, + )?; profile!("generate thread local maps"); @@ -530,12 +538,12 @@ pub fn parallel_generate_sparse_density_map( } /// Internal helper type used to evaluate the density contribution for a particle -struct SparseDensityMapGenerator { +struct SparseDensityMapGenerator> { particle_rest_mass: R, half_supported_cells: I, supported_points: I, kernel_evaluation_radius_sq: R, - kernel: CubicSplineKernel, + kernel: K, allowed_domain: Aabb3d, } @@ -580,7 +588,7 @@ pub(crate) fn compute_kernel_evaluation_radius( } // TODO: Maybe remove allowed domain check? And require this is done before, using the active_particles array? -impl SparseDensityMapGenerator { +impl> SparseDensityMapGenerator { fn try_new( grid: &UniformGrid, compact_support_radius: R, @@ -595,7 +603,7 @@ impl SparseDensityMapGenerator { // Pre-compute the kernel which can be queried using squared distances let kernel_evaluation_radius_sq = kernel_evaluation_radius * kernel_evaluation_radius; - let kernel = CubicSplineKernel::new(compact_support_radius); + let kernel = K::new(compact_support_radius); // Shrink the allowed domain for particles by the kernel evaluation radius. This ensures that all cells/points // that are affected by a particle are actually part of the domain/grid, so it does not have to be checked in the loops below. diff --git a/splashsurf_lib/src/kernel.rs b/splashsurf_lib/src/kernel.rs index 3b3c84a..21aa9dc 100644 --- a/splashsurf_lib/src/kernel.rs +++ b/splashsurf_lib/src/kernel.rs @@ -1,8 +1,17 @@ //! SPH kernel function implementations //! //! Currently, the following SIMD implementations are provided: -//! - `CubicSplineKernelAvxF32`: Only available on `x86` and `x86_64` targets, requires AVX2 and FMA support -//! - `CubicSplineKernelNeonF32`: Only available on `aarch64` targets, requires NEON support +//! - `CubicSplineKernelAvxF32` +//! - `CubicSplineKernelNeonF32` +//! - `Poly6KernelAvxF32` +//! - `Poly6KernelNeonF32` +//! - `SpikyKernelAvxF32` +//! - `SpikyKernelNeonF32` +//! - `WendlandQuinticC2KernelAvxF32` +//! - `WendlandQuinticC2KernelNeonF32` +//! +//! The Avx implementations are only available on `x86` and `x86_64` targets and require AVX2 and FMA support. +//! Similarly, the Neon implementations are only available on `aarch64` targets and require NEON support. //! //! Note that documentation of the SIMD kernels is only available on the respective target architectures. @@ -20,6 +29,14 @@ use core::arch::x86_64::__m256; // TODO: Add reference for the kernel function, document formula +#[derive(Clone, Debug)] +pub enum KernelType { + CubicSpline, + Poly6, + Spiky, + WendlandQuinticC2, +} + /// Utility functions for computing the volume of fluid particles pub struct Volume; @@ -37,6 +54,7 @@ impl Volume { /// Trait for symmetric kernel functions in three dimensions pub trait SymmetricKernel3d { + fn new(compact_support_radius: R) -> Self; /// Returns the compact support radius of the kernel fn compact_support_radius(&self) -> R; /// Evaluates the kernel at the radial distance `r` relative to the origin @@ -47,6 +65,22 @@ pub trait SymmetricKernel3d { fn evaluate_gradient_norm(&self, r: R) -> R; } +#[cfg(target_arch = "aarch64")] +pub unsafe trait NeonKernel { + fn new(compact_support_radius: f32) -> Self; + + /// Evaluates the kernel at the radial distance `r` relative to the origin + unsafe fn evaluate(&self, r: float32x4_t) -> float32x4_t; +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +pub unsafe trait AvxKernel { + fn new(compact_support_radius: f32) -> Self; + + /// Evaluates the kernel at the radial distance `r` relative to the origin + unsafe fn evaluate(&self, r: __m256) -> __m256; +} + /// The commonly used cubic spline kernel pub struct CubicSplineKernel { /// Compact support radius of the kernel @@ -56,18 +90,6 @@ pub struct CubicSplineKernel { } impl CubicSplineKernel { - /// Initializes a cubic spline kernel with the given compact support radius - #[replace_float_literals(R::from_float(literal))] - pub fn new(compact_support_radius: R) -> Self { - let h = compact_support_radius; - let sigma = 8.0 / (h * h * h); - - Self { - compact_support_radius, - normalization: sigma, - } - } - /// The cubic spline function used by the cubic spline kernel #[replace_float_literals(R::from_float(literal))] fn cubic_function(q: R) -> R { @@ -96,6 +118,18 @@ impl CubicSplineKernel { } impl SymmetricKernel3d for CubicSplineKernel { + /// Initializes a cubic spline kernel with the given compact support radius + #[replace_float_literals(R::from_float(literal))] + fn new(compact_support_radius: R) -> Self { + let h = compact_support_radius; + let sigma = 8.0 / (h * h * h); + + Self { + compact_support_radius, + normalization: sigma, + } + } + fn compact_support_radius(&self) -> R { self.compact_support_radius } @@ -140,45 +174,6 @@ impl SymmetricKernel3d for CubicSplineKernel { } } -#[test] -fn test_cubic_kernel_r_compact_support() { - let hs = [0.025, 0.1, 2.0]; - for &h in hs.iter() { - let kernel = CubicSplineKernel::new(h); - assert_eq!(kernel.evaluate(h), 0.0); - assert_eq!(kernel.evaluate(2.0 * h), 0.0); - assert_eq!(kernel.evaluate(10.0 * h), 0.0); - } -} - -#[test] -fn test_cubic_kernel_r_integral() { - let hs = [0.025, 0.1, 2.0]; - let n = 10; - - for &h in hs.iter() { - let kernel = CubicSplineKernel::new(h); - - let dr = h / (n as f64); - let dvol = dr * dr * dr; - - let mut integral = 0.0; - for i in -n..n { - for j in -n..n { - for k in -n..n { - let r_in = Vector3::new(i as f64, j as f64, k as f64) * dr; - let r_out = Vector3::new((i + 1) as f64, (j + 1) as f64, (k + 1) as f64) * dr; - let r = ((r_in + r_out) * 0.5).norm(); - - integral += dvol * kernel.evaluate(r); - } - } - } - - assert!((integral - 1.0).abs() <= 1e-5); - } -} - /// Vectorized implementation of the cubic spline kernel using NEON instructions. Only available on `aarch64` targets. #[cfg(target_arch = "aarch64")] pub struct CubicSplineKernelNeonF32 { @@ -187,9 +182,9 @@ pub struct CubicSplineKernelNeonF32 { } #[cfg(target_arch = "aarch64")] -impl CubicSplineKernelNeonF32 { +unsafe impl NeonKernel for CubicSplineKernelNeonF32 { /// Initializes a cubic spline kernel with the given compact support radius - pub fn new(compact_support_radius: f32) -> Self { + fn new(compact_support_radius: f32) -> Self { let r = compact_support_radius; let compact_support_inv = 1.0 / r; let rrr = r * r * r; @@ -202,7 +197,7 @@ impl CubicSplineKernelNeonF32 { /// Evaluates the cubic spline kernel at the specified radial distances #[target_feature(enable = "neon")] - pub fn evaluate(&self, r: float32x4_t) -> float32x4_t { + unsafe fn evaluate(&self, r: float32x4_t) -> float32x4_t { use core::arch::aarch64::*; let one = vdupq_n_f32(1.0); @@ -235,88 +230,6 @@ impl CubicSplineKernelNeonF32 { } } -#[test] -#[cfg_attr( - not(all(target_arch = "aarch64", target_feature = "neon")), - ignore = "Skipped on non-aarch64 targets" -)] -fn test_cubic_spline_kernel_neon() { - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - { - use core::arch::aarch64::*; - - // Test a few representative compact support radii - let hs: [f32; 3] = [0.025, 0.1, 2.0]; - for &h in hs.iter() { - let scalar = CubicSplineKernel::new(h); - let neon = CubicSplineKernelNeonF32::new(h); - - // Sample radii from 0 to 2h (beyond support should be 0) - let n: usize = 1024; - let mut r0: f32 = 0.0; - let dr: f32 = (2.0 * h) / (n as f32); - - for _chunk in 0..(n / 4) { - // Prepare 4 lanes of radii - let rs = [r0, r0 + dr, r0 + 2.0 * dr, r0 + 3.0 * dr]; - let r_vec = unsafe { vld1q_f32(rs.as_ptr()) }; - - // Evaluate NEON and store back to array - let w_vec = unsafe { neon.evaluate(r_vec) }; - let mut w_neon = [0.0f32; 4]; - unsafe { vst1q_f32(w_neon.as_mut_ptr(), w_vec) }; - - // Compare against scalar lane-wise - for lane in 0..4 { - let r_lane = rs[lane]; - let w_scalar = scalar.evaluate(r_lane); - let diff = (w_neon[lane] - w_scalar).abs(); - - // Absolute tolerance with mild relative component to be robust across scales - let tol = 5e-6_f32.max(2e-5_f32 * w_scalar.abs()); - assert!( - diff <= tol, - "NEON kernel mismatch (h={}, r={}, lane={}): neon={}, scalar={}, diff={}, tol={}", - h, - r_lane, - lane, - w_neon[lane], - w_scalar, - diff, - tol - ); - } - - r0 += 4.0 * dr; - } - - // Also check a couple of out-of-support points explicitly - for &r in &[h * 1.01, h * 1.5, h * 2.0, h * 2.5] { - let w_scalar = scalar.evaluate(r); - let w_neon = { - let v = unsafe { vld1q_f32([r, r, r, r].as_ptr()) }; - let w = unsafe { neon.evaluate(v) }; - let mut tmp = [0.0f32; 4]; - unsafe { vst1q_f32(tmp.as_mut_ptr(), w) }; - tmp[0] - }; - let diff = (w_neon - w_scalar).abs(); - let tol = 5e-6_f32.max(1e-5_f32 * w_scalar.abs()); - assert!( - diff <= tol, - "NEON kernel mismatch outside support (h={}, r={}): neon={}, scalar={}, diff={}, tol={}", - h, - r, - w_neon, - w_scalar, - diff, - tol - ); - } - } - } -} - /// Vectorized implementation of the cubic spline kernel using AVX2 and FMA instructions. Only available on `x86` and `x86_64` targets. #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] pub struct CubicSplineKernelAvxF32 { @@ -325,9 +238,9 @@ pub struct CubicSplineKernelAvxF32 { } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -impl CubicSplineKernelAvxF32 { +unsafe impl AvxKernel for CubicSplineKernelAvxF32 { /// Initializes a cubic spline kernel with the given compact support radius - pub fn new(compact_support_radius: f32) -> Self { + fn new(compact_support_radius: f32) -> Self { let r = compact_support_radius; let compact_support_inv = 1.0 / r; let rrr = r * r * r; @@ -340,7 +253,7 @@ impl CubicSplineKernelAvxF32 { /// Evaluates the cubic spline kernel at the specified radial distances #[target_feature(enable = "avx2,fma")] - pub fn evaluate(&self, r: __m256) -> __m256 { + unsafe fn evaluate(&self, r: __m256) -> __m256 { #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] @@ -378,108 +291,6 @@ impl CubicSplineKernelAvxF32 { } } -#[test] -#[cfg_attr( - not(all( - any(target_arch = "x86_64", target_arch = "x86"), - target_feature = "avx2", - target_feature = "fma" - )), - ignore = "Skipped on non-x86 targets" -)] -fn test_cubic_spline_kernel_avx() { - #[cfg(all( - any(target_arch = "x86_64", target_arch = "x86"), - target_feature = "avx2", - target_feature = "fma" - ))] - { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - // Test a few representative compact support radii - let hs: [f32; 3] = [0.025, 0.1, 2.0]; - for &h in hs.iter() { - let scalar = CubicSplineKernel::new(h); - let avx = CubicSplineKernelAvxF32::new(h); - - // Sample radii from 0 to 2h (beyond support should be 0) - let n: usize = 1024; - let mut r0: f32 = 0.0; - let dr: f32 = (2.0 * h) / (n as f32); - - for _chunk in 0..(n / 8) { - // Prepare 8 lanes of radii - let rs = [ - r0, - r0 + dr, - r0 + 2.0 * dr, - r0 + 3.0 * dr, - r0 + 4.0 * dr, - r0 + 5.0 * dr, - r0 + 6.0 * dr, - r0 + 7.0 * dr, - ]; - - // Evaluate AVX and store back to array - let r_vec = unsafe { _mm256_loadu_ps(rs.as_ptr()) }; - let w_vec = unsafe { avx.evaluate(r_vec) }; - let mut w_avx = [0.0f32; 8]; - unsafe { _mm256_storeu_ps(w_avx.as_mut_ptr(), w_vec) }; - - // Compare against scalar lane-wise - for lane in 0..8 { - let r_lane = rs[lane]; - let w_scalar = scalar.evaluate(r_lane); - let diff = (w_avx[lane] - w_scalar).abs(); - - // Absolute tolerance with mild relative component to be robust across scales - let tol = 1e-6_f32.max(1e-5_f32 * w_scalar.abs()); - assert!( - diff <= tol, - "AVX kernel mismatch (h={}, r={}, lane={}): avx={}, scalar={}, diff={}, tol={}", - h, - r_lane, - lane, - w_avx[lane], - w_scalar, - diff, - tol - ); - } - - r0 += 8.0 * dr; - } - - // Also check a couple of out-of-support points explicitly - for &r in &[h * 1.01, h * 1.5, h * 2.0, h * 2.5] { - let w_scalar = scalar.evaluate(r); - let w_avx = { - let v = unsafe { _mm256_set1_ps(r) }; - let w = unsafe { avx.evaluate(v) }; - let mut tmp = [0.0f32; 8]; - unsafe { _mm256_storeu_ps(tmp.as_mut_ptr(), w) }; - tmp[0] - }; - let diff = (w_avx - w_scalar).abs(); - let tol = 1e-6_f32.max(1e-5_f32 * w_scalar.abs()); - assert!( - diff <= tol, - "AVX kernel mismatch outside support (h={}, r={}): avx={}, scalar={}, diff={}, tol={}", - h, - r, - w_avx, - w_scalar, - diff, - tol - ); - } - } - } -} - /// Accelerator for efficient evaluation of a precomputed cubic kernel /// /// This structure is used to pre-compute a discrete representation of the cubic kernel function. @@ -544,36 +355,886 @@ impl DiscreteSquaredDistanceCubicKernel { } } -#[test] -fn test_discrete_kernel() { - let n = 10000; - let h = 0.025; +/// Poly6 kernel +pub struct Poly6Kernel { + /// Compact support radius of the kernel + compact_support_radius: R, + /// Kernel normalization factor (sigma) + normalization: R, +} - let discrete_kernel = DiscreteSquaredDistanceCubicKernel::new::(n, h); - let kernel = CubicSplineKernel::new(h); +impl SymmetricKernel3d for Poly6Kernel { + /// Initializes a poly6 kernel with the given compact support radius + #[replace_float_literals(R::from_float(literal))] + fn new(compact_support_radius: R) -> Self { + let h = compact_support_radius; + let sigma = 315.0 / (64.0 * R::pi() * h.powi(9)); - // Test the pre-computed values using a linear stepping - let dr = h / (n as f64); - for i in 0..n { - let r = (i as f64) * dr; - let rr = r * r; + Self { + compact_support_radius, + normalization: sigma, + } + } - let discrete = discrete_kernel.evaluate(rr); - let continuous = kernel.evaluate(r); + fn compact_support_radius(&self) -> R { + self.compact_support_radius + } - let diff = (discrete - continuous).abs(); - let rel_diff = diff / continuous; - if rel_diff > 5e-2 && diff > 1e-1 { - eprintln!( - "at r={}, r/h={}, discrete: {}, continuous: {}, diff: {}, rel_diff: {}", - r, - r / h, - discrete, - continuous, - diff, - rel_diff - ); - assert!(false); + /// Evaluates the poly6 kernel at the radial distance `r` + fn evaluate(&self, r: R) -> R { + let r2 = r * r; + let h2 = self.compact_support_radius.powi(2); + if r2 <= h2 { + self.normalization * (h2 - r2).powi(3) + } else { + R::zero() } } + + /// Evaluates the gradient of the poly6 kernel at the position `x` + fn evaluate_gradient(&self, x: Vector3) -> Vector3 { + // Radial distance is norm of position + let r2 = x.norm_squared(); + let h2 = self.compact_support_radius.powi(2); + + if r2 <= h2 { + let x = x.normalize(); + + x.scale(R::from_float(6.0) * self.normalization * (h2 - r2).powi(2)) + } else { + x.scale(R::zero()) + } + } + + /// Evaluates the norm of the gradient of the poly6 kernel at the radial distance `r` + fn evaluate_gradient_norm(&self, r: R) -> R { + let r2 = r * r; + let h2 = self.compact_support_radius.powi(2); + + if r2 <= h2 { + R::from_float(6.0) * self.normalization * (h2 - r2).powi(2) + } else { + R::zero() + } + } +} + +/// Vectorized implementation of the poly6 kernel using NEON instructions. Only available on `aarch64` targets. +#[cfg(target_arch = "aarch64")] +pub struct Poly6KernelNeonF32 { + squared_compact_support: f32, + sigma: f32, +} + +#[cfg(target_arch = "aarch64")] +unsafe impl NeonKernel for Poly6KernelNeonF32 { + /// Initializes a poly6 kernel with the given compact support radius + fn new(compact_support_radius: f32) -> Self { + let r = compact_support_radius; + let r9 = r.powi(9); + let sigma = 315.0 / (64.0 * std::f32::consts::PI * r9); + Self { + squared_compact_support: r * r, + sigma, + } + } + + /// Evaluates the poly6 kernel at the specified radial distances + #[target_feature(enable = "neon")] + unsafe fn evaluate(&self, r: float32x4_t) -> float32x4_t { + use core::arch::aarch64::*; + + let zero = vdupq_n_f32(0.0); + + let r2 = vmulq_f32(r, r); + let h2 = vdupq_n_f32(self.squared_compact_support); + + // (h2 - r2)^3 + let relevance = vsubq_f32(h2, r2); + let relevance2 = vmulq_f32(relevance, relevance); + let relevance3 = vmulq_f32(relevance2, relevance); + + // 315.0/(64.0*PI*h^9) + let normalization = vdupq_n_f32(self.sigma); + + let res = vmulq_f32(normalization, relevance3); + + // mask: r² <= h² + let mask = vcleq_f32(r2, h2); + + // blend: if mask true → res, else → zero + vbslq_f32(mask, res, zero) + } +} + +/// Vectorized implementation of the poly6 kernel using AVX2 and FMA instructions. Only available on `x86` and `x86_64` targets. +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +pub struct Poly6KernelAvxF32 { + squared_compact_support: f32, + sigma: f32, +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +unsafe impl AvxKernel for Poly6KernelAvxF32 { + /// Initializes a poly6 kernel with the given compact support radius + fn new(compact_support_radius: f32) -> Self { + let r = compact_support_radius; + let r9 = r.powi(9); + let sigma = 315.0 / (64.0 * std::f32::consts::PI * r9); + Self { + squared_compact_support: r * r, + sigma, + } + } + + /// Evaluates the poly6 kernel at the specified radial distances + #[target_feature(enable = "avx2,fma")] + unsafe fn evaluate(&self, r: __m256) -> __m256 { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + let zero = _mm256_set1_ps(0.0); + + let r2 = _mm256_mul_ps(r, r); + let h2 = _mm256_set1_ps(self.squared_compact_support); + + // (h2-r2)^3 + let relevance = _mm256_sub_ps(h2, r2); + let relevance2 = _mm256_mul_ps(relevance, relevance); + let relevance3 = _mm256_mul_ps(relevance2, relevance); + + // 315.0/(64.0*PI*h^9) + let normalization = _mm256_set1_ps(self.sigma); + + let res = _mm256_mul_ps(normalization, relevance3); + + // Only use if in compact support radius + let in_radius = _mm256_cmp_ps::<_CMP_LE_OQ>(r2, h2); + _mm256_blendv_ps(zero, res, in_radius) + } +} + +/// Spiky kernel +pub struct SpikyKernel { + /// Compact support radius of the kernel + compact_support_radius: R, + /// Kernel normalization factor (sigma) + normalization: R, +} + +impl SymmetricKernel3d for SpikyKernel { + /// Initializes a spiky kernel with the given compact support radius + #[replace_float_literals(R::from_float(literal))] + fn new(compact_support_radius: R) -> Self { + let h = compact_support_radius; + let sigma = 15.0 / (R::pi() * h.powi(6)); + + Self { + compact_support_radius, + normalization: sigma, + } + } + + fn compact_support_radius(&self) -> R { + self.compact_support_radius + } + + /// Evaluates the spiky kernel at the radial distance `r` + fn evaluate(&self, r: R) -> R { + let r2 = r * r; + let h2 = self.compact_support_radius.powi(2); + if r2 <= h2 { + self.normalization * (self.compact_support_radius - r).powi(3) + } else { + R::zero() + } + } + + /// Evaluates the gradient of the spiky kernel at the position `x` + fn evaluate_gradient(&self, x: Vector3) -> Vector3 { + // Radial distance is norm of position + let r2 = x.norm_squared(); + let r = r2.try_sqrt().unwrap(); + let h2 = self.compact_support_radius.powi(2); + + if r2 <= h2 { + // normalize x + let x = x.unscale(r); + + x.scale( + R::from_float(-3.0) + * self.normalization + * (self.compact_support_radius - r).powi(2), + ) + } else { + x.scale(R::zero()) + } + } + + /// Evaluates the norm of the gradient of the spiky kernel at the radial distance `r` + fn evaluate_gradient_norm(&self, r: R) -> R { + let r2 = r * r; + let h2 = self.compact_support_radius.powi(2); + + if r2 <= h2 { + R::from_float(-3.0) * self.normalization * (self.compact_support_radius - r).powi(2) + } else { + R::zero() + } + } +} + +/// Vectorized implementation of the spiky kernel using NEON instructions. Only available on `aarch64` targets. +#[cfg(target_arch = "aarch64")] +pub struct SpikyKernelNeonF32 { + compact_support: f32, + sigma: f32, +} + +#[cfg(target_arch = "aarch64")] +unsafe impl NeonKernel for SpikyKernelNeonF32 { + /// Initializes a spiky kernel with the given compact support radius + fn new(compact_support_radius: f32) -> Self { + let r = compact_support_radius; + let r6 = r.powi(6); + let sigma = 15.0 / (std::f32::consts::PI * r6); + Self { + compact_support: r, + sigma, + } + } + + /// Evaluates the spiky kernel at the specified radial distances + #[target_feature(enable = "neon")] + unsafe fn evaluate(&self, r: float32x4_t) -> float32x4_t { + use core::arch::aarch64::*; + + let zero = vdupq_n_f32(0.0); + let h = vdupq_n_f32(self.compact_support); + + let r2 = vmulq_f32(r, r); + let h2 = vmulq_f32(h, h); + + // (h - r)^3 + let relevance = vsubq_f32(h, r); + let relevance2 = vmulq_f32(relevance, relevance); + let relevance3 = vmulq_f32(relevance2, relevance); + + // 15.0/(PI*h^6) + let normalization = vdupq_n_f32(self.sigma); + + let res = vmulq_f32(normalization, relevance3); + + // mask: r2 <= h2 + let mask = vcleq_f32(r2, h2); + + // blend: if mask true → res, else → zero + vbslq_f32(mask, res, zero) + } +} + +/// Vectorized implementation of the spiky kernel using AVX2 and FMA instructions. Only available on `x86` and `x86_64` targets. +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +pub struct SpikyKernelAvxF32 { + compact_support: f32, + sigma: f32, +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +unsafe impl AvxKernel for SpikyKernelAvxF32 { + /// Initializes a spiky kernel with the given compact support radius + fn new(compact_support_radius: f32) -> Self { + let r = compact_support_radius; + let r6 = r.powi(6); + let sigma = 15.0 / (std::f32::consts::PI * r6); + Self { + compact_support: r, + sigma, + } + } + + /// Evaluates the spiky kernel at the specified radial distances + #[target_feature(enable = "avx2,fma")] + unsafe fn evaluate(&self, r: __m256) -> __m256 { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + let zero = _mm256_set1_ps(0.0); + let h = _mm256_set1_ps(self.compact_support); + + let r2 = _mm256_mul_ps(r, r); + let h2 = _mm256_mul_ps(h, h); + + // (h-r)^3 + let relevance = _mm256_sub_ps(h, r); + let relevance2 = _mm256_mul_ps(relevance, relevance); + let relevance3 = _mm256_mul_ps(relevance2, relevance); + + // 15.0/(PI*h^6) + let normalization = _mm256_set1_ps(self.sigma); + + let res = _mm256_mul_ps(normalization, relevance3); + + // Only use if in compact support radius + let in_radius = _mm256_cmp_ps::<_CMP_LE_OQ>(r2, h2); + _mm256_blendv_ps(zero, res, in_radius) + } +} + +/// Quintic Wendland C2 kernel +pub struct WendlandQuinticC2Kernel { + /// Compact support radius of the kernel + compact_support_radius: R, + /// Kernel normalization factor (sigma) + normalization: R, +} + +impl SymmetricKernel3d for WendlandQuinticC2Kernel { + /// Initializes a wendland quintic C2 kernel with the given compact support radius + #[replace_float_literals(R::from_float(literal))] + fn new(compact_support_radius: R) -> Self { + let h = compact_support_radius; + let sigma = 21.0 / (2.0 * R::pi() * h.powi(3)); + + Self { + compact_support_radius, + normalization: sigma, + } + } + + fn compact_support_radius(&self) -> R { + self.compact_support_radius + } + + /// Evaluates the wendland quintic C2 kernel at the radial distance `r` + fn evaluate(&self, r: R) -> R { + let q = r / self.compact_support_radius; + if q <= R::one() { + self.normalization * (R::one() - q).powi(4) * (R::from_float(4.0) * q + R::one()) + } else { + R::zero() + } + } + + /// Evaluates the gradient of the wendland quintic C2 kernel at the position `x` + fn evaluate_gradient(&self, x: Vector3) -> Vector3 { + // Radial distance is norm of position + let r = x.norm(); + let q = r / self.compact_support_radius; + + if q <= R::one() { + // normalize x + let gradq = x.unscale(r * self.compact_support_radius); + + gradq.scale(R::from_float(-20.0) * self.normalization * q * (R::one() - q).powi(3)) + } else { + x.scale(R::zero()) + } + } + + /// Evaluates the norm of the gradient of the wendland quintic C2 kernel at the radial distance `r` + fn evaluate_gradient_norm(&self, r: R) -> R { + let q = r / self.compact_support_radius; + + if q <= R::one() { + R::from_float(-20.0) * self.normalization * q * (R::one() - q).powi(3) + } else { + R::zero() + } + } +} + +/// Vectorized implementation of the Wendland quintic C2 kernel using NEON instructions. Only available on `aarch64` targets. +#[cfg(target_arch = "aarch64")] +pub struct WendlandQuinticC2KernelNeonF32 { + compact_support_inv: f32, + sigma: f32, +} + +#[cfg(target_arch = "aarch64")] +unsafe impl NeonKernel for WendlandQuinticC2KernelNeonF32 { + /// Initializes a Wendland quintic C2 kernel with the given compact support radius + fn new(compact_support_radius: f32) -> Self { + let r = compact_support_radius; + let compact_support_inv = 1.0 / r; + let r3 = r.powi(3); + let sigma = 21.0 / (2.0 * std::f32::consts::PI * r3); + + Self { + compact_support_inv, + sigma, + } + } + + /// Evaluates the Wendland quintic C2 kernel at the specified radial distances + #[target_feature(enable = "neon")] + unsafe fn evaluate(&self, r: float32x4_t) -> float32x4_t { + use core::arch::aarch64::*; + + let zero = vdupq_n_f32(0.0); + let one = vdupq_n_f32(1.0); + let four = vdupq_n_f32(4.0); + + // q = r * compact_support_inv + let q = vmulq_n_f32(r, self.compact_support_inv); + + // v = max(0, 1 - q) + let mut v = vsubq_f32(one, q); + v = vmaxq_f32(v, zero); + + // (1 - q)^4 + let v2 = vmulq_f32(v, v); + let v4 = vmulq_f32(v2, v2); + + // normalization factor (15.0/(PI*h^6)), stored in self.sigma + let normalization = vdupq_n_f32(self.sigma); + + // (1-q)^4 * (4q + 1) + let fourq = vmulq_f32(four, q); + let fourq_plus_one = vaddq_f32(fourq, one); + let res = vmulq_f32(v4, fourq_plus_one); + + // normalization * res + vmulq_f32(normalization, res) + } +} + +/// Vectorized implementation of the Wendland quintic C2 kernel using AVX2 and FMA instructions. Only available on `x86` and `x86_64` targets. +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +pub struct WendlandQuinticC2KernelAvxF32 { + compact_support_inv: f32, + sigma: f32, +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +unsafe impl AvxKernel for WendlandQuinticC2KernelAvxF32 { + /// Initializes a Wendland quintic C2 kernel with the given compact support radius + fn new(compact_support_radius: f32) -> Self { + let r = compact_support_radius; + let compact_support_inv = 1.0 / r; + let r3 = r.powi(3); + let sigma = 21.0 / (2.0 * std::f32::consts::PI * r3); + + Self { + compact_support_inv, + sigma, + } + } + + /// Evaluates the Wendland quintic C2 kernel at the specified radial distances + #[target_feature(enable = "avx2,fma")] + unsafe fn evaluate(&self, r: __m256) -> __m256 { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + let zero = _mm256_set1_ps(0.0); + let one = _mm256_set1_ps(1.0); + let four = _mm256_set1_ps(4.0); + + let q = _mm256_mul_ps(r, _mm256_set1_ps(self.compact_support_inv)); + let mut v = _mm256_sub_ps(one, q); + // Clamp v to [0, 1] to implicitly zero-out contributions with q > 1 + v = _mm256_max_ps(v, zero); + + // (1-q)^4 + let v4 = _mm256_mul_ps(v, _mm256_mul_ps(v, _mm256_mul_ps(v, v))); + + // 15.0/(PI*h^6) + let normalization = _mm256_set1_ps(self.sigma); + + // (1-q)^4 * (4q+1) + let res = _mm256_mul_ps(v4, _mm256_add_ps(_mm256_mul_ps(four, q), one)); + + _mm256_mul_ps(normalization, res) + } +} + +#[cfg(test)] +mod kernel_tests { + use super::*; + + macro_rules! test_kernel_r_compact_support { + ($kernel_class: ident) => { + let hs = [0.025, 0.1, 2.0]; + for &h in hs.iter() { + let kernel = $kernel_class::new(h); + assert_eq!(kernel.evaluate(h), 0.0); + assert_eq!(kernel.evaluate(2.0 * h), 0.0); + assert_eq!(kernel.evaluate(10.0 * h), 0.0); + } + }; + } + + macro_rules! test_kernel_r_integral { + ($kernel_class: ident) => { + let hs = [0.025, 0.1, 2.0]; + let n = 10; + + for &h in hs.iter() { + let kernel = $kernel_class::new(h); + + let dr = h / (n as f64); + let dvol = dr * dr * dr; + + let mut integral = 0.0; + for i in -n..n { + for j in -n..n { + for k in -n..n { + let r_in = Vector3::new(i as f64, j as f64, k as f64) * dr; + let r_out = + Vector3::new((i + 1) as f64, (j + 1) as f64, (k + 1) as f64) * dr; + let r = ((r_in + r_out) * 0.5).norm(); + + integral += dvol * kernel.evaluate(r); + } + } + } + + // println!("{}", integral); + assert!((integral - 1.0).abs() <= 1e-4); + } + }; + } + + macro_rules! test_kernel_avx { + ($kernel_class: ident, $kernel_class_avx: ident) => { + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "avx2", + target_feature = "fma" + ))] + { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + // Test a few representative compact support radii + let hs: [f32; 3] = [0.025, 0.1, 2.0]; + for &h in hs.iter() { + let scalar = $kernel_class::new(h); + let avx = $kernel_class_avx::new(h); + + // Sample radii from 0 to 2h (beyond support should be 0) + let n: usize = 1024; + let mut r0: f32 = 0.0; + let dr: f32 = (2.0 * h) / (n as f32); + + for _chunk in 0..(n / 8) { + // Prepare 8 lanes of radii + let rs = [ + r0, + r0 + dr, + r0 + 2.0 * dr, + r0 + 3.0 * dr, + r0 + 4.0 * dr, + r0 + 5.0 * dr, + r0 + 6.0 * dr, + r0 + 7.0 * dr, + ]; + + // Evaluate AVX and store back to array + let r_vec = unsafe { _mm256_loadu_ps(rs.as_ptr()) }; + let w_vec = unsafe { avx.evaluate(r_vec) }; + let mut w_avx = [0.0f32; 8]; + unsafe { _mm256_storeu_ps(w_avx.as_mut_ptr(), w_vec) }; + + // Compare against scalar lane-wise + for lane in 0..8 { + let r_lane = rs[lane]; + let w_scalar = scalar.evaluate(r_lane); + let diff = (w_avx[lane] - w_scalar).abs(); + + // Absolute tolerance with mild relative component to be robust across scales + let tol = 1e-6_f32.max(1e-5_f32 * w_scalar.abs()); + assert!( + diff <= tol, + "AVX kernel mismatch (h={}, r={}, lane={}): avx={}, scalar={}, diff={}, tol={}", + h, + r_lane, + lane, + w_avx[lane], + w_scalar, + diff, + tol + ); + } + + r0 += 8.0 * dr; + } + + // Also check a couple of out-of-support points explicitly + for &r in &[h * 1.01, h * 1.5, h * 2.0, h * 2.5] { + let w_scalar = scalar.evaluate(r); + let w_avx = { + let v = unsafe { _mm256_set1_ps(r) }; + let w = unsafe { avx.evaluate(v) }; + let mut tmp = [0.0f32; 8]; + unsafe { _mm256_storeu_ps(tmp.as_mut_ptr(), w) }; + tmp[0] + }; + let diff = (w_avx - w_scalar).abs(); + let tol = 1e-6_f32.max(1e-5_f32 * w_scalar.abs()); + assert!( + diff <= tol, + "AVX kernel mismatch outside support (h={}, r={}): avx={}, scalar={}, diff={}, tol={}", + h, + r, + w_avx, + w_scalar, + diff, + tol + ); + } + } + } + }; + } + + macro_rules! test_kernel_neon { + ($kernel_class: ident, $kernel_class_neon: ident) => { + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + { + use core::arch::aarch64::*; + + // Test a few representative compact support radii + let hs: [f32; 3] = [0.025, 0.1, 2.0]; + for &h in hs.iter() { + let scalar = $kernel_class::new(h); + let neon = $kernel_class_neon::new(h); + + // Sample radii from 0 to 2h (beyond support should be 0) + let n: usize = 1024; + let mut r0: f32 = 0.0; + let dr: f32 = (2.0 * h) / (n as f32); + + for _chunk in 0..(n / 4) { + // Prepare 4 lanes of radii + let rs = [r0, r0 + dr, r0 + 2.0 * dr, r0 + 3.0 * dr]; + let r_vec = unsafe { vld1q_f32(rs.as_ptr()) }; + + // Evaluate NEON and store back to array + let w_vec = unsafe { neon.evaluate(r_vec) }; + let mut w_neon = [0.0f32; 4]; + unsafe { vst1q_f32(w_neon.as_mut_ptr(), w_vec) }; + + // Compare against scalar lane-wise + for lane in 0..4 { + let r_lane = rs[lane]; + let w_scalar = scalar.evaluate(r_lane); + let diff = (w_neon[lane] - w_scalar).abs(); + + // Absolute tolerance with mild relative component to be robust across scales + let tol = 5e-6_f32.max(2e-5_f32 * w_scalar.abs()); + assert!( + diff <= tol, + "NEON kernel mismatch (h={}, r={}, lane={}): neon={}, scalar={}, diff={}, tol={}", + h, + r_lane, + lane, + w_neon[lane], + w_scalar, + diff, + tol + ); + } + + r0 += 4.0 * dr; + } + + // Also check a couple of out-of-support points explicitly + for &r in &[h * 1.01, h * 1.5, h * 2.0, h * 2.5] { + let w_scalar = scalar.evaluate(r); + let w_neon = { + let v = unsafe { vld1q_f32([r, r, r, r].as_ptr()) }; + let w = unsafe { neon.evaluate(v) }; + let mut tmp = [0.0f32; 4]; + unsafe { vst1q_f32(tmp.as_mut_ptr(), w) }; + tmp[0] + }; + let diff = (w_neon - w_scalar).abs(); + let tol = 5e-6_f32.max(1e-5_f32 * w_scalar.abs()); + assert!( + diff <= tol, + "NEON kernel mismatch outside support (h={}, r={}): neon={}, scalar={}, diff={}, tol={}", + h, + r, + w_neon, + w_scalar, + diff, + tol + ); + } + } + } + }; + } + + #[test] + fn cubic_spline_kernel_r_compact_support() { + test_kernel_r_compact_support!(CubicSplineKernel); + } + + #[test] + fn cubic_spline_kernel_r_integral() { + test_kernel_r_integral!(CubicSplineKernel); + } + + #[test] + #[cfg_attr( + not(all(target_arch = "aarch64", target_feature = "neon")), + ignore = "Skipped on non-aarch64 targets" + )] + fn cubic_spline_kernel_neon() { + test_kernel_neon!(CubicSplineKernel, CubicSplineKernelNeonF32); + } + + #[test] + #[cfg_attr( + not(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "avx2", + target_feature = "fma" + )), + ignore = "Skipped on non-x86 targets" + )] + fn cubic_spline_kernel_avx() { + test_kernel_avx!(CubicSplineKernel, CubicSplineKernelAvxF32); + } + + #[test] + fn discrete_cubic_kernel() { + let n = 10000; + let h = 0.025; + + let discrete_kernel = DiscreteSquaredDistanceCubicKernel::new::(n, h); + let kernel = CubicSplineKernel::new(h); + + // Test the pre-computed values using a linear stepping + let dr = h / (n as f64); + for i in 0..n { + let r = (i as f64) * dr; + let rr = r * r; + + let discrete = discrete_kernel.evaluate(rr); + let continuous = kernel.evaluate(r); + + let diff = (discrete - continuous).abs(); + let rel_diff = diff / continuous; + if rel_diff > 5e-2 && diff > 1e-1 { + eprintln!( + "at r={}, r/h={}, discrete: {}, continuous: {}, diff: {}, rel_diff: {}", + r, + r / h, + discrete, + continuous, + diff, + rel_diff + ); + assert!(false); + } + } + } + + #[test] + fn poly6_kernel_r_compact_support() { + test_kernel_r_compact_support!(Poly6Kernel); + } + + #[test] + fn poly6_kernel_r_integral() { + test_kernel_r_integral!(Poly6Kernel); + } + + #[test] + #[cfg_attr( + not(all(target_arch = "aarch64", target_feature = "neon")), + ignore = "Skipped on non-aarch64 targets" + )] + fn poly6_kernel_neon() { + test_kernel_neon!(Poly6Kernel, Poly6KernelNeonF32); + } + + #[test] + #[cfg_attr( + not(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "avx2", + target_feature = "fma" + )), + ignore = "Skipped on non-x86 targets" + )] + fn poly6_kernel_avx() { + test_kernel_avx!(Poly6Kernel, Poly6KernelAvxF32); + } + + #[test] + fn spiky_kernel_r_compact_support() { + test_kernel_r_compact_support!(SpikyKernel); + } + + #[test] + fn spiky_kernel_r_integral() { + test_kernel_r_integral!(SpikyKernel); + } + + #[test] + #[cfg_attr( + not(all(target_arch = "aarch64", target_feature = "neon")), + ignore = "Skipped on non-aarch64 targets" + )] + fn spiky_kernel_neon() { + test_kernel_neon!(SpikyKernel, SpikyKernelNeonF32); + } + + #[test] + #[cfg_attr( + not(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "avx2", + target_feature = "fma" + )), + ignore = "Skipped on non-x86 targets" + )] + fn spiky_kernel_avx() { + test_kernel_avx!(SpikyKernel, SpikyKernelAvxF32); + } + + #[test] + fn wendland_quintic_c2_kernel_r_compact_support() { + test_kernel_r_compact_support!(WendlandQuinticC2Kernel); + } + + #[test] + fn wendland_quintic_c2_kernel_r_integral() { + test_kernel_r_integral!(WendlandQuinticC2Kernel); + } + + #[test] + #[cfg_attr( + not(all(target_arch = "aarch64", target_feature = "neon")), + ignore = "Skipped on non-aarch64 targets" + )] + fn wendland_quintic_c2_kernel_neon() { + test_kernel_neon!(WendlandQuinticC2Kernel, WendlandQuinticC2KernelNeonF32); + } + + #[test] + #[cfg_attr( + not(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "avx2", + target_feature = "fma" + )), + ignore = "Skipped on non-x86 targets" + )] + fn wendland_quintic_c2_kernel_avx() { + test_kernel_avx!(WendlandQuinticC2Kernel, WendlandQuinticC2KernelAvxF32); + } } diff --git a/splashsurf_lib/src/lib.rs b/splashsurf_lib/src/lib.rs index 35b347b..63abf08 100644 --- a/splashsurf_lib/src/lib.rs +++ b/splashsurf_lib/src/lib.rs @@ -42,6 +42,7 @@ pub use crate::traits::{Index, Real, RealConvert, ThreadSafe}; pub use crate::uniform_grid::UniformGrid; use crate::density_map::DensityMapError; +use crate::kernel::KernelType; use crate::marching_cubes::MarchingCubesError; use crate::mesh::TriMesh3d; use crate::uniform_grid::GridConstructionError; @@ -186,6 +187,7 @@ pub struct Parameters { /// Depending on the settings of the reconstruction, neighborhood lists are only computed locally /// in subdomains. Enabling this flag joins this data over all particles which can add a small overhead. pub global_neighborhood_list: bool, + pub kernel_type: KernelType, } impl Parameters { @@ -206,6 +208,7 @@ impl Parameters { enable_simd: true, spatial_decomposition: Default::default(), global_neighborhood_list: false, + kernel_type: KernelType::CubicSpline, } } @@ -238,6 +241,7 @@ impl Parameters { enable_simd: self.enable_simd, spatial_decomposition: self.spatial_decomposition.clone(), global_neighborhood_list: self.global_neighborhood_list, + kernel_type: self.kernel_type.clone(), }) } } diff --git a/splashsurf_lib/src/reconstruction.rs b/splashsurf_lib/src/reconstruction.rs index 2ebfce8..b3eaadd 100644 --- a/splashsurf_lib/src/reconstruction.rs +++ b/splashsurf_lib/src/reconstruction.rs @@ -2,17 +2,40 @@ use crate::dense_subdomains::{ compute_global_densities_and_neighbors, decomposition, initialize_parameters, reconstruct_subdomains, stitching, subdomain_classification::GhostMarginClassifier, }; +use crate::density_map::{compute_particle_densities_inplace, generate_sparse_density_map}; +use crate::kernel::{ + CubicSplineKernel, KernelType, Poly6Kernel, SpikyKernel, WendlandQuinticC2Kernel, +}; use crate::mesh::TriMesh3d; use crate::uniform_grid::UniformGrid; use crate::workspace::LocalReconstructionWorkspace; use crate::{ - Index, Parameters, Real, ReconstructionError, SurfaceReconstruction, density_map, kernel, - marching_cubes, neighborhood_search, profile, + Index, Parameters, Real, ReconstructionError, SurfaceReconstruction, kernel, marching_cubes, + neighborhood_search, profile, }; use anyhow::Context; use log::{info, trace}; use nalgebra::Vector3; +macro_rules! call_with_kernel_type { + ($function:ident; $kernel_type:expr; $( $param:expr ), *) => { + match $kernel_type { + KernelType::CubicSpline => { + $function::>($( $param ), *) + }, + KernelType::Poly6 => { + $function::>($( $param ), *) + }, + KernelType::Spiky => { + $function::>($( $param ), *) + }, + KernelType::WendlandQuinticC2 => { + $function::>($( $param ), *) + } + } + } +} + /// Performs a surface reconstruction with a regular grid for domain decomposition pub(crate) fn reconstruct_surface_subdomain_grid( particle_positions: &[Vector3], @@ -31,18 +54,22 @@ pub(crate) fn reconstruct_surface_subdomain_grid( let subdomains = decomposition::>(&internal_parameters, particle_positions)?; - let (particle_densities, particle_neighbors) = compute_global_densities_and_neighbors( + let (particle_densities, particle_neighbors) = call_with_kernel_type!( + compute_global_densities_and_neighbors; + parameters.kernel_type; &internal_parameters, particle_positions, - &subdomains, + &subdomains ); - let surface_patches = reconstruct_subdomains( + let surface_patches = call_with_kernel_type!( + reconstruct_subdomains; + parameters.kernel_type; &internal_parameters, particle_positions, &particle_densities, &subdomains, - parameters.enable_simd, + parameters.enable_simd ); let global_mesh = stitching(surface_patches); @@ -135,13 +162,15 @@ pub(crate) fn compute_particle_densities_and_neighbors( ); trace!("Computing particle densities..."); - density_map::compute_particle_densities_inplace::( + call_with_kernel_type!( + compute_particle_densities_inplace; + parameters.kernel_type; particle_positions, particle_neighbor_lists.as_slice(), parameters.compact_support_radius, particle_rest_mass, parameters.enable_multi_threading, - densities, + densities ); } @@ -171,7 +200,10 @@ pub(crate) fn reconstruct_single_surface_append( // Create a new density map, reusing memory with the workspace is bad for cache efficiency // Alternatively one could reuse memory with a custom caching allocator let mut density_map = Default::default(); - density_map::generate_sparse_density_map( + + call_with_kernel_type!( + generate_sparse_density_map; + parameters.kernel_type; grid, particle_positions, particle_densities, @@ -180,7 +212,7 @@ pub(crate) fn reconstruct_single_surface_append( parameters.compact_support_radius, parameters.cube_size, parameters.enable_multi_threading, - &mut density_map, + &mut density_map )?; marching_cubes::triangulate_density_map_append( diff --git a/splashsurf_lib/src/sph_interpolation.rs b/splashsurf_lib/src/sph_interpolation.rs index 6fe432e..42483d8 100644 --- a/splashsurf_lib/src/sph_interpolation.rs +++ b/splashsurf_lib/src/sph_interpolation.rs @@ -1,9 +1,12 @@ //! Functions for interpolating quantities (e.g. normals, scalar fields) by evaluating SPH sums use crate::Real; -use crate::kernel::SymmetricKernel3d; +use crate::ThreadSafe; +use crate::kernel::{ + CubicSplineKernel, KernelType, Poly6Kernel, SpikyKernel, SymmetricKernel3d, + WendlandQuinticC2Kernel, +}; use crate::profile; -use crate::{ThreadSafe, kernel}; use nalgebra::{SVector, Unit, Vector3}; use rayon::prelude::*; use rstar::RTree; @@ -14,6 +17,7 @@ use std::ops::AddAssign; pub struct SphInterpolator { compact_support_radius: R, tree: RTree>, + kernel_type: KernelType, } /// Particle type that is stored in the R-tree for fast SPH neighbor queries @@ -62,6 +66,7 @@ impl SphInterpolator { particle_densities: &[R], particle_rest_mass: R, compact_support_radius: R, + kernel_type: KernelType, ) -> Self { assert_eq!(particle_positions.len(), particle_densities.len()); @@ -70,6 +75,7 @@ impl SphInterpolator { Self { compact_support_radius, tree, + kernel_type, } } @@ -86,40 +92,65 @@ impl SphInterpolator { ) { profile!("interpolate_normals_inplace"); - let squared_support = self.compact_support_radius * self.compact_support_radius; - let kernel = kernel::CubicSplineKernel::new(self.compact_support_radius); - - interpolation_points - .par_iter() - .map(|x_i| { - // Compute the gradient of the particle density field which points in the same direction as surface normals - let mut density_grad = Vector3::zeros(); - - // SPH: Iterate over all other particles within the squared support radius - let query_point = bytemuck::cast::<_, [R; 3]>(*x_i); - for p_j in self - .tree - .locate_within_distance(query_point, squared_support) - { - // Volume of the neighbor particle - let vol_j = p_j.data.volume; - // Position of the neighbor particle - let x_j = bytemuck::cast_ref::<_, Vector3>(p_j.geom()); - - // Relative position `dx` and distance `r` of the neighbor particle - let dx = x_j - x_i; - let r = dx.norm(); - - // Compute the contribution of the neighbor to the gradient of the density field - // TODO: Replace this by a discrete gradient norm evaluation - let kernel_grad = dx.unscale(r) * kernel.evaluate_gradient_norm(r); - density_grad += kernel_grad * vol_j; - } - - // Normalize the gradient to get the surface normal - Unit::new_normalize(density_grad) - }) - .collect_into_vec(normals); + fn interpolate_normals_helper + Sync>( + sph: &SphInterpolator, + interpolation_points: &[Vector3], + normals: &mut Vec>>, + ) { + let squared_support = sph.compact_support_radius * sph.compact_support_radius; + + let kernel = K::new(sph.compact_support_radius); + + interpolation_points + .par_iter() + .map(|x_i| { + // Compute the gradient of the particle density field which points in the same direction as surface normals + let mut density_grad = Vector3::zeros(); + + // SPH: Iterate over all other particles within the squared support radius + let query_point = bytemuck::cast::<_, [R; 3]>(*x_i); + for p_j in sph + .tree + .locate_within_distance(query_point, squared_support) + { + // Volume of the neighbor particle + let vol_j = p_j.data.volume; + // Position of the neighbor particle + let x_j = bytemuck::cast_ref::<_, Vector3>(p_j.geom()); + + // Relative position `dx` and distance `r` of the neighbor particle + let dx = x_j - x_i; + let r = dx.norm(); + + // Compute the contribution of the neighbor to the gradient of the density field + // TODO: Replace this by a discrete gradient norm evaluation + let kernel_grad = dx.unscale(r) * kernel.evaluate_gradient_norm(r); + density_grad += kernel_grad * vol_j; + } + + // Normalize the gradient to get the surface normal + Unit::new_normalize(density_grad) + }) + .collect_into_vec(normals); + } + + match self.kernel_type { + KernelType::CubicSpline => interpolate_normals_helper::>( + self, + interpolation_points, + normals, + ), + KernelType::Poly6 => { + interpolate_normals_helper::>(self, interpolation_points, normals) + } + KernelType::Spiky => { + interpolate_normals_helper::>(self, interpolation_points, normals) + } + KernelType::WendlandQuinticC2 => interpolate_normals_helper::< + R, + WendlandQuinticC2Kernel, + >(self, interpolation_points, normals), + } } /// Interpolates surface normals (i.e. normalized SPH gradient of the indicator function) of the fluid to the given points using SPH interpolation @@ -212,49 +243,95 @@ impl SphInterpolator { profile!("interpolate_quantity_inplace"); assert_eq!(particle_quantity.len(), self.tree.size()); - let squared_support = self.compact_support_radius * self.compact_support_radius; - let kernel = kernel::CubicSplineKernel::new(self.compact_support_radius); - - let enable_correction = if first_order_correction { - R::one() - } else { - R::zero() - }; - - interpolation_points - .par_iter() - .map(|x_i| { - let mut interpolated_value = T::zero(); - let mut correction = R::zero(); - - // SPH: Iterate over all other particles within the squared support radius - let query_point = bytemuck::cast::<_, [R; 3]>(*x_i); - for p_j in self - .tree - .locate_within_distance(query_point, squared_support) - { - // Volume of the neighbor particle - let vol_j = p_j.data.volume; - // Position of the neighbor particle - let x_j = bytemuck::cast_ref::<_, Vector3>(p_j.geom()); - - // Relative position `dx` and distance `r` of the neighbor particle - let dx = x_j - x_i; - let r = dx.norm(); - - // Unchecked access is fine as we asserted before that the slice has the correct length - let A_j = unsafe { particle_quantity.get_unchecked(p_j.data.index).clone() }; - let W_ij = kernel.evaluate(r); - - interpolated_value += A_j.scale(vol_j * W_ij); - correction += vol_j * W_ij; - } - - let correction_factor = - enable_correction * correction.recip() + (R::one() - enable_correction); - interpolated_value.scale(correction_factor) - }) - .collect_into_vec(interpolated_values); + fn interpolate_quantity_helper< + R: Real, + T: InterpolationQuantity, + K: SymmetricKernel3d + Sync, + >( + sph: &SphInterpolator, + particle_quantity: &[T], + interpolation_points: &[Vector3], + interpolated_values: &mut Vec, + first_order_correction: bool, + ) { + let squared_support = sph.compact_support_radius * sph.compact_support_radius; + let kernel = K::new(sph.compact_support_radius); + + let enable_correction = if first_order_correction { + R::one() + } else { + R::zero() + }; + + interpolation_points + .par_iter() + .map(|x_i| { + let mut interpolated_value = T::zero(); + let mut correction = R::zero(); + + // SPH: Iterate over all other particles within the squared support radius + let query_point = bytemuck::cast::<_, [R; 3]>(*x_i); + for p_j in sph + .tree + .locate_within_distance(query_point, squared_support) + { + // Volume of the neighbor particle + let vol_j = p_j.data.volume; + // Position of the neighbor particle + let x_j = bytemuck::cast_ref::<_, Vector3>(p_j.geom()); + + // Relative position `dx` and distance `r` of the neighbor particle + let dx = x_j - x_i; + let r = dx.norm(); + + // Unchecked access is fine as we asserted before that the slice has the correct length + let A_j = + unsafe { particle_quantity.get_unchecked(p_j.data.index).clone() }; + let W_ij = kernel.evaluate(r); + + interpolated_value += A_j.scale(vol_j * W_ij); + correction += vol_j * W_ij; + } + + let correction_factor = + enable_correction * correction.recip() + (R::one() - enable_correction); + interpolated_value.scale(correction_factor) + }) + .collect_into_vec(interpolated_values); + } + + match self.kernel_type { + KernelType::CubicSpline => interpolate_quantity_helper::>( + self, + particle_quantity, + interpolation_points, + interpolated_values, + first_order_correction, + ), + KernelType::Poly6 => interpolate_quantity_helper::>( + self, + particle_quantity, + interpolation_points, + interpolated_values, + first_order_correction, + ), + KernelType::Spiky => interpolate_quantity_helper::>( + self, + particle_quantity, + interpolation_points, + interpolated_values, + first_order_correction, + ), + KernelType::WendlandQuinticC2 => { + interpolate_quantity_helper::>( + self, + particle_quantity, + interpolation_points, + interpolated_values, + first_order_correction, + ) + } + } } } diff --git a/splashsurf_lib/tests/integration_tests/test_full.rs b/splashsurf_lib/tests/integration_tests/test_full.rs index 751d786..d70b04f 100644 --- a/splashsurf_lib/tests/integration_tests/test_full.rs +++ b/splashsurf_lib/tests/integration_tests/test_full.rs @@ -1,13 +1,13 @@ use nalgebra::Vector3; use splashsurf_lib::io::particles_from_file; use splashsurf_lib::io::vtk_format::write_vtk; +use splashsurf_lib::kernel::KernelType; use splashsurf_lib::marching_cubes::check_mesh_consistency; use splashsurf_lib::{ Aabb3d, GridDecompositionParameters, Parameters, Real, SpatialDecomposition, reconstruct_surface, }; use std::path::Path; - // TODO: Compare with a solution file // TODO: Test with a fixed grid? @@ -38,6 +38,7 @@ fn params_with_aabb( enable_simd: false, spatial_decomposition: SpatialDecomposition::None, global_neighborhood_list: false, + kernel_type: KernelType::CubicSpline, }; match strategy { diff --git a/splashsurf_lib/tests/integration_tests/test_simple.rs b/splashsurf_lib/tests/integration_tests/test_simple.rs index b15ca9c..f32034e 100644 --- a/splashsurf_lib/tests/integration_tests/test_simple.rs +++ b/splashsurf_lib/tests/integration_tests/test_simple.rs @@ -1,4 +1,5 @@ use nalgebra::Vector3; +use splashsurf_lib::kernel::KernelType; use splashsurf_lib::marching_cubes::check_mesh_consistency; use splashsurf_lib::{ Aabb3d, GridDecompositionParameters, Parameters, Real, SpatialDecomposition, @@ -32,6 +33,7 @@ fn params_with_aabb( enable_simd: false, spatial_decomposition: SpatialDecomposition::None, global_neighborhood_list: false, + kernel_type: KernelType::CubicSpline, }; match strategy { diff --git a/splashsurf_lib/tests/integration_tests/test_subdomains.rs b/splashsurf_lib/tests/integration_tests/test_subdomains.rs index 0e450a8..6044f3d 100644 --- a/splashsurf_lib/tests/integration_tests/test_subdomains.rs +++ b/splashsurf_lib/tests/integration_tests/test_subdomains.rs @@ -3,6 +3,7 @@ use nalgebra::Vector3; use splashsurf_lib::GridDecompositionParameters; #[cfg(feature = "io")] use splashsurf_lib::io::vtk_format::write_vtk; +use splashsurf_lib::kernel::KernelType; use splashsurf_lib::marching_cubes::check_mesh_consistency; use std::path::Path; @@ -31,6 +32,7 @@ macro_rules! generate_single_particle_test { ), rest_density: 1000.0, global_neighborhood_list: false, + kernel_type: KernelType::CubicSpline, }; let reconstruction =