diff --git a/pysplashsurf/pysplashsurf/docs/requirements.txt b/pysplashsurf/pysplashsurf/docs/requirements.txt
index 9126624..d6dacc2 100644
--- a/pysplashsurf/pysplashsurf/docs/requirements.txt
+++ b/pysplashsurf/pysplashsurf/docs/requirements.txt
@@ -1,4 +1,4 @@
-pip==25.0.1
+pip==25.2
 sphinx==8.2.3
 numpy==2.2.3
 meshio==5.3.5
diff --git a/pysplashsurf/pysplashsurf/pysplashsurf.pyi b/pysplashsurf/pysplashsurf/pysplashsurf.pyi
index ed3344f..6acb672 100644
--- a/pysplashsurf/pysplashsurf/pysplashsurf.pyi
+++ b/pysplashsurf/pysplashsurf/pysplashsurf.pyi
@@ -185,11 +185,11 @@ class NeighborhoodLists:
 
 class SphInterpolator:
     r"""
-    Interpolator of per-particle quantities to arbitrary points using SPH interpolation (with cubic kernel)
+    Interpolator of per-particle quantities to arbitrary points using SPH interpolation
     """
-    def __new__(cls, particle_positions:numpy.typing.NDArray[typing.Any], particle_densities:numpy.typing.NDArray[typing.Any], particle_rest_mass:builtins.float, compact_support_radius:builtins.float) -> SphInterpolator:
+    def __new__(cls, particle_positions:numpy.typing.NDArray[typing.Any], particle_densities:numpy.typing.NDArray[typing.Any], particle_rest_mass:builtins.float, compact_support_radius:builtins.float, kernel_type:KernelType) -> SphInterpolator:
         r"""
-        Constructs an SPH interpolator (with cubic kernels) for the given particles
+        Constructs an SPH interpolator for the given particles
         
         Parameters
         ----------
@@ -200,7 +200,9 @@ class SphInterpolator:
         particle_rest_mass
             The rest mass of each particle (assumed to be the same for all particles).
         compact_support_radius
-            The compact support radius of the cubic spline kernel used for interpolation.
+            The compact support radius of the kernel used for interpolation.
+        kernel_type
+            The kernel function used for interpolation
         """
     def interpolate_quantity(self, particle_quantity:numpy.typing.NDArray[typing.Any], interpolation_points:numpy.typing.NDArray[typing.Any], *, first_order_correction:builtins.bool=False) -> numpy.typing.NDArray[typing.Any]:
         r"""
@@ -315,6 +317,15 @@ class VertexVertexConnectivity:
         Returns the wrapped connectivity data by moving it out of this object (zero copy)
         """
 
+class KernelType(Enum):
+    r"""
+    Enum for specifying the Kernel function used for the reconstruction
+    """
+    CubicSpline = ...
+    Poly6 = ...
+    Spiky = ...
+    WendlandQuinticC2 = ...
+
 class MeshType(Enum):
     r"""
     Enum specifying the type of mesh wrapped by a ``MeshWithData``
@@ -497,7 +508,7 @@ def neighborhood_search_spatial_hashing_parallel(particle_positions:numpy.typing
         The radius per particle where other particles are considered neighbors.
     """
 
-def reconstruct_surface(particles:numpy.typing.NDArray[typing.Any], *, particle_radius:builtins.float, rest_density:builtins.float=1000.0, smoothing_length:builtins.float, cube_size:builtins.float, iso_surface_threshold:builtins.float=0.6, aabb_min:typing.Optional[typing.Sequence[builtins.float]]=None, aabb_max:typing.Optional[typing.Sequence[builtins.float]]=None, multi_threading:builtins.bool=True, global_neighborhood_list:builtins.bool=False, subdomain_grid:builtins.bool=True, subdomain_grid_auto_disable:builtins.bool=True, subdomain_num_cubes_per_dim:builtins.int=64) -> SurfaceReconstruction:
+def reconstruct_surface(particles:numpy.typing.NDArray[typing.Any], *, particle_radius:builtins.float, rest_density:builtins.float=1000.0, smoothing_length:builtins.float, cube_size:builtins.float, iso_surface_threshold:builtins.float=0.6, aabb_min:typing.Optional[typing.Sequence[builtins.float]]=None, aabb_max:typing.Optional[typing.Sequence[builtins.float]]=None, multi_threading:builtins.bool=True, simd:builtins.bool=True, kernel_type:KernelType=..., global_neighborhood_list:builtins.bool=False, subdomain_grid:builtins.bool=True, subdomain_grid_auto_disable:builtins.bool=True, subdomain_num_cubes_per_dim:builtins.int=64) -> SurfaceReconstruction:
     r"""
     Performs a surface reconstruction from the given particles without additional post-processing
     
@@ -523,6 +534,8 @@ def reconstruct_surface(particles:numpy.typing.NDArray[typing.Any], *, particle_
         Upper corner of the AABB of particles to consider in the reconstruction.
     multi_threading
         Flag to enable multi-threading for the reconstruction and post-processing steps.
+    simd
+       Flag to enable SIMD vectorization for the reconstruction if supported by the CPU architecture.
     subdomain_grid
         Flag to enable spatial decomposition by dividing the domain into subdomains with dense marching cube grids for efficient multi-threading.
     subdomain_grid_auto_disable
@@ -531,7 +544,7 @@ def reconstruct_surface(particles:numpy.typing.NDArray[typing.Any], *, particle_
         Number of marching cubes voxels along each coordinate axis in each subdomain if the subdomain grid is enabled.
     """
 
-def reconstruction_pipeline(particles:numpy.typing.NDArray[typing.Any], *, attributes_to_interpolate:typing.Optional[dict]=None, particle_radius:builtins.float, rest_density:builtins.float=1000.0, smoothing_length:builtins.float, cube_size:builtins.float, iso_surface_threshold:builtins.float=0.6, aabb_min:typing.Optional[typing.Sequence[builtins.float]]=None, aabb_max:typing.Optional[typing.Sequence[builtins.float]]=None, multi_threading:builtins.bool=True, subdomain_grid:builtins.bool=True, subdomain_grid_auto_disable:builtins.bool=True, subdomain_num_cubes_per_dim:builtins.int=64, check_mesh_closed:builtins.bool=False, check_mesh_manifold:builtins.bool=False, check_mesh_orientation:builtins.bool=False, check_mesh_debug:builtins.bool=False, mesh_cleanup:builtins.bool=False, mesh_cleanup_snap_dist:typing.Optional[builtins.float]=None, decimate_barnacles:builtins.bool=False, keep_vertices:builtins.bool=False, compute_normals:builtins.bool=False, sph_normals:builtins.bool=False, normals_smoothing_iters:typing.Optional[builtins.int]=None, mesh_smoothing_iters:typing.Optional[builtins.int]=None, mesh_smoothing_weights:builtins.bool=True, mesh_smoothing_weights_normalization:builtins.float=13.0, generate_quads:builtins.bool=False, quad_max_edge_diag_ratio:builtins.float=1.75, quad_max_normal_angle:builtins.float=10.0, quad_max_interior_angle:builtins.float=135.0, output_mesh_smoothing_weights:builtins.bool=False, output_raw_normals:builtins.bool=False, output_raw_mesh:builtins.bool=False, mesh_aabb_min:typing.Optional[typing.Sequence[builtins.float]]=None, mesh_aabb_max:typing.Optional[typing.Sequence[builtins.float]]=None, mesh_aabb_clamp_vertices:builtins.bool=True) -> tuple[MeshWithData, SurfaceReconstruction]:
+def reconstruction_pipeline(particles:numpy.typing.NDArray[typing.Any], *, attributes_to_interpolate:typing.Optional[dict]=None, particle_radius:builtins.float, rest_density:builtins.float=1000.0, smoothing_length:builtins.float, cube_size:builtins.float, iso_surface_threshold:builtins.float=0.6, aabb_min:typing.Optional[typing.Sequence[builtins.float]]=None, aabb_max:typing.Optional[typing.Sequence[builtins.float]]=None, multi_threading:builtins.bool=True, simd:builtins.bool=True, kernel_type:KernelType=..., subdomain_grid:builtins.bool=True, subdomain_grid_auto_disable:builtins.bool=True, subdomain_num_cubes_per_dim:builtins.int=64, check_mesh_closed:builtins.bool=False, check_mesh_manifold:builtins.bool=False, check_mesh_orientation:builtins.bool=False, check_mesh_debug:builtins.bool=False, mesh_cleanup:builtins.bool=False, mesh_cleanup_snap_dist:typing.Optional[builtins.float]=None, decimate_barnacles:builtins.bool=False, keep_vertices:builtins.bool=False, compute_normals:builtins.bool=False, sph_normals:builtins.bool=False, normals_smoothing_iters:typing.Optional[builtins.int]=None, mesh_smoothing_iters:typing.Optional[builtins.int]=None, mesh_smoothing_weights:builtins.bool=True, mesh_smoothing_weights_normalization:builtins.float=13.0, generate_quads:builtins.bool=False, quad_max_edge_diag_ratio:builtins.float=1.75, quad_max_normal_angle:builtins.float=10.0, quad_max_interior_angle:builtins.float=135.0, output_mesh_smoothing_weights:builtins.bool=False, output_raw_normals:builtins.bool=False, output_raw_mesh:builtins.bool=False, mesh_aabb_min:typing.Optional[typing.Sequence[builtins.float]]=None, mesh_aabb_max:typing.Optional[typing.Sequence[builtins.float]]=None, mesh_aabb_clamp_vertices:builtins.bool=True) -> tuple[MeshWithData, SurfaceReconstruction]:
     r"""
     Runs the surface reconstruction pipeline for the given particle positions with optional post-processing
     
@@ -561,6 +574,10 @@ def reconstruction_pipeline(particles:numpy.typing.NDArray[typing.Any], *, attri
         Upper corner [x,y,z] of the AABB of particles to consider in the reconstruction.
     multi_threading
         Flag to enable multi-threading for the reconstruction and post-processing steps.
+    simd
+        Flag to enable SIMD vectorization for the reconstruction if supported by the CPU architecture.
+    kernel_type
+        Kernel function to use for the reconstruction.
     subdomain_grid
         Flag to enable spatial decomposition by dividing the domain into subdomains with dense marching cube grids for efficient multi-threading.
     subdomain_grid_auto_disable
diff --git a/pysplashsurf/src/lib.rs b/pysplashsurf/src/lib.rs
index 02183d1..6924fa0 100644
--- a/pysplashsurf/src/lib.rs
+++ b/pysplashsurf/src/lib.rs
@@ -39,6 +39,7 @@ fn pysplashsurf(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<uniform_grid::PyUniformGrid>()?;
     m.add_class::<reconstruction::PySurfaceReconstruction>()?;
     m.add_class::<sph_interpolation::PySphInterpolator>()?;
+    m.add_class::<utils::KernelType>()?;
 
     use wrap_pyfunction as wrap;
 
diff --git a/pysplashsurf/src/pipeline.rs b/pysplashsurf/src/pipeline.rs
index 549f089..082f102 100644
--- a/pysplashsurf/src/pipeline.rs
+++ b/pysplashsurf/src/pipeline.rs
@@ -18,7 +18,7 @@ use std::borrow::Cow;
 
 use crate::mesh::PyMeshWithData;
 use crate::reconstruction::PySurfaceReconstruction;
-use crate::utils::{IndexT, pyerr_unsupported_scalar};
+use crate::utils::{IndexT, KernelType, pyerr_unsupported_scalar};
 
 /// Runs the surface reconstruction pipeline for the given particle positions with optional post-processing
 ///
@@ -49,7 +49,9 @@ use crate::utils::{IndexT, pyerr_unsupported_scalar};
 /// multi_threading
 ///     Flag to enable multi-threading for the reconstruction and post-processing steps.
 /// simd
-///    Flag to enable SIMD vectorization for the reconstruction if supported by the CPU architecture.
+///     Flag to enable SIMD vectorization for the reconstruction if supported by the CPU architecture.
+/// kernel_type
+///     Kernel function to use for the reconstruction.
 /// subdomain_grid
 ///     Flag to enable spatial decomposition by dividing the domain into subdomains with dense marching cube grids for efficient multi-threading.
 /// subdomain_grid_auto_disable
@@ -110,7 +112,7 @@ use crate::utils::{IndexT, pyerr_unsupported_scalar};
 #[pyo3(name = "reconstruction_pipeline")]
 #[pyo3(signature = (particles, *, attributes_to_interpolate = None,
     particle_radius, rest_density = 1000.0, smoothing_length, cube_size, iso_surface_threshold = 0.6,
-    aabb_min = None, aabb_max = None, multi_threading = true, simd = true,
+    aabb_min = None, aabb_max = None, multi_threading = true, simd = true, kernel_type = KernelType::CubicSpline,
     subdomain_grid = true, subdomain_grid_auto_disable = true, subdomain_num_cubes_per_dim = 64,
     check_mesh_closed = false, check_mesh_manifold = false, check_mesh_orientation = false, check_mesh_debug = false,
     mesh_cleanup = false, mesh_cleanup_snap_dist = None, decimate_barnacles = false, keep_vertices = false, compute_normals = false, sph_normals = false,
@@ -131,6 +133,7 @@ pub fn reconstruction_pipeline<'py>(
     aabb_max: Option<[f64; 3]>,
     multi_threading: bool,
     simd: bool,
+    kernel_type: KernelType,
     subdomain_grid: bool,
     subdomain_grid_auto_disable: bool,
     subdomain_num_cubes_per_dim: u32,
@@ -198,6 +201,7 @@ pub fn reconstruction_pipeline<'py>(
         enable_simd: simd,
         spatial_decomposition,
         global_neighborhood_list: false,
+        kernel_type: kernel_type.into_lib_enum(),
     };
 
     let postprocessing_args = splashsurf::reconstruct::ReconstructionPostprocessingParameters {
diff --git a/pysplashsurf/src/reconstruction.rs b/pysplashsurf/src/reconstruction.rs
index 1a6ec9f..fd88c07 100644
--- a/pysplashsurf/src/reconstruction.rs
+++ b/pysplashsurf/src/reconstruction.rs
@@ -1,7 +1,7 @@
 use crate::mesh::PyTriMesh3d;
 use crate::neighborhood_search::PyNeighborhoodLists;
 use crate::uniform_grid::PyUniformGrid;
-use crate::utils;
+use crate::utils::{self, KernelType};
 use anyhow::anyhow;
 use ndarray::ArrayView1;
 use numpy as np;
@@ -124,6 +124,8 @@ impl PySurfaceReconstruction {
 ///     Flag to enable multi-threading for the reconstruction and post-processing steps.
 /// simd
 ///    Flag to enable SIMD vectorization for the reconstruction if supported by the CPU architecture.
+/// kernel_type
+///     Kernel function to use for the reconstruction.
 /// subdomain_grid
 ///     Flag to enable spatial decomposition by dividing the domain into subdomains with dense marching cube grids for efficient multi-threading.
 /// subdomain_grid_auto_disable
@@ -135,8 +137,8 @@ impl PySurfaceReconstruction {
 #[pyo3(name = "reconstruct_surface")]
 #[pyo3(signature = (particles, *,
     particle_radius, rest_density = 1000.0, smoothing_length, cube_size, iso_surface_threshold = 0.6,
-    aabb_min = None, aabb_max = None,
-    multi_threading = true, simd = true, global_neighborhood_list = false,
+    aabb_min = None, aabb_max = None, multi_threading = true, simd = true,
+    kernel_type = KernelType::CubicSpline, global_neighborhood_list = false,
     subdomain_grid = true, subdomain_grid_auto_disable = true, subdomain_num_cubes_per_dim = 64
 ))]
 pub fn reconstruct_surface<'py>(
@@ -150,6 +152,7 @@ pub fn reconstruct_surface<'py>(
     aabb_max: Option<[f64; 3]>,
     multi_threading: bool,
     simd: bool,
+    kernel_type: KernelType,
     global_neighborhood_list: bool,
     subdomain_grid: bool,
     subdomain_grid_auto_disable: bool,
@@ -181,6 +184,7 @@ pub fn reconstruct_surface<'py>(
         enable_simd: simd,
         spatial_decomposition,
         global_neighborhood_list,
+        kernel_type: kernel_type.into_lib_enum(),
     };
 
     let element_type = particles.dtype();
diff --git a/pysplashsurf/src/sph_interpolation.rs b/pysplashsurf/src/sph_interpolation.rs
index b386a00..6bd3ced 100644
--- a/pysplashsurf/src/sph_interpolation.rs
+++ b/pysplashsurf/src/sph_interpolation.rs
@@ -1,3 +1,4 @@
+use crate::utils::{KernelType, *};
 use numpy as np;
 use numpy::prelude::*;
 use numpy::{Element, PyArray1, PyArray2, PyUntypedArray};
@@ -12,14 +13,12 @@ use splashsurf_lib::{
     sph_interpolation::SphInterpolator,
 };
 
-use crate::utils::*;
-
 enum PySphInterpolatorWrapper {
     F32(SphInterpolator<f32>),
     F64(SphInterpolator<f64>),
 }
 
-/// Interpolator of per-particle quantities to arbitrary points using SPH interpolation (with cubic kernel)
+/// Interpolator of per-particle quantities to arbitrary points using SPH interpolation
 #[gen_stub_pyclass]
 #[pyclass]
 #[pyo3(name = "SphInterpolator")]
@@ -36,6 +35,7 @@ impl PySphInterpolator {
         particle_densities: &Bound<'py, PyUntypedArray>,
         particle_rest_mass: f64,
         compact_support_radius: f64,
+        kernel_type: KernelType,
     ) -> PyResult<PySphInterpolator>
     where
         PySphInterpolator: From<SphInterpolator<R>>,
@@ -55,6 +55,7 @@ impl PySphInterpolator {
                 densities,
                 R::from_float(particle_rest_mass),
                 R::from_float(compact_support_radius),
+                kernel_type.into_lib_enum(),
             )))
         } else {
             Err(pyerr_scalar_type_mismatch())
@@ -159,7 +160,7 @@ impl PySphInterpolator {
 #[gen_stub_pymethods]
 #[pymethods]
 impl PySphInterpolator {
-    /// Constructs an SPH interpolator (with cubic kernels) for the given particles
+    /// Constructs an SPH interpolator for the given particles
     ///
     /// Parameters
     /// ----------
@@ -170,13 +171,16 @@ impl PySphInterpolator {
     /// particle_rest_mass
     ///     The rest mass of each particle (assumed to be the same for all particles).
     /// compact_support_radius
-    ///     The compact support radius of the cubic spline kernel used for interpolation.
+    ///     The compact support radius of the kernel used for interpolation.
+    /// kernel_type
+    ///     The kernel function used for interpolation
     #[new]
     fn py_new<'py>(
         particle_positions: &Bound<'py, PyUntypedArray>,
         particle_densities: &Bound<'py, PyUntypedArray>,
         particle_rest_mass: f64,
         compact_support_radius: f64,
+        kernel_type: KernelType,
     ) -> PyResult<Self> {
         let py = particle_positions.py();
         let element_type = particle_positions.dtype();
@@ -187,6 +191,7 @@ impl PySphInterpolator {
                 particle_densities,
                 particle_rest_mass,
                 compact_support_radius,
+                kernel_type,
             )
         } else if element_type.is_equiv_to(&np::dtype::<f64>(py)) {
             Self::new_generic::<f64>(
@@ -194,6 +199,7 @@ impl PySphInterpolator {
                 particle_densities,
                 particle_rest_mass,
                 compact_support_radius,
+                kernel_type,
             )
         } else {
             Err(pyerr_unsupported_scalar())
diff --git a/pysplashsurf/src/utils.rs b/pysplashsurf/src/utils.rs
index 095c359..ff7f66c 100644
--- a/pysplashsurf/src/utils.rs
+++ b/pysplashsurf/src/utils.rs
@@ -3,9 +3,32 @@ use numpy::{Element, PyArray, PyUntypedArray};
 use pyo3::exceptions::PyTypeError;
 use pyo3::prelude::*;
 use pyo3::{Bound, PyAny, PyErr, PyResult};
+use pyo3_stub_gen::derive::gen_stub_pyclass_enum;
 use splashsurf_lib::Real;
 use splashsurf_lib::nalgebra::SVector;
 
+/// Enum for specifying the Kernel function used for the reconstruction
+#[gen_stub_pyclass_enum]
+#[pyclass]
+#[derive(Clone)]
+pub enum KernelType {
+    CubicSpline,
+    Poly6,
+    Spiky,
+    WendlandQuinticC2,
+}
+
+impl KernelType {
+    pub fn into_lib_enum(&self) -> splashsurf_lib::kernel::KernelType {
+        match self {
+            KernelType::CubicSpline => splashsurf_lib::kernel::KernelType::CubicSpline,
+            KernelType::Poly6 => splashsurf_lib::kernel::KernelType::Poly6,
+            KernelType::Spiky => splashsurf_lib::kernel::KernelType::Spiky,
+            KernelType::WendlandQuinticC2 => splashsurf_lib::kernel::KernelType::WendlandQuinticC2,
+        }
+    }
+}
+
 /// The index type used for all grids and reconstructions in this crate
 pub(crate) type IndexT = i64;
 
diff --git a/pysplashsurf/tests/test_basic.py b/pysplashsurf/tests/test_basic.py
index 083cc47..2d38871 100644
--- a/pysplashsurf/tests/test_basic.py
+++ b/pysplashsurf/tests/test_basic.py
@@ -301,7 +301,7 @@ def interpolator_test(dtype):
     rest_mass = 1000.0 * 0.025**3
 
     interpolator = pysplashsurf.SphInterpolator(
-        particles, reconstruction.particle_densities, rest_mass, compact_support
+        particles, reconstruction.particle_densities, rest_mass, compact_support, pysplashsurf.KernelType.CubicSpline
     )
 
     assert type(interpolator) is pysplashsurf.SphInterpolator
diff --git a/pysplashsurf/tests/test_calling.py b/pysplashsurf/tests/test_calling.py
index c64f1ad..2c89f53 100644
--- a/pysplashsurf/tests/test_calling.py
+++ b/pysplashsurf/tests/test_calling.py
@@ -59,6 +59,7 @@ def reconstruction_pipeline(
     smoothing_length=2.0,
     cube_size=0.5,
     iso_surface_threshold=0.6,
+    kernel_type=pysplashsurf.KernelType.CubicSpline,
     mesh_smoothing_weights=False,
     output_mesh_smoothing_weights=False,
     sph_normals=False,
@@ -109,6 +110,7 @@ def reconstruction_pipeline(
         rest_density=rest_density,
         smoothing_length=smoothing_length,
         cube_size=cube_size,
+        kernel_type=kernel_type,
         iso_surface_threshold=iso_surface_threshold,
         mesh_smoothing_weights=mesh_smoothing_weights,
         sph_normals=sph_normals,
@@ -276,3 +278,56 @@ def test_with_post_processing_f32():
 
 def test_with_post_processing_f64():
     with_post_processing_test(np.float64)
+
+
+def poly6_kernel_test(dtype):
+    start = now_s()
+    subprocess.run(
+        [BINARY_PATH]
+        + f"reconstruct {VTK_PATH} -o {DIR.joinpath('test_bin.vtk')} -r=0.025 -l=2.0 -c=0.5 -t=0.6 {"-d=on" if dtype == np.float64 else ""} --kernel=poly6 --subdomain-grid=on --mesh-cleanup=off --mesh-smoothing-weights=off --mesh-smoothing-iters=0 --normals=off --normals-smoothing-iters=0".split(),
+        check=True,
+    )
+    print("Binary done in", now_s() - start)
+
+    start = now_s()
+    reconstruction_pipeline(
+        VTK_PATH,
+        DIR.joinpath("test.vtk"),
+        particle_radius=0.025,
+        smoothing_length=2.0,
+        cube_size=0.5,
+        iso_surface_threshold=0.6,
+        kernel_type=pysplashsurf.KernelType.Poly6,
+        mesh_smoothing_weights=False,
+        mesh_smoothing_iters=0,
+        normals_smoothing_iters=0,
+        mesh_cleanup=False,
+        compute_normals=False,
+        subdomain_grid=True,
+        dtype=dtype
+    )
+    print("Python done in", now_s() - start)
+
+    binary_mesh = meshio.read(DIR.joinpath("test_bin.vtk"))
+    python_mesh = meshio.read(DIR.joinpath("test.vtk"))
+
+    binary_verts = np.array(binary_mesh.points, dtype=dtype)
+    python_verts = np.array(python_mesh.points, dtype=dtype)
+
+    print("# of vertices binary:", len(binary_verts))
+    print("# of vertices python:", len(python_verts))
+
+    assert len(binary_verts) == len(python_verts)
+
+    binary_verts.sort(axis=0)
+    python_verts.sort(axis=0)
+
+    assert np.allclose(binary_verts, python_verts)
+
+
+def test_poly6_kernel_f32():
+    poly6_kernel_test(np.float32)
+
+
+def test_poly6_kernel_f64():
+    poly6_kernel_test(np.float64)
diff --git a/splashsurf/src/cli.rs b/splashsurf/src/cli.rs
index 060ebf3..275f6be 100644
--- a/splashsurf/src/cli.rs
+++ b/splashsurf/src/cli.rs
@@ -80,6 +80,26 @@ impl Switch {
     }
 }
 
+/// An enum for specifying the wanted kernel type in the CLI
+#[derive(Copy, Clone, Debug, PartialEq, Eq, clap::ValueEnum)]
+pub(crate) enum KernelType {
+    CubicSpline,
+    Poly6,
+    Spiky,
+    WendlandQuinticC2,
+}
+
+impl KernelType {
+    pub(crate) fn into_lib_enum(self) -> splashsurf_lib::kernel::KernelType {
+        match self {
+            KernelType::CubicSpline => splashsurf_lib::kernel::KernelType::CubicSpline,
+            KernelType::Poly6 => splashsurf_lib::kernel::KernelType::Poly6,
+            KernelType::Spiky => splashsurf_lib::kernel::KernelType::Spiky,
+            KernelType::WendlandQuinticC2 => splashsurf_lib::kernel::KernelType::WendlandQuinticC2,
+        }
+    }
+}
+
 /// Runs the splashsurf CLI with the provided command line arguments.
 ///
 /// This function behaves like the binary `splashsurf` command line tool including output to stdout
diff --git a/splashsurf/src/reconstruct.rs b/splashsurf/src/reconstruct.rs
index 0cc8428..b5119ac 100644
--- a/splashsurf/src/reconstruct.rs
+++ b/splashsurf/src/reconstruct.rs
@@ -1,6 +1,6 @@
 //! Implementation of the `reconstruct` subcommand of the splashsurf CLI.
 
-use crate::cli::Switch;
+use crate::cli::{KernelType, Switch};
 use crate::reconstruct::arguments::*;
 use crate::{io, logging};
 use anyhow::{Context, anyhow};
@@ -135,7 +135,16 @@ pub(crate) struct ReconstructSubcommandArgs {
         require_equals = true
     )]
     pub simd: Switch,
-
+    /// Kernel that is used for the reconstruction and interpolation.
+    #[arg(
+        help_heading = ARGS_ADV,
+        long,
+        default_value = "cubic-spline",
+        value_name = "cubic-spline|poly6|spiky|wendland-quintic-c2",
+        ignore_case = true,
+        require_equals = true
+    )]
+    pub kernel: KernelType,
     /// Enable automatic spatial decomposition using a regular grid-based approach (for efficient multithreading) if the domain is large enough
     #[arg(
         help_heading = ARGS_DECOMPOSITION,
@@ -651,6 +660,7 @@ pub(crate) mod arguments {
                 enable_simd: args.simd.into_bool(),
                 spatial_decomposition,
                 global_neighborhood_list: args.mesh_smoothing_weights.into_bool(),
+                kernel_type: args.kernel.into_lib_enum(),
             };
 
             // Optionally initialize thread pool
@@ -1143,6 +1153,7 @@ pub fn reconstruction_pipeline<'a, I: Index, R: Real>(
                 particle_densities,
                 particle_rest_mass,
                 params.compact_support_radius,
+                params.kernel_type,
             ))
         } else {
             None
diff --git a/splashsurf_lib/benches/benches/bench_full.rs b/splashsurf_lib/benches/benches/bench_full.rs
index be2476a..23af42f 100644
--- a/splashsurf_lib/benches/benches/bench_full.rs
+++ b/splashsurf_lib/benches/benches/bench_full.rs
@@ -105,6 +105,7 @@ pub fn surface_reconstruction_dam_break(c: &mut Criterion) {
         enable_simd: true,
         spatial_decomposition: SpatialDecomposition::None,
         global_neighborhood_list: false,
+        kernel_type: splashsurf_lib::kernel::KernelType::CubicSpline,
     };
 
     let mut group = c.benchmark_group("full surface reconstruction");
@@ -165,6 +166,7 @@ pub fn surface_reconstruction_double_dam_break(c: &mut Criterion) {
         enable_simd: true,
         spatial_decomposition: SpatialDecomposition::None,
         global_neighborhood_list: false,
+        kernel_type: splashsurf_lib::kernel::KernelType::CubicSpline,
     };
 
     let mut group = c.benchmark_group("full surface reconstruction");
@@ -225,6 +227,7 @@ pub fn surface_reconstruction_double_dam_break_inplace(c: &mut Criterion) {
         enable_simd: true,
         spatial_decomposition: SpatialDecomposition::None,
         global_neighborhood_list: false,
+        kernel_type: splashsurf_lib::kernel::KernelType::CubicSpline,
     };
 
     let mut group = c.benchmark_group("full surface reconstruction");
diff --git a/splashsurf_lib/benches/benches/bench_grid_loop.rs b/splashsurf_lib/benches/benches/bench_grid_loop.rs
index 4fd5ff2..ee6898f 100644
--- a/splashsurf_lib/benches/benches/bench_grid_loop.rs
+++ b/splashsurf_lib/benches/benches/bench_grid_loop.rs
@@ -2,7 +2,7 @@ use criterion::{Criterion, criterion_group};
 use nalgebra::{Scalar, Vector3};
 use serde_derive::{Deserialize, Serialize};
 use splashsurf_lib::dense_subdomains;
-use splashsurf_lib::kernel::CubicSplineKernel;
+use splashsurf_lib::kernel::{CubicSplineKernel, SymmetricKernel3d};
 use splashsurf_lib::uniform_grid::UniformCartesianCubeGrid3d;
 use std::time::Duration;
 
@@ -231,7 +231,6 @@ pub fn grid_loop_avx2(c: &mut Criterion) {
     #[cfg(all(target_feature = "avx2", target_feature = "fma"))]
     {
         let avx_result = unsafe {
-            let kernel = CubicSplineKernel::new(params.compact_support_radius);
             let mut params = params.clone();
             dense_subdomains::density_grid_loop_avx(
                 params.levelset_grid.as_mut_slice(),
@@ -243,7 +242,8 @@ pub fn grid_loop_avx2(c: &mut Criterion) {
                 params.cube_radius,
                 params.squared_support_with_margin,
                 params.particle_rest_mass,
-                &kernel,
+                params.compact_support_radius,
+                &splashsurf_lib::kernel::KernelType::CubicSpline,
             );
             params.levelset_grid
         };
@@ -268,7 +268,6 @@ pub fn grid_loop_avx2(c: &mut Criterion) {
     unsafe {
         group.bench_function("grid_loop_avx2", |b| {
             b.iter(|| {
-                let kernel = CubicSplineKernel::new(params.compact_support_radius);
                 let mut params = params.clone();
                 dense_subdomains::density_grid_loop_avx(
                     params.levelset_grid.as_mut_slice(),
@@ -280,7 +279,8 @@ pub fn grid_loop_avx2(c: &mut Criterion) {
                     params.cube_radius,
                     params.squared_support_with_margin,
                     params.particle_rest_mass,
-                    &kernel,
+                    params.compact_support_radius,
+                    &splashsurf_lib::kernel::KernelType::CubicSpline,
                 );
             })
         });
diff --git a/splashsurf_lib/benches/benches/bench_mesh.rs b/splashsurf_lib/benches/benches/bench_mesh.rs
index 4d49d00..f8caa3d 100644
--- a/splashsurf_lib/benches/benches/bench_mesh.rs
+++ b/splashsurf_lib/benches/benches/bench_mesh.rs
@@ -29,6 +29,7 @@ fn reconstruct_particles<P: AsRef<Path>>(particle_file: P) -> SurfaceReconstruct
             auto_disable: false,
         }),
         global_neighborhood_list: false,
+        kernel_type: splashsurf_lib::kernel::KernelType::CubicSpline,
     };
 
     reconstruct_surface::<i64, _>(particle_positions.as_slice(), &parameters).unwrap()
diff --git a/splashsurf_lib/benches/benches/bench_subdomain_grid.rs b/splashsurf_lib/benches/benches/bench_subdomain_grid.rs
index 592ca81..b91316c 100644
--- a/splashsurf_lib/benches/benches/bench_subdomain_grid.rs
+++ b/splashsurf_lib/benches/benches/bench_subdomain_grid.rs
@@ -28,6 +28,7 @@ fn parameters_canyon() -> Parameters<f32> {
             auto_disable: false,
         }),
         global_neighborhood_list: false,
+        kernel_type: splashsurf_lib::kernel::KernelType::CubicSpline,
     }
 }
 
diff --git a/splashsurf_lib/src/dense_subdomains.rs b/splashsurf_lib/src/dense_subdomains.rs
index ed19c6d..71e8803 100644
--- a/splashsurf_lib/src/dense_subdomains.rs
+++ b/splashsurf_lib/src/dense_subdomains.rs
@@ -16,7 +16,10 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 use thread_local::ThreadLocal;
 
 use crate::density_map::sequential_compute_particle_densities_filtered;
-use crate::kernel::{CubicSplineKernel, SymmetricKernel3d};
+use crate::kernel::{
+    CubicSplineKernel, KernelType, Poly6Kernel, SpikyKernel, SymmetricKernel3d,
+    WendlandQuinticC2Kernel,
+};
 use crate::marching_cubes::marching_cubes_lut::marching_cubes_triangulation_iter;
 use crate::mesh::{HexMesh3d, TriMesh3d};
 use crate::neighborhood_search::{
@@ -59,6 +62,8 @@ pub(crate) struct ParametersSubdomainGrid<I: Index, R: Real> {
     pub chunk_size: usize,
     /// Whether to return the global particle neighborhood list instead of only using per-domain lists internally
     pub global_neighborhood_list: bool,
+    /// The SPH kernel that should be used
+    pub kernel_type: KernelType,
 }
 
 impl<I: Index, R: Real> ParametersSubdomainGrid<I, R> {
@@ -240,6 +245,7 @@ pub(crate) fn initialize_parameters<I: Index, R: Real>(
         subdomain_grid,
         chunk_size,
         global_neighborhood_list: parameters.global_neighborhood_list,
+        kernel_type: parameters.kernel_type.clone(),
     })
 }
 
@@ -493,7 +499,7 @@ pub(crate) fn decomposition<
     })
 }
 
-pub(crate) fn compute_global_densities_and_neighbors<I: Index, R: Real>(
+pub(crate) fn compute_global_densities_and_neighbors<I: Index, R: Real, K: SymmetricKernel3d<R>>(
     parameters: &ParametersSubdomainGrid<I, R>,
     global_particles: &[Vector3<R>],
     subdomains: &Subdomains<I>,
@@ -583,7 +589,7 @@ pub(crate) fn compute_global_densities_and_neighbors<I: Index, R: Real>(
                 |i| is_inside[i],
             );
 
-            sequential_compute_particle_densities_filtered::<I, R, _>(
+            sequential_compute_particle_densities_filtered::<I, R, _, K>(
                 subdomain_particles,
                 neighborhood_lists,
                 parameters.compact_support_radius,
@@ -712,7 +718,7 @@ fn local_to_global_point_ijk<I: Index>(
 }
 
 /// Auto-dispatching density grid loop for f32/i64: chooses AVX2+FMA on x86, NEON on aarch64, otherwise scalar fallback
-pub fn density_grid_loop_auto<K: SymmetricKernel3d<f32>>(
+pub fn density_grid_loop_auto(
     levelset_grid: &mut [f32],
     subdomain_particles: &[Vector3<f32>],
     subdomain_particle_densities: &[f32],
@@ -722,7 +728,8 @@ pub fn density_grid_loop_auto<K: SymmetricKernel3d<f32>>(
     cube_radius: i64,
     squared_support_with_margin: f32,
     particle_rest_mass: f32,
-    kernel: &K,
+    kernel_support: f32,
+    kernel_type: &KernelType,
 ) {
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     {
@@ -739,7 +746,8 @@ pub fn density_grid_loop_auto<K: SymmetricKernel3d<f32>>(
                     cube_radius,
                     squared_support_with_margin,
                     particle_rest_mass,
-                    kernel,
+                    kernel_support,
+                    kernel_type,
                 );
             }
         }
@@ -760,25 +768,72 @@ pub fn density_grid_loop_auto<K: SymmetricKernel3d<f32>>(
                     cube_radius,
                     squared_support_with_margin,
                     particle_rest_mass,
-                    kernel,
+                    kernel_support,
+                    kernel_type,
                 );
             }
         }
     }
 
     // Fallback: scalar generic implementation
-    density_grid_loop_scalar::<i64, f32, K>(
-        levelset_grid,
-        subdomain_particles,
-        subdomain_particle_densities,
-        subdomain_mc_grid,
-        subdomain_ijk,
-        global_mc_grid,
-        cube_radius,
-        squared_support_with_margin,
-        particle_rest_mass,
-        kernel,
-    );
+    match kernel_type {
+        KernelType::CubicSpline => {
+            density_grid_loop_scalar::<i64, f32, CubicSplineKernel<f32>>(
+                levelset_grid,
+                subdomain_particles,
+                subdomain_particle_densities,
+                subdomain_mc_grid,
+                subdomain_ijk,
+                global_mc_grid,
+                cube_radius,
+                squared_support_with_margin,
+                particle_rest_mass,
+                &CubicSplineKernel::new(kernel_support),
+            );
+        }
+        KernelType::Poly6 => {
+            density_grid_loop_scalar::<i64, f32, Poly6Kernel<f32>>(
+                levelset_grid,
+                subdomain_particles,
+                subdomain_particle_densities,
+                subdomain_mc_grid,
+                subdomain_ijk,
+                global_mc_grid,
+                cube_radius,
+                squared_support_with_margin,
+                particle_rest_mass,
+                &Poly6Kernel::new(kernel_support),
+            );
+        }
+        KernelType::Spiky => {
+            density_grid_loop_scalar::<i64, f32, SpikyKernel<f32>>(
+                levelset_grid,
+                subdomain_particles,
+                subdomain_particle_densities,
+                subdomain_mc_grid,
+                subdomain_ijk,
+                global_mc_grid,
+                cube_radius,
+                squared_support_with_margin,
+                particle_rest_mass,
+                &SpikyKernel::new(kernel_support),
+            );
+        }
+        KernelType::WendlandQuinticC2 => {
+            density_grid_loop_scalar::<i64, f32, WendlandQuinticC2Kernel<f32>>(
+                levelset_grid,
+                subdomain_particles,
+                subdomain_particle_densities,
+                subdomain_mc_grid,
+                subdomain_ijk,
+                global_mc_grid,
+                cube_radius,
+                squared_support_with_margin,
+                particle_rest_mass,
+                &WendlandQuinticC2Kernel::new(kernel_support),
+            );
+        }
+    }
 }
 
 pub fn density_grid_loop_scalar<I: Index, R: Real, K: SymmetricKernel3d<R>>(
@@ -848,7 +903,7 @@ pub fn density_grid_loop_scalar<I: Index, R: Real, K: SymmetricKernel3d<R>>(
 
 #[cfg(all(target_arch = "aarch64"))]
 #[target_feature(enable = "neon")]
-pub fn density_grid_loop_neon<K: SymmetricKernel3d<f32>>(
+pub fn density_grid_loop_neon(
     levelset_grid: &mut [f32],
     subdomain_particles: &[Vector3<f32>],
     subdomain_particle_densities: &[f32],
@@ -858,8 +913,10 @@ pub fn density_grid_loop_neon<K: SymmetricKernel3d<f32>>(
     cube_radius: i64,
     _squared_support_with_margin: f32,
     particle_rest_mass: f32,
-    kernel: &K,
+    kernel_support: f32,
+    kernel_type: &KernelType,
 ) {
+    use crate::kernel::NeonKernel;
     use core::arch::aarch64::*;
     const LANES: usize = 4;
 
@@ -875,120 +932,193 @@ pub fn density_grid_loop_neon<K: SymmetricKernel3d<f32>>(
         u32::MAX
     );
 
-    let kernel_neon = kernel::CubicSplineKernelNeonF32::new(kernel.compact_support_radius());
-
-    profile!("density grid loop (neon)");
-    let mc_grid = subdomain_mc_grid;
-    let mc_points = mc_grid.points_per_dim();
-    let dim_y = mc_points[1] as usize;
-    let dim_z = mc_points[2] as usize;
-
-    // Preload constants used inside the hot loops
-    const K_OFFSETS_ARR: [u32; 4] = [0, 1, 2, 3];
-    let k_offsets = unsafe { vld1q_u32(K_OFFSETS_ARR.as_ptr()) };
-    let global_min = global_mc_grid.aabb().min();
-    let min_z_v = vdupq_n_f32(global_min[2]);
-    let cube_size = global_mc_grid.cell_size();
-    let support = kernel.compact_support_radius();
-    let support_sq_v = vdupq_n_f32(support * support);
-
-    for (p_i, rho_i) in subdomain_particles
-        .iter()
-        .copied()
-        .zip(subdomain_particle_densities.iter().copied())
-    {
-        let v_i = particle_rest_mass / rho_i;
-
-        let (lower, upper) = particle_influence_aabb(&p_i, &mc_grid, cube_radius);
-        let lower = lower.map(|i| i as usize);
-        let upper = upper.map(|i| i as usize);
-
-        let remainder = (upper[2] - lower[2]) % LANES;
-        let upper_k_aligned = upper[2] - remainder;
-
-        let subdomain_ijk = subdomain_ijk.map(|i| i as u32);
-        let mc_cells = mc_grid.cells_per_dim().map(|i| i as u32);
-
-        // Broadcast particle coordinates once per particle
-        let particle_xs = vdupq_n_f32(p_i[0]);
-        let particle_ys = vdupq_n_f32(p_i[1]);
-        let particle_zs = vdupq_n_f32(p_i[2]);
-
-        // Function to evaluate kernel contribution of current particle to 8 grid points at once
-        let evaluate_contribution =
-            |k: usize, grid_xs: float32x4_t, grid_ys: float32x4_t| -> float32x4_t {
-                // Compute global k for 4 consecutive points
-                let global_k_base = vdupq_n_u32(subdomain_ijk[2] * mc_cells[2] + k as u32);
-                let global_k = vaddq_u32(global_k_base, k_offsets);
-                let global_k_f32 = vcvtq_f32_u32(global_k);
-                let grid_zs = vmlaq_n_f32(min_z_v, global_k_f32, cube_size);
-                // Deltas
-                let dxs = vsubq_f32(particle_xs, grid_xs);
-                let dys = vsubq_f32(particle_ys, grid_ys);
-                let dzs = vsubq_f32(particle_zs, grid_zs);
-                // Distance squared
-                let dist_sq = vmlaq_f32(vmulq_f32(dxs, dxs), dys, dys);
-                let dist_sq = vmlaq_f32(dist_sq, dzs, dzs);
-                // Cheap mask to skip work if all lanes are outside support
-                let inside_mask = vcltq_f32(dist_sq, support_sq_v);
-                let any_inside = vmaxvq_u32(inside_mask);
-                if any_inside == 0 {
-                    return vdupq_n_f32(0.0);
-                }
-                // Compute weights, then mask out lanes outside support
-                let dist = vsqrtq_f32(dist_sq);
-                let w = kernel_neon.evaluate(dist);
-                vbslq_f32(inside_mask, w, vdupq_n_f32(0.0))
-            };
-
-        // Loop over grid points in support region of the particle
-        for i in lower[0]..upper[0] {
-            for j in lower[1]..upper[1] {
-                let flat_index_base = (i * dim_y + j) * dim_z;
-
-                let global_i = subdomain_ijk[0] * mc_cells[0] + i as u32;
-                let global_j = subdomain_ijk[1] * mc_cells[1] + j as u32;
-
-                let grid_x = global_i as f32 * cube_size + global_min[0];
-                let grid_y = global_j as f32 * cube_size + global_min[1];
-                let grid_xs = vdupq_n_f32(grid_x);
-                let grid_ys = vdupq_n_f32(grid_y);
-
-                for k in (lower[2]..upper_k_aligned).step_by(LANES) {
-                    let flat_point_idx = flat_index_base + k;
-                    let w_ij = evaluate_contribution(k, grid_xs, grid_ys);
+    #[target_feature(enable = "neon")]
+    fn density_grid_loop<K: NeonKernel>(
+        levelset_grid: &mut [f32],
+        subdomain_particles: &[Vector3<f32>],
+        subdomain_particle_densities: &[f32],
+        subdomain_mc_grid: &UniformCartesianCubeGrid3d<i64, f32>,
+        subdomain_ijk: &[i64; 3],
+        global_mc_grid: &UniformCartesianCubeGrid3d<GlobalIndex, f32>,
+        cube_radius: i64,
+        _squared_support_with_margin: f32,
+        particle_rest_mass: f32,
+        kernel_support: f32,
+    ) {
+        let kernel_neon = K::new(kernel_support);
+
+        profile!("density grid loop (neon)");
+        let mc_grid = subdomain_mc_grid;
+        let mc_points = mc_grid.points_per_dim();
+        let dim_y = mc_points[1] as usize;
+        let dim_z = mc_points[2] as usize;
+
+        // Preload constants used inside the hot loops
+        const K_OFFSETS_ARR: [u32; 4] = [0, 1, 2, 3];
+        let k_offsets = unsafe { vld1q_u32(K_OFFSETS_ARR.as_ptr()) };
+        let global_min = global_mc_grid.aabb().min();
+        let min_z_v = vdupq_n_f32(global_min[2]);
+        let cube_size = global_mc_grid.cell_size();
+        let support = kernel_support;
+        let support_sq_v = vdupq_n_f32(support * support);
+
+        for (p_i, rho_i) in subdomain_particles
+            .iter()
+            .copied()
+            .zip(subdomain_particle_densities.iter().copied())
+        {
+            let v_i = particle_rest_mass / rho_i;
+
+            let (lower, upper) = particle_influence_aabb(&p_i, &mc_grid, cube_radius);
+            let lower = lower.map(|i| i as usize);
+            let upper = upper.map(|i| i as usize);
+
+            let remainder = (upper[2] - lower[2]) % LANES;
+            let upper_k_aligned = upper[2] - remainder;
+
+            let subdomain_ijk = subdomain_ijk.map(|i| i as u32);
+            let mc_cells = mc_grid.cells_per_dim().map(|i| i as u32);
+
+            // Broadcast particle coordinates once per particle
+            let particle_xs = vdupq_n_f32(p_i[0]);
+            let particle_ys = vdupq_n_f32(p_i[1]);
+            let particle_zs = vdupq_n_f32(p_i[2]);
+
+            // Function to evaluate kernel contribution of current particle to 8 grid points at once
+            let evaluate_contribution =
+                |k: usize, grid_xs: float32x4_t, grid_ys: float32x4_t| -> float32x4_t {
+                    // Compute global k for 4 consecutive points
+                    let global_k_base = vdupq_n_u32(subdomain_ijk[2] * mc_cells[2] + k as u32);
+                    let global_k = vaddq_u32(global_k_base, k_offsets);
+                    let global_k_f32 = vcvtq_f32_u32(global_k);
+                    let grid_zs = vmlaq_n_f32(min_z_v, global_k_f32, cube_size);
+                    // Deltas
+                    let dxs = vsubq_f32(particle_xs, grid_xs);
+                    let dys = vsubq_f32(particle_ys, grid_ys);
+                    let dzs = vsubq_f32(particle_zs, grid_zs);
+                    // Distance squared
+                    let dist_sq = vmlaq_f32(vmulq_f32(dxs, dxs), dys, dys);
+                    let dist_sq = vmlaq_f32(dist_sq, dzs, dzs);
+                    // Cheap mask to skip work if all lanes are outside support
+                    let inside_mask = vcltq_f32(dist_sq, support_sq_v);
+                    let any_inside = vmaxvq_u32(inside_mask);
+                    if any_inside == 0 {
+                        return vdupq_n_f32(0.0);
+                    }
+                    // Compute weights, then mask out lanes outside support
+                    let dist = vsqrtq_f32(dist_sq);
+                    let w = unsafe { kernel_neon.evaluate(dist) };
+                    vbslq_f32(inside_mask, w, vdupq_n_f32(0.0))
+                };
 
-                    let prev_val = unsafe {
-                        vld1q_f32(levelset_grid.as_ptr().offset(flat_point_idx as isize))
-                    };
-                    let val = vmlaq_n_f32(prev_val, w_ij, v_i);
-                    unsafe {
-                        vst1q_f32(
-                            levelset_grid.as_mut_ptr().offset(flat_point_idx as isize),
-                            val,
-                        )
-                    };
-                }
+            // Loop over grid points in support region of the particle
+            for i in lower[0]..upper[0] {
+                for j in lower[1]..upper[1] {
+                    let flat_index_base = (i * dim_y + j) * dim_z;
+
+                    let global_i = subdomain_ijk[0] * mc_cells[0] + i as u32;
+                    let global_j = subdomain_ijk[1] * mc_cells[1] + j as u32;
+
+                    let grid_x = global_i as f32 * cube_size + global_min[0];
+                    let grid_y = global_j as f32 * cube_size + global_min[1];
+                    let grid_xs = vdupq_n_f32(grid_x);
+                    let grid_ys = vdupq_n_f32(grid_y);
+
+                    for k in (lower[2]..upper_k_aligned).step_by(LANES) {
+                        let flat_point_idx = flat_index_base + k;
+                        let w_ij = evaluate_contribution(k, grid_xs, grid_ys);
+
+                        let prev_val = unsafe {
+                            vld1q_f32(levelset_grid.as_ptr().offset(flat_point_idx as isize))
+                        };
+                        let val = vmlaq_n_f32(prev_val, w_ij, v_i);
+                        unsafe {
+                            vst1q_f32(
+                                levelset_grid.as_mut_ptr().offset(flat_point_idx as isize),
+                                val,
+                            )
+                        };
+                    }
 
-                // Handle remainder
-                if remainder > 0 {
-                    let flat_point_idx = flat_index_base + upper_k_aligned;
-                    let w_ij = evaluate_contribution(upper_k_aligned, grid_xs, grid_ys);
-                    let w_ij_v = vmulq_n_f32(w_ij, v_i);
-                    let w_ij_v =
-                        unsafe { std::mem::transmute::<float32x4_t, [f32; LANES]>(w_ij_v) };
-                    for rr in 0..remainder {
-                        levelset_grid[flat_point_idx + rr] += w_ij_v[rr];
+                    // Handle remainder
+                    if remainder > 0 {
+                        let flat_point_idx = flat_index_base + upper_k_aligned;
+                        let w_ij = evaluate_contribution(upper_k_aligned, grid_xs, grid_ys);
+                        let w_ij_v = vmulq_n_f32(w_ij, v_i);
+                        let w_ij_v =
+                            unsafe { std::mem::transmute::<float32x4_t, [f32; LANES]>(w_ij_v) };
+                        for rr in 0..remainder {
+                            levelset_grid[flat_point_idx + rr] += w_ij_v[rr];
+                        }
                     }
                 }
             }
         }
     }
+
+    match kernel_type {
+        KernelType::CubicSpline => {
+            density_grid_loop::<kernel::CubicSplineKernelNeonF32>(
+                levelset_grid,
+                subdomain_particles,
+                subdomain_particle_densities,
+                subdomain_mc_grid,
+                subdomain_ijk,
+                global_mc_grid,
+                cube_radius,
+                _squared_support_with_margin,
+                particle_rest_mass,
+                kernel_support,
+            );
+        }
+        KernelType::Poly6 => {
+            density_grid_loop::<kernel::Poly6KernelNeonF32>(
+                levelset_grid,
+                subdomain_particles,
+                subdomain_particle_densities,
+                subdomain_mc_grid,
+                subdomain_ijk,
+                global_mc_grid,
+                cube_radius,
+                _squared_support_with_margin,
+                particle_rest_mass,
+                kernel_support,
+            );
+        }
+        KernelType::Spiky => {
+            density_grid_loop::<kernel::SpikyKernelNeonF32>(
+                levelset_grid,
+                subdomain_particles,
+                subdomain_particle_densities,
+                subdomain_mc_grid,
+                subdomain_ijk,
+                global_mc_grid,
+                cube_radius,
+                _squared_support_with_margin,
+                particle_rest_mass,
+                kernel_support,
+            );
+        }
+        KernelType::WendlandQuinticC2 => {
+            density_grid_loop::<kernel::WendlandQuinticC2KernelNeonF32>(
+                levelset_grid,
+                subdomain_particles,
+                subdomain_particle_densities,
+                subdomain_mc_grid,
+                subdomain_ijk,
+                global_mc_grid,
+                cube_radius,
+                _squared_support_with_margin,
+                particle_rest_mass,
+                kernel_support,
+            );
+        }
+    }
 }
 
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[target_feature(enable = "avx2,fma")]
-pub fn density_grid_loop_avx<K: SymmetricKernel3d<f32>>(
+pub fn density_grid_loop_avx(
     levelset_grid: &mut [f32],
     subdomain_particles: &[Vector3<f32>],
     subdomain_particle_densities: &[f32],
@@ -998,13 +1128,16 @@ pub fn density_grid_loop_avx<K: SymmetricKernel3d<f32>>(
     cube_radius: i64,
     _squared_support_with_margin: f32,
     particle_rest_mass: f32,
-    kernel: &K,
+    kernel_support: f32,
+    kernel_type: &KernelType,
 ) {
     #[cfg(target_arch = "x86")]
     use core::arch::x86::*;
     #[cfg(target_arch = "x86_64")]
     use core::arch::x86_64::*;
 
+    use crate::kernel::AvxKernel;
+
     const LANES: usize = 8;
 
     // Ensure that we can safely compute global MC vertex positions via i32 indices per dimension
@@ -1019,117 +1152,191 @@ pub fn density_grid_loop_avx<K: SymmetricKernel3d<f32>>(
         i32::MAX
     );
 
-    let kernel_avx = kernel::CubicSplineKernelAvxF32::new(kernel.compact_support_radius());
+    #[target_feature(enable = "avx2,fma")]
+    fn density_grid_loop<K: AvxKernel>(
+        levelset_grid: &mut [f32],
+        subdomain_particles: &[Vector3<f32>],
+        subdomain_particle_densities: &[f32],
+        subdomain_mc_grid: &UniformCartesianCubeGrid3d<i64, f32>,
+        subdomain_ijk: &[i64; 3],
+        global_mc_grid: &UniformCartesianCubeGrid3d<GlobalIndex, f32>,
+        cube_radius: i64,
+        _squared_support_with_margin: f32,
+        particle_rest_mass: f32,
+        kernel_support: f32,
+    ) {
+        let kernel_avx = K::new(kernel_support);
+
+        profile!("density grid loop (avx)");
+        let mc_grid = subdomain_mc_grid;
+        let mc_points = mc_grid.points_per_dim();
+        let dim_y = mc_points[1] as usize;
+        let dim_z = mc_points[2] as usize;
+
+        // Preload constants used inside the hot loops
+        let k_offsets_i32 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        let global_min = global_mc_grid.aabb().min();
+        let min_z_v = _mm256_set1_ps(global_min[2]);
+        let cube_size = global_mc_grid.cell_size();
+        let cube_size_ps = _mm256_set1_ps(cube_size);
+        let support = kernel_support;
+        let support_sq_v = _mm256_set1_ps(support * support);
+
+        for (p_i, rho_i) in subdomain_particles
+            .iter()
+            .copied()
+            .zip(subdomain_particle_densities.iter().copied())
+        {
+            let v_i = particle_rest_mass / rho_i;
 
-    profile!("density grid loop (avx)");
-    let mc_grid = subdomain_mc_grid;
-    let mc_points = mc_grid.points_per_dim();
-    let dim_y = mc_points[1] as usize;
-    let dim_z = mc_points[2] as usize;
-
-    // Preload constants used inside the hot loops
-    let k_offsets_i32 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-    let global_min = global_mc_grid.aabb().min();
-    let min_z_v = _mm256_set1_ps(global_min[2]);
-    let cube_size = global_mc_grid.cell_size();
-    let cube_size_ps = _mm256_set1_ps(cube_size);
-    let support = kernel.compact_support_radius();
-    let support_sq_v = _mm256_set1_ps(support * support);
+            let (lower, upper) = particle_influence_aabb(&p_i, &mc_grid, cube_radius);
+            let lower = lower.map(|i| i as usize);
+            let upper = upper.map(|i| i as usize);
 
-    for (p_i, rho_i) in subdomain_particles
-        .iter()
-        .copied()
-        .zip(subdomain_particle_densities.iter().copied())
-    {
-        let v_i = particle_rest_mass / rho_i;
+            let remainder = (upper[2] - lower[2]) % LANES;
+            let upper_k_aligned = upper[2] - remainder;
 
-        let (lower, upper) = particle_influence_aabb(&p_i, &mc_grid, cube_radius);
-        let lower = lower.map(|i| i as usize);
-        let upper = upper.map(|i| i as usize);
-
-        let remainder = (upper[2] - lower[2]) % LANES;
-        let upper_k_aligned = upper[2] - remainder;
-
-        let subdomain_ijk_i32 = subdomain_ijk.map(|i| i as i32);
-        let mc_cells = mc_grid.cells_per_dim().map(|i| i as i32);
-
-        // Broadcast particle coordinates once per particle
-        let particle_xs = _mm256_set1_ps(p_i[0]);
-        let particle_ys = _mm256_set1_ps(p_i[1]);
-        let particle_zs = _mm256_set1_ps(p_i[2]);
-        let v_i_ps = _mm256_set1_ps(v_i);
-
-        // Function to evaluate kernel contribution of current particle to 8 grid points at once
-        let evaluate_contribution = |k: usize, grid_xs: __m256, grid_ys: __m256| -> __m256 {
-            // Compute global k for 8 consecutive points using i32 lanes and convert to f32
-            let base_k = subdomain_ijk_i32[2] * mc_cells[2] + k as i32;
-            let global_k_i32 = _mm256_add_epi32(_mm256_set1_epi32(base_k), k_offsets_i32);
-            let global_k_f32 = _mm256_cvtepi32_ps(global_k_i32);
-            let grid_zs = _mm256_fmadd_ps(global_k_f32, cube_size_ps, min_z_v);
-
-            // Deltas
-            let dxs = _mm256_sub_ps(particle_xs, grid_xs);
-            let dys = _mm256_sub_ps(particle_ys, grid_ys);
-            let dzs = _mm256_sub_ps(particle_zs, grid_zs);
-
-            // Distance squared
-            let dist_sq = {
-                let t = _mm256_fmadd_ps(dxs, dxs, _mm256_mul_ps(dys, dys));
-                _mm256_fmadd_ps(dzs, dzs, t)
-            };
+            let subdomain_ijk_i32 = subdomain_ijk.map(|i| i as i32);
+            let mc_cells = mc_grid.cells_per_dim().map(|i| i as i32);
 
-            // Cheap mask to skip work if all lanes are outside support
-            let inside_mask = _mm256_cmp_ps::<_CMP_LT_OQ>(dist_sq, support_sq_v);
-            if _mm256_movemask_ps(inside_mask) == 0 {
-                return _mm256_set1_ps(0.0);
-            }
+            // Broadcast particle coordinates once per particle
+            let particle_xs = _mm256_set1_ps(p_i[0]);
+            let particle_ys = _mm256_set1_ps(p_i[1]);
+            let particle_zs = _mm256_set1_ps(p_i[2]);
+            let v_i_ps = _mm256_set1_ps(v_i);
 
-            // Compute weights, then mask out lanes outside support
-            let dist = _mm256_sqrt_ps(dist_sq);
-            let w = kernel_avx.evaluate(dist);
-            _mm256_and_ps(w, inside_mask)
-        };
+            // Function to evaluate kernel contribution of current particle to 8 grid points at once
+            let evaluate_contribution = |k: usize, grid_xs: __m256, grid_ys: __m256| -> __m256 {
+                // Compute global k for 8 consecutive points using i32 lanes and convert to f32
+                let base_k = subdomain_ijk_i32[2] * mc_cells[2] + k as i32;
+                let global_k_i32 = _mm256_add_epi32(_mm256_set1_epi32(base_k), k_offsets_i32);
+                let global_k_f32 = _mm256_cvtepi32_ps(global_k_i32);
+                let grid_zs = _mm256_fmadd_ps(global_k_f32, cube_size_ps, min_z_v);
 
-        // Loop over grid points in support region of the particle
-        for i in lower[0]..upper[0] {
-            for j in lower[1]..upper[1] {
-                let flat_index_base = (i * dim_y + j) * dim_z;
+                // Deltas
+                let dxs = _mm256_sub_ps(particle_xs, grid_xs);
+                let dys = _mm256_sub_ps(particle_ys, grid_ys);
+                let dzs = _mm256_sub_ps(particle_zs, grid_zs);
 
-                let global_i = subdomain_ijk_i32[0] * mc_cells[0] + i as i32;
-                let global_j = subdomain_ijk_i32[1] * mc_cells[1] + j as i32;
+                // Distance squared
+                let dist_sq = {
+                    let t = _mm256_fmadd_ps(dxs, dxs, _mm256_mul_ps(dys, dys));
+                    _mm256_fmadd_ps(dzs, dzs, t)
+                };
 
-                let grid_x = global_i as f32 * cube_size + global_min[0];
-                let grid_y = global_j as f32 * cube_size + global_min[1];
-                let grid_xs = _mm256_set1_ps(grid_x);
-                let grid_ys = _mm256_set1_ps(grid_y);
+                // Cheap mask to skip work if all lanes are outside support
+                let inside_mask = _mm256_cmp_ps::<_CMP_LT_OQ>(dist_sq, support_sq_v);
+                if _mm256_movemask_ps(inside_mask) == 0 {
+                    return _mm256_set1_ps(0.0);
+                }
 
-                for k in (lower[2]..upper_k_aligned).step_by(LANES) {
-                    let flat_point_idx = flat_index_base + k;
-                    let w_ij = evaluate_contribution(k, grid_xs, grid_ys);
+                // Compute weights, then mask out lanes outside support
+                let dist = _mm256_sqrt_ps(dist_sq);
+                let w = unsafe { kernel_avx.evaluate(dist) };
+                _mm256_and_ps(w, inside_mask)
+            };
 
-                    unsafe {
-                        let prev_val = _mm256_loadu_ps(levelset_grid.as_ptr().add(flat_point_idx));
-                        let val = _mm256_fmadd_ps(w_ij, v_i_ps, prev_val);
-                        _mm256_storeu_ps(levelset_grid.as_mut_ptr().add(flat_point_idx), val);
+            // Loop over grid points in support region of the particle
+            for i in lower[0]..upper[0] {
+                for j in lower[1]..upper[1] {
+                    let flat_index_base = (i * dim_y + j) * dim_z;
+
+                    let global_i = subdomain_ijk_i32[0] * mc_cells[0] + i as i32;
+                    let global_j = subdomain_ijk_i32[1] * mc_cells[1] + j as i32;
+
+                    let grid_x = global_i as f32 * cube_size + global_min[0];
+                    let grid_y = global_j as f32 * cube_size + global_min[1];
+                    let grid_xs = _mm256_set1_ps(grid_x);
+                    let grid_ys = _mm256_set1_ps(grid_y);
+
+                    for k in (lower[2]..upper_k_aligned).step_by(LANES) {
+                        let flat_point_idx = flat_index_base + k;
+                        let w_ij = evaluate_contribution(k, grid_xs, grid_ys);
+
+                        unsafe {
+                            let prev_val =
+                                _mm256_loadu_ps(levelset_grid.as_ptr().add(flat_point_idx));
+                            let val = _mm256_fmadd_ps(w_ij, v_i_ps, prev_val);
+                            _mm256_storeu_ps(levelset_grid.as_mut_ptr().add(flat_point_idx), val);
+                        }
                     }
-                }
 
-                // Handle remainder
-                if remainder > 0 {
-                    let flat_point_idx = flat_index_base + upper_k_aligned;
-                    let w_ij = evaluate_contribution(upper_k_aligned, grid_xs, grid_ys);
-                    unsafe {
-                        let val = _mm256_mul_ps(w_ij, v_i_ps);
-                        let mut tmp: [f32; LANES] = [0.0; LANES];
-                        _mm256_storeu_ps(tmp.as_mut_ptr(), val);
-                        for rr in 0..remainder {
-                            levelset_grid[flat_point_idx + rr] += tmp[rr];
+                    // Handle remainder
+                    if remainder > 0 {
+                        let flat_point_idx = flat_index_base + upper_k_aligned;
+                        let w_ij = evaluate_contribution(upper_k_aligned, grid_xs, grid_ys);
+                        unsafe {
+                            let val = _mm256_mul_ps(w_ij, v_i_ps);
+                            let mut tmp: [f32; LANES] = [0.0; LANES];
+                            _mm256_storeu_ps(tmp.as_mut_ptr(), val);
+                            for rr in 0..remainder {
+                                levelset_grid[flat_point_idx + rr] += tmp[rr];
+                            }
                         }
                     }
                 }
             }
         }
     }
+
+    match kernel_type {
+        KernelType::CubicSpline => {
+            density_grid_loop::<kernel::CubicSplineKernelAvxF32>(
+                levelset_grid,
+                subdomain_particles,
+                subdomain_particle_densities,
+                subdomain_mc_grid,
+                subdomain_ijk,
+                global_mc_grid,
+                cube_radius,
+                _squared_support_with_margin,
+                particle_rest_mass,
+                kernel_support,
+            );
+        }
+        KernelType::Poly6 => {
+            density_grid_loop::<kernel::Poly6KernelAvxF32>(
+                levelset_grid,
+                subdomain_particles,
+                subdomain_particle_densities,
+                subdomain_mc_grid,
+                subdomain_ijk,
+                global_mc_grid,
+                cube_radius,
+                _squared_support_with_margin,
+                particle_rest_mass,
+                kernel_support,
+            );
+        }
+        KernelType::Spiky => {
+            density_grid_loop::<kernel::SpikyKernelAvxF32>(
+                levelset_grid,
+                subdomain_particles,
+                subdomain_particle_densities,
+                subdomain_mc_grid,
+                subdomain_ijk,
+                global_mc_grid,
+                cube_radius,
+                _squared_support_with_margin,
+                particle_rest_mass,
+                kernel_support,
+            );
+        }
+        KernelType::WendlandQuinticC2 => {
+            density_grid_loop::<kernel::WendlandQuinticC2KernelAvxF32>(
+                levelset_grid,
+                subdomain_particles,
+                subdomain_particle_densities,
+                subdomain_mc_grid,
+                subdomain_ijk,
+                global_mc_grid,
+                cube_radius,
+                _squared_support_with_margin,
+                particle_rest_mass,
+                kernel_support,
+            );
+        }
+    }
 }
 
 pub fn density_grid_loop_sparse<I: Index, R: Real, K: SymmetricKernel3d<R>>(
@@ -1212,7 +1419,7 @@ pub fn density_grid_loop_sparse<I: Index, R: Real, K: SymmetricKernel3d<R>>(
     }
 }
 
-pub(crate) fn reconstruct_subdomains<I: Index, R: Real>(
+pub(crate) fn reconstruct_subdomains<I: Index, R: Real, K: SymmetricKernel3d<R> + Sync>(
     parameters: &ParametersSubdomainGrid<I, R>,
     global_particles: &[Vector3<R>],
     global_particle_densities: &[R],
@@ -1228,7 +1435,7 @@ pub(crate) fn reconstruct_subdomains<I: Index, R: Real>(
     let cube_radius = I::from((parameters.compact_support_radius / parameters.cube_size).ceil())
         .expect("kernel radius in cubes has to fit in index type");
     // Kernel
-    let kernel = CubicSplineKernel::new(parameters.compact_support_radius);
+    let kernel = K::new(parameters.compact_support_radius);
     //let kernel = DiscreteSquaredDistanceCubicKernel::new::<f64>(1000, parameters.compact_support_radius);
 
     let mc_total_cells = parameters.subdomain_cubes.cubed();
@@ -1436,7 +1643,8 @@ pub(crate) fn reconstruct_subdomains<I: Index, R: Real>(
                     cube_radius.to_i64().unwrap(),
                     squared_support_with_margin.to_f32().unwrap(),
                     parameters.particle_rest_mass.to_f32().unwrap(),
-                    &CubicSplineKernel::new(parameters.compact_support_radius.to_f32().unwrap()),
+                    parameters.compact_support_radius.to_f32().unwrap(),
+                    &parameters.kernel_type,
                 );
             } else {
                 density_grid_loop_scalar(
diff --git a/splashsurf_lib/src/density_map.rs b/splashsurf_lib/src/density_map.rs
index 0043439..da3c3ec 100644
--- a/splashsurf_lib/src/density_map.rs
+++ b/splashsurf_lib/src/density_map.rs
@@ -24,7 +24,7 @@
 //! indices, even if the density map is only generated for a smaller subdomain.
 
 use crate::aabb::Aabb3d;
-use crate::kernel::{CubicSplineKernel, SymmetricKernel3d};
+use crate::kernel::SymmetricKernel3d;
 use crate::mesh::{HexMesh3d, MeshWithData, OwnedMeshAttribute};
 use crate::neighborhood_search::NeighborhoodList;
 use crate::uniform_grid::UniformGrid;
@@ -62,7 +62,7 @@ pub enum DensityMapError<R: Scalar> {
 
 /// Computes the individual densities of particles using a standard SPH sum
 #[inline(never)]
-pub fn compute_particle_densities<I: Index, R: Real>(
+pub fn compute_particle_densities<I: Index, R: Real, K: SymmetricKernel3d<R> + Sync>(
     particle_positions: &[Vector3<R>],
     particle_neighbor_lists: &[Vec<usize>],
     compact_support_radius: R,
@@ -71,7 +71,7 @@ pub fn compute_particle_densities<I: Index, R: Real>(
 ) -> Vec<R> {
     let mut densities = Vec::new();
     if enable_multi_threading {
-        parallel_compute_particle_densities::<I, R>(
+        parallel_compute_particle_densities::<I, R, K>(
             particle_positions,
             particle_neighbor_lists,
             compact_support_radius,
@@ -79,7 +79,7 @@ pub fn compute_particle_densities<I: Index, R: Real>(
             &mut densities,
         )
     } else {
-        sequential_compute_particle_densities::<I, R, _>(
+        sequential_compute_particle_densities::<I, R, _, K>(
             particle_positions,
             particle_neighbor_lists,
             compact_support_radius,
@@ -92,7 +92,7 @@ pub fn compute_particle_densities<I: Index, R: Real>(
 
 /// Computes the individual densities of particles inplace using a standard SPH sum
 #[inline(never)]
-pub fn compute_particle_densities_inplace<I: Index, R: Real>(
+pub fn compute_particle_densities_inplace<I: Index, R: Real, K: SymmetricKernel3d<R> + Sync>(
     particle_positions: &[Vector3<R>],
     particle_neighbor_lists: &[Vec<usize>],
     compact_support_radius: R,
@@ -101,7 +101,7 @@ pub fn compute_particle_densities_inplace<I: Index, R: Real>(
     densities: &mut Vec<R>,
 ) {
     if enable_multi_threading {
-        parallel_compute_particle_densities::<I, R>(
+        parallel_compute_particle_densities::<I, R, K>(
             particle_positions,
             particle_neighbor_lists,
             compact_support_radius,
@@ -109,7 +109,7 @@ pub fn compute_particle_densities_inplace<I: Index, R: Real>(
             densities,
         )
     } else {
-        sequential_compute_particle_densities::<I, R, _>(
+        sequential_compute_particle_densities::<I, R, _, K>(
             particle_positions,
             particle_neighbor_lists,
             compact_support_radius,
@@ -127,7 +127,12 @@ fn init_density_storage<R: Real>(densities: &mut Vec<R>, new_len: usize) {
 
 /// Computes the individual densities of particles using a standard SPH sum, sequential implementation
 #[inline(never)]
-pub fn sequential_compute_particle_densities<I: Index, R: Real, Nl: NeighborhoodList + ?Sized>(
+pub fn sequential_compute_particle_densities<
+    I: Index,
+    R: Real,
+    Nl: NeighborhoodList + ?Sized,
+    K: SymmetricKernel3d<R>,
+>(
     particle_positions: &[Vector3<R>],
     particle_neighbor_lists: &Nl,
     compact_support_radius: R,
@@ -136,7 +141,7 @@ pub fn sequential_compute_particle_densities<I: Index, R: Real, Nl: Neighborhood
 ) {
     profile!("sequential_compute_particle_densities");
 
-    sequential_compute_particle_densities_filtered::<I, R, Nl>(
+    sequential_compute_particle_densities_filtered::<I, R, Nl, K>(
         particle_positions,
         particle_neighbor_lists,
         compact_support_radius,
@@ -151,6 +156,7 @@ pub fn sequential_compute_particle_densities_filtered<
     I: Index,
     R: Real,
     Nl: NeighborhoodList + ?Sized,
+    K: SymmetricKernel3d<R>,
 >(
     particle_positions: &[Vector3<R>],
     particle_neighbor_lists: &Nl,
@@ -164,7 +170,7 @@ pub fn sequential_compute_particle_densities_filtered<
     init_density_storage(particle_densities, particle_positions.len());
 
     // Pre-compute the kernel which can be queried using squared distances
-    let kernel = CubicSplineKernel::new(compact_support_radius);
+    let kernel = K::new(compact_support_radius);
 
     for (i, particle_i_position) in particle_positions
         .iter()
@@ -187,7 +193,7 @@ pub fn sequential_compute_particle_densities_filtered<
 
 /// Computes the individual densities of particles using a standard SPH sum, multi-threaded implementation
 #[inline(never)]
-pub fn parallel_compute_particle_densities<I: Index, R: Real>(
+pub fn parallel_compute_particle_densities<I: Index, R: Real, K: SymmetricKernel3d<R> + Sync>(
     particle_positions: &[Vector3<R>],
     particle_neighbor_lists: &[Vec<usize>],
     compact_support_radius: R,
@@ -199,7 +205,7 @@ pub fn parallel_compute_particle_densities<I: Index, R: Real>(
     init_density_storage(particle_densities, particle_positions.len());
 
     // Pre-compute the kernel which can be queried using squared distances
-    let kernel = CubicSplineKernel::new(compact_support_radius);
+    let kernel = K::new(compact_support_radius);
 
     particle_positions
         .par_iter()
@@ -313,7 +319,7 @@ impl<'a, I: Index, R: Real> DensityMap<'a, I, R> {
 
 /// Computes a sparse density map for the fluid based on the specified background grid
 #[inline(never)]
-pub fn generate_sparse_density_map<I: Index, R: Real>(
+pub fn generate_sparse_density_map<I: Index, R: Real, K: SymmetricKernel3d<R> + Sync>(
     grid: &UniformGrid<I, R>,
     particle_positions: &[Vector3<R>],
     particle_densities: &[R],
@@ -334,7 +340,7 @@ pub fn generate_sparse_density_map<I: Index, R: Real>(
     );
 
     if allow_threading {
-        *density_map = parallel_generate_sparse_density_map(
+        *density_map = parallel_generate_sparse_density_map::<_, _, K>(
             grid,
             particle_positions,
             particle_densities,
@@ -344,7 +350,7 @@ pub fn generate_sparse_density_map<I: Index, R: Real>(
             cube_size,
         )?
     } else {
-        *density_map = sequential_generate_sparse_density_map(
+        *density_map = sequential_generate_sparse_density_map::<_, _, K>(
             grid,
             particle_positions,
             particle_densities,
@@ -365,7 +371,7 @@ pub fn generate_sparse_density_map<I: Index, R: Real>(
 
 /// Computes a sparse density map for the fluid based on the specified background grid, sequential implementation
 #[inline(never)]
-pub fn sequential_generate_sparse_density_map<I: Index, R: Real>(
+pub fn sequential_generate_sparse_density_map<I: Index, R: Real, K: SymmetricKernel3d<R>>(
     grid: &UniformGrid<I, R>,
     particle_positions: &[Vector3<R>],
     particle_densities: &[R],
@@ -378,12 +384,13 @@ pub fn sequential_generate_sparse_density_map<I: Index, R: Real>(
 
     let mut sparse_densities = new_map();
 
-    let density_map_generator = SparseDensityMapGenerator::try_new(
-        grid,
-        compact_support_radius,
-        cube_size,
-        particle_rest_mass,
-    )?;
+    let density_map_generator: SparseDensityMapGenerator<I, R, K> =
+        SparseDensityMapGenerator::try_new(
+            grid,
+            compact_support_radius,
+            cube_size,
+            particle_rest_mass,
+        )?;
 
     let process_particle = |particle_data: (&Vector3<R>, R)| {
         let (particle, particle_density) = particle_data;
@@ -412,7 +419,7 @@ pub fn sequential_generate_sparse_density_map<I: Index, R: Real>(
 
 /// Computes a sparse density map for the fluid based on the specified background grid, multi-threaded implementation
 #[inline(never)]
-pub fn parallel_generate_sparse_density_map<I: Index, R: Real>(
+pub fn parallel_generate_sparse_density_map<I: Index, R: Real, K: SymmetricKernel3d<R> + Sync>(
     grid: &UniformGrid<I, R>,
     particle_positions: &[Vector3<R>],
     particle_densities: &[R],
@@ -428,12 +435,13 @@ pub fn parallel_generate_sparse_density_map<I: Index, R: Real>(
 
     // Generate thread local density maps
     {
-        let density_map_generator = SparseDensityMapGenerator::try_new(
-            grid,
-            compact_support_radius,
-            cube_size,
-            particle_rest_mass,
-        )?;
+        let density_map_generator: SparseDensityMapGenerator<I, R, K> =
+            SparseDensityMapGenerator::try_new(
+                grid,
+                compact_support_radius,
+                cube_size,
+                particle_rest_mass,
+            )?;
 
         profile!("generate thread local maps");
 
@@ -530,12 +538,12 @@ pub fn parallel_generate_sparse_density_map<I: Index, R: Real>(
 }
 
 /// Internal helper type used to evaluate the density contribution for a particle
-struct SparseDensityMapGenerator<I: Index, R: Real> {
+struct SparseDensityMapGenerator<I: Index, R: Real, K: SymmetricKernel3d<R>> {
     particle_rest_mass: R,
     half_supported_cells: I,
     supported_points: I,
     kernel_evaluation_radius_sq: R,
-    kernel: CubicSplineKernel<R>,
+    kernel: K,
     allowed_domain: Aabb3d<R>,
 }
 
@@ -580,7 +588,7 @@ pub(crate) fn compute_kernel_evaluation_radius<I: Index, R: Real>(
 }
 
 // TODO: Maybe remove allowed domain check? And require this is done before, using the active_particles array?
-impl<I: Index, R: Real> SparseDensityMapGenerator<I, R> {
+impl<I: Index, R: Real, K: SymmetricKernel3d<R>> SparseDensityMapGenerator<I, R, K> {
     fn try_new(
         grid: &UniformGrid<I, R>,
         compact_support_radius: R,
@@ -595,7 +603,7 @@ impl<I: Index, R: Real> SparseDensityMapGenerator<I, R> {
 
         // Pre-compute the kernel which can be queried using squared distances
         let kernel_evaluation_radius_sq = kernel_evaluation_radius * kernel_evaluation_radius;
-        let kernel = CubicSplineKernel::new(compact_support_radius);
+        let kernel = K::new(compact_support_radius);
 
         // Shrink the allowed domain for particles by the kernel evaluation radius. This ensures that all cells/points
         // that are affected by a particle are actually part of the domain/grid, so it does not have to be checked in the loops below.
diff --git a/splashsurf_lib/src/kernel.rs b/splashsurf_lib/src/kernel.rs
index 3b3c84a..21aa9dc 100644
--- a/splashsurf_lib/src/kernel.rs
+++ b/splashsurf_lib/src/kernel.rs
@@ -1,8 +1,17 @@
 //! SPH kernel function implementations
 //!
 //! Currently, the following SIMD implementations are provided:
-//!  - `CubicSplineKernelAvxF32`: Only available on `x86` and `x86_64` targets, requires AVX2 and FMA support
-//!  - `CubicSplineKernelNeonF32`: Only available on `aarch64` targets, requires NEON support
+//!  - `CubicSplineKernelAvxF32`
+//!  - `CubicSplineKernelNeonF32`
+//!  - `Poly6KernelAvxF32`
+//!  - `Poly6KernelNeonF32`
+//!  - `SpikyKernelAvxF32`
+//!  - `SpikyKernelNeonF32`
+//!  - `WendlandQuinticC2KernelAvxF32`
+//!  - `WendlandQuinticC2KernelNeonF32`
+//!
+//! The Avx implementations are only available on `x86` and `x86_64` targets and require AVX2 and FMA support.
+//! Similarly, the Neon implementations are only available on `aarch64` targets and require NEON support.
 //!
 //! Note that documentation of the SIMD kernels is only available on the respective target architectures.
 
@@ -20,6 +29,14 @@ use core::arch::x86_64::__m256;
 
 // TODO: Add reference for the kernel function, document formula
 
+#[derive(Clone, Debug)]
+pub enum KernelType {
+    CubicSpline,
+    Poly6,
+    Spiky,
+    WendlandQuinticC2,
+}
+
 /// Utility functions for computing the volume of fluid particles
 pub struct Volume;
 
@@ -37,6 +54,7 @@ impl Volume {
 
 /// Trait for symmetric kernel functions in three dimensions
 pub trait SymmetricKernel3d<R: Real> {
+    fn new(compact_support_radius: R) -> Self;
     /// Returns the compact support radius of the kernel
     fn compact_support_radius(&self) -> R;
     /// Evaluates the kernel at the radial distance `r` relative to the origin
@@ -47,6 +65,22 @@ pub trait SymmetricKernel3d<R: Real> {
     fn evaluate_gradient_norm(&self, r: R) -> R;
 }
 
+#[cfg(target_arch = "aarch64")]
+pub unsafe trait NeonKernel {
+    fn new(compact_support_radius: f32) -> Self;
+
+    /// Evaluates the kernel at the radial distance `r` relative to the origin
+    unsafe fn evaluate(&self, r: float32x4_t) -> float32x4_t;
+}
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+pub unsafe trait AvxKernel {
+    fn new(compact_support_radius: f32) -> Self;
+
+    /// Evaluates the kernel at the radial distance `r` relative to the origin
+    unsafe fn evaluate(&self, r: __m256) -> __m256;
+}
+
 /// The commonly used cubic spline kernel
 pub struct CubicSplineKernel<R: Real> {
     /// Compact support radius of the kernel
@@ -56,18 +90,6 @@ pub struct CubicSplineKernel<R: Real> {
 }
 
 impl<R: Real> CubicSplineKernel<R> {
-    /// Initializes a cubic spline kernel with the given compact support radius
-    #[replace_float_literals(R::from_float(literal))]
-    pub fn new(compact_support_radius: R) -> Self {
-        let h = compact_support_radius;
-        let sigma = 8.0 / (h * h * h);
-
-        Self {
-            compact_support_radius,
-            normalization: sigma,
-        }
-    }
-
     /// The cubic spline function used by the cubic spline kernel
     #[replace_float_literals(R::from_float(literal))]
     fn cubic_function(q: R) -> R {
@@ -96,6 +118,18 @@ impl<R: Real> CubicSplineKernel<R> {
 }
 
 impl<R: Real> SymmetricKernel3d<R> for CubicSplineKernel<R> {
+    /// Initializes a cubic spline kernel with the given compact support radius
+    #[replace_float_literals(R::from_float(literal))]
+    fn new(compact_support_radius: R) -> Self {
+        let h = compact_support_radius;
+        let sigma = 8.0 / (h * h * h);
+
+        Self {
+            compact_support_radius,
+            normalization: sigma,
+        }
+    }
+
     fn compact_support_radius(&self) -> R {
         self.compact_support_radius
     }
@@ -140,45 +174,6 @@ impl<R: Real> SymmetricKernel3d<R> for CubicSplineKernel<R> {
     }
 }
 
-#[test]
-fn test_cubic_kernel_r_compact_support() {
-    let hs = [0.025, 0.1, 2.0];
-    for &h in hs.iter() {
-        let kernel = CubicSplineKernel::new(h);
-        assert_eq!(kernel.evaluate(h), 0.0);
-        assert_eq!(kernel.evaluate(2.0 * h), 0.0);
-        assert_eq!(kernel.evaluate(10.0 * h), 0.0);
-    }
-}
-
-#[test]
-fn test_cubic_kernel_r_integral() {
-    let hs = [0.025, 0.1, 2.0];
-    let n = 10;
-
-    for &h in hs.iter() {
-        let kernel = CubicSplineKernel::new(h);
-
-        let dr = h / (n as f64);
-        let dvol = dr * dr * dr;
-
-        let mut integral = 0.0;
-        for i in -n..n {
-            for j in -n..n {
-                for k in -n..n {
-                    let r_in = Vector3::new(i as f64, j as f64, k as f64) * dr;
-                    let r_out = Vector3::new((i + 1) as f64, (j + 1) as f64, (k + 1) as f64) * dr;
-                    let r = ((r_in + r_out) * 0.5).norm();
-
-                    integral += dvol * kernel.evaluate(r);
-                }
-            }
-        }
-
-        assert!((integral - 1.0).abs() <= 1e-5);
-    }
-}
-
 /// Vectorized implementation of the cubic spline kernel using NEON instructions. Only available on `aarch64` targets.
 #[cfg(target_arch = "aarch64")]
 pub struct CubicSplineKernelNeonF32 {
@@ -187,9 +182,9 @@ pub struct CubicSplineKernelNeonF32 {
 }
 
 #[cfg(target_arch = "aarch64")]
-impl CubicSplineKernelNeonF32 {
+unsafe impl NeonKernel for CubicSplineKernelNeonF32 {
     /// Initializes a cubic spline kernel with the given compact support radius
-    pub fn new(compact_support_radius: f32) -> Self {
+    fn new(compact_support_radius: f32) -> Self {
         let r = compact_support_radius;
         let compact_support_inv = 1.0 / r;
         let rrr = r * r * r;
@@ -202,7 +197,7 @@ impl CubicSplineKernelNeonF32 {
 
     /// Evaluates the cubic spline kernel at the specified radial distances
     #[target_feature(enable = "neon")]
-    pub fn evaluate(&self, r: float32x4_t) -> float32x4_t {
+    unsafe fn evaluate(&self, r: float32x4_t) -> float32x4_t {
         use core::arch::aarch64::*;
 
         let one = vdupq_n_f32(1.0);
@@ -235,88 +230,6 @@ impl CubicSplineKernelNeonF32 {
     }
 }
 
-#[test]
-#[cfg_attr(
-    not(all(target_arch = "aarch64", target_feature = "neon")),
-    ignore = "Skipped on non-aarch64 targets"
-)]
-fn test_cubic_spline_kernel_neon() {
-    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-    {
-        use core::arch::aarch64::*;
-
-        // Test a few representative compact support radii
-        let hs: [f32; 3] = [0.025, 0.1, 2.0];
-        for &h in hs.iter() {
-            let scalar = CubicSplineKernel::new(h);
-            let neon = CubicSplineKernelNeonF32::new(h);
-
-            // Sample radii from 0 to 2h (beyond support should be 0)
-            let n: usize = 1024;
-            let mut r0: f32 = 0.0;
-            let dr: f32 = (2.0 * h) / (n as f32);
-
-            for _chunk in 0..(n / 4) {
-                // Prepare 4 lanes of radii
-                let rs = [r0, r0 + dr, r0 + 2.0 * dr, r0 + 3.0 * dr];
-                let r_vec = unsafe { vld1q_f32(rs.as_ptr()) };
-
-                // Evaluate NEON and store back to array
-                let w_vec = unsafe { neon.evaluate(r_vec) };
-                let mut w_neon = [0.0f32; 4];
-                unsafe { vst1q_f32(w_neon.as_mut_ptr(), w_vec) };
-
-                // Compare against scalar lane-wise
-                for lane in 0..4 {
-                    let r_lane = rs[lane];
-                    let w_scalar = scalar.evaluate(r_lane);
-                    let diff = (w_neon[lane] - w_scalar).abs();
-
-                    // Absolute tolerance with mild relative component to be robust across scales
-                    let tol = 5e-6_f32.max(2e-5_f32 * w_scalar.abs());
-                    assert!(
-                        diff <= tol,
-                        "NEON kernel mismatch (h={}, r={}, lane={}): neon={}, scalar={}, diff={}, tol={}",
-                        h,
-                        r_lane,
-                        lane,
-                        w_neon[lane],
-                        w_scalar,
-                        diff,
-                        tol
-                    );
-                }
-
-                r0 += 4.0 * dr;
-            }
-
-            // Also check a couple of out-of-support points explicitly
-            for &r in &[h * 1.01, h * 1.5, h * 2.0, h * 2.5] {
-                let w_scalar = scalar.evaluate(r);
-                let w_neon = {
-                    let v = unsafe { vld1q_f32([r, r, r, r].as_ptr()) };
-                    let w = unsafe { neon.evaluate(v) };
-                    let mut tmp = [0.0f32; 4];
-                    unsafe { vst1q_f32(tmp.as_mut_ptr(), w) };
-                    tmp[0]
-                };
-                let diff = (w_neon - w_scalar).abs();
-                let tol = 5e-6_f32.max(1e-5_f32 * w_scalar.abs());
-                assert!(
-                    diff <= tol,
-                    "NEON kernel mismatch outside support (h={}, r={}): neon={}, scalar={}, diff={}, tol={}",
-                    h,
-                    r,
-                    w_neon,
-                    w_scalar,
-                    diff,
-                    tol
-                );
-            }
-        }
-    }
-}
-
 /// Vectorized implementation of the cubic spline kernel using AVX2 and FMA instructions. Only available on `x86` and `x86_64` targets.
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 pub struct CubicSplineKernelAvxF32 {
@@ -325,9 +238,9 @@ pub struct CubicSplineKernelAvxF32 {
 }
 
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-impl CubicSplineKernelAvxF32 {
+unsafe impl AvxKernel for CubicSplineKernelAvxF32 {
     /// Initializes a cubic spline kernel with the given compact support radius
-    pub fn new(compact_support_radius: f32) -> Self {
+    fn new(compact_support_radius: f32) -> Self {
         let r = compact_support_radius;
         let compact_support_inv = 1.0 / r;
         let rrr = r * r * r;
@@ -340,7 +253,7 @@ impl CubicSplineKernelAvxF32 {
 
     /// Evaluates the cubic spline kernel at the specified radial distances
     #[target_feature(enable = "avx2,fma")]
-    pub fn evaluate(&self, r: __m256) -> __m256 {
+    unsafe fn evaluate(&self, r: __m256) -> __m256 {
         #[cfg(target_arch = "x86")]
         use core::arch::x86::*;
         #[cfg(target_arch = "x86_64")]
@@ -378,108 +291,6 @@ impl CubicSplineKernelAvxF32 {
     }
 }
 
-#[test]
-#[cfg_attr(
-    not(all(
-        any(target_arch = "x86_64", target_arch = "x86"),
-        target_feature = "avx2",
-        target_feature = "fma"
-    )),
-    ignore = "Skipped on non-x86 targets"
-)]
-fn test_cubic_spline_kernel_avx() {
-    #[cfg(all(
-        any(target_arch = "x86_64", target_arch = "x86"),
-        target_feature = "avx2",
-        target_feature = "fma"
-    ))]
-    {
-        #[cfg(target_arch = "x86")]
-        use core::arch::x86::*;
-        #[cfg(target_arch = "x86_64")]
-        use core::arch::x86_64::*;
-
-        // Test a few representative compact support radii
-        let hs: [f32; 3] = [0.025, 0.1, 2.0];
-        for &h in hs.iter() {
-            let scalar = CubicSplineKernel::new(h);
-            let avx = CubicSplineKernelAvxF32::new(h);
-
-            // Sample radii from 0 to 2h (beyond support should be 0)
-            let n: usize = 1024;
-            let mut r0: f32 = 0.0;
-            let dr: f32 = (2.0 * h) / (n as f32);
-
-            for _chunk in 0..(n / 8) {
-                // Prepare 8 lanes of radii
-                let rs = [
-                    r0,
-                    r0 + dr,
-                    r0 + 2.0 * dr,
-                    r0 + 3.0 * dr,
-                    r0 + 4.0 * dr,
-                    r0 + 5.0 * dr,
-                    r0 + 6.0 * dr,
-                    r0 + 7.0 * dr,
-                ];
-
-                // Evaluate AVX and store back to array
-                let r_vec = unsafe { _mm256_loadu_ps(rs.as_ptr()) };
-                let w_vec = unsafe { avx.evaluate(r_vec) };
-                let mut w_avx = [0.0f32; 8];
-                unsafe { _mm256_storeu_ps(w_avx.as_mut_ptr(), w_vec) };
-
-                // Compare against scalar lane-wise
-                for lane in 0..8 {
-                    let r_lane = rs[lane];
-                    let w_scalar = scalar.evaluate(r_lane);
-                    let diff = (w_avx[lane] - w_scalar).abs();
-
-                    // Absolute tolerance with mild relative component to be robust across scales
-                    let tol = 1e-6_f32.max(1e-5_f32 * w_scalar.abs());
-                    assert!(
-                        diff <= tol,
-                        "AVX kernel mismatch (h={}, r={}, lane={}): avx={}, scalar={}, diff={}, tol={}",
-                        h,
-                        r_lane,
-                        lane,
-                        w_avx[lane],
-                        w_scalar,
-                        diff,
-                        tol
-                    );
-                }
-
-                r0 += 8.0 * dr;
-            }
-
-            // Also check a couple of out-of-support points explicitly
-            for &r in &[h * 1.01, h * 1.5, h * 2.0, h * 2.5] {
-                let w_scalar = scalar.evaluate(r);
-                let w_avx = {
-                    let v = unsafe { _mm256_set1_ps(r) };
-                    let w = unsafe { avx.evaluate(v) };
-                    let mut tmp = [0.0f32; 8];
-                    unsafe { _mm256_storeu_ps(tmp.as_mut_ptr(), w) };
-                    tmp[0]
-                };
-                let diff = (w_avx - w_scalar).abs();
-                let tol = 1e-6_f32.max(1e-5_f32 * w_scalar.abs());
-                assert!(
-                    diff <= tol,
-                    "AVX kernel mismatch outside support (h={}, r={}): avx={}, scalar={}, diff={}, tol={}",
-                    h,
-                    r,
-                    w_avx,
-                    w_scalar,
-                    diff,
-                    tol
-                );
-            }
-        }
-    }
-}
-
 /// Accelerator for efficient evaluation of a precomputed cubic kernel
 ///
 /// This structure is used to pre-compute a discrete representation of the cubic kernel function.
@@ -544,36 +355,886 @@ impl<R: Real> DiscreteSquaredDistanceCubicKernel<R> {
     }
 }
 
-#[test]
-fn test_discrete_kernel() {
-    let n = 10000;
-    let h = 0.025;
+/// Poly6 kernel
+pub struct Poly6Kernel<R: Real> {
+    /// Compact support radius of the kernel
+    compact_support_radius: R,
+    /// Kernel normalization factor (sigma)
+    normalization: R,
+}
 
-    let discrete_kernel = DiscreteSquaredDistanceCubicKernel::new::<f64>(n, h);
-    let kernel = CubicSplineKernel::new(h);
+impl<R: Real> SymmetricKernel3d<R> for Poly6Kernel<R> {
+    /// Initializes a poly6 kernel with the given compact support radius
+    #[replace_float_literals(R::from_float(literal))]
+    fn new(compact_support_radius: R) -> Self {
+        let h = compact_support_radius;
+        let sigma = 315.0 / (64.0 * R::pi() * h.powi(9));
 
-    // Test the pre-computed values using a linear stepping
-    let dr = h / (n as f64);
-    for i in 0..n {
-        let r = (i as f64) * dr;
-        let rr = r * r;
+        Self {
+            compact_support_radius,
+            normalization: sigma,
+        }
+    }
 
-        let discrete = discrete_kernel.evaluate(rr);
-        let continuous = kernel.evaluate(r);
+    fn compact_support_radius(&self) -> R {
+        self.compact_support_radius
+    }
 
-        let diff = (discrete - continuous).abs();
-        let rel_diff = diff / continuous;
-        if rel_diff > 5e-2 && diff > 1e-1 {
-            eprintln!(
-                "at r={}, r/h={}, discrete: {}, continuous: {}, diff: {}, rel_diff: {}",
-                r,
-                r / h,
-                discrete,
-                continuous,
-                diff,
-                rel_diff
-            );
-            assert!(false);
+    /// Evaluates the poly6 kernel at the radial distance `r`
+    fn evaluate(&self, r: R) -> R {
+        let r2 = r * r;
+        let h2 = self.compact_support_radius.powi(2);
+        if r2 <= h2 {
+            self.normalization * (h2 - r2).powi(3)
+        } else {
+            R::zero()
         }
     }
+
+    /// Evaluates the gradient of the poly6 kernel at the position `x`
+    fn evaluate_gradient(&self, x: Vector3<R>) -> Vector3<R> {
+        // Radial distance is norm of position
+        let r2 = x.norm_squared();
+        let h2 = self.compact_support_radius.powi(2);
+
+        if r2 <= h2 {
+            let x = x.normalize();
+
+            x.scale(R::from_float(6.0) * self.normalization * (h2 - r2).powi(2))
+        } else {
+            x.scale(R::zero())
+        }
+    }
+
+    /// Evaluates the norm of the gradient of the poly6 kernel at the radial distance `r`
+    fn evaluate_gradient_norm(&self, r: R) -> R {
+        let r2 = r * r;
+        let h2 = self.compact_support_radius.powi(2);
+
+        if r2 <= h2 {
+            R::from_float(6.0) * self.normalization * (h2 - r2).powi(2)
+        } else {
+            R::zero()
+        }
+    }
+}
+
+/// Vectorized implementation of the poly6 kernel using NEON instructions. Only available on `aarch64` targets.
+#[cfg(target_arch = "aarch64")]
+pub struct Poly6KernelNeonF32 {
+    squared_compact_support: f32,
+    sigma: f32,
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe impl NeonKernel for Poly6KernelNeonF32 {
+    /// Initializes a poly6 kernel with the given compact support radius
+    fn new(compact_support_radius: f32) -> Self {
+        let r = compact_support_radius;
+        let r9 = r.powi(9);
+        let sigma = 315.0 / (64.0 * std::f32::consts::PI * r9);
+        Self {
+            squared_compact_support: r * r,
+            sigma,
+        }
+    }
+
+    /// Evaluates the poly6 kernel at the specified radial distances
+    #[target_feature(enable = "neon")]
+    unsafe fn evaluate(&self, r: float32x4_t) -> float32x4_t {
+        use core::arch::aarch64::*;
+
+        let zero = vdupq_n_f32(0.0);
+
+        let r2 = vmulq_f32(r, r);
+        let h2 = vdupq_n_f32(self.squared_compact_support);
+
+        // (h2 - r2)^3
+        let relevance = vsubq_f32(h2, r2);
+        let relevance2 = vmulq_f32(relevance, relevance);
+        let relevance3 = vmulq_f32(relevance2, relevance);
+
+        // 315.0/(64.0*PI*h^9)
+        let normalization = vdupq_n_f32(self.sigma);
+
+        let res = vmulq_f32(normalization, relevance3);
+
+        // mask: r² <= h²
+        let mask = vcleq_f32(r2, h2);
+
+        // blend: if mask true → res, else → zero
+        vbslq_f32(mask, res, zero)
+    }
+}
+
+/// Vectorized implementation of the poly6 kernel using AVX2 and FMA instructions. Only available on `x86` and `x86_64` targets.
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+pub struct Poly6KernelAvxF32 {
+    squared_compact_support: f32,
+    sigma: f32,
+}
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+unsafe impl AvxKernel for Poly6KernelAvxF32 {
+    /// Initializes a poly6 kernel with the given compact support radius
+    fn new(compact_support_radius: f32) -> Self {
+        let r = compact_support_radius;
+        let r9 = r.powi(9);
+        let sigma = 315.0 / (64.0 * std::f32::consts::PI * r9);
+        Self {
+            squared_compact_support: r * r,
+            sigma,
+        }
+    }
+
+    /// Evaluates the poly6 kernel at the specified radial distances
+    #[target_feature(enable = "avx2,fma")]
+    unsafe fn evaluate(&self, r: __m256) -> __m256 {
+        #[cfg(target_arch = "x86")]
+        use core::arch::x86::*;
+        #[cfg(target_arch = "x86_64")]
+        use core::arch::x86_64::*;
+
+        let zero = _mm256_set1_ps(0.0);
+
+        let r2 = _mm256_mul_ps(r, r);
+        let h2 = _mm256_set1_ps(self.squared_compact_support);
+
+        // (h2-r2)^3
+        let relevance = _mm256_sub_ps(h2, r2);
+        let relevance2 = _mm256_mul_ps(relevance, relevance);
+        let relevance3 = _mm256_mul_ps(relevance2, relevance);
+
+        // 315.0/(64.0*PI*h^9)
+        let normalization = _mm256_set1_ps(self.sigma);
+
+        let res = _mm256_mul_ps(normalization, relevance3);
+
+        // Only use if in compact support radius
+        let in_radius = _mm256_cmp_ps::<_CMP_LE_OQ>(r2, h2);
+        _mm256_blendv_ps(zero, res, in_radius)
+    }
+}
+
+/// Spiky kernel
+pub struct SpikyKernel<R: Real> {
+    /// Compact support radius of the kernel
+    compact_support_radius: R,
+    /// Kernel normalization factor (sigma)
+    normalization: R,
+}
+
+impl<R: Real> SymmetricKernel3d<R> for SpikyKernel<R> {
+    /// Initializes a spiky kernel with the given compact support radius
+    #[replace_float_literals(R::from_float(literal))]
+    fn new(compact_support_radius: R) -> Self {
+        let h = compact_support_radius;
+        let sigma = 15.0 / (R::pi() * h.powi(6));
+
+        Self {
+            compact_support_radius,
+            normalization: sigma,
+        }
+    }
+
+    fn compact_support_radius(&self) -> R {
+        self.compact_support_radius
+    }
+
+    /// Evaluates the spiky kernel at the radial distance `r`
+    fn evaluate(&self, r: R) -> R {
+        let r2 = r * r;
+        let h2 = self.compact_support_radius.powi(2);
+        if r2 <= h2 {
+            self.normalization * (self.compact_support_radius - r).powi(3)
+        } else {
+            R::zero()
+        }
+    }
+
+    /// Evaluates the gradient of the spiky kernel at the position `x`
+    fn evaluate_gradient(&self, x: Vector3<R>) -> Vector3<R> {
+        // Radial distance is norm of position
+        let r2 = x.norm_squared();
+        let r = r2.try_sqrt().unwrap();
+        let h2 = self.compact_support_radius.powi(2);
+
+        if r2 <= h2 {
+            // normalize x
+            let x = x.unscale(r);
+
+            x.scale(
+                R::from_float(-3.0)
+                    * self.normalization
+                    * (self.compact_support_radius - r).powi(2),
+            )
+        } else {
+            x.scale(R::zero())
+        }
+    }
+
+    /// Evaluates the norm of the gradient of the spiky kernel at the radial distance `r`
+    fn evaluate_gradient_norm(&self, r: R) -> R {
+        let r2 = r * r;
+        let h2 = self.compact_support_radius.powi(2);
+
+        if r2 <= h2 {
+            R::from_float(-3.0) * self.normalization * (self.compact_support_radius - r).powi(2)
+        } else {
+            R::zero()
+        }
+    }
+}
+
+/// Vectorized implementation of the spiky kernel using NEON instructions. Only available on `aarch64` targets.
+#[cfg(target_arch = "aarch64")]
+pub struct SpikyKernelNeonF32 {
+    compact_support: f32,
+    sigma: f32,
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe impl NeonKernel for SpikyKernelNeonF32 {
+    /// Initializes a spiky kernel with the given compact support radius
+    fn new(compact_support_radius: f32) -> Self {
+        let r = compact_support_radius;
+        let r6 = r.powi(6);
+        let sigma = 15.0 / (std::f32::consts::PI * r6);
+        Self {
+            compact_support: r,
+            sigma,
+        }
+    }
+
+    /// Evaluates the spiky kernel at the specified radial distances
+    #[target_feature(enable = "neon")]
+    unsafe fn evaluate(&self, r: float32x4_t) -> float32x4_t {
+        use core::arch::aarch64::*;
+
+        let zero = vdupq_n_f32(0.0);
+        let h = vdupq_n_f32(self.compact_support);
+
+        let r2 = vmulq_f32(r, r);
+        let h2 = vmulq_f32(h, h);
+
+        // (h - r)^3
+        let relevance = vsubq_f32(h, r);
+        let relevance2 = vmulq_f32(relevance, relevance);
+        let relevance3 = vmulq_f32(relevance2, relevance);
+
+        // 15.0/(PI*h^6)
+        let normalization = vdupq_n_f32(self.sigma);
+
+        let res = vmulq_f32(normalization, relevance3);
+
+        // mask: r2 <= h2
+        let mask = vcleq_f32(r2, h2);
+
+        // blend: if mask true → res, else → zero
+        vbslq_f32(mask, res, zero)
+    }
+}
+
+/// Vectorized implementation of the spiky kernel using AVX2 and FMA instructions. Only available on `x86` and `x86_64` targets.
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+pub struct SpikyKernelAvxF32 {
+    compact_support: f32,
+    sigma: f32,
+}
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+unsafe impl AvxKernel for SpikyKernelAvxF32 {
+    /// Initializes a spiky kernel with the given compact support radius
+    fn new(compact_support_radius: f32) -> Self {
+        let r = compact_support_radius;
+        let r6 = r.powi(6);
+        let sigma = 15.0 / (std::f32::consts::PI * r6);
+        Self {
+            compact_support: r,
+            sigma,
+        }
+    }
+
+    /// Evaluates the spiky kernel at the specified radial distances
+    #[target_feature(enable = "avx2,fma")]
+    unsafe fn evaluate(&self, r: __m256) -> __m256 {
+        #[cfg(target_arch = "x86")]
+        use core::arch::x86::*;
+        #[cfg(target_arch = "x86_64")]
+        use core::arch::x86_64::*;
+
+        let zero = _mm256_set1_ps(0.0);
+        let h = _mm256_set1_ps(self.compact_support);
+
+        let r2 = _mm256_mul_ps(r, r);
+        let h2 = _mm256_mul_ps(h, h);
+
+        // (h-r)^3
+        let relevance = _mm256_sub_ps(h, r);
+        let relevance2 = _mm256_mul_ps(relevance, relevance);
+        let relevance3 = _mm256_mul_ps(relevance2, relevance);
+
+        // 15.0/(PI*h^6)
+        let normalization = _mm256_set1_ps(self.sigma);
+
+        let res = _mm256_mul_ps(normalization, relevance3);
+
+        // Only use if in compact support radius
+        let in_radius = _mm256_cmp_ps::<_CMP_LE_OQ>(r2, h2);
+        _mm256_blendv_ps(zero, res, in_radius)
+    }
+}
+
+/// Quintic Wendland C2 kernel
+pub struct WendlandQuinticC2Kernel<R: Real> {
+    /// Compact support radius of the kernel
+    compact_support_radius: R,
+    /// Kernel normalization factor (sigma)
+    normalization: R,
+}
+
+impl<R: Real> SymmetricKernel3d<R> for WendlandQuinticC2Kernel<R> {
+    /// Initializes a wendland quintic C2 kernel with the given compact support radius
+    #[replace_float_literals(R::from_float(literal))]
+    fn new(compact_support_radius: R) -> Self {
+        let h = compact_support_radius;
+        let sigma = 21.0 / (2.0 * R::pi() * h.powi(3));
+
+        Self {
+            compact_support_radius,
+            normalization: sigma,
+        }
+    }
+
+    fn compact_support_radius(&self) -> R {
+        self.compact_support_radius
+    }
+
+    /// Evaluates the wendland quintic C2 kernel at the radial distance `r`
+    fn evaluate(&self, r: R) -> R {
+        let q = r / self.compact_support_radius;
+        if q <= R::one() {
+            self.normalization * (R::one() - q).powi(4) * (R::from_float(4.0) * q + R::one())
+        } else {
+            R::zero()
+        }
+    }
+
+    /// Evaluates the gradient of the wendland quintic C2 kernel at the position `x`
+    fn evaluate_gradient(&self, x: Vector3<R>) -> Vector3<R> {
+        // Radial distance is norm of position
+        let r = x.norm();
+        let q = r / self.compact_support_radius;
+
+        if q <= R::one() {
+            // normalize x
+            let gradq = x.unscale(r * self.compact_support_radius);
+
+            gradq.scale(R::from_float(-20.0) * self.normalization * q * (R::one() - q).powi(3))
+        } else {
+            x.scale(R::zero())
+        }
+    }
+
+    /// Evaluates the norm of the gradient of the wendland quintic C2 kernel at the radial distance `r`
+    fn evaluate_gradient_norm(&self, r: R) -> R {
+        let q = r / self.compact_support_radius;
+
+        if q <= R::one() {
+            R::from_float(-20.0) * self.normalization * q * (R::one() - q).powi(3)
+        } else {
+            R::zero()
+        }
+    }
+}
+
+/// Vectorized implementation of the Wendland quintic C2 kernel using NEON instructions. Only available on `aarch64` targets.
+#[cfg(target_arch = "aarch64")]
+pub struct WendlandQuinticC2KernelNeonF32 {
+    compact_support_inv: f32,
+    sigma: f32,
+}
+
+#[cfg(target_arch = "aarch64")]
+unsafe impl NeonKernel for WendlandQuinticC2KernelNeonF32 {
+    /// Initializes a Wendland quintic C2 kernel with the given compact support radius
+    fn new(compact_support_radius: f32) -> Self {
+        let r = compact_support_radius;
+        let compact_support_inv = 1.0 / r;
+        let r3 = r.powi(3);
+        let sigma = 21.0 / (2.0 * std::f32::consts::PI * r3);
+
+        Self {
+            compact_support_inv,
+            sigma,
+        }
+    }
+
+    /// Evaluates the Wendland quintic C2 kernel at the specified radial distances
+    #[target_feature(enable = "neon")]
+    unsafe fn evaluate(&self, r: float32x4_t) -> float32x4_t {
+        use core::arch::aarch64::*;
+
+        let zero = vdupq_n_f32(0.0);
+        let one = vdupq_n_f32(1.0);
+        let four = vdupq_n_f32(4.0);
+
+        // q = r * compact_support_inv
+        let q = vmulq_n_f32(r, self.compact_support_inv);
+
+        // v = max(0, 1 - q)
+        let mut v = vsubq_f32(one, q);
+        v = vmaxq_f32(v, zero);
+
+        // (1 - q)^4
+        let v2 = vmulq_f32(v, v);
+        let v4 = vmulq_f32(v2, v2);
+
+        // normalization factor (15.0/(PI*h^6)), stored in self.sigma
+        let normalization = vdupq_n_f32(self.sigma);
+
+        // (1-q)^4 * (4q + 1)
+        let fourq = vmulq_f32(four, q);
+        let fourq_plus_one = vaddq_f32(fourq, one);
+        let res = vmulq_f32(v4, fourq_plus_one);
+
+        // normalization * res
+        vmulq_f32(normalization, res)
+    }
+}
+
+/// Vectorized implementation of the Wendland quintic C2 kernel using AVX2 and FMA instructions. Only available on `x86` and `x86_64` targets.
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+pub struct WendlandQuinticC2KernelAvxF32 {
+    compact_support_inv: f32,
+    sigma: f32,
+}
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+unsafe impl AvxKernel for WendlandQuinticC2KernelAvxF32 {
+    /// Initializes a Wendland quintic C2 kernel with the given compact support radius
+    fn new(compact_support_radius: f32) -> Self {
+        let r = compact_support_radius;
+        let compact_support_inv = 1.0 / r;
+        let r3 = r.powi(3);
+        let sigma = 21.0 / (2.0 * std::f32::consts::PI * r3);
+
+        Self {
+            compact_support_inv,
+            sigma,
+        }
+    }
+
+    /// Evaluates the Wendland quintic C2 kernel at the specified radial distances
+    #[target_feature(enable = "avx2,fma")]
+    unsafe fn evaluate(&self, r: __m256) -> __m256 {
+        #[cfg(target_arch = "x86")]
+        use core::arch::x86::*;
+        #[cfg(target_arch = "x86_64")]
+        use core::arch::x86_64::*;
+
+        let zero = _mm256_set1_ps(0.0);
+        let one = _mm256_set1_ps(1.0);
+        let four = _mm256_set1_ps(4.0);
+
+        let q = _mm256_mul_ps(r, _mm256_set1_ps(self.compact_support_inv));
+        let mut v = _mm256_sub_ps(one, q);
+        // Clamp v to [0, 1] to implicitly zero-out contributions with q > 1
+        v = _mm256_max_ps(v, zero);
+
+        // (1-q)^4
+        let v4 = _mm256_mul_ps(v, _mm256_mul_ps(v, _mm256_mul_ps(v, v)));
+
+        // 15.0/(PI*h^6)
+        let normalization = _mm256_set1_ps(self.sigma);
+
+        // (1-q)^4 * (4q+1)
+        let res = _mm256_mul_ps(v4, _mm256_add_ps(_mm256_mul_ps(four, q), one));
+
+        _mm256_mul_ps(normalization, res)
+    }
+}
+
+#[cfg(test)]
+mod kernel_tests {
+    use super::*;
+
+    macro_rules! test_kernel_r_compact_support {
+        ($kernel_class: ident) => {
+            let hs = [0.025, 0.1, 2.0];
+            for &h in hs.iter() {
+                let kernel = $kernel_class::new(h);
+                assert_eq!(kernel.evaluate(h), 0.0);
+                assert_eq!(kernel.evaluate(2.0 * h), 0.0);
+                assert_eq!(kernel.evaluate(10.0 * h), 0.0);
+            }
+        };
+    }
+
+    macro_rules! test_kernel_r_integral {
+        ($kernel_class: ident) => {
+            let hs = [0.025, 0.1, 2.0];
+            let n = 10;
+
+            for &h in hs.iter() {
+                let kernel = $kernel_class::new(h);
+
+                let dr = h / (n as f64);
+                let dvol = dr * dr * dr;
+
+                let mut integral = 0.0;
+                for i in -n..n {
+                    for j in -n..n {
+                        for k in -n..n {
+                            let r_in = Vector3::new(i as f64, j as f64, k as f64) * dr;
+                            let r_out =
+                                Vector3::new((i + 1) as f64, (j + 1) as f64, (k + 1) as f64) * dr;
+                            let r = ((r_in + r_out) * 0.5).norm();
+
+                            integral += dvol * kernel.evaluate(r);
+                        }
+                    }
+                }
+
+                // println!("{}", integral);
+                assert!((integral - 1.0).abs() <= 1e-4);
+            }
+        };
+    }
+
+    macro_rules! test_kernel_avx {
+        ($kernel_class: ident, $kernel_class_avx: ident) => {
+            #[cfg(all(
+                any(target_arch = "x86_64", target_arch = "x86"),
+                target_feature = "avx2",
+                target_feature = "fma"
+            ))]
+            {
+                #[cfg(target_arch = "x86")]
+                use core::arch::x86::*;
+                #[cfg(target_arch = "x86_64")]
+                use core::arch::x86_64::*;
+
+                // Test a few representative compact support radii
+                let hs: [f32; 3] = [0.025, 0.1, 2.0];
+                for &h in hs.iter() {
+                    let scalar = $kernel_class::new(h);
+                    let avx = $kernel_class_avx::new(h);
+
+                    // Sample radii from 0 to 2h (beyond support should be 0)
+                    let n: usize = 1024;
+                    let mut r0: f32 = 0.0;
+                    let dr: f32 = (2.0 * h) / (n as f32);
+
+                    for _chunk in 0..(n / 8) {
+                        // Prepare 8 lanes of radii
+                        let rs = [
+                            r0,
+                            r0 + dr,
+                            r0 + 2.0 * dr,
+                            r0 + 3.0 * dr,
+                            r0 + 4.0 * dr,
+                            r0 + 5.0 * dr,
+                            r0 + 6.0 * dr,
+                            r0 + 7.0 * dr,
+                        ];
+
+                        // Evaluate AVX and store back to array
+                        let r_vec = unsafe { _mm256_loadu_ps(rs.as_ptr()) };
+                        let w_vec = unsafe { avx.evaluate(r_vec) };
+                        let mut w_avx = [0.0f32; 8];
+                        unsafe { _mm256_storeu_ps(w_avx.as_mut_ptr(), w_vec) };
+
+                        // Compare against scalar lane-wise
+                        for lane in 0..8 {
+                            let r_lane = rs[lane];
+                            let w_scalar = scalar.evaluate(r_lane);
+                            let diff = (w_avx[lane] - w_scalar).abs();
+
+                            // Absolute tolerance with mild relative component to be robust across scales
+                            let tol = 1e-6_f32.max(1e-5_f32 * w_scalar.abs());
+                            assert!(
+                                diff <= tol,
+                                "AVX kernel mismatch (h={}, r={}, lane={}): avx={}, scalar={}, diff={}, tol={}",
+                                h,
+                                r_lane,
+                                lane,
+                                w_avx[lane],
+                                w_scalar,
+                                diff,
+                                tol
+                            );
+                        }
+
+                        r0 += 8.0 * dr;
+                    }
+
+                    // Also check a couple of out-of-support points explicitly
+                    for &r in &[h * 1.01, h * 1.5, h * 2.0, h * 2.5] {
+                        let w_scalar = scalar.evaluate(r);
+                        let w_avx = {
+                            let v = unsafe { _mm256_set1_ps(r) };
+                            let w = unsafe { avx.evaluate(v) };
+                            let mut tmp = [0.0f32; 8];
+                            unsafe { _mm256_storeu_ps(tmp.as_mut_ptr(), w) };
+                            tmp[0]
+                        };
+                        let diff = (w_avx - w_scalar).abs();
+                        let tol = 1e-6_f32.max(1e-5_f32 * w_scalar.abs());
+                        assert!(
+                            diff <= tol,
+                            "AVX kernel mismatch outside support (h={}, r={}): avx={}, scalar={}, diff={}, tol={}",
+                            h,
+                            r,
+                            w_avx,
+                            w_scalar,
+                            diff,
+                            tol
+                        );
+                    }
+                }
+            }
+        };
+    }
+
+    macro_rules! test_kernel_neon {
+        ($kernel_class: ident, $kernel_class_neon: ident) => {
+            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+            {
+                use core::arch::aarch64::*;
+
+                // Test a few representative compact support radii
+                let hs: [f32; 3] = [0.025, 0.1, 2.0];
+                for &h in hs.iter() {
+                    let scalar = $kernel_class::new(h);
+                    let neon = $kernel_class_neon::new(h);
+
+                    // Sample radii from 0 to 2h (beyond support should be 0)
+                    let n: usize = 1024;
+                    let mut r0: f32 = 0.0;
+                    let dr: f32 = (2.0 * h) / (n as f32);
+
+                    for _chunk in 0..(n / 4) {
+                        // Prepare 4 lanes of radii
+                        let rs = [r0, r0 + dr, r0 + 2.0 * dr, r0 + 3.0 * dr];
+                        let r_vec = unsafe { vld1q_f32(rs.as_ptr()) };
+
+                        // Evaluate NEON and store back to array
+                        let w_vec = unsafe { neon.evaluate(r_vec) };
+                        let mut w_neon = [0.0f32; 4];
+                        unsafe { vst1q_f32(w_neon.as_mut_ptr(), w_vec) };
+
+                        // Compare against scalar lane-wise
+                        for lane in 0..4 {
+                            let r_lane = rs[lane];
+                            let w_scalar = scalar.evaluate(r_lane);
+                            let diff = (w_neon[lane] - w_scalar).abs();
+
+                            // Absolute tolerance with mild relative component to be robust across scales
+                            let tol = 5e-6_f32.max(2e-5_f32 * w_scalar.abs());
+                            assert!(
+                                diff <= tol,
+                                "NEON kernel mismatch (h={}, r={}, lane={}): neon={}, scalar={}, diff={}, tol={}",
+                                h,
+                                r_lane,
+                                lane,
+                                w_neon[lane],
+                                w_scalar,
+                                diff,
+                                tol
+                            );
+                        }
+
+                        r0 += 4.0 * dr;
+                    }
+
+                    // Also check a couple of out-of-support points explicitly
+                    for &r in &[h * 1.01, h * 1.5, h * 2.0, h * 2.5] {
+                        let w_scalar = scalar.evaluate(r);
+                        let w_neon = {
+                            let v = unsafe { vld1q_f32([r, r, r, r].as_ptr()) };
+                            let w = unsafe { neon.evaluate(v) };
+                            let mut tmp = [0.0f32; 4];
+                            unsafe { vst1q_f32(tmp.as_mut_ptr(), w) };
+                            tmp[0]
+                        };
+                        let diff = (w_neon - w_scalar).abs();
+                        let tol = 5e-6_f32.max(1e-5_f32 * w_scalar.abs());
+                        assert!(
+                            diff <= tol,
+                            "NEON kernel mismatch outside support (h={}, r={}): neon={}, scalar={}, diff={}, tol={}",
+                            h,
+                            r,
+                            w_neon,
+                            w_scalar,
+                            diff,
+                            tol
+                        );
+                    }
+                }
+            }
+        };
+    }
+
+    #[test]
+    fn cubic_spline_kernel_r_compact_support() {
+        test_kernel_r_compact_support!(CubicSplineKernel);
+    }
+
+    #[test]
+    fn cubic_spline_kernel_r_integral() {
+        test_kernel_r_integral!(CubicSplineKernel);
+    }
+
+    #[test]
+    #[cfg_attr(
+        not(all(target_arch = "aarch64", target_feature = "neon")),
+        ignore = "Skipped on non-aarch64 targets"
+    )]
+    fn cubic_spline_kernel_neon() {
+        test_kernel_neon!(CubicSplineKernel, CubicSplineKernelNeonF32);
+    }
+
+    #[test]
+    #[cfg_attr(
+        not(all(
+            any(target_arch = "x86_64", target_arch = "x86"),
+            target_feature = "avx2",
+            target_feature = "fma"
+        )),
+        ignore = "Skipped on non-x86 targets"
+    )]
+    fn cubic_spline_kernel_avx() {
+        test_kernel_avx!(CubicSplineKernel, CubicSplineKernelAvxF32);
+    }
+
+    #[test]
+    fn discrete_cubic_kernel() {
+        let n = 10000;
+        let h = 0.025;
+
+        let discrete_kernel = DiscreteSquaredDistanceCubicKernel::new::<f64>(n, h);
+        let kernel = CubicSplineKernel::new(h);
+
+        // Test the pre-computed values using a linear stepping
+        let dr = h / (n as f64);
+        for i in 0..n {
+            let r = (i as f64) * dr;
+            let rr = r * r;
+
+            let discrete = discrete_kernel.evaluate(rr);
+            let continuous = kernel.evaluate(r);
+
+            let diff = (discrete - continuous).abs();
+            let rel_diff = diff / continuous;
+            if rel_diff > 5e-2 && diff > 1e-1 {
+                eprintln!(
+                    "at r={}, r/h={}, discrete: {}, continuous: {}, diff: {}, rel_diff: {}",
+                    r,
+                    r / h,
+                    discrete,
+                    continuous,
+                    diff,
+                    rel_diff
+                );
+                assert!(false);
+            }
+        }
+    }
+
+    #[test]
+    fn poly6_kernel_r_compact_support() {
+        test_kernel_r_compact_support!(Poly6Kernel);
+    }
+
+    #[test]
+    fn poly6_kernel_r_integral() {
+        test_kernel_r_integral!(Poly6Kernel);
+    }
+
+    #[test]
+    #[cfg_attr(
+        not(all(target_arch = "aarch64", target_feature = "neon")),
+        ignore = "Skipped on non-aarch64 targets"
+    )]
+    fn poly6_kernel_neon() {
+        test_kernel_neon!(Poly6Kernel, Poly6KernelNeonF32);
+    }
+
+    #[test]
+    #[cfg_attr(
+        not(all(
+            any(target_arch = "x86_64", target_arch = "x86"),
+            target_feature = "avx2",
+            target_feature = "fma"
+        )),
+        ignore = "Skipped on non-x86 targets"
+    )]
+    fn poly6_kernel_avx() {
+        test_kernel_avx!(Poly6Kernel, Poly6KernelAvxF32);
+    }
+
+    #[test]
+    fn spiky_kernel_r_compact_support() {
+        test_kernel_r_compact_support!(SpikyKernel);
+    }
+
+    #[test]
+    fn spiky_kernel_r_integral() {
+        test_kernel_r_integral!(SpikyKernel);
+    }
+
+    #[test]
+    #[cfg_attr(
+        not(all(target_arch = "aarch64", target_feature = "neon")),
+        ignore = "Skipped on non-aarch64 targets"
+    )]
+    fn spiky_kernel_neon() {
+        test_kernel_neon!(SpikyKernel, SpikyKernelNeonF32);
+    }
+
+    #[test]
+    #[cfg_attr(
+        not(all(
+            any(target_arch = "x86_64", target_arch = "x86"),
+            target_feature = "avx2",
+            target_feature = "fma"
+        )),
+        ignore = "Skipped on non-x86 targets"
+    )]
+    fn spiky_kernel_avx() {
+        test_kernel_avx!(SpikyKernel, SpikyKernelAvxF32);
+    }
+
+    #[test]
+    fn wendland_quintic_c2_kernel_r_compact_support() {
+        test_kernel_r_compact_support!(WendlandQuinticC2Kernel);
+    }
+
+    #[test]
+    fn wendland_quintic_c2_kernel_r_integral() {
+        test_kernel_r_integral!(WendlandQuinticC2Kernel);
+    }
+
+    #[test]
+    #[cfg_attr(
+        not(all(target_arch = "aarch64", target_feature = "neon")),
+        ignore = "Skipped on non-aarch64 targets"
+    )]
+    fn wendland_quintic_c2_kernel_neon() {
+        test_kernel_neon!(WendlandQuinticC2Kernel, WendlandQuinticC2KernelNeonF32);
+    }
+
+    #[test]
+    #[cfg_attr(
+        not(all(
+            any(target_arch = "x86_64", target_arch = "x86"),
+            target_feature = "avx2",
+            target_feature = "fma"
+        )),
+        ignore = "Skipped on non-x86 targets"
+    )]
+    fn wendland_quintic_c2_kernel_avx() {
+        test_kernel_avx!(WendlandQuinticC2Kernel, WendlandQuinticC2KernelAvxF32);
+    }
 }
diff --git a/splashsurf_lib/src/lib.rs b/splashsurf_lib/src/lib.rs
index 35b347b..63abf08 100644
--- a/splashsurf_lib/src/lib.rs
+++ b/splashsurf_lib/src/lib.rs
@@ -42,6 +42,7 @@ pub use crate::traits::{Index, Real, RealConvert, ThreadSafe};
 pub use crate::uniform_grid::UniformGrid;
 
 use crate::density_map::DensityMapError;
+use crate::kernel::KernelType;
 use crate::marching_cubes::MarchingCubesError;
 use crate::mesh::TriMesh3d;
 use crate::uniform_grid::GridConstructionError;
@@ -186,6 +187,7 @@ pub struct Parameters<R: Scalar> {
     /// Depending on the settings of the reconstruction, neighborhood lists are only computed locally
     /// in subdomains. Enabling this flag joins this data over all particles which can add a small overhead.
     pub global_neighborhood_list: bool,
+    pub kernel_type: KernelType,
 }
 
 impl<R: Real> Parameters<R> {
@@ -206,6 +208,7 @@ impl<R: Real> Parameters<R> {
             enable_simd: true,
             spatial_decomposition: Default::default(),
             global_neighborhood_list: false,
+            kernel_type: KernelType::CubicSpline,
         }
     }
 
@@ -238,6 +241,7 @@ impl<R: Real> Parameters<R> {
             enable_simd: self.enable_simd,
             spatial_decomposition: self.spatial_decomposition.clone(),
             global_neighborhood_list: self.global_neighborhood_list,
+            kernel_type: self.kernel_type.clone(),
         })
     }
 }
diff --git a/splashsurf_lib/src/reconstruction.rs b/splashsurf_lib/src/reconstruction.rs
index 2ebfce8..b3eaadd 100644
--- a/splashsurf_lib/src/reconstruction.rs
+++ b/splashsurf_lib/src/reconstruction.rs
@@ -2,17 +2,40 @@ use crate::dense_subdomains::{
     compute_global_densities_and_neighbors, decomposition, initialize_parameters,
     reconstruct_subdomains, stitching, subdomain_classification::GhostMarginClassifier,
 };
+use crate::density_map::{compute_particle_densities_inplace, generate_sparse_density_map};
+use crate::kernel::{
+    CubicSplineKernel, KernelType, Poly6Kernel, SpikyKernel, WendlandQuinticC2Kernel,
+};
 use crate::mesh::TriMesh3d;
 use crate::uniform_grid::UniformGrid;
 use crate::workspace::LocalReconstructionWorkspace;
 use crate::{
-    Index, Parameters, Real, ReconstructionError, SurfaceReconstruction, density_map, kernel,
-    marching_cubes, neighborhood_search, profile,
+    Index, Parameters, Real, ReconstructionError, SurfaceReconstruction, kernel, marching_cubes,
+    neighborhood_search, profile,
 };
 use anyhow::Context;
 use log::{info, trace};
 use nalgebra::Vector3;
 
+macro_rules! call_with_kernel_type {
+    ($function:ident; $kernel_type:expr; $( $param:expr ), *) => {
+        match $kernel_type {
+            KernelType::CubicSpline => {
+                $function::<I, R, CubicSplineKernel<R>>($( $param ), *)
+            },
+            KernelType::Poly6 => {
+                $function::<I, R, Poly6Kernel<R>>($( $param ), *)
+            },
+            KernelType::Spiky => {
+                $function::<I, R, SpikyKernel<R>>($( $param ), *)
+            },
+            KernelType::WendlandQuinticC2 => {
+                $function::<I, R, WendlandQuinticC2Kernel<R>>($( $param ), *)
+            }
+        }
+    }
+}
+
 /// Performs a surface reconstruction with a regular grid for domain decomposition
 pub(crate) fn reconstruct_surface_subdomain_grid<I: Index, R: Real>(
     particle_positions: &[Vector3<R>],
@@ -31,18 +54,22 @@ pub(crate) fn reconstruct_surface_subdomain_grid<I: Index, R: Real>(
     let subdomains =
         decomposition::<I, R, GhostMarginClassifier<I>>(&internal_parameters, particle_positions)?;
 
-    let (particle_densities, particle_neighbors) = compute_global_densities_and_neighbors(
+    let (particle_densities, particle_neighbors) = call_with_kernel_type!(
+        compute_global_densities_and_neighbors;
+        parameters.kernel_type;
         &internal_parameters,
         particle_positions,
-        &subdomains,
+        &subdomains
     );
 
-    let surface_patches = reconstruct_subdomains(
+    let surface_patches = call_with_kernel_type!(
+        reconstruct_subdomains;
+        parameters.kernel_type;
         &internal_parameters,
         particle_positions,
         &particle_densities,
         &subdomains,
-        parameters.enable_simd,
+        parameters.enable_simd
     );
 
     let global_mesh = stitching(surface_patches);
@@ -135,13 +162,15 @@ pub(crate) fn compute_particle_densities_and_neighbors<I: Index, R: Real>(
     );
 
     trace!("Computing particle densities...");
-    density_map::compute_particle_densities_inplace::<I, R>(
+    call_with_kernel_type!(
+        compute_particle_densities_inplace;
+        parameters.kernel_type;
         particle_positions,
         particle_neighbor_lists.as_slice(),
         parameters.compact_support_radius,
         particle_rest_mass,
         parameters.enable_multi_threading,
-        densities,
+        densities
     );
 }
 
@@ -171,7 +200,10 @@ pub(crate) fn reconstruct_single_surface_append<I: Index, R: Real>(
     // Create a new density map, reusing memory with the workspace is bad for cache efficiency
     // Alternatively one could reuse memory with a custom caching allocator
     let mut density_map = Default::default();
-    density_map::generate_sparse_density_map(
+
+    call_with_kernel_type!(
+        generate_sparse_density_map;
+        parameters.kernel_type;
         grid,
         particle_positions,
         particle_densities,
@@ -180,7 +212,7 @@ pub(crate) fn reconstruct_single_surface_append<I: Index, R: Real>(
         parameters.compact_support_radius,
         parameters.cube_size,
         parameters.enable_multi_threading,
-        &mut density_map,
+        &mut density_map
     )?;
 
     marching_cubes::triangulate_density_map_append(
diff --git a/splashsurf_lib/src/sph_interpolation.rs b/splashsurf_lib/src/sph_interpolation.rs
index 6fe432e..42483d8 100644
--- a/splashsurf_lib/src/sph_interpolation.rs
+++ b/splashsurf_lib/src/sph_interpolation.rs
@@ -1,9 +1,12 @@
 //! Functions for interpolating quantities (e.g. normals, scalar fields) by evaluating SPH sums
 
 use crate::Real;
-use crate::kernel::SymmetricKernel3d;
+use crate::ThreadSafe;
+use crate::kernel::{
+    CubicSplineKernel, KernelType, Poly6Kernel, SpikyKernel, SymmetricKernel3d,
+    WendlandQuinticC2Kernel,
+};
 use crate::profile;
-use crate::{ThreadSafe, kernel};
 use nalgebra::{SVector, Unit, Vector3};
 use rayon::prelude::*;
 use rstar::RTree;
@@ -14,6 +17,7 @@ use std::ops::AddAssign;
 pub struct SphInterpolator<R: Real> {
     compact_support_radius: R,
     tree: RTree<Particle<R>>,
+    kernel_type: KernelType,
 }
 
 /// Particle type that is stored in the R-tree for fast SPH neighbor queries
@@ -62,6 +66,7 @@ impl<R: Real> SphInterpolator<R> {
         particle_densities: &[R],
         particle_rest_mass: R,
         compact_support_radius: R,
+        kernel_type: KernelType,
     ) -> Self {
         assert_eq!(particle_positions.len(), particle_densities.len());
 
@@ -70,6 +75,7 @@ impl<R: Real> SphInterpolator<R> {
         Self {
             compact_support_radius,
             tree,
+            kernel_type,
         }
     }
 
@@ -86,40 +92,65 @@ impl<R: Real> SphInterpolator<R> {
     ) {
         profile!("interpolate_normals_inplace");
 
-        let squared_support = self.compact_support_radius * self.compact_support_radius;
-        let kernel = kernel::CubicSplineKernel::new(self.compact_support_radius);
-
-        interpolation_points
-            .par_iter()
-            .map(|x_i| {
-                // Compute the gradient of the particle density field which points in the same direction as surface normals
-                let mut density_grad = Vector3::zeros();
-
-                // SPH: Iterate over all other particles within the squared support radius
-                let query_point = bytemuck::cast::<_, [R; 3]>(*x_i);
-                for p_j in self
-                    .tree
-                    .locate_within_distance(query_point, squared_support)
-                {
-                    // Volume of the neighbor particle
-                    let vol_j = p_j.data.volume;
-                    // Position of the neighbor particle
-                    let x_j = bytemuck::cast_ref::<_, Vector3<R>>(p_j.geom());
-
-                    // Relative position `dx` and distance `r` of the neighbor particle
-                    let dx = x_j - x_i;
-                    let r = dx.norm();
-
-                    // Compute the contribution of the neighbor to the gradient of the density field
-                    // TODO: Replace this by a discrete gradient norm evaluation
-                    let kernel_grad = dx.unscale(r) * kernel.evaluate_gradient_norm(r);
-                    density_grad += kernel_grad * vol_j;
-                }
-
-                // Normalize the gradient to get the surface normal
-                Unit::new_normalize(density_grad)
-            })
-            .collect_into_vec(normals);
+        fn interpolate_normals_helper<R: Real, K: SymmetricKernel3d<R> + Sync>(
+            sph: &SphInterpolator<R>,
+            interpolation_points: &[Vector3<R>],
+            normals: &mut Vec<Unit<Vector3<R>>>,
+        ) {
+            let squared_support = sph.compact_support_radius * sph.compact_support_radius;
+
+            let kernel = K::new(sph.compact_support_radius);
+
+            interpolation_points
+                .par_iter()
+                .map(|x_i| {
+                    // Compute the gradient of the particle density field which points in the same direction as surface normals
+                    let mut density_grad = Vector3::zeros();
+
+                    // SPH: Iterate over all other particles within the squared support radius
+                    let query_point = bytemuck::cast::<_, [R; 3]>(*x_i);
+                    for p_j in sph
+                        .tree
+                        .locate_within_distance(query_point, squared_support)
+                    {
+                        // Volume of the neighbor particle
+                        let vol_j = p_j.data.volume;
+                        // Position of the neighbor particle
+                        let x_j = bytemuck::cast_ref::<_, Vector3<R>>(p_j.geom());
+
+                        // Relative position `dx` and distance `r` of the neighbor particle
+                        let dx = x_j - x_i;
+                        let r = dx.norm();
+
+                        // Compute the contribution of the neighbor to the gradient of the density field
+                        // TODO: Replace this by a discrete gradient norm evaluation
+                        let kernel_grad = dx.unscale(r) * kernel.evaluate_gradient_norm(r);
+                        density_grad += kernel_grad * vol_j;
+                    }
+
+                    // Normalize the gradient to get the surface normal
+                    Unit::new_normalize(density_grad)
+                })
+                .collect_into_vec(normals);
+        }
+
+        match self.kernel_type {
+            KernelType::CubicSpline => interpolate_normals_helper::<R, CubicSplineKernel<R>>(
+                self,
+                interpolation_points,
+                normals,
+            ),
+            KernelType::Poly6 => {
+                interpolate_normals_helper::<R, Poly6Kernel<R>>(self, interpolation_points, normals)
+            }
+            KernelType::Spiky => {
+                interpolate_normals_helper::<R, SpikyKernel<R>>(self, interpolation_points, normals)
+            }
+            KernelType::WendlandQuinticC2 => interpolate_normals_helper::<
+                R,
+                WendlandQuinticC2Kernel<R>,
+            >(self, interpolation_points, normals),
+        }
     }
 
     /// Interpolates surface normals (i.e. normalized SPH gradient of the indicator function) of the fluid to the given points using SPH interpolation
@@ -212,49 +243,95 @@ impl<R: Real> SphInterpolator<R> {
         profile!("interpolate_quantity_inplace");
         assert_eq!(particle_quantity.len(), self.tree.size());
 
-        let squared_support = self.compact_support_radius * self.compact_support_radius;
-        let kernel = kernel::CubicSplineKernel::new(self.compact_support_radius);
-
-        let enable_correction = if first_order_correction {
-            R::one()
-        } else {
-            R::zero()
-        };
-
-        interpolation_points
-            .par_iter()
-            .map(|x_i| {
-                let mut interpolated_value = T::zero();
-                let mut correction = R::zero();
-
-                // SPH: Iterate over all other particles within the squared support radius
-                let query_point = bytemuck::cast::<_, [R; 3]>(*x_i);
-                for p_j in self
-                    .tree
-                    .locate_within_distance(query_point, squared_support)
-                {
-                    // Volume of the neighbor particle
-                    let vol_j = p_j.data.volume;
-                    // Position of the neighbor particle
-                    let x_j = bytemuck::cast_ref::<_, Vector3<R>>(p_j.geom());
-
-                    // Relative position `dx` and distance `r` of the neighbor particle
-                    let dx = x_j - x_i;
-                    let r = dx.norm();
-
-                    // Unchecked access is fine as we asserted before that the slice has the correct length
-                    let A_j = unsafe { particle_quantity.get_unchecked(p_j.data.index).clone() };
-                    let W_ij = kernel.evaluate(r);
-
-                    interpolated_value += A_j.scale(vol_j * W_ij);
-                    correction += vol_j * W_ij;
-                }
-
-                let correction_factor =
-                    enable_correction * correction.recip() + (R::one() - enable_correction);
-                interpolated_value.scale(correction_factor)
-            })
-            .collect_into_vec(interpolated_values);
+        fn interpolate_quantity_helper<
+            R: Real,
+            T: InterpolationQuantity<R>,
+            K: SymmetricKernel3d<R> + Sync,
+        >(
+            sph: &SphInterpolator<R>,
+            particle_quantity: &[T],
+            interpolation_points: &[Vector3<R>],
+            interpolated_values: &mut Vec<T>,
+            first_order_correction: bool,
+        ) {
+            let squared_support = sph.compact_support_radius * sph.compact_support_radius;
+            let kernel = K::new(sph.compact_support_radius);
+
+            let enable_correction = if first_order_correction {
+                R::one()
+            } else {
+                R::zero()
+            };
+
+            interpolation_points
+                .par_iter()
+                .map(|x_i| {
+                    let mut interpolated_value = T::zero();
+                    let mut correction = R::zero();
+
+                    // SPH: Iterate over all other particles within the squared support radius
+                    let query_point = bytemuck::cast::<_, [R; 3]>(*x_i);
+                    for p_j in sph
+                        .tree
+                        .locate_within_distance(query_point, squared_support)
+                    {
+                        // Volume of the neighbor particle
+                        let vol_j = p_j.data.volume;
+                        // Position of the neighbor particle
+                        let x_j = bytemuck::cast_ref::<_, Vector3<R>>(p_j.geom());
+
+                        // Relative position `dx` and distance `r` of the neighbor particle
+                        let dx = x_j - x_i;
+                        let r = dx.norm();
+
+                        // Unchecked access is fine as we asserted before that the slice has the correct length
+                        let A_j =
+                            unsafe { particle_quantity.get_unchecked(p_j.data.index).clone() };
+                        let W_ij = kernel.evaluate(r);
+
+                        interpolated_value += A_j.scale(vol_j * W_ij);
+                        correction += vol_j * W_ij;
+                    }
+
+                    let correction_factor =
+                        enable_correction * correction.recip() + (R::one() - enable_correction);
+                    interpolated_value.scale(correction_factor)
+                })
+                .collect_into_vec(interpolated_values);
+        }
+
+        match self.kernel_type {
+            KernelType::CubicSpline => interpolate_quantity_helper::<R, T, CubicSplineKernel<R>>(
+                self,
+                particle_quantity,
+                interpolation_points,
+                interpolated_values,
+                first_order_correction,
+            ),
+            KernelType::Poly6 => interpolate_quantity_helper::<R, T, Poly6Kernel<R>>(
+                self,
+                particle_quantity,
+                interpolation_points,
+                interpolated_values,
+                first_order_correction,
+            ),
+            KernelType::Spiky => interpolate_quantity_helper::<R, T, SpikyKernel<R>>(
+                self,
+                particle_quantity,
+                interpolation_points,
+                interpolated_values,
+                first_order_correction,
+            ),
+            KernelType::WendlandQuinticC2 => {
+                interpolate_quantity_helper::<R, T, WendlandQuinticC2Kernel<R>>(
+                    self,
+                    particle_quantity,
+                    interpolation_points,
+                    interpolated_values,
+                    first_order_correction,
+                )
+            }
+        }
     }
 }
 
diff --git a/splashsurf_lib/tests/integration_tests/test_full.rs b/splashsurf_lib/tests/integration_tests/test_full.rs
index 751d786..d70b04f 100644
--- a/splashsurf_lib/tests/integration_tests/test_full.rs
+++ b/splashsurf_lib/tests/integration_tests/test_full.rs
@@ -1,13 +1,13 @@
 use nalgebra::Vector3;
 use splashsurf_lib::io::particles_from_file;
 use splashsurf_lib::io::vtk_format::write_vtk;
+use splashsurf_lib::kernel::KernelType;
 use splashsurf_lib::marching_cubes::check_mesh_consistency;
 use splashsurf_lib::{
     Aabb3d, GridDecompositionParameters, Parameters, Real, SpatialDecomposition,
     reconstruct_surface,
 };
 use std::path::Path;
-
 // TODO: Compare with a solution file
 // TODO: Test with a fixed grid?
 
@@ -38,6 +38,7 @@ fn params_with_aabb<R: Real>(
         enable_simd: false,
         spatial_decomposition: SpatialDecomposition::None,
         global_neighborhood_list: false,
+        kernel_type: KernelType::CubicSpline,
     };
 
     match strategy {
diff --git a/splashsurf_lib/tests/integration_tests/test_simple.rs b/splashsurf_lib/tests/integration_tests/test_simple.rs
index b15ca9c..f32034e 100644
--- a/splashsurf_lib/tests/integration_tests/test_simple.rs
+++ b/splashsurf_lib/tests/integration_tests/test_simple.rs
@@ -1,4 +1,5 @@
 use nalgebra::Vector3;
+use splashsurf_lib::kernel::KernelType;
 use splashsurf_lib::marching_cubes::check_mesh_consistency;
 use splashsurf_lib::{
     Aabb3d, GridDecompositionParameters, Parameters, Real, SpatialDecomposition,
@@ -32,6 +33,7 @@ fn params_with_aabb<R: Real>(
         enable_simd: false,
         spatial_decomposition: SpatialDecomposition::None,
         global_neighborhood_list: false,
+        kernel_type: KernelType::CubicSpline,
     };
 
     match strategy {
diff --git a/splashsurf_lib/tests/integration_tests/test_subdomains.rs b/splashsurf_lib/tests/integration_tests/test_subdomains.rs
index 0e450a8..6044f3d 100644
--- a/splashsurf_lib/tests/integration_tests/test_subdomains.rs
+++ b/splashsurf_lib/tests/integration_tests/test_subdomains.rs
@@ -3,6 +3,7 @@ use nalgebra::Vector3;
 use splashsurf_lib::GridDecompositionParameters;
 #[cfg(feature = "io")]
 use splashsurf_lib::io::vtk_format::write_vtk;
+use splashsurf_lib::kernel::KernelType;
 use splashsurf_lib::marching_cubes::check_mesh_consistency;
 use std::path::Path;
 
@@ -31,6 +32,7 @@ macro_rules! generate_single_particle_test {
                 ),
                 rest_density: 1000.0,
                 global_neighborhood_list: false,
+                kernel_type: KernelType::CubicSpline,
             };
 
             let reconstruction =