From 3f877500946d01c2374b2961c36ff5e017e809ba Mon Sep 17 00:00:00 2001 From: jasalper Date: Tue, 10 Jun 2025 00:12:34 -0400 Subject: [PATCH] SIMD Log functionality --- sleef_interface/sleef_avx.c | 25 ++++++ sleef_interface/sleef_neon.c | 25 ++++++ src/dense/simd.rs | 159 +++++++++++++++++++++++++++++++++++ 3 files changed, 209 insertions(+) diff --git a/sleef_interface/sleef_avx.c b/sleef_interface/sleef_avx.c index 1be0115f..593baa1d 100644 --- a/sleef_interface/sleef_avx.c +++ b/sleef_interface/sleef_avx.c @@ -71,4 +71,29 @@ rlst_float32x8 rlst_avx_exp_f32(float* value) { return output; } +rlst_float64x4 rlst_avx_log_f64(double* value) { + + __m256d simd_value = _mm256_loadu_pd(value); + __m256d simd_output = Sleef_logd4_u10avx2(simd_value); + rlst_float64x4 output; + + _mm256_storeu_pd (output.x, simd_output); + + + return output; +} + +rlst_float32x8 rlst_avx_log_f32(float* value) { + + __m256 simd_value = _mm256_loadu_ps(value); + __m256 simd_output = Sleef_logf8_u10avx2(simd_value); + rlst_float32x8 output; + + _mm256_storeu_ps (output.x, simd_output); + + + return output; +} + + diff --git a/sleef_interface/sleef_neon.c b/sleef_interface/sleef_neon.c index 4a76f945..50bdb130 100644 --- a/sleef_interface/sleef_neon.c +++ b/sleef_interface/sleef_neon.c @@ -65,6 +65,31 @@ rlst_float32x4 rlst_neon_exp_f32(float* value) { rlst_float32x4 output; + vst1q_f32(output.x, simd_output); + + return output; + +} + +rlst_float64x2 rlst_neon_log_f64(double* value) { + + float64x2_t simd_value = vld1q_f64(value); + float64x2_t simd_output = Sleef_logd2_u10advsimd(simd_value); + rlst_float64x2 output; + + + vst1q_f64(output.x, simd_output); + + return output; +} + +rlst_float32x4 rlst_neon_log_f32(float* value) { + + float32x4_t simd_value = vld1q_f32(value); + float32x4_t simd_output = Sleef_logf4_u10advsimd(simd_value); + rlst_float32x4 output; + + vst1q_f32(output.x, simd_output); return output; diff --git a/src/dense/simd.rs b/src/dense/simd.rs index 4966f841..82ce36d7 100644 --- a/src/dense/simd.rs +++ b/src/dense/simd.rs @@ -130,6 +130,12 @@ impl SimdFor { T::simd_exp(self.simd, value) } + /// Compute the natural logarithm of a Simd vector. + #[inline(always)] + pub fn log(self, value: T::Scalars) -> T::Scalars { + T::simd_log(self.simd, value) + } + /// Compute the square root of a Simd vector. #[inline(always)] pub fn sqrt(self, value: T::Scalars) -> T::Scalars { @@ -319,6 +325,9 @@ pub trait RlstSimd: Pod + Send + Sync + num::Zero + 'static { /// Compute the base e exponential of a Simd vector. fn simd_exp(simd: S, value: Self::Scalars) -> Self::Scalars; + /// Compute the approximate log of each element in the register. + fn simd_log(simd: S, value: Self::Scalars) -> Self::Scalars; + /// Compute the square root of each element in the register. fn simd_sqrt(simd: S, value: Self::Scalars) -> Self::Scalars; @@ -658,6 +667,35 @@ impl RlstSimd for f32 { out } + #[inline(always)] + fn simd_log(simd: S, value: Self::Scalars) -> Self::Scalars { + #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "sleef"))] + if coe::is_same::() { + let value: [f32; 4] = bytemuck::cast(value); + let out = unsafe { sleef_neon::rlst_neon_log_f32(value.as_ptr()) }; + + return bytemuck::cast(out); + } + + #[cfg(all(target_arch = "x86_64", feature = "sleef"))] + if coe::is_same::() { + let value: [f32; 8] = bytemuck::cast(value); + let out = unsafe { sleef_avx::rlst_avx_log_f32(value.as_ptr()) }; + + return bytemuck::cast(out); + } + + let mut out = simd.splat_f32s(0.0); + { + let out: &mut [f32] = bytemuck::cast_slice_mut(std::slice::from_mut(&mut out)); + let x: &[f32] = bytemuck::cast_slice(std::slice::from_ref(&value)); + for (out, x) in itertools::izip!(out, x) { + *out = x.ln(); + } + } + out + } + #[inline(always)] fn simd_sqrt(simd: S, value: Self::Scalars) -> Self::Scalars { #[cfg(target_arch = "x86_64")] @@ -1033,6 +1071,36 @@ impl RlstSimd for f64 { out } + #[inline(always)] + fn simd_log(simd: S, value: Self::Scalars) -> Self::Scalars { + #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "sleef"))] + if coe::is_same::() { + let value: [f64; 2] = bytemuck::cast(value); + let out = unsafe { sleef_neon::rlst_neon_log_f64(value.as_ptr()) }; + + return bytemuck::cast(out); + } + + #[cfg(all(target_arch = "x86_64", feature = "sleef"))] + if coe::is_same::() { + let value: [f64; 4] = bytemuck::cast(value); + let out = unsafe { sleef_avx::rlst_avx_log_f64(value.as_ptr()) }; + + return bytemuck::cast(out); + } + + let mut out = simd.splat_f64s(0.0); + { + let out: &mut [f64] = bytemuck::cast_slice_mut(std::slice::from_mut(&mut out)); + let x: &[f64] = bytemuck::cast_slice(std::slice::from_ref(&value)); + for (out, x) in itertools::izip!(out, x) { + *out = x.ln(); + } + } + out + } + + #[inline(always)] fn simd_sqrt(simd: S, value: Self::Scalars) -> Self::Scalars { #[cfg(target_arch = "x86_64")] @@ -1143,6 +1211,97 @@ mod tests { use paste; use rand::prelude::*; + #[test] + fn test_simd_log() { + let nsamples = 1001; + + trait Tolerance: RlstScalar { + const TOL: Self; + } + + impl Tolerance for f32 { + const TOL: f32 = 1E-6; + } + + impl Tolerance for f64 { + const TOL: f64 = 1E-13; + } + + struct Impl + RlstSimd + Tolerance> { + rng: StdRng, + nsamples: usize, + _marker: PhantomData, + } + + impl + RlstSimd + Tolerance> pulp::WithSimd for Impl { + type Output = (); + + fn with_simd(self, simd: S) -> Self::Output { + let Self { + mut rng, nsamples, .. + } = self; + + let samples = (0..nsamples) + .map(|_| num::cast::<_, T>(rng.gen::()).unwrap()) + .collect::>(); + + let mut actual = vec![T::default(); nsamples]; + + let (value_head, value_tail) = + ::as_simd_slice::(samples.as_slice()); + let (actual_head, actual_tail) = + ::as_simd_slice_mut::(actual.as_mut_slice()); + + fn impl_slice + RlstSimd + Tolerance, S: Simd>( + simd: S, + values: &[::Scalars], + actual: &mut [::Scalars], + ) { + let simd = SimdFor::::new(simd); + + for (&v, a) in itertools::izip!(values, actual) { + *a = simd.log(v); + } + } + + impl_slice::(simd, value_head, actual_head); + impl_slice::( + pulp::Scalar::new(), + value_tail.coerce(), + actual_tail.coerce(), + ); + + if coe::is_same::() { + for (v, a) in itertools::izip!(samples, actual) { + let v: f32 = coerce_static(v); + let a: f32 = coerce_static(a); + assert_relative_eq!(v.ln(), a, max_relative = f32::TOL); + } + } else if coe::is_same::() { + for (v, a) in itertools::izip!(samples, actual) { + let v: f64 = coerce_static(v); + let a: f64 = coerce_static(a); + assert_relative_eq!(v.ln(), a, max_relative = f64::TOL); + } + } else { + panic!("Unsupported type"); + } + } + } + + pulp::Arch::new().dispatch(Impl:: { + rng: StdRng::seed_from_u64(0), + nsamples, + _marker: PhantomData, + }); + + pulp::Arch::new().dispatch(Impl:: { + rng: StdRng::seed_from_u64(0), + nsamples, + _marker: PhantomData, + }); + } + #[test] fn test_simd_approx_recip() { let nsamples = 1001;