From 6f6d59b1755f48656a3c450ab9f9508a7cd1758e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Mon, 15 Nov 2021 19:34:28 +0100 Subject: [PATCH 01/11] Implement avx512f masked unaligned load and store intrinsics --- crates/core_arch/src/x86/avx512f.rs | 249 ++++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 766acf46f7..12c564ef68 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -30323,6 +30323,160 @@ pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) { ptr::write(mem_addr as *mut __m512d, a); } +/// Load packed 32-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqu32))] +pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, mask: __mmask16, ptr: *const i32) -> __m512i { + let mut result: __m512i = src; + + asm!( + "vmovdqu32 {io}{{{k}}}, [{p}]", + p = in(reg) ptr, + k = in(kreg) mask, + io = inout(zmm_reg) result, + options(nostack), options(pure), options(readonly) + ); + + result +} + +/// Store packed 32-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqu32))] +pub unsafe fn _mm512_mask_storeu_epi32(ptr: *mut i32, mask: __mmask16, a: __m512i) { + asm!( + "vmovdqu32 [{p}]{{{k}}}, {i}", + p = in(reg) ptr, + k = in(kreg) mask, + i = in(zmm_reg) a, + options(nostack) + ); +} + +/// Load packed 64-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqu64))] +pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, mask: __mmask8, ptr: *const i64) -> __m512i { + let mut result: __m512i = src; + + asm!( + "vmovdqu64 {io}{{{k}}}, [{p}]", + p = in(reg) ptr, + k = in(kreg) mask, + io = inout(zmm_reg) result, + options(nostack), options(pure), options(readonly) + ); + + result +} + +/// Store packed 64-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqu64))] +pub unsafe fn _mm512_mask_storeu_epi64(ptr: *mut i64, mask: __mmask8, a: __m512i) { + asm!( + "vmovdqu64 [{p}]{{{k}}}, {i}", + p = in(reg) ptr, + k = in(kreg) mask, + i = in(zmm_reg) a, + options(nostack) + ); +} + +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovups))] +pub unsafe fn _mm512_mask_loadu_ps(src: __m512, mask: __mmask16, ptr: *const f32) -> __m512 { + let mut result: __m512 = src; + + asm!( + "vmovups {io}{{{k}}}, [{p}]", + p = in(reg) ptr, + k = in(kreg) mask, + io = inout(zmm_reg) result, + options(nostack), options(pure), options(readonly) + ); + + result +} + +/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovups))] +pub unsafe fn _mm512_mask_storeu_ps(ptr: *mut f32, mask: __mmask16, a: __m512) { + asm!( + "vmovups [{p}]{{{k}}}, {i}", + p = in(reg) ptr, + k = in(kreg) mask, + i = in(zmm_reg) a, + options(nostack) + ); +} + +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovupd))] +pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, mask: __mmask8, ptr: *const f64) -> __m512d { + let mut result: __m512d = src; + + asm!( + "vmovupd {io}{{{k}}}, [{p}]", + p = in(reg) ptr, + k = in(kreg) mask, + io = inout(zmm_reg) result, + options(nostack), options(pure), options(readonly) + ); + + result +} + +/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovupd))] +pub unsafe fn _mm512_mask_storeu_pd(ptr: *mut f64, mask: __mmask8, a: __m512d) { + asm!( + "vmovupd [{p}]{{{k}}}, {i}", + p = in(reg) ptr, + k = in(kreg) mask, + i = in(zmm_reg) a, + options(nostack) + ); +} + /// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order. /// /// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_pd&expand=5002) @@ -44587,6 +44741,101 @@ mod tests { assert_eq_m512(r, a); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_loadu_epi32() { + let src = _mm512_set1_epi32(42); + let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_mask_loadu_epi32(src, m, black_box(p)); + let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_storeu_epi32() { + let mut r = _mm512_set1_epi32(42); + let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let m = 0b11101000_11001010; + _mm512_mask_storeu_epi32(&mut r as *mut _ as *mut i32, m, a); + let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_loadu_epi64() { + let src = _mm512_set1_epi64(42); + let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm512_mask_loadu_epi64(src, m, black_box(p)); + let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_storeu_epi64() { + let mut r = _mm512_set1_epi64(42); + let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let m = 0b11001010; + _mm512_mask_storeu_epi64(&mut r as *mut _ as *mut i64, m, a); + let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_loadu_ps() { + let src = _mm512_set1_ps(42.0); + let a = &[ + 1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 16.0, + ]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_mask_loadu_ps(src, m, black_box(p)); + let e = _mm512_setr_ps( + 42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0, + 16.0, + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_storeu_ps() { + let mut r = _mm512_set1_ps(42.0); + let a = _mm512_setr_ps( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); + let m = 0b11101000_11001010; + _mm512_mask_storeu_ps(&mut r as *mut _ as *mut f32, m, a); + let e = _mm512_setr_ps( + 42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0, + 16.0, + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_loadu_pd() { + let src = _mm512_set1_pd(42.0); + let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm512_mask_loadu_pd(src, m, black_box(p)); + let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_storeu_pd() { + let mut r = _mm512_set1_pd(42.0); + let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let m = 0b11001010; + _mm512_mask_storeu_pd(&mut r as *mut _ as *mut f64, m, a); + let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_setr_pd() { let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); From c902d9096a4beac621d3777deb7ec655e7f06272 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Sat, 20 Nov 2021 19:01:32 +0100 Subject: [PATCH 02/11] Reduce code repetition using macros and implement avx512vl load and store intrinsics --- crates/core_arch/src/x86/avx512f.rs | 212 ++++++++-------------------- crates/core_arch/src/x86/macros.rs | 95 +++++++++++++ 2 files changed, 154 insertions(+), 153 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 12c564ef68..aabb9aabe0 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -30323,159 +30323,65 @@ pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) { ptr::write(mem_addr as *mut __m512d, a); } -/// Load packed 32-bit integers from memory into dst using writemask k -/// (elements are copied from src when the corresponding mask bit is not set). -/// mem_addr does not need to be aligned on any particular boundary. -/// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi32) -#[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovdqu32))] -pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, mask: __mmask16, ptr: *const i32) -> __m512i { - let mut result: __m512i = src; - - asm!( - "vmovdqu32 {io}{{{k}}}, [{p}]", - p = in(reg) ptr, - k = in(kreg) mask, - io = inout(zmm_reg) result, - options(nostack), options(pure), options(readonly) - ); - - result -} - -/// Store packed 32-bit integers from a into memory using writemask k. -/// mem_addr does not need to be aligned on any particular boundary. -/// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi32) -#[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovdqu32))] -pub unsafe fn _mm512_mask_storeu_epi32(ptr: *mut i32, mask: __mmask16, a: __m512i) { - asm!( - "vmovdqu32 [{p}]{{{k}}}, {i}", - p = in(reg) ptr, - k = in(kreg) mask, - i = in(zmm_reg) a, - options(nostack) - ); -} - -/// Load packed 64-bit integers from memory into dst using writemask k -/// (elements are copied from src when the corresponding mask bit is not set). -/// mem_addr does not need to be aligned on any particular boundary. -/// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_epi64) -#[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovdqu64))] -pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, mask: __mmask8, ptr: *const i64) -> __m512i { - let mut result: __m512i = src; - - asm!( - "vmovdqu64 {io}{{{k}}}, [{p}]", - p = in(reg) ptr, - k = in(kreg) mask, - io = inout(zmm_reg) result, - options(nostack), options(pure), options(readonly) - ); - - result -} - -/// Store packed 64-bit integers from a into memory using writemask k. -/// mem_addr does not need to be aligned on any particular boundary. -/// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_epi64) -#[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovdqu64))] -pub unsafe fn _mm512_mask_storeu_epi64(ptr: *mut i64, mask: __mmask8, a: __m512i) { - asm!( - "vmovdqu64 [{p}]{{{k}}}, {i}", - p = in(reg) ptr, - k = in(kreg) mask, - i = in(zmm_reg) a, - options(nostack) - ); -} - -/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). -/// mem_addr does not need to be aligned on any particular boundary. -/// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_ps) -#[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovups))] -pub unsafe fn _mm512_mask_loadu_ps(src: __m512, mask: __mmask16, ptr: *const f32) -> __m512 { - let mut result: __m512 = src; - - asm!( - "vmovups {io}{{{k}}}, [{p}]", - p = in(reg) ptr, - k = in(kreg) mask, - io = inout(zmm_reg) result, - options(nostack), options(pure), options(readonly) - ); - - result -} - -/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. -/// mem_addr does not need to be aligned on any particular boundary. -/// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_ps) -#[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovups))] -pub unsafe fn _mm512_mask_storeu_ps(ptr: *mut f32, mask: __mmask16, a: __m512) { - asm!( - "vmovups [{p}]{{{k}}}, {i}", - p = in(reg) ptr, - k = in(kreg) mask, - i = in(zmm_reg) a, - options(nostack) - ); -} - -/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). -/// mem_addr does not need to be aligned on any particular boundary. -/// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_loadu_pd) -#[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovupd))] -pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, mask: __mmask8, ptr: *const f64) -> __m512d { - let mut result: __m512d = src; - - asm!( - "vmovupd {io}{{{k}}}, [{p}]", - p = in(reg) ptr, - k = in(kreg) mask, - io = inout(zmm_reg) result, - options(nostack), options(pure), options(readonly) - ); - - result -} - -/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. -/// mem_addr does not need to be aligned on any particular boundary. -/// -/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_storeu_pd) -#[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovupd))] -pub unsafe fn _mm512_mask_storeu_pd(ptr: *mut f64, mask: __mmask8, a: __m512d) { - asm!( - "vmovupd [{p}]{{{k}}}, {i}", - p = in(reg) ptr, - k = in(kreg) mask, - i = in(zmm_reg) a, - options(nostack) - ); -} +define_masked_load_unaligned!("avx512f", _mm512_mask_loadu_epi32, _mm512_maskz_loadu_epi32, "32-bit integers", vmovdqu32, __m512i, zmm_reg, __mmask16, i32); +define_masked_load_unaligned!("avx512f", _mm512_mask_loadu_epi64, _mm512_maskz_loadu_epi64, "64-bit integers", vmovdqu64, __m512i, zmm_reg, __mmask8, i64); +define_masked_load_unaligned!("avx512f", _mm512_mask_loadu_ps, _mm512_maskz_loadu_ps, "single-precision (32-bit) floating-point elements", vmovups, __m512, zmm_reg, __mmask16, f32); +define_masked_load_unaligned!("avx512f", _mm512_mask_loadu_pd, _mm512_maskz_loadu_pd, "double-precision (64-bit) floating-point elements", vmovupd, __m512d, zmm_reg, __mmask8, f64); + +define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm256_mask_loadu_epi32, _mm256_maskz_loadu_epi32, "32-bit integers", vmovdqu32, __m256i, ymm_reg, __mmask8, i32); +define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm256_mask_loadu_epi64, _mm256_maskz_loadu_epi64, "64-bit integers", vmovdqu64, __m256i, ymm_reg, __mmask8, i64); +define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm256_mask_loadu_ps, _mm256_maskz_loadu_ps, "single-precision (32-bit) floating-point elements", vmovups, __m256, ymm_reg, __mmask8, f32); +define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm256_mask_loadu_pd, _mm256_maskz_loadu_pd, "double-precision (64-bit) floating-point elements", vmovupd, __m256d, ymm_reg, __mmask8, f64); + +define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm_mask_loadu_epi32, _mm_maskz_loadu_epi32, "32-bit integers", vmovdqu32, __m128i, xmm_reg, __mmask8, i32); +define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm_mask_loadu_epi64, _mm_maskz_loadu_epi64, "64-bit integers", vmovdqu64, __m128i, xmm_reg, __mmask8, i64); +define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm_mask_loadu_ps, _mm_maskz_loadu_ps, "single-precision (32-bit) floating-point elements", vmovups, __m128, xmm_reg, __mmask8, f32); +define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm_mask_loadu_pd, _mm_maskz_loadu_pd, "double-precision (64-bit) floating-point elements", vmovupd, __m128d, xmm_reg, __mmask8, f64); + +define_masked_load_aligned!("avx512f", _mm512_mask_load_epi32, _mm512_maskz_load_epi32, "32-bit integers", vmovdqa32, __m512i, zmm_reg, __mmask16, i32, "64-byte"); +define_masked_load_aligned!("avx512f", _mm512_mask_load_epi64, _mm512_maskz_load_epi64, "64-bit integers", vmovdqa64, __m512i, zmm_reg, __mmask8, i64, "64-byte"); +define_masked_load_aligned!("avx512f", _mm512_mask_load_ps, _mm512_maskz_load_ps, "single-precision (32-bit) floating-point elements", vmovaps, __m512, zmm_reg, __mmask16, f32, "64-byte"); +define_masked_load_aligned!("avx512f", _mm512_mask_load_pd, _mm512_maskz_load_pd, "double-precision (64-bit) floating-point elements", vmovapd, __m512d, zmm_reg, __mmask8, f64, "64-byte"); + +define_masked_load_aligned!("avx512f,avx512vl,avx", _mm256_mask_load_epi32, _mm256_maskz_load_epi32, "32-bit integers", vmovdqa32, __m256i, ymm_reg, __mmask8, i32, "32-byte"); +define_masked_load_aligned!("avx512f,avx512vl,avx", _mm256_mask_load_epi64, _mm256_maskz_load_epi64, "64-bit integers", vmovdqa64, __m256i, ymm_reg, __mmask8, i64, "32-byte"); +define_masked_load_aligned!("avx512f,avx512vl,avx", _mm256_mask_load_ps, _mm256_maskz_load_ps, "single-precision (32-bit) floating-point elements", vmovaps, __m256, ymm_reg, __mmask8, f32, "32-byte"); +define_masked_load_aligned!("avx512f,avx512vl,avx", _mm256_mask_load_pd, _mm256_maskz_load_pd, "double-precision (64-bit) floating-point elements", vmovapd, __m256d, ymm_reg, __mmask8, f64, "32-byte"); + +define_masked_load_aligned!("avx512f,avx512vl,avx", _mm_mask_load_epi32, _mm_maskz_load_epi32, "32-bit integers", vmovdqa32, __m128i, xmm_reg, __mmask8, i32, "16-byte"); +define_masked_load_aligned!("avx512f,avx512vl,avx", _mm_mask_load_epi64, _mm_maskz_load_epi64, "64-bit integers", vmovdqa64, __m128i, xmm_reg, __mmask8, i64, "16-byte"); +define_masked_load_aligned!("avx512f,avx512vl,avx", _mm_mask_load_ps, _mm_maskz_load_ps, "single-precision (32-bit) floating-point elements", vmovaps, __m128, xmm_reg, __mmask8, f32, "16-byte"); +define_masked_load_aligned!("avx512f,avx512vl,avx", _mm_mask_load_pd, _mm_maskz_load_pd, "double-precision (64-bit) floating-point elements", vmovapd, __m128d, xmm_reg, __mmask8, f64, "16-byte"); + +define_masked_store_unaligned!("avx512f", _mm512_mask_storeu_epi32, "32-bit integers", vmovdqu32, __m512i, zmm_reg, __mmask16, i32); +define_masked_store_unaligned!("avx512f", _mm512_mask_storeu_epi64, "64-bit integers", vmovdqu64, __m512i, zmm_reg, __mmask8, i64); +define_masked_store_unaligned!("avx512f", _mm512_mask_storeu_ps, "single-precision (32-bit) floating-point elements", vmovups, __m512, zmm_reg, __mmask16, f32); +define_masked_store_unaligned!("avx512f", _mm512_mask_storeu_pd, "double-precision (64-bit) floating-point elements", vmovupd, __m512d, zmm_reg, __mmask8, f64); + +define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm256_mask_storeu_epi32, "32-bit integers", vmovdqu32, __m256i, ymm_reg, __mmask8, i32); +define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm256_mask_storeu_epi64, "64-bit integers", vmovdqu64, __m256i, ymm_reg, __mmask8, i64); +define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm256_mask_storeu_ps, "single-precision (32-bit) floating-point elements", vmovups, __m256, ymm_reg, __mmask8, f32); +define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm256_mask_storeu_pd, "double-precision (64-bit) floating-point elements", vmovupd, __m256d, ymm_reg, __mmask8, f64); + +define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm_mask_storeu_epi32, "32-bit integers", vmovdqu32, __m128i, xmm_reg, __mmask8, i32); +define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm_mask_storeu_epi64, "64-bit integers", vmovdqu64, __m128i, xmm_reg, __mmask8, i64); +define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm_mask_storeu_ps, "single-precision (32-bit) floating-point elements", vmovups, __m128, xmm_reg, __mmask8, f32); +define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm_mask_storeu_pd, "double-precision (64-bit) floating-point elements", vmovupd, __m128d, xmm_reg, __mmask8, f64); + +define_masked_store_aligned!("avx512f", _mm512_mask_store_epi32, "32-bit integers", vmovdqa32, __m512i, zmm_reg, __mmask16, i32, "64-byte"); +define_masked_store_aligned!("avx512f", _mm512_mask_store_epi64, "64-bit integers", vmovdqa64, __m512i, zmm_reg, __mmask8, i64, "64-byte"); +define_masked_store_aligned!("avx512f", _mm512_mask_store_ps, "single-precision (32-bit) floating-point elements", vmovaps, __m512, zmm_reg, __mmask16, f32, "64-byte"); +define_masked_store_aligned!("avx512f", _mm512_mask_store_pd, "double-precision (64-bit) floating-point elements", vmovapd, __m512d, zmm_reg, __mmask8, f64, "64-byte"); + +define_masked_store_aligned!("avx512f,avx512vl,avx", _mm256_mask_store_epi32, "32-bit integers", vmovdqa32, __m256i, ymm_reg, __mmask8, i32, "32-byte"); +define_masked_store_aligned!("avx512f,avx512vl,avx", _mm256_mask_store_epi64, "64-bit integers", vmovdqa64, __m256i, ymm_reg, __mmask8, i64, "32-byte"); +define_masked_store_aligned!("avx512f,avx512vl,avx", _mm256_mask_store_ps, "single-precision (32-bit) floating-point elements", vmovaps, __m256, ymm_reg, __mmask8, f32, "32-byte"); +define_masked_store_aligned!("avx512f,avx512vl,avx", _mm256_mask_store_pd, "double-precision (64-bit) floating-point elements", vmovapd, __m256d, ymm_reg, __mmask8, f64, "32-byte"); + +define_masked_store_aligned!("avx512f,avx512vl,avx", _mm_mask_store_epi32, "32-bit integers", vmovdqa32, __m128i, xmm_reg, __mmask8, i32, "16-byte"); +define_masked_store_aligned!("avx512f,avx512vl,avx", _mm_mask_store_epi64, "64-bit integers", vmovdqa64, __m128i, xmm_reg, __mmask8, i64, "16-byte"); +define_masked_store_aligned!("avx512f,avx512vl,avx", _mm_mask_store_ps, "single-precision (32-bit) floating-point elements", vmovaps, __m128, xmm_reg, __mmask8, f32, "16-byte"); +define_masked_store_aligned!("avx512f,avx512vl,avx", _mm_mask_store_pd, "double-precision (64-bit) floating-point elements", vmovapd, __m128d, xmm_reg, __mmask8, f64, "16-byte"); /// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order. /// diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs index e686e65b30..540a4ccde1 100644 --- a/crates/core_arch/src/x86/macros.rs +++ b/crates/core_arch/src/x86/macros.rs @@ -87,6 +87,101 @@ macro_rules! static_assert_imm8_scale { }; } + +macro_rules! define_masked_load_aligned { + ($feature:literal, $name:ident, $name_zero_masked:ident, $element_description:literal, $instruction:ident, $simd_type:path, $reg_type:ident, $mask_type:path, $lane_type:path, $alignment_description:literal) => { + define_masked_load!($feature, $name, $name_zero_masked, $element_description, $instruction, $simd_type, $reg_type, $mask_type, $lane_type, "mem_addr must be aligned on a ", $alignment_description, " boundary or a general-protection exception may be generated."); + } +} + +macro_rules! define_masked_load_unaligned { + ($feature:literal, $name:ident, $name_zero_masked:ident, $element_description:literal, $instruction:ident, $simd_type:path, $reg_type:ident, $mask_type:path, $lane_type:path) => { + define_masked_load!($feature, $name, $name_zero_masked, $element_description, $instruction, $simd_type, $reg_type, $mask_type, $lane_type, "mem_addr does not need to be aligned on any particular boundary."); + }; +} + +macro_rules! define_masked_load { + ($feature:literal, $name:ident, $name_zero_masked:ident, $element_description:literal, $instruction:ident, $simd_type:path, $reg_type:ident, $mask_type:path, $lane_type:path, $($additional_doc:literal),+) => { + #[inline] + #[doc = "Load packed "] + #[doc = $element_description] + #[doc = " from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set)."] + $(#[doc = $additional_doc])+ + #[doc = ""] + #[doc = concat!("[Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=", stringify!($name), ")")] + #[target_feature(enable = $feature)] + #[cfg_attr(test, assert_instr($instruction))] + pub unsafe fn $name(src: $simd_type, k: $mask_type, mem_addr: *const $lane_type) -> $simd_type { + let mut result: $simd_type = src; + asm!( + concat!(stringify!($instruction), " {r}{{{k}}}, [{p}]"), + p = in(reg) mem_addr, + k = in(kreg) k, + r = inout($reg_type) result, + options(nostack), options(pure), options(readonly) + ); + result + } + + #[inline] + #[doc = "Load packed "] + #[doc = $element_description] + #[doc = " from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set)."] + $(#[doc = $additional_doc])+ + #[doc = ""] + #[doc = concat!("[Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=", stringify!($name_zero_masked), ")")] + #[target_feature(enable = $feature)] + #[cfg_attr(test, assert_instr($instruction))] + pub unsafe fn $name_zero_masked(k: $mask_type, mem_addr: *const $lane_type) -> $simd_type { + let mut result: $simd_type; + asm!( + concat!(stringify!($instruction), " {r}{{{k}}} {{z}}, [{p}]"), + p = in(reg) mem_addr, + k = in(kreg) k, + r = out($reg_type) result, + options(nostack), options(pure), options(readonly) + ); + result + } + }; +} + +macro_rules! define_masked_store { + ($feature:literal, $name:ident, $element_description:literal, $instruction:ident, $simd_type:path, $reg_type:ident, $mask_type:path, $lane_type:path, $($additional_doc:literal),+) => { + #[inline] + #[doc = "Store packed "] + #[doc = $element_description] + #[doc = " from from a into memory using writemask k."] + $(#[doc = $additional_doc])+ + #[doc = ""] + #[doc = concat!("[Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=", stringify!($name), ")")] + #[target_feature(enable = $feature)] + #[cfg_attr(test, assert_instr($instruction))] + pub unsafe fn $name(mem_addr: *mut $lane_type, mask: $mask_type, a: $simd_type) { + asm!( + concat!(stringify!($instruction), " [{p}]{{{k}}}, {a}"), + p = in(reg) mem_addr, + k = in(kreg) mask, + a = in($reg_type) a, + options(nostack) + ); + } + } +} + +macro_rules! define_masked_store_aligned { + ($feature:literal, $name:ident, $element_description:literal, $instruction:ident, $simd_type:path, $reg_type:ident, $mask_type:path, $lane_type:path, $alignment_description:literal) => { + define_masked_store!($feature, $name, $element_description, $instruction, $simd_type, $reg_type, $mask_type, $lane_type, "mem_addr must be aligned on a ", $alignment_description, " boundary or a general-protection exception may be generated."); + } +} + +macro_rules! define_masked_store_unaligned { + ($feature:literal, $name:ident, $element_description:literal, $instruction:ident, $simd_type:path, $reg_type:ident, $mask_type:path, $lane_type:path) => { + define_masked_store!($feature, $name, $element_description, $instruction, $simd_type, $reg_type, $mask_type, $lane_type, "mem_addr does not need to be aligned on any particular boundary."); + }; +} + + #[cfg(test)] macro_rules! assert_approx_eq { ($a:expr, $b:expr, $eps:expr) => {{ From 7fe1b6b4776c398a98fcd2a8daecd9d232e816ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Sun, 21 Nov 2021 21:57:28 +0100 Subject: [PATCH 03/11] Not using macros, adding more tests --- crates/core_arch/src/x86/avx512f.rs | 1542 ++++++++++++++++++++++++++- crates/core_arch/src/x86/macros.rs | 95 -- 2 files changed, 1483 insertions(+), 154 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index aabb9aabe0..0fb46dc6a1 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -30323,65 +30323,1301 @@ pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) { ptr::write(mem_addr as *mut __m512d, a); } -define_masked_load_unaligned!("avx512f", _mm512_mask_loadu_epi32, _mm512_maskz_loadu_epi32, "32-bit integers", vmovdqu32, __m512i, zmm_reg, __mmask16, i32); -define_masked_load_unaligned!("avx512f", _mm512_mask_loadu_epi64, _mm512_maskz_loadu_epi64, "64-bit integers", vmovdqu64, __m512i, zmm_reg, __mmask8, i64); -define_masked_load_unaligned!("avx512f", _mm512_mask_loadu_ps, _mm512_maskz_loadu_ps, "single-precision (32-bit) floating-point elements", vmovups, __m512, zmm_reg, __mmask16, f32); -define_masked_load_unaligned!("avx512f", _mm512_mask_loadu_pd, _mm512_maskz_loadu_pd, "double-precision (64-bit) floating-point elements", vmovupd, __m512d, zmm_reg, __mmask8, f64); - -define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm256_mask_loadu_epi32, _mm256_maskz_loadu_epi32, "32-bit integers", vmovdqu32, __m256i, ymm_reg, __mmask8, i32); -define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm256_mask_loadu_epi64, _mm256_maskz_loadu_epi64, "64-bit integers", vmovdqu64, __m256i, ymm_reg, __mmask8, i64); -define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm256_mask_loadu_ps, _mm256_maskz_loadu_ps, "single-precision (32-bit) floating-point elements", vmovups, __m256, ymm_reg, __mmask8, f32); -define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm256_mask_loadu_pd, _mm256_maskz_loadu_pd, "double-precision (64-bit) floating-point elements", vmovupd, __m256d, ymm_reg, __mmask8, f64); - -define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm_mask_loadu_epi32, _mm_maskz_loadu_epi32, "32-bit integers", vmovdqu32, __m128i, xmm_reg, __mmask8, i32); -define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm_mask_loadu_epi64, _mm_maskz_loadu_epi64, "64-bit integers", vmovdqu64, __m128i, xmm_reg, __mmask8, i64); -define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm_mask_loadu_ps, _mm_maskz_loadu_ps, "single-precision (32-bit) floating-point elements", vmovups, __m128, xmm_reg, __mmask8, f32); -define_masked_load_unaligned!("avx512f,avx512vl,avx", _mm_mask_loadu_pd, _mm_maskz_loadu_pd, "double-precision (64-bit) floating-point elements", vmovupd, __m128d, xmm_reg, __mmask8, f64); - -define_masked_load_aligned!("avx512f", _mm512_mask_load_epi32, _mm512_maskz_load_epi32, "32-bit integers", vmovdqa32, __m512i, zmm_reg, __mmask16, i32, "64-byte"); -define_masked_load_aligned!("avx512f", _mm512_mask_load_epi64, _mm512_maskz_load_epi64, "64-bit integers", vmovdqa64, __m512i, zmm_reg, __mmask8, i64, "64-byte"); -define_masked_load_aligned!("avx512f", _mm512_mask_load_ps, _mm512_maskz_load_ps, "single-precision (32-bit) floating-point elements", vmovaps, __m512, zmm_reg, __mmask16, f32, "64-byte"); -define_masked_load_aligned!("avx512f", _mm512_mask_load_pd, _mm512_maskz_load_pd, "double-precision (64-bit) floating-point elements", vmovapd, __m512d, zmm_reg, __mmask8, f64, "64-byte"); - -define_masked_load_aligned!("avx512f,avx512vl,avx", _mm256_mask_load_epi32, _mm256_maskz_load_epi32, "32-bit integers", vmovdqa32, __m256i, ymm_reg, __mmask8, i32, "32-byte"); -define_masked_load_aligned!("avx512f,avx512vl,avx", _mm256_mask_load_epi64, _mm256_maskz_load_epi64, "64-bit integers", vmovdqa64, __m256i, ymm_reg, __mmask8, i64, "32-byte"); -define_masked_load_aligned!("avx512f,avx512vl,avx", _mm256_mask_load_ps, _mm256_maskz_load_ps, "single-precision (32-bit) floating-point elements", vmovaps, __m256, ymm_reg, __mmask8, f32, "32-byte"); -define_masked_load_aligned!("avx512f,avx512vl,avx", _mm256_mask_load_pd, _mm256_maskz_load_pd, "double-precision (64-bit) floating-point elements", vmovapd, __m256d, ymm_reg, __mmask8, f64, "32-byte"); - -define_masked_load_aligned!("avx512f,avx512vl,avx", _mm_mask_load_epi32, _mm_maskz_load_epi32, "32-bit integers", vmovdqa32, __m128i, xmm_reg, __mmask8, i32, "16-byte"); -define_masked_load_aligned!("avx512f,avx512vl,avx", _mm_mask_load_epi64, _mm_maskz_load_epi64, "64-bit integers", vmovdqa64, __m128i, xmm_reg, __mmask8, i64, "16-byte"); -define_masked_load_aligned!("avx512f,avx512vl,avx", _mm_mask_load_ps, _mm_maskz_load_ps, "single-precision (32-bit) floating-point elements", vmovaps, __m128, xmm_reg, __mmask8, f32, "16-byte"); -define_masked_load_aligned!("avx512f,avx512vl,avx", _mm_mask_load_pd, _mm_maskz_load_pd, "double-precision (64-bit) floating-point elements", vmovapd, __m128d, xmm_reg, __mmask8, f64, "16-byte"); - -define_masked_store_unaligned!("avx512f", _mm512_mask_storeu_epi32, "32-bit integers", vmovdqu32, __m512i, zmm_reg, __mmask16, i32); -define_masked_store_unaligned!("avx512f", _mm512_mask_storeu_epi64, "64-bit integers", vmovdqu64, __m512i, zmm_reg, __mmask8, i64); -define_masked_store_unaligned!("avx512f", _mm512_mask_storeu_ps, "single-precision (32-bit) floating-point elements", vmovups, __m512, zmm_reg, __mmask16, f32); -define_masked_store_unaligned!("avx512f", _mm512_mask_storeu_pd, "double-precision (64-bit) floating-point elements", vmovupd, __m512d, zmm_reg, __mmask8, f64); - -define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm256_mask_storeu_epi32, "32-bit integers", vmovdqu32, __m256i, ymm_reg, __mmask8, i32); -define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm256_mask_storeu_epi64, "64-bit integers", vmovdqu64, __m256i, ymm_reg, __mmask8, i64); -define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm256_mask_storeu_ps, "single-precision (32-bit) floating-point elements", vmovups, __m256, ymm_reg, __mmask8, f32); -define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm256_mask_storeu_pd, "double-precision (64-bit) floating-point elements", vmovupd, __m256d, ymm_reg, __mmask8, f64); - -define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm_mask_storeu_epi32, "32-bit integers", vmovdqu32, __m128i, xmm_reg, __mmask8, i32); -define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm_mask_storeu_epi64, "64-bit integers", vmovdqu64, __m128i, xmm_reg, __mmask8, i64); -define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm_mask_storeu_ps, "single-precision (32-bit) floating-point elements", vmovups, __m128, xmm_reg, __mmask8, f32); -define_masked_store_unaligned!("avx512f,avx512vl,avx", _mm_mask_storeu_pd, "double-precision (64-bit) floating-point elements", vmovupd, __m128d, xmm_reg, __mmask8, f64); - -define_masked_store_aligned!("avx512f", _mm512_mask_store_epi32, "32-bit integers", vmovdqa32, __m512i, zmm_reg, __mmask16, i32, "64-byte"); -define_masked_store_aligned!("avx512f", _mm512_mask_store_epi64, "64-bit integers", vmovdqa64, __m512i, zmm_reg, __mmask8, i64, "64-byte"); -define_masked_store_aligned!("avx512f", _mm512_mask_store_ps, "single-precision (32-bit) floating-point elements", vmovaps, __m512, zmm_reg, __mmask16, f32, "64-byte"); -define_masked_store_aligned!("avx512f", _mm512_mask_store_pd, "double-precision (64-bit) floating-point elements", vmovapd, __m512d, zmm_reg, __mmask8, f64, "64-byte"); - -define_masked_store_aligned!("avx512f,avx512vl,avx", _mm256_mask_store_epi32, "32-bit integers", vmovdqa32, __m256i, ymm_reg, __mmask8, i32, "32-byte"); -define_masked_store_aligned!("avx512f,avx512vl,avx", _mm256_mask_store_epi64, "64-bit integers", vmovdqa64, __m256i, ymm_reg, __mmask8, i64, "32-byte"); -define_masked_store_aligned!("avx512f,avx512vl,avx", _mm256_mask_store_ps, "single-precision (32-bit) floating-point elements", vmovaps, __m256, ymm_reg, __mmask8, f32, "32-byte"); -define_masked_store_aligned!("avx512f,avx512vl,avx", _mm256_mask_store_pd, "double-precision (64-bit) floating-point elements", vmovapd, __m256d, ymm_reg, __mmask8, f64, "32-byte"); - -define_masked_store_aligned!("avx512f,avx512vl,avx", _mm_mask_store_epi32, "32-bit integers", vmovdqa32, __m128i, xmm_reg, __mmask8, i32, "16-byte"); -define_masked_store_aligned!("avx512f,avx512vl,avx", _mm_mask_store_epi64, "64-bit integers", vmovdqa64, __m128i, xmm_reg, __mmask8, i64, "16-byte"); -define_masked_store_aligned!("avx512f,avx512vl,avx", _mm_mask_store_ps, "single-precision (32-bit) floating-point elements", vmovaps, __m128, xmm_reg, __mmask8, f32, "16-byte"); -define_masked_store_aligned!("avx512f,avx512vl,avx", _mm_mask_store_pd, "double-precision (64-bit) floating-point elements", vmovapd, __m128d, xmm_reg, __mmask8, f64, "16-byte"); +/// Load packed 32-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i { + let mut dst: __m512i = src; + asm!( + "vmovdqu32 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 32-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_maskz_loadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i { + let mut dst: __m512i; + asm!( + "vmovdqu32 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 64-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i { + let mut dst: __m512i = src; + asm!( + "vmovdqu64 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 64-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i { + let mut dst: __m512i; + asm!( + "vmovdqu64 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_ps) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_loadu_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 { + let mut dst: __m512 = src; + asm!( + "vmovups {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_ps) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_maskz_loadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 { + let mut dst: __m512; + asm!( + "vmovups {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_pd) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d { + let mut dst: __m512d = src; + asm!( + "vmovupd {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_pd) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d { + let mut dst: __m512d; + asm!( + "vmovupd {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 32-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_loadu_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i { + let mut dst: __m256i = src; + asm!( + "vmovdqu32 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 32-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i { + let mut dst: __m256i; + asm!( + "vmovdqu32 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 64-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_loadu_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i { + let mut dst: __m256i = src; + asm!( + "vmovdqu64 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 64-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i { + let mut dst: __m256i; + asm!( + "vmovdqu64 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_loadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 { + let mut dst: __m256 = src; + asm!( + "vmovups {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 { + let mut dst: __m256; + asm!( + "vmovups {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_loadu_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d { + let mut dst: __m256d = src; + asm!( + "vmovupd {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d { + let mut dst: __m256d; + asm!( + "vmovupd {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 32-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i { + let mut dst: __m128i = src; + asm!( + "vmovdqu32 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 32-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i { + let mut dst: __m128i; + asm!( + "vmovdqu32 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 64-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i { + let mut dst: __m128i = src; + asm!( + "vmovdqu64 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 64-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i { + let mut dst: __m128i; + asm!( + "vmovdqu64 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 { + let mut dst: __m128 = src; + asm!( + "vmovups {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { + let mut dst: __m128; + asm!( + "vmovups {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d { + let mut dst: __m128d = src; + asm!( + "vmovupd {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { + let mut dst: __m128d; + asm!( + "vmovupd {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 32-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i { + let mut dst: __m512i = src; + asm!( + "vmovdqa32 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 32-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i { + let mut dst: __m512i; + asm!( + "vmovdqa32 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 64-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i { + let mut dst: __m512i = src; + asm!( + "vmovdqa64 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 64-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i { + let mut dst: __m512i; + asm!( + "vmovdqa64 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_ps) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 { + let mut dst: __m512 = src; + asm!( + "vmovaps {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_ps) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 { + let mut dst: __m512; + asm!( + "vmovaps {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_pd) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d { + let mut dst: __m512d = src; + asm!( + "vmovapd {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_pd) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d { + let mut dst: __m512d; + asm!( + "vmovapd {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 32-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i { + let mut dst: __m256i = src; + asm!( + "vmovdqa32 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 32-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i { + let mut dst: __m256i; + asm!( + "vmovdqa32 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 64-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i { + let mut dst: __m256i = src; + asm!( + "vmovdqa64 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 64-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i { + let mut dst: __m256i; + asm!( + "vmovdqa64 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 { + let mut dst: __m256 = src; + asm!( + "vmovaps {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 { + let mut dst: __m256; + asm!( + "vmovaps {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d { + let mut dst: __m256d = src; + asm!( + "vmovapd {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d { + let mut dst: __m256d; + asm!( + "vmovapd {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 32-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i { + let mut dst: __m128i = src; + asm!( + "vmovdqa32 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 32-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i { + let mut dst: __m128i; + asm!( + "vmovdqa32 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 64-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i { + let mut dst: __m128i = src; + asm!( + "vmovdqa64 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 64-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i { + let mut dst: __m128i; + asm!( + "vmovdqa64 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 { + let mut dst: __m128 = src; + asm!( + "vmovaps {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { + let mut dst: __m128; + asm!( + "vmovaps {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d { + let mut dst: __m128d = src; + asm!( + "vmovapd {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { + let mut dst: __m128d; + asm!( + "vmovapd {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Store packed 32-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) { + asm!( + "vmovdqu32 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); +} + +/// Store packed 64-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) { + asm!( + "vmovdqu64 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); +} + +/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_ps) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) { + asm!( + "vmovups [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); +} + +/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_pd) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) { + asm!( + "vmovupd [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); +} + +/// Store packed 32-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) { + asm!( + "vmovdqu32 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); +} + +/// Store packed 64-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) { + asm!( + "vmovdqu64 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); +} + +/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) { + asm!( + "vmovups [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); +} + +/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) { + asm!( + "vmovupd [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); +} + +/// Store packed 32-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) { + asm!( + "vmovdqu32 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); +} + +/// Store packed 64-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) { + asm!( + "vmovdqu64 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); +} + +/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { + asm!( + "vmovups [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); +} + +/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) { + asm!( + "vmovupd [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); +} + +/// Store packed 32-bit integers from a into memory using writemask k. +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) { + asm!( + "vmovdqa32 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); +} + +/// Store packed 64-bit integers from a into memory using writemask k. +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) { + asm!( + "vmovdqa64 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); +} + +/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_ps) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) { + asm!( + "vmovaps [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); +} + +/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. +/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_pd) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) { + asm!( + "vmovapd [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); +} + +/// Store packed 32-bit integers from a into memory using writemask k. +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) { + asm!( + "vmovdqa32 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); +} + +/// Store packed 64-bit integers from a into memory using writemask k. +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) { + asm!( + "vmovdqa64 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); +} + +/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) { + asm!( + "vmovaps [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); +} + +/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. +/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) { + asm!( + "vmovapd [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); +} + +/// Store packed 32-bit integers from a into memory using writemask k. +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_epi32) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) { + asm!( + "vmovdqa32 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); +} + +/// Store packed 64-bit integers from a into memory using writemask k. +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_epi64) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) { + asm!( + "vmovdqa64 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); +} + +/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k. +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ps) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { + asm!( + "vmovaps [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); +} + +/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k. +/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_pd) +#[inline] +#[target_feature(enable = "avx512f,avx512vl,avx")] +pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) { + asm!( + "vmovapd [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); +} /// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order. /// @@ -44658,6 +45894,49 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_loadu_epi32() { + let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_maskz_loadu_epi32(m, black_box(p)); + let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_load_epi32() { + #[repr(align(64))] + struct Align { + data: [i32; 16], // 64 bytes + } + let src = _mm512_set1_epi32(42); + let a = Align { + data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + }; + let p = a.data.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_mask_load_epi32(src, m, black_box(p)); + let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_load_epi32() { + #[repr(align(64))] + struct Align { + data: [i32; 16], // 64 bytes + } + let a = Align { + data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + }; + let p = a.data.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_maskz_load_epi32(m, black_box(p)); + let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16); + assert_eq_m512i(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_storeu_epi32() { let mut r = _mm512_set1_epi32(42); @@ -44679,6 +45958,49 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_loadu_epi64() { + let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm512_maskz_loadu_epi64(m, black_box(p)); + let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_load_epi64() { + #[repr(align(64))] + struct Align { + data: [i64; 8], // 64 bytes + } + let src = _mm512_set1_epi64(42); + let a = Align { + data: [1_i64, 2, 3, 4, 5, 6, 7, 8], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm512_mask_load_epi64(src, m, black_box(p)); + let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_load_epi64() { + #[repr(align(64))] + struct Align { + data: [i64; 8], // 64 bytes + } + let a = Align { + data: [1_i64, 2, 3, 4, 5, 6, 7, 8], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm512_maskz_load_epi64(m, black_box(p)); + let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8); + assert_eq_m512i(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_storeu_epi64() { let mut r = _mm512_set1_epi64(42); @@ -44706,6 +46028,65 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_loadu_ps() { + let a = &[ + 1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 16.0, + ]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_maskz_loadu_ps(m, black_box(p)); + let e = _mm512_setr_ps( + 0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0, + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_load_ps() { + #[repr(align(64))] + struct Align { + data: [f32; 16], // 64 bytes + } + let src = _mm512_set1_ps(42.0); + let a = Align { + data: [ + 1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, + 15.0, 16.0, + ], + }; + let p = a.data.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_mask_load_ps(src, m, black_box(p)); + let e = _mm512_setr_ps( + 42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0, + 16.0, + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_load_ps() { + #[repr(align(64))] + struct Align { + data: [f32; 16], // 64 bytes + } + let a = Align { + data: [ + 1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, + 15.0, 16.0, + ], + }; + let p = a.data.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm512_maskz_load_ps(m, black_box(p)); + let e = _mm512_setr_ps( + 0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0, + ); + assert_eq_m512(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_storeu_ps() { let mut r = _mm512_set1_ps(42.0); @@ -44732,6 +46113,49 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_loadu_pd() { + let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm512_maskz_loadu_pd(m, black_box(p)); + let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_load_pd() { + #[repr(align(64))] + struct Align { + data: [f64; 8], // 64 bytes + } + let src = _mm512_set1_pd(42.0); + let a = Align { + data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm512_mask_load_pd(src, m, black_box(p)); + let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_load_pd() { + #[repr(align(64))] + struct Align { + data: [f64; 8], // 64 bytes + } + let a = Align { + data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm512_maskz_load_pd(m, black_box(p)); + let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_storeu_pd() { let mut r = _mm512_set1_pd(42.0); diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs index 540a4ccde1..e686e65b30 100644 --- a/crates/core_arch/src/x86/macros.rs +++ b/crates/core_arch/src/x86/macros.rs @@ -87,101 +87,6 @@ macro_rules! static_assert_imm8_scale { }; } - -macro_rules! define_masked_load_aligned { - ($feature:literal, $name:ident, $name_zero_masked:ident, $element_description:literal, $instruction:ident, $simd_type:path, $reg_type:ident, $mask_type:path, $lane_type:path, $alignment_description:literal) => { - define_masked_load!($feature, $name, $name_zero_masked, $element_description, $instruction, $simd_type, $reg_type, $mask_type, $lane_type, "mem_addr must be aligned on a ", $alignment_description, " boundary or a general-protection exception may be generated."); - } -} - -macro_rules! define_masked_load_unaligned { - ($feature:literal, $name:ident, $name_zero_masked:ident, $element_description:literal, $instruction:ident, $simd_type:path, $reg_type:ident, $mask_type:path, $lane_type:path) => { - define_masked_load!($feature, $name, $name_zero_masked, $element_description, $instruction, $simd_type, $reg_type, $mask_type, $lane_type, "mem_addr does not need to be aligned on any particular boundary."); - }; -} - -macro_rules! define_masked_load { - ($feature:literal, $name:ident, $name_zero_masked:ident, $element_description:literal, $instruction:ident, $simd_type:path, $reg_type:ident, $mask_type:path, $lane_type:path, $($additional_doc:literal),+) => { - #[inline] - #[doc = "Load packed "] - #[doc = $element_description] - #[doc = " from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set)."] - $(#[doc = $additional_doc])+ - #[doc = ""] - #[doc = concat!("[Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=", stringify!($name), ")")] - #[target_feature(enable = $feature)] - #[cfg_attr(test, assert_instr($instruction))] - pub unsafe fn $name(src: $simd_type, k: $mask_type, mem_addr: *const $lane_type) -> $simd_type { - let mut result: $simd_type = src; - asm!( - concat!(stringify!($instruction), " {r}{{{k}}}, [{p}]"), - p = in(reg) mem_addr, - k = in(kreg) k, - r = inout($reg_type) result, - options(nostack), options(pure), options(readonly) - ); - result - } - - #[inline] - #[doc = "Load packed "] - #[doc = $element_description] - #[doc = " from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set)."] - $(#[doc = $additional_doc])+ - #[doc = ""] - #[doc = concat!("[Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=", stringify!($name_zero_masked), ")")] - #[target_feature(enable = $feature)] - #[cfg_attr(test, assert_instr($instruction))] - pub unsafe fn $name_zero_masked(k: $mask_type, mem_addr: *const $lane_type) -> $simd_type { - let mut result: $simd_type; - asm!( - concat!(stringify!($instruction), " {r}{{{k}}} {{z}}, [{p}]"), - p = in(reg) mem_addr, - k = in(kreg) k, - r = out($reg_type) result, - options(nostack), options(pure), options(readonly) - ); - result - } - }; -} - -macro_rules! define_masked_store { - ($feature:literal, $name:ident, $element_description:literal, $instruction:ident, $simd_type:path, $reg_type:ident, $mask_type:path, $lane_type:path, $($additional_doc:literal),+) => { - #[inline] - #[doc = "Store packed "] - #[doc = $element_description] - #[doc = " from from a into memory using writemask k."] - $(#[doc = $additional_doc])+ - #[doc = ""] - #[doc = concat!("[Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=", stringify!($name), ")")] - #[target_feature(enable = $feature)] - #[cfg_attr(test, assert_instr($instruction))] - pub unsafe fn $name(mem_addr: *mut $lane_type, mask: $mask_type, a: $simd_type) { - asm!( - concat!(stringify!($instruction), " [{p}]{{{k}}}, {a}"), - p = in(reg) mem_addr, - k = in(kreg) mask, - a = in($reg_type) a, - options(nostack) - ); - } - } -} - -macro_rules! define_masked_store_aligned { - ($feature:literal, $name:ident, $element_description:literal, $instruction:ident, $simd_type:path, $reg_type:ident, $mask_type:path, $lane_type:path, $alignment_description:literal) => { - define_masked_store!($feature, $name, $element_description, $instruction, $simd_type, $reg_type, $mask_type, $lane_type, "mem_addr must be aligned on a ", $alignment_description, " boundary or a general-protection exception may be generated."); - } -} - -macro_rules! define_masked_store_unaligned { - ($feature:literal, $name:ident, $element_description:literal, $instruction:ident, $simd_type:path, $reg_type:ident, $mask_type:path, $lane_type:path) => { - define_masked_store!($feature, $name, $element_description, $instruction, $simd_type, $reg_type, $mask_type, $lane_type, "mem_addr does not need to be aligned on any particular boundary."); - }; -} - - #[cfg(test)] macro_rules! assert_approx_eq { ($a:expr, $b:expr, $eps:expr) => {{ From b9329bd62e19819d5fa386148c74df6e880798a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Tue, 23 Nov 2021 23:20:14 +0100 Subject: [PATCH 04/11] Tests for mm512 aligned stores --- crates/core_arch/src/x86/avx512f.rs | 69 +++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 0fb46dc6a1..114ebe1674 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -45947,6 +45947,22 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_store_epi32() { + #[repr(align(64))] + struct Align { + data: __m512i, + } + let mut r = Align { + data: _mm512_set1_epi32(42), + }; + let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + let m = 0b11101000_11001010; + _mm512_mask_store_epi32(&mut r.data as *mut _ as *mut i32, m, a); + let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16); + assert_eq_m512i(r.data, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_loadu_epi64() { let src = _mm512_set1_epi64(42); @@ -46011,6 +46027,22 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_store_epi64() { + #[repr(align(64))] + struct Align { + data: __m512i, + } + let mut r = Align { + data: _mm512_set1_epi32(42), + }; + let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8); + let m = 0b11001010; + _mm512_mask_store_epi64(&mut r.data as *mut _ as *mut i64, m, a); + let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m512i(r.data, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_loadu_ps() { let src = _mm512_set1_ps(42.0); @@ -46102,6 +46134,27 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_store_ps() { + #[repr(align(64))] + struct Align { + data: __m512, + } + let mut r = Align { + data: _mm512_set1_ps(42.0), + }; + let a = _mm512_setr_ps( + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + ); + let m = 0b11101000_11001010; + _mm512_mask_store_ps(&mut r.data as *mut _ as *mut f32, m, a); + let e = _mm512_setr_ps( + 42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0, + 16.0, + ); + assert_eq_m512(r.data, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_loadu_pd() { let src = _mm512_set1_pd(42.0); @@ -46166,6 +46219,22 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_store_pd() { + #[repr(align(64))] + struct Align { + data: __m512d, + } + let mut r = Align { + data: _mm512_set1_pd(42.0), + }; + let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let m = 0b11001010; + _mm512_mask_store_pd(&mut r.data as *mut _ as *mut f64, m, a); + let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m512d(r.data, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_setr_pd() { let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); From ce1f051effb921ebcd07f37768514a4be38ada2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Sat, 27 Nov 2021 21:25:08 +0100 Subject: [PATCH 05/11] Tests for 256-bit variants --- crates/core_arch/src/x86/avx512f.rs | 320 ++++++++++++++++++++++++++++ 1 file changed, 320 insertions(+) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 114ebe1674..57d791f233 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -46235,6 +46235,326 @@ mod tests { assert_eq_m512d(r.data, e); } + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_loadu_epi32() { + let src = _mm256_set1_epi32(42); + let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm256_mask_loadu_epi32(src, m, black_box(p)); + let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_loadu_epi32() { + let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm256_maskz_loadu_epi32(m, black_box(p)); + let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_load_epi32() { + #[repr(align(32))] + struct Align { + data: [i32; 8], // 32 bytes + } + let src = _mm256_set1_epi32(42); + let a = Align { + data: [1_i32, 2, 3, 4, 5, 6, 7, 8], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm256_mask_load_epi32(src, m, black_box(p)); + let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_load_epi32() { + #[repr(align(32))] + struct Align { + data: [i32; 8], // 32 bytes + } + let a = Align { + data: [1_i32, 2, 3, 4, 5, 6, 7, 8], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm256_maskz_load_epi32(m, black_box(p)); + let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_storeu_epi32() { + let mut r = _mm256_set1_epi32(42); + let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let m = 0b11001010; + _mm256_mask_storeu_epi32(&mut r as *mut _ as *mut i32, m, a); + let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_store_epi32() { + #[repr(align(64))] + struct Align { + data: __m256i, + } + let mut r = Align { + data: _mm256_set1_epi32(42), + }; + let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let m = 0b11001010; + _mm256_mask_store_epi32(&mut r.data as *mut _ as *mut i32, m, a); + let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8); + assert_eq_m256i(r.data, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_loadu_epi64() { + let src = _mm256_set1_epi64x(42); + let a = &[1_i64, 2, 3, 4]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm256_mask_loadu_epi64(src, m, black_box(p)); + let e = _mm256_setr_epi64x(42, 2, 42, 4); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_loadu_epi64() { + let a = &[1_i64, 2, 3, 4]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm256_maskz_loadu_epi64(m, black_box(p)); + let e = _mm256_setr_epi64x(0, 2, 0, 4); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_load_epi64() { + #[repr(align(32))] + struct Align { + data: [i64; 4], // 32 bytes + } + let src = _mm256_set1_epi64x(42); + let a = Align { + data: [1_i64, 2, 3, 4], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm256_mask_load_epi64(src, m, black_box(p)); + let e = _mm256_setr_epi64x(42, 2, 42, 4); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_load_epi64() { + #[repr(align(32))] + struct Align { + data: [i64; 4], // 32 bytes + } + let a = Align { + data: [1_i64, 2, 3, 4], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm256_maskz_load_epi64(m, black_box(p)); + let e = _mm256_setr_epi64x(0, 2, 0, 4); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_storeu_epi64() { + let mut r = _mm256_set1_epi64x(42); + let a = _mm256_setr_epi64x(1, 2, 3, 4); + let m = 0b1010; + _mm256_mask_storeu_epi64(&mut r as *mut _ as *mut i64, m, a); + let e = _mm256_setr_epi64x(42, 2, 42, 4); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_store_epi64() { + #[repr(align(32))] + struct Align { + data: __m256i, + } + let mut r = Align { + data: _mm256_set1_epi32(42), + }; + let a = _mm256_setr_epi64x(1, 2, 3, 4); + let m = 0b1010; + _mm256_mask_store_epi64(&mut r.data as *mut _ as *mut i64, m, a); + let e = _mm256_setr_epi64x(42, 2, 42, 4); + assert_eq_m256i(r.data, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_loadu_ps() { + let src = _mm256_set1_ps(42.0); + let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm256_mask_loadu_ps(src, m, black_box(p)); + let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_loadu_ps() { + let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm256_maskz_loadu_ps(m, black_box(p)); + let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_load_ps() { + #[repr(align(32))] + struct Align { + data: [f32; 8], // 32 bytes + } + let src = _mm256_set1_ps(42.0); + let a = Align { + data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm256_mask_load_ps(src, m, black_box(p)); + let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_load_ps() { + #[repr(align(32))] + struct Align { + data: [f32; 8], // 32 bytes + } + let a = Align { + data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + }; + let p = a.data.as_ptr(); + let m = 0b11001010; + let r = _mm256_maskz_load_ps(m, black_box(p)); + let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_storeu_ps() { + let mut r = _mm256_set1_ps(42.0); + let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let m = 0b11001010; + _mm256_mask_storeu_ps(&mut r as *mut _ as *mut f32, m, a); + let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_store_ps() { + #[repr(align(32))] + struct Align { + data: __m256, + } + let mut r = Align { + data: _mm256_set1_ps(42.0), + }; + let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let m = 0b11001010; + _mm256_mask_store_ps(&mut r.data as *mut _ as *mut f32, m, a); + let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); + assert_eq_m256(r.data, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_loadu_pd() { + let src = _mm256_set1_pd(42.0); + let a = &[1.0_f64, 2.0, 3.0, 4.0]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm256_mask_loadu_pd(src, m, black_box(p)); + let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0); + assert_eq_m256d(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_loadu_pd() { + let a = &[1.0_f64, 2.0, 3.0, 4.0]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm256_maskz_loadu_pd(m, black_box(p)); + let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0); + assert_eq_m256d(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_load_pd() { + #[repr(align(32))] + struct Align { + data: [f64; 4], // 32 bytes + } + let src = _mm256_set1_pd(42.0); + let a = Align { + data: [1.0_f64, 2.0, 3.0, 4.0], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm256_mask_load_pd(src, m, black_box(p)); + let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0); + assert_eq_m256d(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_maskz_load_pd() { + #[repr(align(32))] + struct Align { + data: [f64; 4], // 32 bytes + } + let a = Align { + data: [1.0_f64, 2.0, 3.0, 4.0], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm256_maskz_load_pd(m, black_box(p)); + let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0); + assert_eq_m256d(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_storeu_pd() { + let mut r = _mm256_set1_pd(42.0); + let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0); + let m = 0b1010; + _mm256_mask_storeu_pd(&mut r as *mut _ as *mut f64, m, a); + let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0); + assert_eq_m256d(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm256_mask_store_pd() { + #[repr(align(32))] + struct Align { + data: __m256d, + } + let mut r = Align { + data: _mm256_set1_pd(42.0), + }; + let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0); + let m = 0b1010; + _mm256_mask_store_pd(&mut r.data as *mut _ as *mut f64, m, a); + let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0); + assert_eq_m256d(r.data, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_setr_pd() { let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); From 5f028ad9e2c61dae7a96fbf1755ee64ffe5bc020 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Sat, 27 Nov 2021 21:51:21 +0100 Subject: [PATCH 06/11] Change tests to store into slices --- crates/core_arch/src/x86/avx512f.rs | 129 ++++++++++++---------------- 1 file changed, 57 insertions(+), 72 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 57d791f233..6410b5f386 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -45939,28 +45939,26 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_storeu_epi32() { - let mut r = _mm512_set1_epi32(42); + let mut r = [42_i32; 16]; let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); let m = 0b11101000_11001010; - _mm512_mask_storeu_epi32(&mut r as *mut _ as *mut i32, m, a); + _mm512_mask_storeu_epi32(r.as_mut_ptr(), m, a); let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16); - assert_eq_m512i(r, e); + assert_eq_m512i(_mm512_loadu_epi32(r.as_ptr()), e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_store_epi32() { #[repr(align(64))] struct Align { - data: __m512i, + data: [i32; 16], } - let mut r = Align { - data: _mm512_set1_epi32(42), - }; + let mut r = Align { data: [42; 16] }; let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); let m = 0b11101000_11001010; - _mm512_mask_store_epi32(&mut r.data as *mut _ as *mut i32, m, a); + _mm512_mask_store_epi32(r.data.as_mut_ptr(), m, a); let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16); - assert_eq_m512i(r.data, e); + assert_eq_m512i(_mm512_load_epi32(r.data.as_ptr()), e); } #[simd_test(enable = "avx512f")] @@ -46019,28 +46017,27 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_storeu_epi64() { - let mut r = _mm512_set1_epi64(42); + let mut r = [42_i64; 8]; let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8); let m = 0b11001010; - _mm512_mask_storeu_epi64(&mut r as *mut _ as *mut i64, m, a); + _mm512_mask_storeu_epi64(r.as_mut_ptr(), m, a); let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8); - assert_eq_m512i(r, e); + assert_eq_m512i(_mm512_loadu_epi64(r.as_ptr()), e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_store_epi64() { #[repr(align(64))] struct Align { - data: __m512i, + data: [i64; 8], } - let mut r = Align { - data: _mm512_set1_epi32(42), - }; + let mut r = Align { data: [42; 8] }; let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8); let m = 0b11001010; - _mm512_mask_store_epi64(&mut r.data as *mut _ as *mut i64, m, a); + let p = r.data.as_mut_ptr(); + _mm512_mask_store_epi64(p, m, a); let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8); - assert_eq_m512i(r.data, e); + assert_eq_m512i(_mm512_load_epi64(r.data.as_ptr()), e); } #[simd_test(enable = "avx512f")] @@ -46121,38 +46118,36 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_storeu_ps() { - let mut r = _mm512_set1_ps(42.0); + let mut r = [42_f32; 16]; let a = _mm512_setr_ps( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ); let m = 0b11101000_11001010; - _mm512_mask_storeu_ps(&mut r as *mut _ as *mut f32, m, a); + _mm512_mask_storeu_ps(r.as_mut_ptr(), m, a); let e = _mm512_setr_ps( 42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0, 16.0, ); - assert_eq_m512(r, e); + assert_eq_m512(_mm512_loadu_ps(r.as_ptr()), e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_store_ps() { #[repr(align(64))] struct Align { - data: __m512, + data: [f32; 16], } - let mut r = Align { - data: _mm512_set1_ps(42.0), - }; + let mut r = Align { data: [42.0; 16] }; let a = _mm512_setr_ps( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, ); let m = 0b11101000_11001010; - _mm512_mask_store_ps(&mut r.data as *mut _ as *mut f32, m, a); + _mm512_mask_store_ps(r.data.as_mut_ptr(), m, a); let e = _mm512_setr_ps( 42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0, 16.0, ); - assert_eq_m512(r.data, e); + assert_eq_m512(_mm512_load_ps(r.data.as_ptr()), e); } #[simd_test(enable = "avx512f")] @@ -46211,28 +46206,26 @@ mod tests { #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_storeu_pd() { - let mut r = _mm512_set1_pd(42.0); + let mut r = [42_f64; 8]; let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); let m = 0b11001010; - _mm512_mask_storeu_pd(&mut r as *mut _ as *mut f64, m, a); + _mm512_mask_storeu_pd(r.as_mut_ptr(), m, a); let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); - assert_eq_m512d(r, e); + assert_eq_m512d(_mm512_loadu_pd(r.as_ptr()), e); } #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_store_pd() { #[repr(align(64))] struct Align { - data: __m512d, + data: [f64; 8], } - let mut r = Align { - data: _mm512_set1_pd(42.0), - }; + let mut r = Align { data: [42.0; 8] }; let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); let m = 0b11001010; - _mm512_mask_store_pd(&mut r.data as *mut _ as *mut f64, m, a); + _mm512_mask_store_pd(r.data.as_mut_ptr(), m, a); let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); - assert_eq_m512d(r.data, e); + assert_eq_m512d(_mm512_load_pd(r.data.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] @@ -46291,28 +46284,26 @@ mod tests { #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm256_mask_storeu_epi32() { - let mut r = _mm256_set1_epi32(42); + let mut r = [42_i32; 8]; let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); let m = 0b11001010; - _mm256_mask_storeu_epi32(&mut r as *mut _ as *mut i32, m, a); + _mm256_mask_storeu_epi32(r.as_mut_ptr(), m, a); let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8); - assert_eq_m256i(r, e); + assert_eq_m256i(_mm256_loadu_epi32(r.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm256_mask_store_epi32() { #[repr(align(64))] struct Align { - data: __m256i, + data: [i32; 8], } - let mut r = Align { - data: _mm256_set1_epi32(42), - }; + let mut r = Align { data: [42; 8] }; let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); let m = 0b11001010; - _mm256_mask_store_epi32(&mut r.data as *mut _ as *mut i32, m, a); + _mm256_mask_store_epi32(r.data.as_mut_ptr(), m, a); let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8); - assert_eq_m256i(r.data, e); + assert_eq_m256i(_mm256_load_epi32(r.data.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] @@ -46371,28 +46362,26 @@ mod tests { #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm256_mask_storeu_epi64() { - let mut r = _mm256_set1_epi64x(42); + let mut r = [42_i64; 4]; let a = _mm256_setr_epi64x(1, 2, 3, 4); let m = 0b1010; - _mm256_mask_storeu_epi64(&mut r as *mut _ as *mut i64, m, a); + _mm256_mask_storeu_epi64(r.as_mut_ptr(), m, a); let e = _mm256_setr_epi64x(42, 2, 42, 4); - assert_eq_m256i(r, e); + assert_eq_m256i(_mm256_loadu_epi64(r.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm256_mask_store_epi64() { #[repr(align(32))] struct Align { - data: __m256i, + data: [i64; 4], } - let mut r = Align { - data: _mm256_set1_epi32(42), - }; + let mut r = Align { data: [42; 4] }; let a = _mm256_setr_epi64x(1, 2, 3, 4); let m = 0b1010; - _mm256_mask_store_epi64(&mut r.data as *mut _ as *mut i64, m, a); + _mm256_mask_store_epi64(r.data.as_mut_ptr(), m, a); let e = _mm256_setr_epi64x(42, 2, 42, 4); - assert_eq_m256i(r.data, e); + assert_eq_m256i(_mm256_load_epi64(r.data.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] @@ -46451,28 +46440,26 @@ mod tests { #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm256_mask_storeu_ps() { - let mut r = _mm256_set1_ps(42.0); + let mut r = [42_f32; 8]; let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); let m = 0b11001010; - _mm256_mask_storeu_ps(&mut r as *mut _ as *mut f32, m, a); + _mm256_mask_storeu_ps(r.as_mut_ptr(), m, a); let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); - assert_eq_m256(r, e); + assert_eq_m256(_mm256_loadu_ps(r.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm256_mask_store_ps() { #[repr(align(32))] struct Align { - data: __m256, + data: [f32; 8], } - let mut r = Align { - data: _mm256_set1_ps(42.0), - }; + let mut r = Align { data: [42.0; 8] }; let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); let m = 0b11001010; - _mm256_mask_store_ps(&mut r.data as *mut _ as *mut f32, m, a); + _mm256_mask_store_ps(r.data.as_mut_ptr(), m, a); let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0); - assert_eq_m256(r.data, e); + assert_eq_m256(_mm256_load_ps(r.data.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] @@ -46531,28 +46518,26 @@ mod tests { #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm256_mask_storeu_pd() { - let mut r = _mm256_set1_pd(42.0); + let mut r = [42_f64; 4]; let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0); let m = 0b1010; - _mm256_mask_storeu_pd(&mut r as *mut _ as *mut f64, m, a); + _mm256_mask_storeu_pd(r.as_mut_ptr(), m, a); let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0); - assert_eq_m256d(r, e); + assert_eq_m256d(_mm256_loadu_pd(r.as_ptr()), e); } #[simd_test(enable = "avx512f,avx512vl")] unsafe fn test_mm256_mask_store_pd() { #[repr(align(32))] struct Align { - data: __m256d, + data: [f64; 4], } - let mut r = Align { - data: _mm256_set1_pd(42.0), - }; + let mut r = Align { data: [42.0; 4] }; let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0); let m = 0b1010; - _mm256_mask_store_pd(&mut r.data as *mut _ as *mut f64, m, a); + _mm256_mask_store_pd(r.data.as_mut_ptr(), m, a); let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0); - assert_eq_m256d(r.data, e); + assert_eq_m256d(_mm256_load_pd(r.data.as_ptr()), e); } #[simd_test(enable = "avx512f")] From c3b7347ca930e8cd4e06d70311677bc867352b05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Sat, 27 Nov 2021 22:24:41 +0100 Subject: [PATCH 07/11] Tests for 128-bit variants --- crates/core_arch/src/x86/avx512f.rs | 308 ++++++++++++++++++++++++++++ 1 file changed, 308 insertions(+) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 6410b5f386..3c9bcbc2fa 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -46540,6 +46540,314 @@ mod tests { assert_eq_m256d(_mm256_load_pd(r.data.as_ptr()), e); } + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_loadu_epi32() { + let src = _mm_set1_epi32(42); + let a = &[1_i32, 2, 3, 4]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm_mask_loadu_epi32(src, m, black_box(p)); + let e = _mm_setr_epi32(42, 2, 42, 4); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_loadu_epi32() { + let a = &[1_i32, 2, 3, 4]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm_maskz_loadu_epi32(m, black_box(p)); + let e = _mm_setr_epi32(0, 2, 0, 4); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_load_epi32() { + #[repr(align(16))] + struct Align { + data: [i32; 4], // 32 bytes + } + let src = _mm_set1_epi32(42); + let a = Align { + data: [1_i32, 2, 3, 4], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm_mask_load_epi32(src, m, black_box(p)); + let e = _mm_setr_epi32(42, 2, 42, 4); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_load_epi32() { + #[repr(align(16))] + struct Align { + data: [i32; 4], // 16 bytes + } + let a = Align { + data: [1_i32, 2, 3, 4], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm_maskz_load_epi32(m, black_box(p)); + let e = _mm_setr_epi32(0, 2, 0, 4); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_storeu_epi32() { + let mut r = [42_i32; 4]; + let a = _mm_setr_epi32(1, 2, 3, 4); + let m = 0b1010; + _mm_mask_storeu_epi32(r.as_mut_ptr(), m, a); + let e = _mm_setr_epi32(42, 2, 42, 4); + assert_eq_m128i(_mm_loadu_epi32(r.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_store_epi32() { + #[repr(align(16))] + struct Align { + data: [i32; 4], // 16 bytes + } + let mut r = Align { data: [42; 4] }; + let a = _mm_setr_epi32(1, 2, 3, 4); + let m = 0b1010; + _mm_mask_store_epi32(r.data.as_mut_ptr(), m, a); + let e = _mm_setr_epi32(42, 2, 42, 4); + assert_eq_m128i(_mm_load_epi32(r.data.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_loadu_epi64() { + let src = _mm_set1_epi64x(42); + let a = &[1_i64, 2]; + let p = a.as_ptr(); + let m = 0b10; + let r = _mm_mask_loadu_epi64(src, m, black_box(p)); + let e = _mm_setr_epi64x(42, 2); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_loadu_epi64() { + let a = &[1_i64, 2]; + let p = a.as_ptr(); + let m = 0b10; + let r = _mm_maskz_loadu_epi64(m, black_box(p)); + let e = _mm_setr_epi64x(0, 2); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_load_epi64() { + #[repr(align(16))] + struct Align { + data: [i64; 2], // 16 bytes + } + let src = _mm_set1_epi64x(42); + let a = Align { data: [1_i64, 2] }; + let p = a.data.as_ptr(); + let m = 0b10; + let r = _mm_mask_load_epi64(src, m, black_box(p)); + let e = _mm_setr_epi64x(42, 2); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_load_epi64() { + #[repr(align(16))] + struct Align { + data: [i64; 2], // 16 bytes + } + let a = Align { data: [1_i64, 2] }; + let p = a.data.as_ptr(); + let m = 0b10; + let r = _mm_maskz_load_epi64(m, black_box(p)); + let e = _mm_setr_epi64x(0, 2); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_storeu_epi64() { + let mut r = [42_i64; 2]; + let a = _mm_setr_epi64x(1, 2); + let m = 0b10; + _mm_mask_storeu_epi64(r.as_mut_ptr(), m, a); + let e = _mm_setr_epi64x(42, 2); + assert_eq_m128i(_mm_loadu_epi64(r.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_store_epi64() { + #[repr(align(16))] + struct Align { + data: [i64; 2], // 16 bytes + } + let mut r = Align { data: [42; 2] }; + let a = _mm_setr_epi64x(1, 2); + let m = 0b10; + _mm_mask_store_epi64(r.data.as_mut_ptr(), m, a); + let e = _mm_setr_epi64x(42, 2); + assert_eq_m128i(_mm_load_epi64(r.data.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_loadu_ps() { + let src = _mm_set1_ps(42.0); + let a = &[1.0_f32, 2.0, 3.0, 4.0]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm_mask_loadu_ps(src, m, black_box(p)); + let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_loadu_ps() { + let a = &[1.0_f32, 2.0, 3.0, 4.0]; + let p = a.as_ptr(); + let m = 0b1010; + let r = _mm_maskz_loadu_ps(m, black_box(p)); + let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_load_ps() { + #[repr(align(16))] + struct Align { + data: [f32; 4], // 16 bytes + } + let src = _mm_set1_ps(42.0); + let a = Align { + data: [1.0_f32, 2.0, 3.0, 4.0], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm_mask_load_ps(src, m, black_box(p)); + let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_load_ps() { + #[repr(align(16))] + struct Align { + data: [f32; 4], // 16 bytes + } + let a = Align { + data: [1.0_f32, 2.0, 3.0, 4.0], + }; + let p = a.data.as_ptr(); + let m = 0b1010; + let r = _mm_maskz_load_ps(m, black_box(p)); + let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_storeu_ps() { + let mut r = [42_f32; 4]; + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let m = 0b1010; + _mm_mask_storeu_ps(r.as_mut_ptr(), m, a); + let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0); + assert_eq_m128(_mm_loadu_ps(r.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_store_ps() { + #[repr(align(16))] + struct Align { + data: [f32; 4], // 16 bytes + } + let mut r = Align { data: [42.0; 4] }; + let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); + let m = 0b1010; + _mm_mask_store_ps(r.data.as_mut_ptr(), m, a); + let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0); + assert_eq_m128(_mm_load_ps(r.data.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_loadu_pd() { + let src = _mm_set1_pd(42.0); + let a = &[1.0_f64, 2.0]; + let p = a.as_ptr(); + let m = 0b10; + let r = _mm_mask_loadu_pd(src, m, black_box(p)); + let e = _mm_setr_pd(42.0, 2.0); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_loadu_pd() { + let a = &[1.0_f64, 2.0]; + let p = a.as_ptr(); + let m = 0b10; + let r = _mm_maskz_loadu_pd(m, black_box(p)); + let e = _mm_setr_pd(0.0, 2.0); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_load_pd() { + #[repr(align(16))] + struct Align { + data: [f64; 2], // 16 bytes + } + let src = _mm_set1_pd(42.0); + let a = Align { + data: [1.0_f64, 2.0], + }; + let p = a.data.as_ptr(); + let m = 0b10; + let r = _mm_mask_load_pd(src, m, black_box(p)); + let e = _mm_setr_pd(42.0, 2.0); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_maskz_load_pd() { + #[repr(align(16))] + struct Align { + data: [f64; 2], // 16 bytes + } + let a = Align { + data: [1.0_f64, 2.0], + }; + let p = a.data.as_ptr(); + let m = 0b10; + let r = _mm_maskz_load_pd(m, black_box(p)); + let e = _mm_setr_pd(0.0, 2.0); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_storeu_pd() { + let mut r = [42_f64; 2]; + let a = _mm_setr_pd(1.0, 2.0); + let m = 0b10; + _mm_mask_storeu_pd(r.as_mut_ptr(), m, a); + let e = _mm_setr_pd(42.0, 2.0); + assert_eq_m128d(_mm_loadu_pd(r.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512vl")] + unsafe fn test_mm_mask_store_pd() { + #[repr(align(16))] + struct Align { + data: [f64; 2], // 16 bytes + } + let mut r = Align { data: [42.0; 2] }; + let a = _mm_setr_pd(1.0, 2.0); + let m = 0b10; + _mm_mask_store_pd(r.data.as_mut_ptr(), m, a); + let e = _mm_setr_pd(42.0, 2.0); + assert_eq_m128d(_mm_load_pd(r.data.as_ptr()), e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_setr_pd() { let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); From 2c22e09a66e0946e14328e511e12d12cfd93e53d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Sat, 27 Nov 2021 22:31:11 +0100 Subject: [PATCH 08/11] Update avx512f checklist --- crates/core_arch/avx512f.md | 146 ++++++++++++++++++------------------ 1 file changed, 73 insertions(+), 73 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 1ad80147cf..9d95f0c492 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -1784,113 +1784,113 @@ * [x] [`_mm512_setzero_si512`] * [x] [`_mm512_setzero`] * [x] [`_mm512_load_epi32`] - * [ ] [`_mm512_mask_load_epi32`] //need i1 - * [ ] [`_mm512_maskz_load_epi32`] //need i1 + * [x] [`_mm512_mask_load_epi32`] //need i1 + * [x] [`_mm512_maskz_load_epi32`] //need i1 * [x] [`_mm_load_epi32`] - * [_] [`_mm_mask_load_epi32`] //need i1 - * [_] [`_mm_maskz_load_epi32`] //need i1 + * [x] [`_mm_mask_load_epi32`] //need i1 + * [x] [`_mm_maskz_load_epi32`] //need i1 * [x] [`_mm256_load_epi32`] - * [_] [`_mm256_mask_load_epi32`] //need i1 - * [_] [`_mm256_maskz_load_epi32`] //need i1 + * [x] [`_mm256_mask_load_epi32`] //need i1 + * [x] [`_mm256_maskz_load_epi32`] //need i1 * [x] [`_mm512_load_epi64`] - * [ ] [`_mm512_mask_load_epi64`] //need i1 - * [ ] [`_mm512_maskz_load_epi64`] //need i1 + * [x] [`_mm512_mask_load_epi64`] //need i1 + * [x] [`_mm512_maskz_load_epi64`] //need i1 * [x] [`_mm_load_epi64`] //need i1 - * [_] [`_mm_mask_load_epi64`] //need i1 - * [_] [`_mm_maskz_load_epi64`] //need i1 + * [x] [`_mm_mask_load_epi64`] //need i1 + * [x] [`_mm_maskz_load_epi64`] //need i1 * [x] [`_mm256_load_epi64`] //need i1 - * [_] [`_mm256_mask_load_epi64`] //need i1 - * [_] [`_mm256_maskz_load_epi64`] //need i1 + * [x] [`_mm256_mask_load_epi64`] //need i1 + * [x] [`_mm256_maskz_load_epi64`] //need i1 * [x] [`_mm512_load_ps`] - * [ ] [`_mm512_mask_load_ps`] //need i1 - * [ ] [`_mm512_maskz_load_ps`] //need i1 - * [_] [`_mm_maskz_load_ps`] //need i - * [_] [`_mm_mask_load_ps`] //need i1 - * [_] [`_mm_maskz_load_ps`] //need i1 - * [_] [`_mm256_mask_load_ps`] //need i1 - * [_] [`_mm256_maskz_load_ps`] //need i1 + * [x] [`_mm512_mask_load_ps`] //need i1 + * [x] [`_mm512_maskz_load_ps`] //need i1 + * [x] [`_mm_maskz_load_ps`] //need i + * [x] [`_mm_mask_load_ps`] //need i1 + * [x] [`_mm_maskz_load_ps`] //need i1 + * [x] [`_mm256_mask_load_ps`] //need i1 + * [x] [`_mm256_maskz_load_ps`] //need i1 * [x] [`_mm512_load_pd`] - * [ ] [`_mm512_mask_load_pd`] //need i1 - * [ ] [`_mm512_maskz_load_pd`] //need i1 - * [_] [`_mm_mask_load_pd`] //need i1 - * [_] [`_mm_maskz_load_pd`] //need i1 - * [_] [`_mm256_mask_load_pd`] //need i1 - * [_] [`_mm256_maskz_load_pd`] //need i1 + * [x] [`_mm512_mask_load_pd`] //need i1 + * [x] [`_mm512_maskz_load_pd`] //need i1 + * [x] [`_mm_mask_load_pd`] //need i1 + * [x] [`_mm_maskz_load_pd`] //need i1 + * [x] [`_mm256_mask_load_pd`] //need i1 + * [x] [`_mm256_maskz_load_pd`] //need i1 * [x] [`_mm512_load_si512`] * [x] [`_mm512_loadu_epi32`] - * [ ] [`_mm512_mask_loadu_epi32`] //need i1 + * [x] [`_mm512_mask_loadu_epi32`] //need i1 * [x] [`_mm_loadu_epi32`] - * [_] [`_mm_mask_loadu_epi32`] //need i1 - * [_] [`_mm_maskz_loadu_epi32`] //need i1 - * [ ] [`_mm512_maskz_loadu_epi32`] //need i1 + * [x] [`_mm_mask_loadu_epi32`] //need i1 + * [x] [`_mm_maskz_loadu_epi32`] //need i1 + * [x] [`_mm512_maskz_loadu_epi32`] //need i1 * [x] [`_mm256_loadu_epi32`] - * [_] [`_mm256_mask_loadu_epi32`] //need i1 - * [_] [`_mm256_maskz_loadu_epi32`] //need i1 + * [x] [`_mm256_mask_loadu_epi32`] //need i1 + * [x] [`_mm256_maskz_loadu_epi32`] //need i1 * [x] [`_mm512_loadu_epi64`] - * [ ] [`_mm512_mask_loadu_epi64`] //need i1 - * [ ] [`_mm512_maskz_loadu_epi64`] //need i1 + * [x] [`_mm512_mask_loadu_epi64`] //need i1 + * [x] [`_mm512_maskz_loadu_epi64`] //need i1 * [x] [`_mm_loadu_epi64`] - * [_] [`_mm_mask_loadu_epi64`] //need i1 - * [_] [`_mm_maskz_loadu_epi64`] //need i1 + * [x] [`_mm_mask_loadu_epi64`] //need i1 + * [x] [`_mm_maskz_loadu_epi64`] //need i1 * [x] [`_mm256_loadu_epi64`] - * [_] [`_mm256_mask_loadu_epi64`] //need i1 - * [_] [`_mm256_maskz_loadu_epi64`] //need i1 + * [x] [`_mm256_mask_loadu_epi64`] //need i1 + * [x] [`_mm256_maskz_loadu_epi64`] //need i1 * [x] [`_mm512_loadu_ps`] - * [ ] [`_mm512_mask_loadu_ps`] //need i1 - * [ ] [`_mm512_maskz_loadu_ps`] //need i1 - * [_] [`_mm_mask_loadu_ps`] //need i1 - * [_] [`_mm_maskz_loadu_ps`] //need i1 - * [_] [`_mm256_mask_loadu_ps`] //need i1 - * [_] [`_mm256_maskz_loadu_ps`] //need i1 + * [x] [`_mm512_mask_loadu_ps`] //need i1 + * [x] [`_mm512_maskz_loadu_ps`] //need i1 + * [x] [`_mm_mask_loadu_ps`] //need i1 + * [x] [`_mm_maskz_loadu_ps`] //need i1 + * [x] [`_mm256_mask_loadu_ps`] //need i1 + * [x] [`_mm256_maskz_loadu_ps`] //need i1 * [x] [`_mm512_loadu_pd`] - * [ ] [`_mm512_mask_loadu_pd`] //need i1 - * [ ] [`_mm512_maskz_loadu_pd`] //need i1 - * [_] [`_mm_mask_loadu_pd`] //need i1 - * [_] [`_mm_maskz_loadu_pd`] //need i1 - * [_] [`_mm256_mask_loadu_pd`] //need i1 - * [_] [`_mm256_maskz_loadu_pd`] //need i1 + * [x] [`_mm512_mask_loadu_pd`] //need i1 + * [x] [`_mm512_maskz_loadu_pd`] //need i1 + * [x] [`_mm_mask_loadu_pd`] //need i1 + * [x] [`_mm_maskz_loadu_pd`] //need i1 + * [x] [`_mm256_mask_loadu_pd`] //need i1 + * [x] [`_mm256_maskz_loadu_pd`] //need i1 * [x] [`_mm512_loadu_si512`] * [x] [`_mm512_store_epi32`] - * [ ] [`_mm512_mask_store_epi32`] //need i1 - * [_] [`_mm_mask_store_epi32`] //need i1 + * [x] [`_mm512_mask_store_epi32`] //need i1 + * [x] [`_mm_mask_store_epi32`] //need i1 * [x] [`_mm_store_epi32`] - * [_] [`_mm256_mask_store_epi32`] //need i1 + * [x] [`_mm256_mask_store_epi32`] //need i1 * [x] [`_mm256_store_epi32`] * [x] [`_mm512_store_epi64`] - * [ ] [`_mm512_mask_store_epi64`] //need i1 - * [_] [`_mm_mask_store_epi64`] //need i1 + * [x] [`_mm512_mask_store_epi64`] //need i1 + * [x] [`_mm_mask_store_epi64`] //need i1 * [x] [`_mm_store_epi64`] - * [_] [`_mm256_mask_store_epi64`] //need i1 + * [x] [`_mm256_mask_store_epi64`] //need i1 * [x] [`_mm256_store_epi64`] * [x] [`_mm512_store_ps`] - * [ ] [`_mm512_mask_store_ps`] //need i1 - * [_] [`_mm_mask_store_ps`] //need i1 - * [_] [`_mm256_mask_store_ps`] //need i1 + * [x] [`_mm512_mask_store_ps`] //need i1 + * [x] [`_mm_mask_store_ps`] //need i1 + * [x] [`_mm256_mask_store_ps`] //need i1 * [x] [`_mm512_store_pd`] - * [ ] [`_mm512_mask_store_pd`] //need i1 - * [_] [`_mm_mask_store_pd`] //need i1 - * [_] [`_mm256_mask_store_pd`] //need i1 + * [x] [`_mm512_mask_store_pd`] //need i1 + * [x] [`_mm_mask_store_pd`] //need i1 + * [x] [`_mm256_mask_store_pd`] //need i1 * [x] [`_mm512_store_si512`] * [x] [`_mm512_storeu_epi32`] - * [ ] [`_mm512_mask_storeu_epi32`] //need i1 - * [_] [`_mm_mask_storeu_epi32`] //need i1 + * [x] [`_mm512_mask_storeu_epi32`] //need i1 + * [x] [`_mm_mask_storeu_epi32`] //need i1 * [x] [`_mm_storeu_epi32`] - * [_] [`_mm256_mask_storeu_epi32`] //need i1 + * [x] [`_mm256_mask_storeu_epi32`] //need i1 * [x] [`_mm256_storeu_epi32`] * [x] [`_mm512_storeu_epi64`] - * [ ] [`_mm512_mask_storeu_epi64`] //need i1 - * [_] [`_mm_mask_storeu_epi64`] //need i1 + * [x] [`_mm512_mask_storeu_epi64`] //need i1 + * [x] [`_mm_mask_storeu_epi64`] //need i1 * [x] [`_mm_storeu_epi64`] - * [_] [`_mm256_mask_storeu_epi64`] //need i1 + * [x] [`_mm256_mask_storeu_epi64`] //need i1 * [x] [`_mm256_storeu_epi64`] * [x] [`_mm512_storeu_ps`] - * [ ] [`_mm512_mask_storeu_ps`] //need i1 - * [_] [`_mm_mask_storeu_ps`] //need i1 - * [_] [`_mm256_mask_storeu_ps`] //need i1 + * [x] [`_mm512_mask_storeu_ps`] //need i1 + * [x] [`_mm_mask_storeu_ps`] //need i1 + * [x] [`_mm256_mask_storeu_ps`] //need i1 * [x] [`_mm512_storeu_pd`] - * [ ] [`_mm512_mask_storeu_pd`] //need i1 - * [_] [`_mm_mask_storeu_pd`] //need i1 - * [_] [`_mm256_mask_storeu_pd`] //need i1 + * [x] [`_mm512_mask_storeu_pd`] //need i1 + * [x] [`_mm_mask_storeu_pd`] //need i1 + * [x] [`_mm256_mask_storeu_pd`] //need i1 * [x] [`_mm512_storeu_si512`] * [ ] [`_mm512_stream_load_si512`] //stream_load_si256, ... not implment yet * [x] [`_mm512_stream_pd`] From 1f7d501f0a98db04f070f926695c85b21c82a8f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Sun, 28 Nov 2021 12:03:29 +0100 Subject: [PATCH 09/11] Add avx512bw masked load and stores --- crates/core_arch/src/x86/avx512bw.rs | 324 +++++++++++++++++++++++++++ 1 file changed, 324 insertions(+) diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs index 10e0096339..b871bc4297 100644 --- a/crates/core_arch/src/x86/avx512bw.rs +++ b/crates/core_arch/src/x86/avx512bw.rs @@ -4227,6 +4227,330 @@ pub unsafe fn _mm_storeu_epi8(mem_addr: *mut i8, a: __m128i) { ptr::write_unaligned(mem_addr as *mut __m128i, a); } +/// Load packed 16-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub unsafe fn _mm512_mask_loadu_epi16(src: __m512i, k: __mmask32, mem_addr: *const i16) -> __m512i { + let mut dst: __m512i = src; + asm!( + "vmovdqu16 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 16-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub unsafe fn _mm512_maskz_loadu_epi16(k: __mmask32, mem_addr: *const i16) -> __m512i { + let mut dst: __m512i; + asm!( + "vmovdqu16 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 8-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub unsafe fn _mm512_mask_loadu_epi8(src: __m512i, k: __mmask64, mem_addr: *const i8) -> __m512i { + let mut dst: __m512i = src; + asm!( + "vmovdqu8 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 8-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub unsafe fn _mm512_maskz_loadu_epi8(k: __mmask64, mem_addr: *const i8) -> __m512i { + let mut dst: __m512i; + asm!( + "vmovdqu8 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(zmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 16-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm256_mask_loadu_epi16(src: __m256i, k: __mmask16, mem_addr: *const i16) -> __m256i { + let mut dst: __m256i = src; + asm!( + "vmovdqu16 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 16-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm256_maskz_loadu_epi16(k: __mmask16, mem_addr: *const i16) -> __m256i { + let mut dst: __m256i; + asm!( + "vmovdqu16 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 8-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm256_mask_loadu_epi8(src: __m256i, k: __mmask32, mem_addr: *const i8) -> __m256i { + let mut dst: __m256i = src; + asm!( + "vmovdqu8 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 8-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m256i { + let mut dst: __m256i; + asm!( + "vmovdqu8 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(ymm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 16-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i16) -> __m128i { + let mut dst: __m128i = src; + asm!( + "vmovdqu16 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 16-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i { + let mut dst: __m128i; + asm!( + "vmovdqu16 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 8-bit integers from memory into dst using writemask k +/// (elements are copied from src when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i8) -> __m128i { + let mut dst: __m128i = src; + asm!( + "vmovdqu8 {2}{{{1}}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + inout(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Load packed 8-bit integers from memory into dst using zeromask k +/// (elements are zeroed out when the corresponding mask bit is not set). +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i { + let mut dst: __m128i; + asm!( + "vmovdqu8 {2}{{{1}}} {{z}}, [{0}]", + in(reg) mem_addr, + in(kreg) k, + out(xmm_reg) dst, + options(pure, readonly, nostack) + ); + dst +} + +/// Store packed 16-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub unsafe fn _mm512_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask32, a: __m512i) { + asm!( + "vmovdqu16 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); +} + +/// Store packed 8-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub unsafe fn _mm512_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask64, a: __m512i) { + asm!( + "vmovdqu8 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(zmm_reg) a, + options(nostack) + ); +} + +/// Store packed 16-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm256_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask16, a: __m256i) { + asm!( + "vmovdqu16 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); +} + +/// Store packed 8-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m256i) { + asm!( + "vmovdqu8 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(ymm_reg) a, + options(nostack) + ); +} + +/// Store packed 16-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi16) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m128i) { + asm!( + "vmovdqu16 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); +} + +/// Store packed 8-bit integers from a into memory using writemask k. +/// mem_addr does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi8) +#[inline] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128i) { + asm!( + "vmovdqu8 [{0}]{{{1}}}, {2}", + in(reg) mem_addr, + in(kreg) mask, + in(xmm_reg) a, + options(nostack) + ); +} + /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_madd_epi16&expand=3511) From 7d28014612584263776b59c22a17ecc6542d6990 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Sun, 28 Nov 2021 23:20:29 +0100 Subject: [PATCH 10/11] Tests for avx512bw masked loads and stores --- crates/core_arch/src/x86/avx512bw.rs | 278 +++++++++++++++++++++++++++ 1 file changed, 278 insertions(+) diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs index b871bc4297..6d71e19f0a 100644 --- a/crates/core_arch/src/x86/avx512bw.rs +++ b/crates/core_arch/src/x86/avx512bw.rs @@ -14150,6 +14150,284 @@ mod tests { assert_eq_m128i(r, a); } + #[simd_test(enable = "avx512f,avx512bw")] + unsafe fn test_mm512_mask_loadu_epi16() { + let src = _mm512_set1_epi16(42); + let a = &[ + 1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let p = a.as_ptr(); + let m = 0b10101010_11001100_11101000_11001010; + let r = _mm512_mask_loadu_epi16(src, m, black_box(p)); + let e = &[ + 42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42, + 23, 24, 42, 26, 42, 28, 42, 30, 42, 32, + ]; + let e = _mm512_loadu_epi16(e.as_ptr()); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw")] + unsafe fn test_mm512_maskz_loadu_epi16() { + let a = &[ + 1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let p = a.as_ptr(); + let m = 0b10101010_11001100_11101000_11001010; + let r = _mm512_maskz_loadu_epi16(m, black_box(p)); + let e = &[ + 0_i16, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0, + 26, 0, 28, 0, 30, 0, 32, + ]; + let e = _mm512_loadu_epi16(e.as_ptr()); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw")] + unsafe fn test_mm512_mask_storeu_epi16() { + let mut r = [42_i16; 32]; + let a = &[ + 1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let a = _mm512_loadu_epi16(a.as_ptr()); + let m = 0b10101010_11001100_11101000_11001010; + _mm512_mask_storeu_epi16(r.as_mut_ptr(), m, a); + let e = &[ + 42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42, + 23, 24, 42, 26, 42, 28, 42, 30, 42, 32, + ]; + let e = _mm512_loadu_epi16(e.as_ptr()); + assert_eq_m512i(_mm512_loadu_epi16(r.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512bw")] + unsafe fn test_mm512_mask_loadu_epi8() { + let src = _mm512_set1_epi8(42); + let a = &[ + 1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + ]; + let p = a.as_ptr(); + let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010; + let r = _mm512_mask_loadu_epi8(src, m, black_box(p)); + let e = &[ + 42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42, + 23, 24, 42, 26, 42, 28, 42, 30, 42, 32, 42, 42, 42, 42, 42, 42, 42, 42, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 42, 42, 42, 42, 42, 42, 42, 42, + ]; + let e = _mm512_loadu_epi8(e.as_ptr()); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw")] + unsafe fn test_mm512_maskz_loadu_epi8() { + let a = &[ + 1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + ]; + let p = a.as_ptr(); + let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010; + let r = _mm512_maskz_loadu_epi8(m, black_box(p)); + let e = &[ + 0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0, + 26, 0, 28, 0, 30, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 41, 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + let e = _mm512_loadu_epi8(e.as_ptr()); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw")] + unsafe fn test_mm512_mask_storeu_epi8() { + let mut r = [42_i8; 64]; + let a = &[ + 1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + ]; + let a = _mm512_loadu_epi8(a.as_ptr()); + let m = 0b00000000_11111111_11111111_00000000_10101010_11001100_11101000_11001010; + _mm512_mask_storeu_epi8(r.as_mut_ptr(), m, a); + let e = &[ + 42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42, + 23, 24, 42, 26, 42, 28, 42, 30, 42, 32, 42, 42, 42, 42, 42, 42, 42, 42, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 42, 42, 42, 42, 42, 42, 42, 42, + ]; + let e = _mm512_loadu_epi8(e.as_ptr()); + assert_eq_m512i(_mm512_loadu_epi8(r.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm256_mask_loadu_epi16() { + let src = _mm256_set1_epi16(42); + let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm256_mask_loadu_epi16(src, m, black_box(p)); + let e = &[ + 42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, + ]; + let e = _mm256_loadu_epi16(e.as_ptr()); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm256_maskz_loadu_epi16() { + let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm256_maskz_loadu_epi16(m, black_box(p)); + let e = &[0_i16, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16]; + let e = _mm256_loadu_epi16(e.as_ptr()); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm256_mask_storeu_epi16() { + let mut r = [42_i16; 16]; + let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let a = _mm256_loadu_epi16(a.as_ptr()); + let m = 0b11101000_11001010; + _mm256_mask_storeu_epi16(r.as_mut_ptr(), m, a); + let e = &[ + 42_i16, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, + ]; + let e = _mm256_loadu_epi16(e.as_ptr()); + assert_eq_m256i(_mm256_loadu_epi16(r.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm256_mask_loadu_epi8() { + let src = _mm256_set1_epi8(42); + let a = &[ + 1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let p = a.as_ptr(); + let m = 0b10101010_11001100_11101000_11001010; + let r = _mm256_mask_loadu_epi8(src, m, black_box(p)); + let e = &[ + 42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42, + 23, 24, 42, 26, 42, 28, 42, 30, 42, 32, + ]; + let e = _mm256_loadu_epi8(e.as_ptr()); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm256_maskz_loadu_epi8() { + let a = &[ + 1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let p = a.as_ptr(); + let m = 0b10101010_11001100_11101000_11001010; + let r = _mm256_maskz_loadu_epi8(m, black_box(p)); + let e = &[ + 0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16, 0, 0, 19, 20, 0, 0, 23, 24, 0, + 26, 0, 28, 0, 30, 0, 32, + ]; + let e = _mm256_loadu_epi8(e.as_ptr()); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm256_mask_storeu_epi8() { + let mut r = [42_i8; 32]; + let a = &[ + 1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let a = _mm256_loadu_epi8(a.as_ptr()); + let m = 0b10101010_11001100_11101000_11001010; + _mm256_mask_storeu_epi8(r.as_mut_ptr(), m, a); + let e = &[ + 42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, 42, 42, 19, 20, 42, 42, + 23, 24, 42, 26, 42, 28, 42, 30, 42, 32, + ]; + let e = _mm256_loadu_epi8(e.as_ptr()); + assert_eq_m256i(_mm256_loadu_epi8(r.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm_mask_loadu_epi16() { + let src = _mm_set1_epi16(42); + let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm_mask_loadu_epi16(src, m, black_box(p)); + let e = &[42_i16, 2, 42, 4, 42, 42, 7, 8]; + let e = _mm_loadu_epi16(e.as_ptr()); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm_maskz_loadu_epi16() { + let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8]; + let p = a.as_ptr(); + let m = 0b11001010; + let r = _mm_maskz_loadu_epi16(m, black_box(p)); + let e = &[0_i16, 2, 0, 4, 0, 0, 7, 8]; + let e = _mm_loadu_epi16(e.as_ptr()); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm_mask_storeu_epi16() { + let mut r = [42_i16; 8]; + let a = &[1_i16, 2, 3, 4, 5, 6, 7, 8]; + let a = _mm_loadu_epi16(a.as_ptr()); + let m = 0b11001010; + _mm_mask_storeu_epi16(r.as_mut_ptr(), m, a); + let e = &[42_i16, 2, 42, 4, 42, 42, 7, 8]; + let e = _mm_loadu_epi16(e.as_ptr()); + assert_eq_m128i(_mm_loadu_epi16(r.as_ptr()), e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm_mask_loadu_epi8() { + let src = _mm_set1_epi8(42); + let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm_mask_loadu_epi8(src, m, black_box(p)); + let e = &[ + 42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, + ]; + let e = _mm_loadu_epi8(e.as_ptr()); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm_maskz_loadu_epi8() { + let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let p = a.as_ptr(); + let m = 0b11101000_11001010; + let r = _mm_maskz_loadu_epi8(m, black_box(p)); + let e = &[0_i8, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16]; + let e = _mm_loadu_epi8(e.as_ptr()); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avx512f,avx512bw,avx512vl")] + unsafe fn test_mm_mask_storeu_epi8() { + let mut r = [42_i8; 16]; + let a = &[1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let a = _mm_loadu_epi8(a.as_ptr()); + let m = 0b11101000_11001010; + _mm_mask_storeu_epi8(r.as_mut_ptr(), m, a); + let e = &[ + 42_i8, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16, + ]; + let e = _mm_loadu_epi8(e.as_ptr()); + assert_eq_m128i(_mm_loadu_epi8(r.as_ptr()), e); + } + #[simd_test(enable = "avx512bw")] unsafe fn test_mm512_madd_epi16() { let a = _mm512_set1_epi16(1); From 53f193bd8fb8395147f8f202fce1714435fbdc8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= Date: Sun, 28 Nov 2021 23:45:02 +0100 Subject: [PATCH 11/11] Using xmm registers seems to require sse target_feature on CI --- crates/core_arch/src/x86/avx512bw.rs | 12 +++---- crates/core_arch/src/x86/avx512f.rs | 48 ++++++++++++++-------------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs index 6d71e19f0a..0363004674 100644 --- a/crates/core_arch/src/x86/avx512bw.rs +++ b/crates/core_arch/src/x86/avx512bw.rs @@ -4385,7 +4385,7 @@ pub unsafe fn _mm256_maskz_loadu_epi8(k: __mmask32, mem_addr: *const i8) -> __m2 /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i16) -> __m128i { let mut dst: __m128i = src; asm!( @@ -4404,7 +4404,7 @@ pub unsafe fn _mm_mask_loadu_epi16(src: __m128i, k: __mmask8, mem_addr: *const i /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128i { let mut dst: __m128i; asm!( @@ -4423,7 +4423,7 @@ pub unsafe fn _mm_maskz_loadu_epi16(k: __mmask8, mem_addr: *const i16) -> __m128 /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i8) -> __m128i { let mut dst: __m128i = src; asm!( @@ -4442,7 +4442,7 @@ pub unsafe fn _mm_mask_loadu_epi8(src: __m128i, k: __mmask16, mem_addr: *const i /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] pub unsafe fn _mm_maskz_loadu_epi8(k: __mmask16, mem_addr: *const i8) -> __m128i { let mut dst: __m128i; asm!( @@ -4524,7 +4524,7 @@ pub unsafe fn _mm256_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask32, a: __m /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi16) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m128i) { asm!( "vmovdqu16 [{0}]{{{1}}}, {2}", @@ -4540,7 +4540,7 @@ pub unsafe fn _mm_mask_storeu_epi16(mem_addr: *mut i16, mask: __mmask8, a: __m12 /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi8) #[inline] -#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512bw,avx512vl,avx,sse")] pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128i) { asm!( "vmovdqu8 [{0}]{{{1}}}, {2}", diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 3c9bcbc2fa..7633442aa6 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -30633,7 +30633,7 @@ pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256 /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i { let mut dst: __m128i = src; asm!( @@ -30652,7 +30652,7 @@ pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i { let mut dst: __m128i; asm!( @@ -30671,7 +30671,7 @@ pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128 /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i { let mut dst: __m128i = src; asm!( @@ -30690,7 +30690,7 @@ pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i { let mut dst: __m128i; asm!( @@ -30709,7 +30709,7 @@ pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128 /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 { let mut dst: __m128 = src; asm!( @@ -30728,7 +30728,7 @@ pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { let mut dst: __m128; asm!( @@ -30747,7 +30747,7 @@ pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d { let mut dst: __m128d = src; asm!( @@ -30766,7 +30766,7 @@ pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { let mut dst: __m128d; asm!( @@ -31089,7 +31089,7 @@ pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i { let mut dst: __m128i = src; asm!( @@ -31108,7 +31108,7 @@ pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i3 /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i { let mut dst: __m128i; asm!( @@ -31127,7 +31127,7 @@ pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i { let mut dst: __m128i = src; asm!( @@ -31146,7 +31146,7 @@ pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i6 /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i { let mut dst: __m128i; asm!( @@ -31165,7 +31165,7 @@ pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 { let mut dst: __m128 = src; asm!( @@ -31184,7 +31184,7 @@ pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) - /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { let mut dst: __m128; asm!( @@ -31203,7 +31203,7 @@ pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 { /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d { let mut dst: __m128d = src; asm!( @@ -31222,7 +31222,7 @@ pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d { let mut dst: __m128d; asm!( @@ -31368,7 +31368,7 @@ pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m25 /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) { asm!( "vmovdqu32 [{0}]{{{1}}}, {2}", @@ -31384,7 +31384,7 @@ pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m12 /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) { asm!( "vmovdqu64 [{0}]{{{1}}}, {2}", @@ -31400,7 +31400,7 @@ pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m12 /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { asm!( "vmovups [{0}]{{{1}}}, {2}", @@ -31416,7 +31416,7 @@ pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) { asm!( "vmovupd [{0}]{{{1}}}, {2}", @@ -31560,7 +31560,7 @@ pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256 /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_epi32) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) { asm!( "vmovdqa32 [{0}]{{{1}}}, {2}", @@ -31576,7 +31576,7 @@ pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128 /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_epi64) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) { asm!( "vmovdqa64 [{0}]{{{1}}}, {2}", @@ -31592,7 +31592,7 @@ pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128 /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ps) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { asm!( "vmovaps [{0}]{{{1}}}, {2}", @@ -31608,7 +31608,7 @@ pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) { /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_pd) #[inline] -#[target_feature(enable = "avx512f,avx512vl,avx")] +#[target_feature(enable = "avx512f,avx512vl,avx,sse")] pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) { asm!( "vmovapd [{0}]{{{1}}}, {2}",