From 1982d871475ffb8c2c160a96916a76fdcb6eb11c Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sat, 18 Jan 2025 15:37:14 -0500
Subject: [PATCH 1/3] Merge commit '3383cfbd3572465febc7a8f816a46304373de46a'
 into sync-from-portable-simd-2025-01-18

---
 .github/workflows/ci.yml                   | 125 +++++-----
 .github/workflows/doc.yml                  |   2 +-
 .gitignore                                 |   1 +
 Cargo.toml                                 |   6 +
 Cross.toml                                 |   2 +
 crates/core_simd/Cargo.toml                |   3 +-
 crates/core_simd/src/lane_count.rs         |   8 +-
 crates/core_simd/src/lib.rs                |   4 +-
 crates/core_simd/src/masks.rs              |  42 ----
 crates/core_simd/src/masks/bitmask.rs      |  17 --
 crates/core_simd/src/masks/full_masks.rs   |  58 +----
 crates/core_simd/src/ops.rs                |  25 +-
 crates/core_simd/src/simd/cmp/eq.rs        |   2 +-
 crates/core_simd/src/simd/num/float.rs     |  30 ++-
 crates/core_simd/src/simd/num/int.rs       |  43 +++-
 crates/core_simd/src/simd/num/uint.rs      |  42 +++-
 crates/core_simd/src/simd/ptr/const_ptr.rs |  21 ++
 crates/core_simd/src/simd/ptr/mut_ptr.rs   |  21 ++
 crates/core_simd/src/swizzle.rs            | 257 ++++++++++++++++++++-
 crates/core_simd/src/swizzle_dyn.rs        |  59 ++++-
 crates/core_simd/src/vector.rs             |  43 +++-
 crates/core_simd/src/vendor.rs             |   3 +
 crates/core_simd/src/vendor/loongarch64.rs |  31 +++
 crates/core_simd/tests/layout.rs           |  35 +++
 crates/core_simd/tests/masks.rs            |  43 ----
 crates/core_simd/tests/ops_macros.rs       |  38 +++
 crates/core_simd/tests/swizzle.rs          |  18 ++
 crates/test_helpers/Cargo.toml             |   3 -
 crates/test_helpers/src/lib.rs             | 216 ++++++++---------
 rust-toolchain.toml                        |   3 +
 subtree-sync.sh                            |  52 +++++
 31 files changed, 865 insertions(+), 388 deletions(-)
 create mode 100644 Cross.toml
 create mode 100644 crates/core_simd/src/vendor/loongarch64.rs
 create mode 100644 crates/core_simd/tests/layout.rs
 create mode 100644 rust-toolchain.toml
 create mode 100755 subtree-sync.sh

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b292be2d6f9..3984d8f0d8d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -9,6 +9,7 @@ on:
 env:
   CARGO_NET_RETRY: 10
   RUSTUP_MAX_RETRIES: 10
+  PROPTEST_CASES: 64
 
 jobs:
   rustfmt:
@@ -16,12 +17,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v2
-      - name: Setup Rust
-        run: |
-          rustup update nightly --no-self-update
-          rustup default nightly
-          rustup component add rustfmt
+      - uses: actions/checkout@v4
       - name: Run rustfmt
         run: cargo fmt --all -- --check
 
@@ -37,7 +33,9 @@ jobs:
           - i686-unknown-linux-gnu
           - i586-unknown-linux-gnu
           - aarch64-unknown-linux-gnu
+          - arm64ec-pc-windows-msvc
           - armv7-unknown-linux-gnueabihf
+          - loongarch64-unknown-linux-gnu
           # non-nightly since https://github.com/rust-lang/rust/pull/113274
           # - mips-unknown-linux-gnu
           # - mips64-unknown-linux-gnuabi64
@@ -49,13 +47,9 @@ jobs:
           - wasm32-unknown-unknown
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Setup Rust
-        run: |
-          rustup update nightly --no-self-update
-          rustup default nightly
-          rustup target add ${{ matrix.target }}
-          rustup component add clippy
+        run: rustup target add ${{ matrix.target }}
       - name: Run Clippy
         run: cargo clippy --all-targets --target ${{ matrix.target }}
 
@@ -65,26 +59,19 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        target: [x86_64-pc-windows-msvc, i686-pc-windows-msvc, i586-pc-windows-msvc, x86_64-unknown-linux-gnu, x86_64-apple-darwin]
+        target: [x86_64-pc-windows-msvc, i686-pc-windows-msvc, i586-pc-windows-msvc, x86_64-unknown-linux-gnu]
         # `default` means we use the default target config for the target,
         # `native` means we run with `-Ctarget-cpu=native`, and anything else is
         # an arg to `-Ctarget-feature`
         target_feature: [default, native, +sse3, +ssse3, +sse4.1, +sse4.2, +avx, +avx2]
 
         exclude:
-          # The macos runners seem to only reliably support up to `avx`.
-          - { target: x86_64-apple-darwin, target_feature: +avx2 }
-          # These features are statically known to be present for all 64 bit
-          # macs, and thus are covered by the `default` test
-          - { target: x86_64-apple-darwin, target_feature: +sse3 }
-          - { target: x86_64-apple-darwin, target_feature: +ssse3 }
           # -Ctarget-cpu=native sounds like bad-news if target != host
           - { target: i686-pc-windows-msvc, target_feature: native }
           - { target: i586-pc-windows-msvc, target_feature: native }
 
         include:
           # Populate the `matrix.os` field
-          - { target: x86_64-apple-darwin,      os: macos-latest }
           - { target: x86_64-unknown-linux-gnu, os: ubuntu-latest }
           - { target: x86_64-pc-windows-msvc,   os: windows-latest }
           - { target: i686-pc-windows-msvc,     os: windows-latest }
@@ -98,12 +85,9 @@ jobs:
           # avx512vl, but occasionally doesn't.  Maybe one day we can enable it.
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Setup Rust
-        run: |
-          rustup update nightly --no-self-update
-          rustup default nightly
-          rustup target add ${{ matrix.target }}
+        run: rustup target add ${{ matrix.target }}
 
       - name: Configure RUSTFLAGS
         shell: bash
@@ -145,6 +129,35 @@ jobs:
         run: cargo doc --verbose --target=${{ matrix.target }}
         env:
           RUSTDOCFLAGS: -Dwarnings
+    
+  macos-tests:
+    name: ${{ matrix.target }}
+    runs-on: macos-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        target:
+          - aarch64-apple-darwin
+          - x86_64-apple-darwin
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Rust
+        run: rustup target add ${{ matrix.target }}
+
+      - name: Configure RUSTFLAGS
+        shell: bash
+        run: echo "RUSTFLAGS=-Dwarnings" >> $GITHUB_ENV
+
+      - name: Test (debug)
+        run: cargo test --verbose --target=${{ matrix.target }}
+
+      - name: Test (release)
+        run: cargo test --verbose --target=${{ matrix.target }} --release
+
+      - name: Generate docs
+        run: cargo doc --verbose --target=${{ matrix.target }}
+        env:
+          RUSTDOCFLAGS: -Dwarnings
 
   wasm-tests:
     name: "wasm (firefox, ${{ matrix.name }})"
@@ -155,11 +168,7 @@ jobs:
           - { name: default, RUSTFLAGS: "" }
           - { name: simd128, RUSTFLAGS: "-C target-feature=+simd128" }
     steps:
-      - uses: actions/checkout@v2
-      - name: Setup Rust
-        run: |
-          rustup update nightly --no-self-update
-          rustup default nightly
+      - uses: actions/checkout@v4
       - name: Install wasm-pack
         run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
       - name: Test (debug)
@@ -174,6 +183,8 @@ jobs:
   cross-tests:
     name: "${{ matrix.target_feature }} on ${{ matrix.target }} (via cross)"
     runs-on: ubuntu-latest
+    env:
+      PROPTEST_CASES: 16
     strategy:
       fail-fast: false
 
@@ -185,6 +196,7 @@ jobs:
           - powerpc-unknown-linux-gnu
           - powerpc64le-unknown-linux-gnu       # includes altivec by default
           - riscv64gc-unknown-linux-gnu
+          - loongarch64-unknown-linux-gnu
           # MIPS uses a nonstandard binary representation for NaNs which makes it worth testing
           # non-nightly since https://github.com/rust-lang/rust/pull/113274
           # - mips-unknown-linux-gnu
@@ -201,24 +213,14 @@ jobs:
           # - { target: riscv64gc-unknown-linux-gnu, target_feature: "+v,+zvl128b" }
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Setup Rust
-        run: |
-          rustup update nightly --no-self-update
-          rustup default nightly
-          rustup target add ${{ matrix.target }}
-          rustup component add rust-src
+        run: rustup target add ${{ matrix.target }}
 
       - name: Install Cross
-        # Equivalent to `cargo install cross`, but downloading a prebuilt
-        # binary. Ideally we wouldn't hardcode a version, but the version number
-        # being part of the tarball means we can't just use the download/latest
-        # URL :(
+        # Install the latest git version for newer targets.
         run: |
-          CROSS_URL=https://github.com/cross-rs/cross/releases/download/v0.2.5/cross-x86_64-unknown-linux-gnu.tar.gz
-          mkdir -p "$HOME/.bin"
-          curl -sfSL --retry-delay 10 --retry 5 "${CROSS_URL}" | tar zxf - -C "$HOME/.bin"
-          echo "$HOME/.bin" >> $GITHUB_PATH
+          cargo install cross --git https://github.com/cross-rs/cross --rev 4090beca3cfffa44371a5bba524de3a578aa46c3
 
       - name: Configure Emulated CPUs
         run: |
@@ -242,34 +244,11 @@ jobs:
       - name: Test (release)
         run: cross test --verbose --target=${{ matrix.target }} --release
 
-  features:
-    name: "Test cargo features (${{ matrix.simd }} × ${{ matrix.features }})"
+  miri:
     runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        simd:
-          - ""
-          - "avx512"
-        features:
-          - ""
-          - "--features std"
-          - "--features all_lane_counts"
-          - "--all-features"
-
+    env:
+      PROPTEST_CASES: 16
     steps:
-      - uses: actions/checkout@v2
-      - name: Setup Rust
-        run: |
-          rustup update nightly --no-self-update
-          rustup default nightly
-      - name: Detect AVX512
-        run: echo "CPU_FEATURE=$(lscpu | grep -o avx512[a-z]* | sed s/avx/+avx/ | tr '\n' ',' )" >> $GITHUB_ENV
-      - name: Check build
-        if: ${{ matrix.simd == '' }}
-        run: RUSTFLAGS="-Dwarnings" cargo test --all-targets --no-default-features ${{ matrix.features }}
-      - name: Check AVX
-        if: ${{ matrix.simd == 'avx512' && contains(env.CPU_FEATURE, 'avx512') }}
-        run: |
-          echo "Found AVX features: $CPU_FEATURE"
-          RUSTFLAGS="-Dwarnings -Ctarget-feature=$CPU_FEATURE" cargo test --all-targets --no-default-features ${{ matrix.features }}
+      - uses: actions/checkout@v4
+      - name: Test (Miri)
+        run: cargo miri test
diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml
index 9d1fa66ccb5..22c2cb3f67f 100644
--- a/.github/workflows/doc.yml
+++ b/.github/workflows/doc.yml
@@ -12,7 +12,7 @@ jobs:
 
     steps:
       - name: Checkout Repository
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4
 
       - name: Setup Rust
         run: |
diff --git a/.gitignore b/.gitignore
index ea8c4bf7f35..9673e52dcad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 /target
+git-subtree.sh
diff --git a/Cargo.toml b/Cargo.toml
index d1732aaec2f..21d4584a9f4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,3 +5,9 @@ members = [
     "crates/std_float",
     "crates/test_helpers",
 ]
+
+[profile.test.package."*"]
+opt-level = 2
+
+[profile.test.package.test_helpers]
+opt-level = 2
diff --git a/Cross.toml b/Cross.toml
new file mode 100644
index 00000000000..d21e76b92dd
--- /dev/null
+++ b/Cross.toml
@@ -0,0 +1,2 @@
+[build.env]
+passthrough = ["PROPTEST_CASES"]
diff --git a/crates/core_simd/Cargo.toml b/crates/core_simd/Cargo.toml
index b4a8fd70f4c..a7a6d43b11d 100644
--- a/crates/core_simd/Cargo.toml
+++ b/crates/core_simd/Cargo.toml
@@ -9,10 +9,9 @@ categories = ["hardware-support", "no-std"]
 license = "MIT OR Apache-2.0"
 
 [features]
-default = ["as_crate"]
+default = ["as_crate", "std"]
 as_crate = []
 std = []
-all_lane_counts = []
 
 [target.'cfg(target_arch = "wasm32")'.dev-dependencies]
 wasm-bindgen = "0.2"
diff --git a/crates/core_simd/src/lane_count.rs b/crates/core_simd/src/lane_count.rs
index 4cd7265ed67..280b27bc9bc 100644
--- a/crates/core_simd/src/lane_count.rs
+++ b/crates/core_simd/src/lane_count.rs
@@ -33,10 +33,8 @@ macro_rules! supported_lane_count {
     };
 }
 
-supported_lane_count!(1, 2, 4, 8, 16, 32, 64);
-#[cfg(feature = "all_lane_counts")]
 supported_lane_count!(
-    3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
-    31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
-    56, 57, 58, 59, 60, 61, 62, 63
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+    27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+    51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
 );
diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 992a7705e3c..7f57847c9c2 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -1,7 +1,6 @@
 #![no_std]
 #![feature(
-    const_refs_to_cell,
-    const_mut_refs,
+    const_eval_select,
     convert_float_to_int,
     core_intrinsics,
     decl_macro,
@@ -26,6 +25,7 @@
     all(target_arch = "arm", target_feature = "v7"),
     feature(stdarch_arm_neon_intrinsics)
 )]
+#![cfg_attr(target_arch = "loongarch64", feature(stdarch_loongarch))]
 #![cfg_attr(
     any(target_arch = "powerpc", target_arch = "powerpc64"),
     feature(stdarch_powerpc)
diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index 04de3a96827..b763a7c75a5 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -308,48 +308,6 @@ where
         Self(mask_impl::Mask::from_bitmask_integer(bitmask))
     }
 
-    /// Creates a bitmask vector from a mask.
-    ///
-    /// Each bit is set if the corresponding element in the mask is `true`.
-    /// The remaining bits are unset.
-    ///
-    /// The bits are packed into the first N bits of the vector:
-    /// ```
-    /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
-    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::mask32x8;
-    /// let mask = mask32x8::from_array([true, false, true, false, false, false, true, false]);
-    /// assert_eq!(mask.to_bitmask_vector()[0], 0b01000101);
-    /// ```
-    #[inline]
-    #[must_use = "method returns a new integer and does not mutate the original value"]
-    pub fn to_bitmask_vector(self) -> Simd<u8, N> {
-        self.0.to_bitmask_vector()
-    }
-
-    /// Creates a mask from a bitmask vector.
-    ///
-    /// For each bit, if it is set, the corresponding element in the mask is set to `true`.
-    ///
-    /// The bits are packed into the first N bits of the vector:
-    /// ```
-    /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
-    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{mask32x8, u8x8};
-    /// let bitmask = u8x8::from_array([0b01000101, 0, 0, 0, 0, 0, 0, 0]);
-    /// assert_eq!(
-    ///     mask32x8::from_bitmask_vector(bitmask),
-    ///     mask32x8::from_array([true, false, true, false, false, false, true, false]),
-    /// );
-    /// ```
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask_vector(bitmask: Simd<u8, N>) -> Self {
-        Self(mask_impl::Mask::from_bitmask_vector(bitmask))
-    }
-
     /// Finds the index of the first set element.
     ///
     /// ```
diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index 96c553426ee..db4312d5bf8 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -122,23 +122,6 @@ where
         unsafe { Self(core::intrinsics::simd::simd_bitmask(value), PhantomData) }
     }
 
-    #[inline]
-    #[must_use = "method returns a new vector and does not mutate the original value"]
-    pub fn to_bitmask_vector(self) -> Simd<u8, N> {
-        let mut bitmask = Simd::splat(0);
-        bitmask.as_mut_array()[..self.0.as_ref().len()].copy_from_slice(self.0.as_ref());
-        bitmask
-    }
-
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask_vector(bitmask: Simd<u8, N>) -> Self {
-        let mut bytes = <LaneCount<N> as SupportedLaneCount>::BitMask::default();
-        let len = bytes.as_ref().len();
-        bytes.as_mut().copy_from_slice(&bitmask.as_array()[..len]);
-        Self(bytes, PhantomData)
-    }
-
     #[inline]
     pub fn to_bitmask_integer(self) -> u64 {
         let mut bitmask = [0u8; 8];
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index 87f031a9f36..2d01946b574 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -140,62 +140,6 @@ where
         unsafe { Mask(core::intrinsics::simd::simd_cast(self.0)) }
     }
 
-    #[inline]
-    #[must_use = "method returns a new vector and does not mutate the original value"]
-    pub fn to_bitmask_vector(self) -> Simd<u8, N> {
-        let mut bitmask = Simd::splat(0);
-
-        // Safety: Bytes is the right size array
-        unsafe {
-            // Compute the bitmask
-            let mut bytes: <LaneCount<N> as SupportedLaneCount>::BitMask =
-                core::intrinsics::simd::simd_bitmask(self.0);
-
-            // LLVM assumes bit order should match endianness
-            if cfg!(target_endian = "big") {
-                for x in bytes.as_mut() {
-                    *x = x.reverse_bits()
-                }
-                if N % 8 > 0 {
-                    bytes.as_mut()[N / 8] >>= 8 - N % 8;
-                }
-            }
-
-            bitmask.as_mut_array()[..bytes.as_ref().len()].copy_from_slice(bytes.as_ref());
-        }
-
-        bitmask
-    }
-
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask_vector(bitmask: Simd<u8, N>) -> Self {
-        let mut bytes = <LaneCount<N> as SupportedLaneCount>::BitMask::default();
-
-        // Safety: Bytes is the right size array
-        unsafe {
-            let len = bytes.as_ref().len();
-            bytes.as_mut().copy_from_slice(&bitmask.as_array()[..len]);
-
-            // LLVM assumes bit order should match endianness
-            if cfg!(target_endian = "big") {
-                for x in bytes.as_mut() {
-                    *x = x.reverse_bits();
-                }
-                if N % 8 > 0 {
-                    bytes.as_mut()[N / 8] >>= 8 - N % 8;
-                }
-            }
-
-            // Compute the regular mask
-            Self::from_int_unchecked(core::intrinsics::simd::simd_select_bitmask(
-                bytes,
-                Self::splat(true).to_int(),
-                Self::splat(false).to_int(),
-            ))
-        }
-    }
-
     #[inline]
     unsafe fn to_bitmask_impl<U: ReverseBits, const M: usize>(self) -> U
     where
@@ -283,7 +227,7 @@ where
     }
 
     #[inline]
-    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn all(self) -> bool {
         // Safety: use `self` as an integer vector
         unsafe { core::intrinsics::simd::simd_reduce_all(self.to_int()) }
diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index dd7303a97b1..d3bd14a3402 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -77,7 +77,7 @@ macro_rules! int_divrem_guard {
     (   $lhs:ident,
         $rhs:ident,
         {   const PANIC_ZERO: &'static str = $zero:literal;
-            $simd_call:ident
+            $simd_call:ident, $op:tt
         },
         $int:ident ) => {
         if $rhs.simd_eq(Simd::splat(0 as _)).any() {
@@ -96,8 +96,23 @@ macro_rules! int_divrem_guard {
                 // Nice base case to make it easy to const-fold away the other branch.
                 $rhs
             };
-            // Safety: $lhs and rhs are vectors
-            unsafe { core::intrinsics::simd::$simd_call($lhs, rhs) }
+
+            // aarch64 div fails for arbitrary `v % 0`, mod fails when rhs is MIN, for non-powers-of-two
+            // these operations aren't vectorized on aarch64 anyway
+            #[cfg(target_arch = "aarch64")]
+            {
+                let mut out = Simd::splat(0 as _);
+                for i in 0..Self::LEN {
+                    out[i] = $lhs[i] $op rhs[i];
+                }
+                out
+            }
+
+            #[cfg(not(target_arch = "aarch64"))]
+            {
+                // Safety: $lhs and rhs are vectors
+                unsafe { core::intrinsics::simd::$simd_call($lhs, rhs) }
+            }
         }
     };
 }
@@ -205,14 +220,14 @@ for_base_ops! {
     impl Div::div {
         int_divrem_guard {
             const PANIC_ZERO: &'static str = "attempt to divide by zero";
-            simd_div
+            simd_div, /
         }
     }
 
     impl Rem::rem {
         int_divrem_guard {
             const PANIC_ZERO: &'static str = "attempt to calculate the remainder with a divisor of zero";
-            simd_rem
+            simd_rem, %
         }
     }
 
diff --git a/crates/core_simd/src/simd/cmp/eq.rs b/crates/core_simd/src/simd/cmp/eq.rs
index 5b4615ce51d..93989ce91b8 100644
--- a/crates/core_simd/src/simd/cmp/eq.rs
+++ b/crates/core_simd/src/simd/cmp/eq.rs
@@ -12,7 +12,7 @@ pub trait SimdPartialEq {
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn simd_eq(self, other: Self) -> Self::Mask;
 
-    /// Test if each element is equal to the corresponding element in `other`.
+    /// Test if each element is not equal to the corresponding element in `other`.
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn simd_ne(self, other: Self) -> Self::Mask;
 }
diff --git a/crates/core_simd/src/simd/num/float.rs b/crates/core_simd/src/simd/num/float.rs
index 59e43851ea8..79954b937b3 100644
--- a/crates/core_simd/src/simd/num/float.rs
+++ b/crates/core_simd/src/simd/num/float.rs
@@ -255,6 +255,7 @@ macro_rules! impl_trait {
             type Bits = Simd<$bits_ty, N>;
             type Cast<T: SimdElement> = Simd<T, N>;
 
+            #[cfg(not(target_arch = "aarch64"))]
             #[inline]
             fn cast<T: SimdCast>(self) -> Self::Cast<T>
             {
@@ -262,6 +263,33 @@ macro_rules! impl_trait {
                 unsafe { core::intrinsics::simd::simd_as(self) }
             }
 
+            // https://github.com/llvm/llvm-project/issues/94694
+            #[cfg(target_arch = "aarch64")]
+            #[inline]
+            fn cast<T: SimdCast>(self) -> Self::Cast<T>
+            {
+                const { assert!(N <= 64) };
+                if N <= 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64 {
+                    // Safety: supported types are guaranteed by SimdCast
+                    unsafe { core::intrinsics::simd::simd_as(self) }
+                } else if N < 4 {
+                    let x = self.resize::<4>(Default::default()).cast();
+                    x.resize::<N>(x[0])
+                } else if N < 8 {
+                    let x = self.resize::<8>(Default::default()).cast();
+                    x.resize::<N>(x[0])
+                } else if N < 16 {
+                    let x = self.resize::<16>(Default::default()).cast();
+                    x.resize::<N>(x[0])
+                } else if N < 32 {
+                    let x = self.resize::<32>(Default::default()).cast();
+                    x.resize::<N>(x[0])
+                } else {
+                    let x = self.resize::<64>(Default::default()).cast();
+                    x.resize::<N>(x[0])
+                }
+            }
+
             #[inline]
             #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
             unsafe fn to_int_unchecked<I: SimdCast>(self) -> Self::Cast<I>
@@ -391,7 +419,7 @@ macro_rules! impl_trait {
                     self.as_array().iter().sum()
                 } else {
                     // Safety: `self` is a float vector
-                    unsafe { core::intrinsics::simd::simd_reduce_add_ordered(self, 0.) }
+                    unsafe { core::intrinsics::simd::simd_reduce_add_ordered(self, -0.) }
                 }
             }
 
diff --git a/crates/core_simd/src/simd/num/int.rs b/crates/core_simd/src/simd/num/int.rs
index d7598d9ceaf..3a51235ff95 100644
--- a/crates/core_simd/src/simd/num/int.rs
+++ b/crates/core_simd/src/simd/num/int.rs
@@ -1,6 +1,6 @@
 use super::sealed::Sealed;
 use crate::simd::{
-    cmp::SimdPartialOrd, num::SimdUint, LaneCount, Mask, Simd, SimdCast, SimdElement,
+    cmp::SimdOrd, cmp::SimdPartialOrd, num::SimdUint, LaneCount, Mask, Simd, SimdCast, SimdElement,
     SupportedLaneCount,
 };
 
@@ -70,11 +70,27 @@ pub trait SimdInt: Copy + Sealed {
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
     /// # use simd::prelude::*;
     /// use core::i32::{MIN, MAX};
-    /// let xs = Simd::from_array([MIN, MIN +1, -5, 0]);
+    /// let xs = Simd::from_array([MIN, MIN + 1, -5, 0]);
     /// assert_eq!(xs.abs(), Simd::from_array([MIN, MAX, 5, 0]));
     /// ```
     fn abs(self) -> Self;
 
+    /// Lanewise absolute difference.
+    /// Every element becomes the absolute difference of `self` and `second`.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::prelude::*;
+    /// use core::i32::{MIN, MAX};
+    /// let a = Simd::from_array([MIN, MAX, 100, -100]);
+    /// let b = Simd::from_array([MAX, MIN, -80, -120]);
+    /// assert_eq!(a.abs_diff(b), Simd::from_array([u32::MAX, u32::MAX, 180, 20]));
+    /// ```
+    fn abs_diff(self, second: Self) -> Self::Unsigned;
+
     /// Lanewise saturating absolute value, implemented in Rust.
     /// As abs(), except the MIN value becomes MAX instead of itself.
     ///
@@ -203,6 +219,12 @@ pub trait SimdInt: Copy + Sealed {
     /// The least significant bit becomes the most significant bit, second least-significant bit becomes second most-significant bit, etc.
     fn reverse_bits(self) -> Self;
 
+    /// Returns the number of ones in the binary representation of each element.
+    fn count_ones(self) -> Self::Unsigned;
+
+    /// Returns the number of zeros in the binary representation of each element.
+    fn count_zeros(self) -> Self::Unsigned;
+
     /// Returns the number of leading zeros in the binary representation of each element.
     fn leading_zeros(self) -> Self::Unsigned;
 
@@ -259,6 +281,13 @@ macro_rules! impl_trait {
                 (self^m) - m
             }
 
+            #[inline]
+            fn abs_diff(self, second: Self) -> Self::Unsigned {
+                let max = self.simd_max(second);
+                let min = self.simd_min(second);
+                (max - min).cast()
+            }
+
             #[inline]
             fn saturating_abs(self) -> Self {
                 // arith shift for -1 or 0 mask based on sign bit, giving 2s complement
@@ -344,6 +373,16 @@ macro_rules! impl_trait {
                 unsafe { core::intrinsics::simd::simd_bitreverse(self) }
             }
 
+            #[inline]
+            fn count_ones(self) -> Self::Unsigned {
+                self.cast::<$unsigned>().count_ones()
+            }
+
+            #[inline]
+            fn count_zeros(self) -> Self::Unsigned {
+                self.cast::<$unsigned>().count_zeros()
+            }
+
             #[inline]
             fn leading_zeros(self) -> Self::Unsigned {
                 self.cast::<$unsigned>().leading_zeros()
diff --git a/crates/core_simd/src/simd/num/uint.rs b/crates/core_simd/src/simd/num/uint.rs
index 53dd97f501c..1ab2d8c7b73 100644
--- a/crates/core_simd/src/simd/num/uint.rs
+++ b/crates/core_simd/src/simd/num/uint.rs
@@ -1,5 +1,5 @@
 use super::sealed::Sealed;
-use crate::simd::{LaneCount, Simd, SimdCast, SimdElement, SupportedLaneCount};
+use crate::simd::{cmp::SimdOrd, LaneCount, Simd, SimdCast, SimdElement, SupportedLaneCount};
 
 /// Operations on SIMD vectors of unsigned integers.
 pub trait SimdUint: Copy + Sealed {
@@ -57,6 +57,22 @@ pub trait SimdUint: Copy + Sealed {
     /// assert_eq!(sat, Simd::splat(0));
     fn saturating_sub(self, second: Self) -> Self;
 
+    /// Lanewise absolute difference.
+    /// Every element becomes the absolute difference of `self` and `second`.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::prelude::*;
+    /// use core::u32::MAX;
+    /// let a = Simd::from_array([0, MAX, 100, 20]);
+    /// let b = Simd::from_array([MAX, 0, 80, 200]);
+    /// assert_eq!(a.abs_diff(b), Simd::from_array([MAX, MAX, 20, 180]));
+    /// ```
+    fn abs_diff(self, second: Self) -> Self;
+
     /// Returns the sum of the elements of the vector, with wrapping addition.
     fn reduce_sum(self) -> Self::Scalar;
 
@@ -85,6 +101,12 @@ pub trait SimdUint: Copy + Sealed {
     /// The least significant bit becomes the most significant bit, second least-significant bit becomes second most-significant bit, etc.
     fn reverse_bits(self) -> Self;
 
+    /// Returns the number of ones in the binary representation of each element.
+    fn count_ones(self) -> Self;
+
+    /// Returns the number of zeros in the binary representation of each element.
+    fn count_zeros(self) -> Self;
+
     /// Returns the number of leading zeros in the binary representation of each element.
     fn leading_zeros(self) -> Self;
 
@@ -138,6 +160,13 @@ macro_rules! impl_trait {
                 unsafe { core::intrinsics::simd::simd_saturating_sub(self, second) }
             }
 
+            #[inline]
+            fn abs_diff(self, second: Self) -> Self {
+                let max = self.simd_max(second);
+                let min = self.simd_min(second);
+                max - min
+            }
+
             #[inline]
             fn reduce_sum(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
@@ -192,6 +221,17 @@ macro_rules! impl_trait {
                 unsafe { core::intrinsics::simd::simd_bitreverse(self) }
             }
 
+            #[inline]
+            fn count_ones(self) -> Self {
+                // Safety: `self` is an integer vector
+                unsafe { core::intrinsics::simd::simd_ctpop(self) }
+            }
+
+            #[inline]
+            fn count_zeros(self) -> Self {
+                (!self).count_ones()
+            }
+
             #[inline]
             fn leading_zeros(self) -> Self {
                 // Safety: `self` is an integer vector
diff --git a/crates/core_simd/src/simd/ptr/const_ptr.rs b/crates/core_simd/src/simd/ptr/const_ptr.rs
index be635ea640b..47383809ffb 100644
--- a/crates/core_simd/src/simd/ptr/const_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/const_ptr.rs
@@ -42,6 +42,19 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// Equivalent to calling [`pointer::addr`] on each element.
     fn addr(self) -> Self::Usize;
 
+    /// Converts an address to a pointer without giving it any provenance.
+    ///
+    /// Without provenance, this pointer is not associated with any actual allocation. Such a
+    /// no-provenance pointer may be used for zero-sized memory accesses (if suitably aligned), but
+    /// non-zero-sized memory accesses with a no-provenance pointer are UB. No-provenance pointers
+    /// are little more than a usize address in disguise.
+    ///
+    /// This is different from [`Self::with_exposed_provenance`], which creates a pointer that picks up a
+    /// previously exposed provenance.
+    ///
+    /// Equivalent to calling [`core::ptr::without_provenance`] on each element.
+    fn without_provenance(addr: Self::Usize) -> Self;
+
     /// Creates a new pointer with the given address.
     ///
     /// This performs the same operation as a cast, but copies the *address-space* and
@@ -118,6 +131,14 @@ where
         unsafe { core::mem::transmute_copy(&self) }
     }
 
+    #[inline]
+    fn without_provenance(addr: Self::Usize) -> Self {
+        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
+        // SAFETY: Integer-to-pointer transmutes are valid (if you are okay with not getting any
+        // provenance).
+        unsafe { core::mem::transmute_copy(&addr) }
+    }
+
     #[inline]
     fn with_addr(self, addr: Self::Usize) -> Self {
         // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
diff --git a/crates/core_simd/src/simd/ptr/mut_ptr.rs b/crates/core_simd/src/simd/ptr/mut_ptr.rs
index f6823a949e3..3f20eef21a3 100644
--- a/crates/core_simd/src/simd/ptr/mut_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/mut_ptr.rs
@@ -39,6 +39,19 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// Equivalent to calling [`pointer::addr`] on each element.
     fn addr(self) -> Self::Usize;
 
+    /// Converts an address to a pointer without giving it any provenance.
+    ///
+    /// Without provenance, this pointer is not associated with any actual allocation. Such a
+    /// no-provenance pointer may be used for zero-sized memory accesses (if suitably aligned), but
+    /// non-zero-sized memory accesses with a no-provenance pointer are UB. No-provenance pointers
+    /// are little more than a usize address in disguise.
+    ///
+    /// This is different from [`Self::with_exposed_provenance`], which creates a pointer that picks up a
+    /// previously exposed provenance.
+    ///
+    /// Equivalent to calling [`core::ptr::without_provenance`] on each element.
+    fn without_provenance(addr: Self::Usize) -> Self;
+
     /// Creates a new pointer with the given address.
     ///
     /// This performs the same operation as a cast, but copies the *address-space* and
@@ -115,6 +128,14 @@ where
         unsafe { core::mem::transmute_copy(&self) }
     }
 
+    #[inline]
+    fn without_provenance(addr: Self::Usize) -> Self {
+        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
+        // SAFETY: Integer-to-pointer transmutes are valid (if you are okay with not getting any
+        // provenance).
+        unsafe { core::mem::transmute_copy(&addr) }
+    }
+
     #[inline]
     fn with_addr(self, addr: Self::Usize) -> Self {
         // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index d62642fb906..42425ef37e5 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -155,8 +155,7 @@ pub trait Swizzle<const N: usize> {
 
     /// Creates a new mask from the elements of `mask`.
     ///
-    /// Element `i` of the output is `concat[Self::INDEX[i]]`, where `concat` is the concatenation of
-    /// `first` and `second`.
+    /// Element `i` of the output is `mask[Self::INDEX[i]]`.
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original inputs"]
     fn swizzle_mask<T, const M: usize>(mask: Mask<T, M>) -> Mask<T, N>
@@ -260,6 +259,50 @@ where
         Rotate::<OFFSET>::swizzle(self)
     }
 
+    /// Shifts the vector elements to the left by `OFFSET`, filling in with
+    /// `padding` from the right.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn shift_elements_left<const OFFSET: usize>(self, padding: T) -> Self {
+        struct Shift<const OFFSET: usize>;
+
+        impl<const OFFSET: usize, const N: usize> Swizzle<N> for Shift<OFFSET> {
+            const INDEX: [usize; N] = const {
+                let mut index = [N; N];
+                let mut i = 0;
+                while i + OFFSET < N {
+                    index[i] = i + OFFSET;
+                    i += 1;
+                }
+                index
+            };
+        }
+
+        Shift::<OFFSET>::concat_swizzle(self, Simd::splat(padding))
+    }
+
+    /// Shifts the vector elements to the right by `OFFSET`, filling in with
+    /// `padding` from the left.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn shift_elements_right<const OFFSET: usize>(self, padding: T) -> Self {
+        struct Shift<const OFFSET: usize>;
+
+        impl<const OFFSET: usize, const N: usize> Swizzle<N> for Shift<OFFSET> {
+            const INDEX: [usize; N] = const {
+                let mut index = [N; N];
+                let mut i = OFFSET;
+                while i < N {
+                    index[i] = i - OFFSET;
+                    i += 1;
+                }
+                index
+            };
+        }
+
+        Shift::<OFFSET>::concat_swizzle(self, Simd::splat(padding))
+    }
+
     /// Interleave two vectors.
     ///
     /// The resulting vectors contain elements taken alternatively from `self` and `other`, first
@@ -320,7 +363,9 @@ where
     ///
     /// ```
     /// # #![feature(portable_simd)]
-    /// # use core::simd::Simd;
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::Simd;
     /// let a = Simd::from_array([0, 4, 1, 5]);
     /// let b = Simd::from_array([2, 6, 3, 7]);
     /// let (x, y) = a.deinterleave(b);
@@ -391,4 +436,210 @@ where
         }
         Resize::<N>::concat_swizzle(self, Simd::splat(value))
     }
+
+    /// Extract a vector from another vector.
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::u32x4;
+    /// let x = u32x4::from_array([0, 1, 2, 3]);
+    /// assert_eq!(x.extract::<1, 2>().to_array(), [1, 2]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn extract<const START: usize, const LEN: usize>(self) -> Simd<T, LEN>
+    where
+        LaneCount<LEN>: SupportedLaneCount,
+    {
+        struct Extract<const N: usize, const START: usize>;
+        impl<const N: usize, const START: usize, const LEN: usize> Swizzle<LEN> for Extract<N, START> {
+            const INDEX: [usize; LEN] = const {
+                assert!(START + LEN <= N, "index out of bounds");
+                let mut index = [0; LEN];
+                let mut i = 0;
+                while i < LEN {
+                    index[i] = START + i;
+                    i += 1;
+                }
+                index
+            };
+        }
+        Extract::<N, START>::swizzle(self)
+    }
+}
+
+impl<T, const N: usize> Mask<T, N>
+where
+    T: MaskElement,
+    LaneCount<N>: SupportedLaneCount,
+{
+    /// Reverse the order of the elements in the mask.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn reverse(self) -> Self {
+        // Safety: swizzles are safe for masks
+        unsafe { Self::from_int_unchecked(self.to_int().reverse()) }
+    }
+
+    /// Rotates the mask such that the first `OFFSET` elements of the slice move to the end
+    /// while the last `self.len() - OFFSET` elements move to the front. After calling `rotate_elements_left`,
+    /// the element previously at index `OFFSET` will become the first element in the slice.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn rotate_elements_left<const OFFSET: usize>(self) -> Self {
+        // Safety: swizzles are safe for masks
+        unsafe { Self::from_int_unchecked(self.to_int().rotate_elements_left::<OFFSET>()) }
+    }
+
+    /// Rotates the mask such that the first `self.len() - OFFSET` elements of the mask move to
+    /// the end while the last `OFFSET` elements move to the front. After calling `rotate_elements_right`,
+    /// the element previously at index `self.len() - OFFSET` will become the first element in the slice.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn rotate_elements_right<const OFFSET: usize>(self) -> Self {
+        // Safety: swizzles are safe for masks
+        unsafe { Self::from_int_unchecked(self.to_int().rotate_elements_right::<OFFSET>()) }
+    }
+
+    /// Shifts the mask elements to the left by `OFFSET`, filling in with
+    /// `padding` from the right.
+    #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original inputs"]
+    pub fn shift_elements_left<const OFFSET: usize>(self, padding: bool) -> Self {
+        // Safety: swizzles are safe for masks
+        unsafe {
+            Self::from_int_unchecked(self.to_int().shift_elements_left::<OFFSET>(if padding {
+                T::TRUE
+            } else {
+                T::FALSE
+            }))
+        }
+    }
+
+    /// Shifts the mask elements to the right by `OFFSET`, filling in with
+    /// `padding` from the left.
+    #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original inputs"]
+    pub fn shift_elements_right<const OFFSET: usize>(self, padding: bool) -> Self {
+        // Safety: swizzles are safe for masks
+        unsafe {
+            Self::from_int_unchecked(self.to_int().shift_elements_right::<OFFSET>(if padding {
+                T::TRUE
+            } else {
+                T::FALSE
+            }))
+        }
+    }
+
+    /// Interleave two masks.
+    ///
+    /// The resulting masks contain elements taken alternatively from `self` and `other`, first
+    /// filling the first result, and then the second.
+    ///
+    /// The reverse of this operation is [`Mask::deinterleave`].
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::mask32x4;
+    /// let a = mask32x4::from_array([false, true, false, true]);
+    /// let b = mask32x4::from_array([false, false, true, true]);
+    /// let (x, y) = a.interleave(b);
+    /// assert_eq!(x.to_array(), [false, false, true, false]);
+    /// assert_eq!(y.to_array(), [false, true, true, true]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn interleave(self, other: Self) -> (Self, Self) {
+        let (lo, hi) = self.to_int().interleave(other.to_int());
+        // Safety: swizzles are safe for masks
+        unsafe { (Self::from_int_unchecked(lo), Self::from_int_unchecked(hi)) }
+    }
+
+    /// Deinterleave two masks.
+    ///
+    /// The first result takes every other element of `self` and then `other`, starting with
+    /// the first element.
+    ///
+    /// The second result takes every other element of `self` and then `other`, starting with
+    /// the second element.
+    ///
+    /// The reverse of this operation is [`Mask::interleave`].
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::mask32x4;
+    /// let a = mask32x4::from_array([false, true, false, true]);
+    /// let b = mask32x4::from_array([false, false, true, true]);
+    /// let (x, y) = a.deinterleave(b);
+    /// assert_eq!(x.to_array(), [false, false, false, true]);
+    /// assert_eq!(y.to_array(), [true, true, false, true]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn deinterleave(self, other: Self) -> (Self, Self) {
+        let (even, odd) = self.to_int().deinterleave(other.to_int());
+        // Safety: swizzles are safe for masks
+        unsafe {
+            (
+                Self::from_int_unchecked(even),
+                Self::from_int_unchecked(odd),
+            )
+        }
+    }
+
+    /// Resize a mask.
+    ///
+    /// If `M` > `N`, extends the length of a mask, setting the new elements to `value`.
+    /// If `M` < `N`, truncates the mask to the first `M` elements.
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::mask32x4;
+    /// let x = mask32x4::from_array([false, true, true, false]);
+    /// assert_eq!(x.resize::<8>(true).to_array(), [false, true, true, false, true, true, true, true]);
+    /// assert_eq!(x.resize::<2>(true).to_array(), [false, true]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn resize<const M: usize>(self, value: bool) -> Mask<T, M>
+    where
+        LaneCount<M>: SupportedLaneCount,
+    {
+        // Safety: swizzles are safe for masks
+        unsafe {
+            Mask::<T, M>::from_int_unchecked(self.to_int().resize::<M>(if value {
+                T::TRUE
+            } else {
+                T::FALSE
+            }))
+        }
+    }
+
+    /// Extract a vector from another vector.
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::mask32x4;
+    /// let x = mask32x4::from_array([false, true, true, false]);
+    /// assert_eq!(x.extract::<1, 2>().to_array(), [true, true]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn extract<const START: usize, const LEN: usize>(self) -> Mask<T, LEN>
+    where
+        LaneCount<LEN>: SupportedLaneCount,
+    {
+        // Safety: swizzles are safe for masks
+        unsafe { Mask::<T, LEN>::from_int_unchecked(self.to_int().extract::<START, LEN>()) }
+    }
 }
diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
index 3b6388d0f27..773bd028bae 100644
--- a/crates/core_simd/src/swizzle_dyn.rs
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -59,15 +59,40 @@ where
                     target_endian = "little"
                 ))]
                 16 => transize(vqtbl1q_u8, self, idxs),
+                #[cfg(all(
+                    target_arch = "arm",
+                    target_feature = "v7",
+                    target_feature = "neon",
+                    target_endian = "little"
+                ))]
+                16 => transize(armv7_neon_swizzle_u8x16, self, idxs),
                 #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
                 32 => transize(avx2_pshufb, self, idxs),
                 #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
-                32 => transize(x86::_mm256_permutexvar_epi8, zeroing_idxs(idxs), self),
-                // Notable absence: avx512bw shuffle
-                // If avx512bw is available, odds of avx512vbmi are good
-                // FIXME: initial AVX512VBMI variant didn't actually pass muster
-                // #[cfg(target_feature = "avx512vbmi")]
-                // 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs),
+                32 => {
+                    // Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
+                    let swizzler = |bytes, idxs| {
+                        let mask = x86::_mm256_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
+                            idxs,
+                            Simd::<u8, 32>::splat(N as u8).into(),
+                        );
+                        x86::_mm256_maskz_permutexvar_epi8(mask, idxs, bytes)
+                    };
+                    transize(swizzler, self, idxs)
+                }
+                // Notable absence: avx512bw pshufb shuffle
+                #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
+                64 => {
+                    // Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
+                    let swizzler = |bytes, idxs| {
+                        let mask = x86::_mm512_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
+                            idxs,
+                            Simd::<u8, 64>::splat(N as u8).into(),
+                        );
+                        x86::_mm512_maskz_permutexvar_epi8(mask, idxs, bytes)
+                    };
+                    transize(swizzler, self, idxs)
+                }
                 _ => {
                     let mut array = [0; N];
                     for (i, k) in idxs.to_array().into_iter().enumerate() {
@@ -82,6 +107,28 @@ where
     }
 }
 
+/// armv7 neon supports swizzling `u8x16` by swizzling two u8x8 blocks
+/// with a u8x8x2 lookup table.
+///
+/// # Safety
+/// This requires armv7 neon to work
+#[cfg(all(
+    target_arch = "arm",
+    target_feature = "v7",
+    target_feature = "neon",
+    target_endian = "little"
+))]
+unsafe fn armv7_neon_swizzle_u8x16(bytes: Simd<u8, 16>, idxs: Simd<u8, 16>) -> Simd<u8, 16> {
+    use core::arch::arm::{uint8x8x2_t, vcombine_u8, vget_high_u8, vget_low_u8, vtbl2_u8};
+    // SAFETY: Caller promised arm neon support
+    unsafe {
+        let bytes = uint8x8x2_t(vget_low_u8(bytes.into()), vget_high_u8(bytes.into()));
+        let lo = vtbl2_u8(bytes, vget_low_u8(idxs.into()));
+        let hi = vtbl2_u8(bytes, vget_high_u8(idxs.into()));
+        vcombine_u8(lo, hi).into()
+    }
+}
+
 /// "vpshufb like it was meant to be" on AVX2
 ///
 /// # Safety
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 3e239169149..9c4dd36c24f 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -99,7 +99,7 @@ use crate::simd::{
 // directly constructing an instance of the type (i.e. `let vector = Simd(array)`) should be
 // avoided, as it will likely become illegal on `#[repr(simd)]` structs in the future. It also
 // causes rustc to emit illegal LLVM IR in some cases.
-#[repr(simd)]
+#[repr(simd, packed)]
 pub struct Simd<T, const N: usize>([T; N])
 where
     LaneCount<N>: SupportedLaneCount,
@@ -144,14 +144,32 @@ where
     /// assert_eq!(v.as_array(), &[8, 8, 8, 8]);
     /// ```
     #[inline]
-    pub fn splat(value: T) -> Self {
-        // This is preferred over `[value; N]`, since it's explicitly a splat:
-        // https://github.com/rust-lang/rust/issues/97804
-        struct Splat;
-        impl<const N: usize> Swizzle<N> for Splat {
-            const INDEX: [usize; N] = [0; N];
+    #[rustc_const_unstable(feature = "portable_simd", issue = "86656")]
+    pub const fn splat(value: T) -> Self {
+        const fn splat_const<T, const N: usize>(value: T) -> Simd<T, N>
+        where
+            T: SimdElement,
+            LaneCount<N>: SupportedLaneCount,
+        {
+            Simd::from_array([value; N])
         }
-        Splat::swizzle::<T, 1>(Simd::<T, 1>::from([value]))
+
+        fn splat_rt<T, const N: usize>(value: T) -> Simd<T, N>
+        where
+            T: SimdElement,
+            LaneCount<N>: SupportedLaneCount,
+        {
+            // This is preferred over `[value; N]`, since it's explicitly a splat:
+            // https://github.com/rust-lang/rust/issues/97804
+            struct Splat;
+            impl<const N: usize> Swizzle<N> for Splat {
+                const INDEX: [usize; N] = [0; N];
+            }
+
+            Splat::swizzle::<T, 1>(Simd::<T, 1>::from([value]))
+        }
+
+        core::intrinsics::const_eval_select((value,), splat_const, splat_rt)
     }
 
     /// Returns an array reference containing the entire SIMD vector.
@@ -425,6 +443,9 @@ where
     ///
     /// When the element is disabled, that memory location is not accessed and the corresponding
     /// value from `or` is passed through.
+    ///
+    /// # Safety
+    /// Enabled loads must not exceed the length of `slice`.
     #[must_use]
     #[inline]
     pub unsafe fn load_select_unchecked(
@@ -442,6 +463,9 @@ where
     ///
     /// When the element is disabled, that memory location is not accessed and the corresponding
     /// value from `or` is passed through.
+    ///
+    /// # Safety
+    /// Enabled `ptr` elements must be safe to read as if by `std::ptr::read`.
     #[must_use]
     #[inline]
     pub unsafe fn load_select_ptr(
@@ -924,6 +948,7 @@ where
     }
 }
 
+/// Lexicographic order. For the SIMD elementwise minimum and maximum, use simd_min and simd_max instead.
 impl<T, const N: usize> PartialOrd for Simd<T, N>
 where
     LaneCount<N>: SupportedLaneCount,
@@ -943,6 +968,7 @@ where
 {
 }
 
+/// Lexicographic order. For the SIMD elementwise minimum and maximum, use simd_min and simd_max instead.
 impl<T, const N: usize> Ord for Simd<T, N>
 where
     LaneCount<N>: SupportedLaneCount,
@@ -1195,6 +1221,7 @@ fn lane_indices<const N: usize>() -> Simd<usize, N>
 where
     LaneCount<N>: SupportedLaneCount,
 {
+    #![allow(clippy::needless_range_loop)]
     let mut index = [0; N];
     for i in 0..N {
         index[i] = i;
diff --git a/crates/core_simd/src/vendor.rs b/crates/core_simd/src/vendor.rs
index 1a34a3a8de5..57536e4fc77 100644
--- a/crates/core_simd/src/vendor.rs
+++ b/crates/core_simd/src/vendor.rs
@@ -29,3 +29,6 @@ mod arm;
 
 #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
 mod powerpc;
+
+#[cfg(target_arch = "loongarch64")]
+mod loongarch64;
diff --git a/crates/core_simd/src/vendor/loongarch64.rs b/crates/core_simd/src/vendor/loongarch64.rs
new file mode 100644
index 00000000000..1290bc166b2
--- /dev/null
+++ b/crates/core_simd/src/vendor/loongarch64.rs
@@ -0,0 +1,31 @@
+use crate::simd::*;
+use core::arch::loongarch64::*;
+
+from_transmute! { unsafe u8x16 => v16u8 }
+from_transmute! { unsafe u8x32 => v32u8 }
+from_transmute! { unsafe i8x16 => v16i8 }
+from_transmute! { unsafe i8x32 => v32i8 }
+
+from_transmute! { unsafe u16x8 => v8u16 }
+from_transmute! { unsafe u16x16 => v16u16 }
+from_transmute! { unsafe i16x8 => v8i16 }
+from_transmute! { unsafe i16x16 => v16i16 }
+
+from_transmute! { unsafe u32x4 => v4u32 }
+from_transmute! { unsafe u32x8 => v8u32 }
+from_transmute! { unsafe i32x4 => v4i32 }
+from_transmute! { unsafe i32x8 => v8i32 }
+from_transmute! { unsafe f32x4 => v4f32 }
+from_transmute! { unsafe f32x8 => v8f32 }
+
+from_transmute! { unsafe u64x2 => v2u64 }
+from_transmute! { unsafe u64x4 => v4u64 }
+from_transmute! { unsafe i64x2 => v2i64 }
+from_transmute! { unsafe i64x4 => v4i64 }
+from_transmute! { unsafe f64x2 => v2f64 }
+from_transmute! { unsafe f64x4 => v4f64 }
+
+from_transmute! { unsafe usizex2 => v2u64 }
+from_transmute! { unsafe usizex4 => v4u64 }
+from_transmute! { unsafe isizex2 => v2i64 }
+from_transmute! { unsafe isizex4 => v4i64 }
diff --git a/crates/core_simd/tests/layout.rs b/crates/core_simd/tests/layout.rs
new file mode 100644
index 00000000000..24114c2d261
--- /dev/null
+++ b/crates/core_simd/tests/layout.rs
@@ -0,0 +1,35 @@
+#![feature(portable_simd)]
+
+macro_rules! layout_tests {
+    { $($mod:ident, $ty:ty,)* } => {
+        $(
+        mod $mod {
+            test_helpers::test_lanes! {
+                fn no_padding<const LANES: usize>() {
+                    assert_eq!(
+                        core::mem::size_of::<core_simd::simd::Simd::<$ty, LANES>>(),
+                        core::mem::size_of::<[$ty; LANES]>(),
+                    );
+                }
+            }
+        }
+        )*
+    }
+}
+
+layout_tests! {
+    i8, i8,
+    i16, i16,
+    i32, i32,
+    i64, i64,
+    isize, isize,
+    u8, u8,
+    u16, u16,
+    u32, u32,
+    u64, u64,
+    usize, usize,
+    f32, f32,
+    f64, f64,
+    mut_ptr, *mut (),
+    const_ptr, *const (),
+}
diff --git a/crates/core_simd/tests/masks.rs b/crates/core_simd/tests/masks.rs
index fc6a3476b7c..48786d02440 100644
--- a/crates/core_simd/tests/masks.rs
+++ b/crates/core_simd/tests/masks.rs
@@ -99,7 +99,6 @@ macro_rules! test_mask_api {
                 assert_eq!(Mask::<$type, 2>::from_bitmask(bitmask), mask);
             }
 
-            #[cfg(feature = "all_lane_counts")]
             #[test]
             fn roundtrip_bitmask_conversion_odd() {
                 let values = [
@@ -134,48 +133,6 @@ macro_rules! test_mask_api {
                 cast_impl::<i64>();
                 cast_impl::<isize>();
             }
-
-            #[test]
-            fn roundtrip_bitmask_vector_conversion() {
-                use core_simd::simd::ToBytes;
-                let values = [
-                    true, false, false, true, false, false, true, false,
-                    true, true, false, false, false, false, false, true,
-                ];
-                let mask = Mask::<$type, 16>::from_array(values);
-                let bitmask = mask.to_bitmask_vector();
-                assert_eq!(bitmask.resize::<2>(0).to_ne_bytes()[..2], [0b01001001, 0b10000011]);
-                assert_eq!(Mask::<$type, 16>::from_bitmask_vector(bitmask), mask);
-            }
-
-            // rust-lang/portable-simd#379
-            #[test]
-            fn roundtrip_bitmask_vector_conversion_small() {
-                use core_simd::simd::ToBytes;
-                let values = [
-                    true, false, true, true
-                ];
-                let mask = Mask::<$type, 4>::from_array(values);
-                let bitmask = mask.to_bitmask_vector();
-                assert_eq!(bitmask.resize::<1>(0).to_ne_bytes()[0], 0b00001101);
-                assert_eq!(Mask::<$type, 4>::from_bitmask_vector(bitmask), mask);
-            }
-
-            /* FIXME doesn't work with non-powers-of-two, yet
-            // rust-lang/portable-simd#379
-            #[cfg(feature = "all_lane_counts")]
-            #[test]
-            fn roundtrip_bitmask_vector_conversion_odd() {
-                use core_simd::simd::ToBytes;
-                let values = [
-                    true, false, true, false, true, true, false, false, false, true, true,
-                ];
-                let mask = Mask::<$type, 11>::from_array(values);
-                let bitmask = mask.to_bitmask_vector();
-                assert_eq!(bitmask.resize::<2>(0).to_ne_bytes()[..2], [0b00110101, 0b00000110]);
-                assert_eq!(Mask::<$type, 11>::from_bitmask_vector(bitmask), mask);
-            }
-            */
         }
     }
 }
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index aa565a13752..6de78f51e59 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -216,6 +216,22 @@ macro_rules! impl_common_integer_tests {
                 )
             }
 
+            fn count_ones<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &$vector::<LANES>::count_ones,
+                    &|x| x.count_ones() as _,
+                    &|_| true,
+                )
+            }
+
+            fn count_zeros<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &$vector::<LANES>::count_zeros,
+                    &|x| x.count_zeros() as _,
+                    &|_| true,
+                )
+            }
+
             fn leading_zeros<const LANES: usize>() {
                 test_helpers::test_unary_elementwise(
                     &$vector::<LANES>::leading_zeros,
@@ -307,6 +323,14 @@ macro_rules! impl_signed_tests {
                     assert_eq!(a % b, Vector::<LANES>::splat(0));
                 }
 
+                fn abs_diff<const LANES: usize>() {
+                    test_helpers::test_binary_elementwise(
+                        &Vector::<LANES>::abs_diff,
+                        &Scalar::abs_diff,
+                        &|_, _| true,
+                    )
+                }
+
                 fn simd_min<const LANES: usize>() {
                     use core_simd::simd::cmp::SimdOrd;
                     let a = Vector::<LANES>::splat(Scalar::MIN);
@@ -419,6 +443,14 @@ macro_rules! impl_unsigned_tests {
                         &|_| true,
                     );
                 }
+
+                fn abs_diff<const LANES: usize>() {
+                    test_helpers::test_binary_elementwise(
+                        &Vector::<LANES>::abs_diff,
+                        &Scalar::abs_diff,
+                        &|_, _| true,
+                    )
+                }
             }
 
             impl_binary_op_test!(Scalar, Add::add, AddAssign::add_assign, Scalar::wrapping_add);
@@ -495,6 +527,9 @@ macro_rules! impl_float_tests {
                 }
 
                 fn is_normal<const LANES: usize>() {
+                    // Arm v7 Neon violates float opsem re: subnormals, see
+                    // https://github.com/rust-lang/portable-simd/issues/439
+                    #[cfg(not(target_arch = "arm"))]
                     test_helpers::test_unary_mask_elementwise(
                         &Vector::<LANES>::is_normal,
                         &Scalar::is_normal,
@@ -503,6 +538,9 @@ macro_rules! impl_float_tests {
                 }
 
                 fn is_subnormal<const LANES: usize>() {
+                    // Arm v7 Neon violates float opsem re: subnormals, see
+                    // https://github.com/rust-lang/portable-simd/issues/439
+                    #[cfg(not(target_arch = "arm"))]
                     test_helpers::test_unary_mask_elementwise(
                         &Vector::<LANES>::is_subnormal,
                         &Scalar::is_subnormal,
diff --git a/crates/core_simd/tests/swizzle.rs b/crates/core_simd/tests/swizzle.rs
index 522d71439b7..7001e5f6bf8 100644
--- a/crates/core_simd/tests/swizzle.rs
+++ b/crates/core_simd/tests/swizzle.rs
@@ -48,6 +48,24 @@ fn rotate() {
     assert_eq!(a.rotate_elements_right::<5>().to_array(), [4, 1, 2, 3]);
 }
 
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn shift() {
+    let a = Simd::from_array([1, 2, 3, 4]);
+    assert_eq!(a.shift_elements_left::<0>(0).to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.shift_elements_left::<1>(0).to_array(), [2, 3, 4, 0]);
+    assert_eq!(a.shift_elements_left::<2>(9).to_array(), [3, 4, 9, 9]);
+    assert_eq!(a.shift_elements_left::<3>(8).to_array(), [4, 8, 8, 8]);
+    assert_eq!(a.shift_elements_left::<4>(7).to_array(), [7, 7, 7, 7]);
+    assert_eq!(a.shift_elements_left::<5>(6).to_array(), [6, 6, 6, 6]);
+    assert_eq!(a.shift_elements_right::<0>(0).to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.shift_elements_right::<1>(0).to_array(), [0, 1, 2, 3]);
+    assert_eq!(a.shift_elements_right::<2>(-1).to_array(), [-1, -1, 1, 2]);
+    assert_eq!(a.shift_elements_right::<3>(-2).to_array(), [-2, -2, -2, 1]);
+    assert_eq!(a.shift_elements_right::<4>(-3).to_array(), [-3, -3, -3, -3]);
+    assert_eq!(a.shift_elements_right::<5>(-4).to_array(), [-4, -4, -4, -4]);
+}
+
 #[test]
 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
 fn interleave() {
diff --git a/crates/test_helpers/Cargo.toml b/crates/test_helpers/Cargo.toml
index 23dae7c9338..a5359b9abc8 100644
--- a/crates/test_helpers/Cargo.toml
+++ b/crates/test_helpers/Cargo.toml
@@ -6,6 +6,3 @@ publish = false
 
 [dependencies]
 proptest = { version = "0.10", default-features = false, features = ["alloc"] }
-
-[features]
-all_lane_counts = []
diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index 51b860a8635..197c920e11e 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -539,32 +539,22 @@ macro_rules! test_lanes {
                     #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
                     lanes_1 1;
                     lanes_2 2;
-                    lanes_4 4;
-                );
-
-                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
-                $crate::test_lanes_helper!(
-                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
-                    lanes_8 8;
-                    lanes_16 16;
-                    lanes_32 32;
-                    lanes_64 64;
-                );
-
-                #[cfg(feature = "all_lane_counts")]
-                $crate::test_lanes_helper!(
-                    // test some odd and even non-power-of-2 lengths on miri
-                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
+                    // Cover an odd and an even non-power-of-2 length in Miri.
+                    // (Even non-power-of-2 vectors have alignment between element
+                    // and vector size, so we want to cover that case as well.)
                     lanes_3 3;
-                    lanes_5 5;
+
                     lanes_6 6;
                 );
 
-                #[cfg(feature = "all_lane_counts")]
                 #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
                 $crate::test_lanes_helper!(
                     #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
+                    lanes_4 4;
+                    lanes_5 5;
+
                     lanes_7 7;
+                    lanes_8 8;
                     lanes_9 9;
                     lanes_10 10;
                     lanes_11 11;
@@ -572,52 +562,55 @@ macro_rules! test_lanes {
                     lanes_13 13;
                     lanes_14 14;
                     lanes_15 15;
+                    lanes_16 16;
                     lanes_17 17;
-                    lanes_18 18;
-                    lanes_19 19;
-                    lanes_20 20;
-                    lanes_21 21;
-                    lanes_22 22;
-                    lanes_23 23;
+                    //lanes_18 18;
+                    //lanes_19 19;
+                    //lanes_20 20;
+                    //lanes_21 21;
+                    //lanes_22 22;
+                    //lanes_23 23;
                     lanes_24 24;
-                    lanes_25 25;
-                    lanes_26 26;
-                    lanes_27 27;
-                    lanes_28 28;
-                    lanes_29 29;
-                    lanes_30 30;
-                    lanes_31 31;
-                    lanes_33 33;
-                    lanes_34 34;
-                    lanes_35 35;
-                    lanes_36 36;
-                    lanes_37 37;
-                    lanes_38 38;
-                    lanes_39 39;
-                    lanes_40 40;
-                    lanes_41 41;
-                    lanes_42 42;
-                    lanes_43 43;
-                    lanes_44 44;
-                    lanes_45 45;
-                    lanes_46 46;
+                    //lanes_25 25;
+                    //lanes_26 26;
+                    //lanes_27 27;
+                    //lanes_28 28;
+                    //lanes_29 29;
+                    //lanes_30 30;
+                    //lanes_31 31;
+                    lanes_32 32;
+                    //lanes_33 33;
+                    //lanes_34 34;
+                    //lanes_35 35;
+                    //lanes_36 36;
+                    //lanes_37 37;
+                    //lanes_38 38;
+                    //lanes_39 39;
+                    //lanes_40 40;
+                    //lanes_41 41;
+                    //lanes_42 42;
+                    //lanes_43 43;
+                    //lanes_44 44;
+                    //lanes_45 45;
+                    //lanes_46 46;
                     lanes_47 47;
-                    lanes_48 48;
-                    lanes_49 49;
-                    lanes_50 50;
-                    lanes_51 51;
-                    lanes_52 52;
-                    lanes_53 53;
-                    lanes_54 54;
-                    lanes_55 55;
+                    //lanes_48 48;
+                    //lanes_49 49;
+                    //lanes_50 50;
+                    //lanes_51 51;
+                    //lanes_52 52;
+                    //lanes_53 53;
+                    //lanes_54 54;
+                    //lanes_55 55;
                     lanes_56 56;
                     lanes_57 57;
-                    lanes_58 58;
-                    lanes_59 59;
-                    lanes_60 60;
-                    lanes_61 61;
-                    lanes_62 62;
+                    //lanes_58 58;
+                    //lanes_59 59;
+                    //lanes_60 60;
+                    //lanes_61 61;
+                    //lanes_62 62;
                     lanes_63 63;
+                    lanes_64 64;
                 );
             }
         )*
@@ -639,36 +632,24 @@ macro_rules! test_lanes_panic {
                     core_simd::simd::LaneCount<$lanes>: core_simd::simd::SupportedLaneCount,
                 $body
 
+                // test some odd and even non-power-of-2 lengths on miri
                 $crate::test_lanes_helper!(
                     #[should_panic];
                     lanes_1 1;
                     lanes_2 2;
-                    lanes_4 4;
-                );
-
-                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
-                $crate::test_lanes_helper!(
-                    #[should_panic];
-                    lanes_8 8;
-                    lanes_16 16;
-                    lanes_32 32;
-                    lanes_64 64;
-                );
-
-                #[cfg(feature = "all_lane_counts")]
-                $crate::test_lanes_helper!(
-                    // test some odd and even non-power-of-2 lengths on miri
-                    #[should_panic];
                     lanes_3 3;
-                    lanes_5 5;
+
                     lanes_6 6;
                 );
 
-                #[cfg(feature = "all_lane_counts")]
                 #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
                 $crate::test_lanes_helper!(
                     #[should_panic];
+                    lanes_4 4;
+                    lanes_5 5;
+
                     lanes_7 7;
+                    lanes_8 8;
                     lanes_9 9;
                     lanes_10 10;
                     lanes_11 11;
@@ -676,52 +657,55 @@ macro_rules! test_lanes_panic {
                     lanes_13 13;
                     lanes_14 14;
                     lanes_15 15;
+                    lanes_16 16;
                     lanes_17 17;
-                    lanes_18 18;
-                    lanes_19 19;
-                    lanes_20 20;
-                    lanes_21 21;
-                    lanes_22 22;
-                    lanes_23 23;
+                    //lanes_18 18;
+                    //lanes_19 19;
+                    //lanes_20 20;
+                    //lanes_21 21;
+                    //lanes_22 22;
+                    //lanes_23 23;
                     lanes_24 24;
-                    lanes_25 25;
-                    lanes_26 26;
-                    lanes_27 27;
-                    lanes_28 28;
-                    lanes_29 29;
-                    lanes_30 30;
-                    lanes_31 31;
-                    lanes_33 33;
-                    lanes_34 34;
-                    lanes_35 35;
-                    lanes_36 36;
-                    lanes_37 37;
-                    lanes_38 38;
-                    lanes_39 39;
-                    lanes_40 40;
-                    lanes_41 41;
-                    lanes_42 42;
-                    lanes_43 43;
-                    lanes_44 44;
-                    lanes_45 45;
-                    lanes_46 46;
+                    //lanes_25 25;
+                    //lanes_26 26;
+                    //lanes_27 27;
+                    //lanes_28 28;
+                    //lanes_29 29;
+                    //lanes_30 30;
+                    //lanes_31 31;
+                    lanes_32 32;
+                    //lanes_33 33;
+                    //lanes_34 34;
+                    //lanes_35 35;
+                    //lanes_36 36;
+                    //lanes_37 37;
+                    //lanes_38 38;
+                    //lanes_39 39;
+                    //lanes_40 40;
+                    //lanes_41 41;
+                    //lanes_42 42;
+                    //lanes_43 43;
+                    //lanes_44 44;
+                    //lanes_45 45;
+                    //lanes_46 46;
                     lanes_47 47;
-                    lanes_48 48;
-                    lanes_49 49;
-                    lanes_50 50;
-                    lanes_51 51;
-                    lanes_52 52;
-                    lanes_53 53;
-                    lanes_54 54;
-                    lanes_55 55;
+                    //lanes_48 48;
+                    //lanes_49 49;
+                    //lanes_50 50;
+                    //lanes_51 51;
+                    //lanes_52 52;
+                    //lanes_53 53;
+                    //lanes_54 54;
+                    //lanes_55 55;
                     lanes_56 56;
                     lanes_57 57;
-                    lanes_58 58;
-                    lanes_59 59;
-                    lanes_60 60;
-                    lanes_61 61;
-                    lanes_62 62;
+                    //lanes_58 58;
+                    //lanes_59 59;
+                    //lanes_60 60;
+                    //lanes_61 61;
+                    //lanes_62 62;
                     lanes_63 63;
+                    lanes_64 64;
                 );
             }
         )*
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
new file mode 100644
index 00000000000..d17c6d2e889
--- /dev/null
+++ b/rust-toolchain.toml
@@ -0,0 +1,3 @@
+[toolchain]
+channel = "nightly-2025-01-16"
+components = ["rustfmt", "clippy", "miri", "rust-src"]
diff --git a/subtree-sync.sh b/subtree-sync.sh
new file mode 100755
index 00000000000..18360077623
--- /dev/null
+++ b/subtree-sync.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+set -eou pipefail
+
+git fetch origin
+pushd $2
+git fetch origin
+popd
+
+if [ "$(git rev-parse --show-prefix)" != "" ]; then
+    echo "Run this script from the git root" >&2
+    exit 1
+fi
+
+if [ "$(git rev-parse HEAD)" != "$(git rev-parse origin/master)" ]; then
+    echo "$(pwd) is not at origin/master" >&2
+    exit 1
+fi
+
+if [ ! -f library/portable-simd/git-subtree.sh ]; then
+    curl -sS https://raw.githubusercontent.com/bjorn3/git/tqc-subtree-portable/contrib/subtree/git-subtree.sh -o library/portable-simd/git-subtree.sh
+    chmod +x library/portable-simd/git-subtree.sh
+fi
+
+today=$(date +%Y-%m-%d)
+
+case $1 in
+    "push")
+        upstream=rust-upstream-$today
+        merge=sync-from-rust-$today
+
+        pushd $2
+        git checkout master
+        git pull
+        popd
+
+        library/portable-simd/git-subtree.sh push -P library/portable-simd $2 $upstream
+
+        pushd $2
+        git checkout -B $merge origin/master
+        git merge $upstream
+        popd
+        echo "Branch \`$merge\` created in \`$2\`. You may need to resolve merge conflicts."
+        ;;
+    "pull")
+        branch=sync-from-portable-simd-$today
+
+        git checkout -B $branch
+        echo "Creating branch \`$branch\`... You may need to resolve merge conflicts."
+        library/portable-simd/git-subtree.sh pull -P library/portable-simd $2 origin/master
+        ;;
+esac

From 02a28b29e2d1ab50b8d80c18018c33402cefc054 Mon Sep 17 00:00:00 2001
From: Samuel Tardieu <sam@rfc1149.net>
Date: Wed, 12 Feb 2025 10:35:32 +0100
Subject: [PATCH 2/3] Remove ignored `#[must_use]` attributes from
 portable-simd

The `#[must_use]` attribute has no effect when applied to methods in
trait implementations.
---
 crates/core_simd/src/masks.rs            | 13 -------------
 crates/core_simd/src/masks/full_masks.rs |  5 -----
 crates/core_simd/src/ops.rs              |  1 -
 crates/core_simd/src/ops/deref.rs        |  3 ---
 crates/core_simd/src/ops/unary.rs        |  2 --
 crates/core_simd/src/simd/num/float.rs   |  1 -
 6 files changed, 25 deletions(-)

diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index b763a7c75a5..19d45f4d3b3 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -401,7 +401,6 @@ where
     LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
-    #[must_use = "method returns a defaulted mask with all elements set to false (0)"]
     fn default() -> Self {
         Self::splat(false)
     }
@@ -413,7 +412,6 @@ where
     LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
-    #[must_use = "method returns a new bool and does not mutate the original value"]
     fn eq(&self, other: &Self) -> bool {
         self.0 == other.0
     }
@@ -425,7 +423,6 @@ where
     LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
-    #[must_use = "method returns a new Ordering and does not mutate the original value"]
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         self.0.partial_cmp(&other.0)
     }
@@ -451,7 +448,6 @@ where
 {
     type Output = Self;
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(self, rhs: Self) -> Self {
         Self(self.0 & rhs.0)
     }
@@ -464,7 +460,6 @@ where
 {
     type Output = Self;
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(self, rhs: bool) -> Self {
         self & Self::splat(rhs)
     }
@@ -477,7 +472,6 @@ where
 {
     type Output = Mask<T, N>;
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(self, rhs: Mask<T, N>) -> Mask<T, N> {
         Mask::splat(self) & rhs
     }
@@ -490,7 +484,6 @@ where
 {
     type Output = Self;
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(self, rhs: Self) -> Self {
         Self(self.0 | rhs.0)
     }
@@ -503,7 +496,6 @@ where
 {
     type Output = Self;
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(self, rhs: bool) -> Self {
         self | Self::splat(rhs)
     }
@@ -516,7 +508,6 @@ where
 {
     type Output = Mask<T, N>;
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(self, rhs: Mask<T, N>) -> Mask<T, N> {
         Mask::splat(self) | rhs
     }
@@ -529,7 +520,6 @@ where
 {
     type Output = Self;
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(self, rhs: Self) -> Self::Output {
         Self(self.0 ^ rhs.0)
     }
@@ -542,7 +532,6 @@ where
 {
     type Output = Self;
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(self, rhs: bool) -> Self::Output {
         self ^ Self::splat(rhs)
     }
@@ -555,7 +544,6 @@ where
 {
     type Output = Mask<T, N>;
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(self, rhs: Mask<T, N>) -> Self::Output {
         Mask::splat(self) ^ rhs
     }
@@ -568,7 +556,6 @@ where
 {
     type Output = Mask<T, N>;
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn not(self) -> Self::Output {
         Self(!self.0)
     }
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index 2d01946b574..387b508c4b4 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -21,7 +21,6 @@ where
     LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn clone(&self) -> Self {
         *self
     }
@@ -252,7 +251,6 @@ where
 {
     type Output = Self;
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(self, rhs: Self) -> Self {
         // Safety: `self` is an integer vector
         unsafe { Self(core::intrinsics::simd::simd_and(self.0, rhs.0)) }
@@ -266,7 +264,6 @@ where
 {
     type Output = Self;
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(self, rhs: Self) -> Self {
         // Safety: `self` is an integer vector
         unsafe { Self(core::intrinsics::simd::simd_or(self.0, rhs.0)) }
@@ -280,7 +277,6 @@ where
 {
     type Output = Self;
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(self, rhs: Self) -> Self {
         // Safety: `self` is an integer vector
         unsafe { Self(core::intrinsics::simd::simd_xor(self.0, rhs.0)) }
@@ -294,7 +290,6 @@ where
 {
     type Output = Self;
     #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
     fn not(self) -> Self::Output {
         Self::splat(true) ^ self
     }
diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index d3bd14a3402..4ac64a253a3 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -135,7 +135,6 @@ macro_rules! for_base_types {
                     type Output = $out;
 
                     #[inline]
-                    #[must_use = "operator returns a new vector without mutating the inputs"]
                     // TODO: only useful for int Div::div, but we hope that this
                     // will essentially always get inlined anyway.
                     #[track_caller]
diff --git a/crates/core_simd/src/ops/deref.rs b/crates/core_simd/src/ops/deref.rs
index 0ff76cfba39..913cbbe977c 100644
--- a/crates/core_simd/src/ops/deref.rs
+++ b/crates/core_simd/src/ops/deref.rs
@@ -18,7 +18,6 @@ macro_rules! deref_lhs {
             type Output = Simd<T, N>;
 
             #[inline]
-            #[must_use = "operator returns a new vector without mutating the inputs"]
             fn $call(self, rhs: $simd) -> Self::Output {
                 (*self).$call(rhs)
             }
@@ -39,7 +38,6 @@ macro_rules! deref_rhs {
             type Output = Simd<T, N>;
 
             #[inline]
-            #[must_use = "operator returns a new vector without mutating the inputs"]
             fn $call(self, rhs: &$simd) -> Self::Output {
                 self.$call(*rhs)
             }
@@ -71,7 +69,6 @@ macro_rules! deref_ops {
                 type Output = $simd;
 
                 #[inline]
-                #[must_use = "operator returns a new vector without mutating the inputs"]
                 fn $call(self, rhs: &'rhs $simd) -> Self::Output {
                     (*self).$call(*rhs)
                 }
diff --git a/crates/core_simd/src/ops/unary.rs b/crates/core_simd/src/ops/unary.rs
index bdae96332a3..412a5b80117 100644
--- a/crates/core_simd/src/ops/unary.rs
+++ b/crates/core_simd/src/ops/unary.rs
@@ -11,7 +11,6 @@ macro_rules! neg {
             type Output = Self;
 
             #[inline]
-            #[must_use = "operator returns a new vector without mutating the input"]
             fn neg(self) -> Self::Output {
                 // Safety: `self` is a signed vector
                 unsafe { core::intrinsics::simd::simd_neg(self) }
@@ -46,7 +45,6 @@ macro_rules! not {
             type Output = Self;
 
             #[inline]
-            #[must_use = "operator returns a new vector without mutating the input"]
             fn not(self) -> Self::Output {
                 self ^ (Simd::splat(!(0 as $scalar)))
             }
diff --git a/crates/core_simd/src/simd/num/float.rs b/crates/core_simd/src/simd/num/float.rs
index 79954b937b3..db705dfe202 100644
--- a/crates/core_simd/src/simd/num/float.rs
+++ b/crates/core_simd/src/simd/num/float.rs
@@ -371,7 +371,6 @@ macro_rules! impl_trait {
             }
 
             #[inline]
-            #[must_use = "method returns a new mask and does not mutate the original value"]
             fn is_normal(self) -> Self::Mask {
                 !(self.abs().simd_eq(Self::splat(0.0)) | self.is_nan() | self.is_subnormal() | self.is_infinite())
             }

From aaf8ff1f9ec9b7049a7896a3c278f9e990306cb8 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Wed, 19 Mar 2025 00:58:47 -0400
Subject: [PATCH 3/3] Merge commit 'c14f2fc3eb69c164d8bf8d36d91ebd60bd5261e6'
 into sync-from-portable-simd-2025-03-19

---
 beginners-guide.md                         |  4 +-
 crates/core_simd/Cargo.toml                |  2 +-
 crates/core_simd/src/lib.rs                |  6 ++-
 crates/core_simd/src/masks/bitmask.rs      | 22 +++++------
 crates/core_simd/src/masks/full_masks.rs   | 20 +++++-----
 crates/core_simd/src/ops.rs                |  2 +-
 crates/core_simd/src/simd/cmp/eq.rs        |  2 +-
 crates/core_simd/src/simd/cmp/ord.rs       |  2 +-
 crates/core_simd/src/simd/num/float.rs     |  9 +++--
 crates/core_simd/src/simd/num/int.rs       |  4 +-
 crates/core_simd/src/simd/num/uint.rs      |  2 +-
 crates/core_simd/src/simd/prelude.rs       |  3 +-
 crates/core_simd/src/simd/ptr/const_ptr.rs |  2 +-
 crates/core_simd/src/simd/ptr/mut_ptr.rs   |  2 +-
 crates/core_simd/src/swizzle.rs            | 44 ++++++++++++++++++++++
 crates/core_simd/src/to_bytes.rs           |  2 +-
 crates/core_simd/src/vector.rs             |  4 +-
 crates/core_simd/tests/layout.rs           |  4 +-
 crates/core_simd/tests/pointers.rs         |  2 +-
 crates/core_simd/tests/round.rs            |  2 +-
 crates/test_helpers/src/subnormals.rs      |  2 +-
 21 files changed, 96 insertions(+), 46 deletions(-)

diff --git a/beginners-guide.md b/beginners-guide.md
index 17ade06ae80..dc08d847ced 100644
--- a/beginners-guide.md
+++ b/beginners-guide.md
@@ -80,12 +80,12 @@ Most of the portable SIMD API is designed to allow the user to gloss over the de
 
 Fortunately, most SIMD types have a fairly predictable size. `i32x4` is bit-equivalent to `[i32; 4]` and so can be bitcast to it, e.g. using [`mem::transmute`], though the API usually offers a safe cast you can use instead.
 
-However, this is not the same as alignment. Computer architectures generally prefer aligned accesses, especially when moving data between memory and vector registers, and while some support specialized operations that can bend the rules to help with this, unaligned access is still typically slow, or even undefined behavior. In addition, different architectures can require different alignments when interacting with their native SIMD types. For this reason, any `#[repr(simd)]` type has a non-portable alignment. If it is necessary to directly interact with the alignment of these types, it should be via [`mem::align_of`].
+However, this is not the same as alignment. Computer architectures generally prefer aligned accesses, especially when moving data between memory and vector registers, and while some support specialized operations that can bend the rules to help with this, unaligned access is still typically slow, or even undefined behavior. In addition, different architectures can require different alignments when interacting with their native SIMD types. For this reason, any `#[repr(simd)]` type has a non-portable alignment. If it is necessary to directly interact with the alignment of these types, it should be via [`align_of`].
 
 When working with slices, data correctly aligned for SIMD can be acquired using the [`as_simd`] and [`as_simd_mut`] methods of the slice primitive.
 
 [`mem::transmute`]: https://doc.rust-lang.org/core/mem/fn.transmute.html
-[`mem::align_of`]: https://doc.rust-lang.org/core/mem/fn.align_of.html
+[`align_of`]: https://doc.rust-lang.org/core/mem/fn.align_of.html
 [`as_simd`]: https://doc.rust-lang.org/nightly/std/primitive.slice.html#method.as_simd
 [`as_simd_mut`]: https://doc.rust-lang.org/nightly/std/primitive.slice.html#method.as_simd_mut
 
diff --git a/crates/core_simd/Cargo.toml b/crates/core_simd/Cargo.toml
index a7a6d43b11d..537ce459c07 100644
--- a/crates/core_simd/Cargo.toml
+++ b/crates/core_simd/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "core_simd"
 version = "0.1.0"
-edition = "2021"
+edition = "2024"
 homepage = "https://github.com/rust-lang/portable-simd"
 repository = "https://github.com/rust-lang/portable-simd"
 keywords = ["core", "simd", "intrinsics"]
diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 7f57847c9c2..717b882b64b 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -35,7 +35,11 @@
     feature(stdarch_x86_avx512)
 )]
 #![warn(missing_docs, clippy::missing_inline_in_public_items)] // basically all items, really
-#![deny(unsafe_op_in_unsafe_fn, clippy::undocumented_unsafe_blocks)]
+#![deny(
+    unsafe_op_in_unsafe_fn,
+    unreachable_pub,
+    clippy::undocumented_unsafe_blocks
+)]
 #![doc(test(attr(deny(warnings))))]
 #![allow(internal_features)]
 #![unstable(feature = "portable_simd", issue = "86656")]
diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index db4312d5bf8..8221d8f17e9 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -5,7 +5,7 @@ use core::marker::PhantomData;
 
 /// A mask where each lane is represented by a single bit.
 #[repr(transparent)]
-pub struct Mask<T, const N: usize>(
+pub(crate) struct Mask<T, const N: usize>(
     <LaneCount<N> as SupportedLaneCount>::BitMask,
     PhantomData<T>,
 )
@@ -78,7 +78,7 @@ where
 {
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn splat(value: bool) -> Self {
+    pub(crate) fn splat(value: bool) -> Self {
         let mut mask = <LaneCount<N> as SupportedLaneCount>::BitMask::default();
         if value {
             mask.as_mut().fill(u8::MAX)
@@ -93,12 +93,12 @@ where
 
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
-    pub unsafe fn test_unchecked(&self, lane: usize) -> bool {
+    pub(crate) unsafe fn test_unchecked(&self, lane: usize) -> bool {
         (self.0.as_ref()[lane / 8] >> (lane % 8)) & 0x1 > 0
     }
 
     #[inline]
-    pub unsafe fn set_unchecked(&mut self, lane: usize, value: bool) {
+    pub(crate) unsafe fn set_unchecked(&mut self, lane: usize, value: bool) {
         unsafe {
             self.0.as_mut()[lane / 8] ^= ((value ^ self.test_unchecked(lane)) as u8) << (lane % 8)
         }
@@ -106,7 +106,7 @@ where
 
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original value"]
-    pub fn to_int(self) -> Simd<T, N> {
+    pub(crate) fn to_int(self) -> Simd<T, N> {
         unsafe {
             core::intrinsics::simd::simd_select_bitmask(
                 self.0,
@@ -118,19 +118,19 @@ where
 
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub unsafe fn from_int_unchecked(value: Simd<T, N>) -> Self {
+    pub(crate) unsafe fn from_int_unchecked(value: Simd<T, N>) -> Self {
         unsafe { Self(core::intrinsics::simd::simd_bitmask(value), PhantomData) }
     }
 
     #[inline]
-    pub fn to_bitmask_integer(self) -> u64 {
+    pub(crate) fn to_bitmask_integer(self) -> u64 {
         let mut bitmask = [0u8; 8];
         bitmask[..self.0.as_ref().len()].copy_from_slice(self.0.as_ref());
         u64::from_ne_bytes(bitmask)
     }
 
     #[inline]
-    pub fn from_bitmask_integer(bitmask: u64) -> Self {
+    pub(crate) fn from_bitmask_integer(bitmask: u64) -> Self {
         let mut bytes = <LaneCount<N> as SupportedLaneCount>::BitMask::default();
         let len = bytes.as_mut().len();
         bytes
@@ -141,7 +141,7 @@ where
 
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn convert<U>(self) -> Mask<U, N>
+    pub(crate) fn convert<U>(self) -> Mask<U, N>
     where
         U: MaskElement,
     {
@@ -151,13 +151,13 @@ where
 
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
-    pub fn any(self) -> bool {
+    pub(crate) fn any(self) -> bool {
         self != Self::splat(false)
     }
 
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
-    pub fn all(self) -> bool {
+    pub(crate) fn all(self) -> bool {
         self == Self::splat(true)
     }
 }
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index 387b508c4b4..4e98db4070a 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -3,7 +3,7 @@
 use crate::simd::{LaneCount, MaskElement, Simd, SupportedLaneCount};
 
 #[repr(transparent)]
-pub struct Mask<T, const N: usize>(Simd<T, N>)
+pub(crate) struct Mask<T, const N: usize>(Simd<T, N>)
 where
     T: MaskElement,
     LaneCount<N>: SupportedLaneCount;
@@ -80,7 +80,7 @@ macro_rules! impl_reverse_bits {
             #[inline(always)]
             fn reverse_bits(self, n: usize) -> Self {
                 let rev = <$int>::reverse_bits(self);
-                let bitsize = core::mem::size_of::<$int>() * 8;
+                let bitsize = size_of::<$int>() * 8;
                 if n < bitsize {
                     // Shift things back to the right
                     rev >> (bitsize - n)
@@ -102,36 +102,36 @@ where
 {
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn splat(value: bool) -> Self {
+    pub(crate) fn splat(value: bool) -> Self {
         Self(Simd::splat(if value { T::TRUE } else { T::FALSE }))
     }
 
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
-    pub unsafe fn test_unchecked(&self, lane: usize) -> bool {
+    pub(crate) unsafe fn test_unchecked(&self, lane: usize) -> bool {
         T::eq(self.0[lane], T::TRUE)
     }
 
     #[inline]
-    pub unsafe fn set_unchecked(&mut self, lane: usize, value: bool) {
+    pub(crate) unsafe fn set_unchecked(&mut self, lane: usize, value: bool) {
         self.0[lane] = if value { T::TRUE } else { T::FALSE }
     }
 
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original value"]
-    pub fn to_int(self) -> Simd<T, N> {
+    pub(crate) fn to_int(self) -> Simd<T, N> {
         self.0
     }
 
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub unsafe fn from_int_unchecked(value: Simd<T, N>) -> Self {
+    pub(crate) unsafe fn from_int_unchecked(value: Simd<T, N>) -> Self {
         Self(value)
     }
 
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn convert<U>(self) -> Mask<U, N>
+    pub(crate) fn convert<U>(self) -> Mask<U, N>
     where
         U: MaskElement,
     {
@@ -220,14 +220,14 @@ where
 
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
-    pub fn any(self) -> bool {
+    pub(crate) fn any(self) -> bool {
         // Safety: use `self` as an integer vector
         unsafe { core::intrinsics::simd::simd_reduce_any(self.to_int()) }
     }
 
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
-    pub fn all(self) -> bool {
+    pub(crate) fn all(self) -> bool {
         // Safety: use `self` as an integer vector
         unsafe { core::intrinsics::simd::simd_reduce_all(self.to_int()) }
     }
diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index 4ac64a253a3..f36e8d01a73 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -1,4 +1,4 @@
-use crate::simd::{cmp::SimdPartialEq, LaneCount, Simd, SimdElement, SupportedLaneCount};
+use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount, cmp::SimdPartialEq};
 use core::ops::{Add, Mul};
 use core::ops::{BitAnd, BitOr, BitXor};
 use core::ops::{Div, Rem, Sub};
diff --git a/crates/core_simd/src/simd/cmp/eq.rs b/crates/core_simd/src/simd/cmp/eq.rs
index 93989ce91b8..2312ba401fa 100644
--- a/crates/core_simd/src/simd/cmp/eq.rs
+++ b/crates/core_simd/src/simd/cmp/eq.rs
@@ -1,6 +1,6 @@
 use crate::simd::{
-    ptr::{SimdConstPtr, SimdMutPtr},
     LaneCount, Mask, Simd, SimdElement, SupportedLaneCount,
+    ptr::{SimdConstPtr, SimdMutPtr},
 };
 
 /// Parallel `PartialEq`.
diff --git a/crates/core_simd/src/simd/cmp/ord.rs b/crates/core_simd/src/simd/cmp/ord.rs
index 899f00a8316..e813e761303 100644
--- a/crates/core_simd/src/simd/cmp/ord.rs
+++ b/crates/core_simd/src/simd/cmp/ord.rs
@@ -1,7 +1,7 @@
 use crate::simd::{
+    LaneCount, Mask, Simd, SupportedLaneCount,
     cmp::SimdPartialEq,
     ptr::{SimdConstPtr, SimdMutPtr},
-    LaneCount, Mask, Simd, SupportedLaneCount,
 };
 
 /// Parallel `PartialOrd`.
diff --git a/crates/core_simd/src/simd/num/float.rs b/crates/core_simd/src/simd/num/float.rs
index db705dfe202..b5972c47373 100644
--- a/crates/core_simd/src/simd/num/float.rs
+++ b/crates/core_simd/src/simd/num/float.rs
@@ -1,7 +1,7 @@
 use super::sealed::Sealed;
 use crate::simd::{
-    cmp::{SimdPartialEq, SimdPartialOrd},
     LaneCount, Mask, Simd, SimdCast, SimdElement, SupportedLaneCount,
+    cmp::{SimdPartialEq, SimdPartialOrd},
 };
 
 /// Operations on SIMD vectors of floats.
@@ -263,7 +263,8 @@ macro_rules! impl_trait {
                 unsafe { core::intrinsics::simd::simd_as(self) }
             }
 
-            // https://github.com/llvm/llvm-project/issues/94694
+            // workaround for https://github.com/llvm/llvm-project/issues/94694 (fixed in LLVM 20)
+            // tracked in: https://github.com/rust-lang/rust/issues/135982
             #[cfg(target_arch = "aarch64")]
             #[inline]
             fn cast<T: SimdCast>(self) -> Self::Cast<T>
@@ -302,14 +303,14 @@ macro_rules! impl_trait {
 
             #[inline]
             fn to_bits(self) -> Simd<$bits_ty, N> {
-                assert_eq!(core::mem::size_of::<Self>(), core::mem::size_of::<Self::Bits>());
+                assert_eq!(size_of::<Self>(), size_of::<Self::Bits>());
                 // Safety: transmuting between vector types is safe
                 unsafe { core::mem::transmute_copy(&self) }
             }
 
             #[inline]
             fn from_bits(bits: Simd<$bits_ty, N>) -> Self {
-                assert_eq!(core::mem::size_of::<Self>(), core::mem::size_of::<Self::Bits>());
+                assert_eq!(size_of::<Self>(), size_of::<Self::Bits>());
                 // Safety: transmuting between vector types is safe
                 unsafe { core::mem::transmute_copy(&bits) }
             }
diff --git a/crates/core_simd/src/simd/num/int.rs b/crates/core_simd/src/simd/num/int.rs
index 3a51235ff95..d25050c3e4b 100644
--- a/crates/core_simd/src/simd/num/int.rs
+++ b/crates/core_simd/src/simd/num/int.rs
@@ -1,7 +1,7 @@
 use super::sealed::Sealed;
 use crate::simd::{
-    cmp::SimdOrd, cmp::SimdPartialOrd, num::SimdUint, LaneCount, Mask, Simd, SimdCast, SimdElement,
-    SupportedLaneCount,
+    LaneCount, Mask, Simd, SimdCast, SimdElement, SupportedLaneCount, cmp::SimdOrd,
+    cmp::SimdPartialOrd, num::SimdUint,
 };
 
 /// Operations on SIMD vectors of signed integers.
diff --git a/crates/core_simd/src/simd/num/uint.rs b/crates/core_simd/src/simd/num/uint.rs
index 1ab2d8c7b73..45d978068b6 100644
--- a/crates/core_simd/src/simd/num/uint.rs
+++ b/crates/core_simd/src/simd/num/uint.rs
@@ -1,5 +1,5 @@
 use super::sealed::Sealed;
-use crate::simd::{cmp::SimdOrd, LaneCount, Simd, SimdCast, SimdElement, SupportedLaneCount};
+use crate::simd::{LaneCount, Simd, SimdCast, SimdElement, SupportedLaneCount, cmp::SimdOrd};
 
 /// Operations on SIMD vectors of unsigned integers.
 pub trait SimdUint: Copy + Sealed {
diff --git a/crates/core_simd/src/simd/prelude.rs b/crates/core_simd/src/simd/prelude.rs
index 4b7c744c013..e5d7a2aeb73 100644
--- a/crates/core_simd/src/simd/prelude.rs
+++ b/crates/core_simd/src/simd/prelude.rs
@@ -7,10 +7,11 @@
 
 #[doc(no_inline)]
 pub use super::{
+    Mask, Simd,
     cmp::{SimdOrd, SimdPartialEq, SimdPartialOrd},
     num::{SimdFloat, SimdInt, SimdUint},
     ptr::{SimdConstPtr, SimdMutPtr},
-    simd_swizzle, Mask, Simd,
+    simd_swizzle,
 };
 
 #[rustfmt::skip]
diff --git a/crates/core_simd/src/simd/ptr/const_ptr.rs b/crates/core_simd/src/simd/ptr/const_ptr.rs
index 47383809ffb..36452e7ae92 100644
--- a/crates/core_simd/src/simd/ptr/const_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/const_ptr.rs
@@ -1,5 +1,5 @@
 use super::sealed::Sealed;
-use crate::simd::{cmp::SimdPartialEq, num::SimdUint, LaneCount, Mask, Simd, SupportedLaneCount};
+use crate::simd::{LaneCount, Mask, Simd, SupportedLaneCount, cmp::SimdPartialEq, num::SimdUint};
 
 /// Operations on SIMD vectors of constant pointers.
 pub trait SimdConstPtr: Copy + Sealed {
diff --git a/crates/core_simd/src/simd/ptr/mut_ptr.rs b/crates/core_simd/src/simd/ptr/mut_ptr.rs
index 3f20eef21a3..c644f390c20 100644
--- a/crates/core_simd/src/simd/ptr/mut_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/mut_ptr.rs
@@ -1,5 +1,5 @@
 use super::sealed::Sealed;
-use crate::simd::{cmp::SimdPartialEq, num::SimdUint, LaneCount, Mask, Simd, SupportedLaneCount};
+use crate::simd::{LaneCount, Mask, Simd, SupportedLaneCount, cmp::SimdPartialEq, num::SimdUint};
 
 /// Operations on SIMD vectors of mutable pointers.
 pub trait SimdMutPtr: Copy + Sealed {
diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 42425ef37e5..dbdd6ef40eb 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -214,6 +214,17 @@ where
     /// Rotates the vector such that the first `OFFSET` elements of the slice move to the end
     /// while the last `self.len() - OFFSET` elements move to the front. After calling `rotate_elements_left`,
     /// the element previously at index `OFFSET` will become the first element in the slice.
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd::Simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd::Simd;
+    /// let a = Simd::from_array([0, 1, 2, 3]);
+    /// let x = a.rotate_elements_left::<3>();
+    /// assert_eq!(x.to_array(), [3, 0, 1, 2]);
+    ///
+    /// let y = a.rotate_elements_left::<7>();
+    /// assert_eq!(y.to_array(), [3, 0, 1, 2]);
+    /// ```
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn rotate_elements_left<const OFFSET: usize>(self) -> Self {
@@ -238,6 +249,17 @@ where
     /// Rotates the vector such that the first `self.len() - OFFSET` elements of the vector move to
     /// the end while the last `OFFSET` elements move to the front. After calling `rotate_elements_right`,
     /// the element previously at index `self.len() - OFFSET` will become the first element in the slice.
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd::Simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd::Simd;
+    /// let a = Simd::from_array([0, 1, 2, 3]);
+    /// let x = a.rotate_elements_right::<3>();
+    /// assert_eq!(x.to_array(), [1, 2, 3, 0]);
+    ///
+    /// let y = a.rotate_elements_right::<7>();
+    /// assert_eq!(y.to_array(), [1, 2, 3, 0]);
+    /// ```
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn rotate_elements_right<const OFFSET: usize>(self) -> Self {
@@ -261,6 +283,17 @@ where
 
     /// Shifts the vector elements to the left by `OFFSET`, filling in with
     /// `padding` from the right.
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd::Simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd::Simd;
+    /// let a = Simd::from_array([0, 1, 2, 3]);
+    /// let x = a.shift_elements_left::<3>(255);
+    /// assert_eq!(x.to_array(), [3, 255, 255, 255]);
+    ///
+    /// let y = a.shift_elements_left::<7>(255);
+    /// assert_eq!(y.to_array(), [255, 255, 255, 255]);
+    /// ```
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn shift_elements_left<const OFFSET: usize>(self, padding: T) -> Self {
@@ -283,6 +316,17 @@ where
 
     /// Shifts the vector elements to the right by `OFFSET`, filling in with
     /// `padding` from the left.
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd::Simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd::Simd;
+    /// let a = Simd::from_array([0, 1, 2, 3]);
+    /// let x = a.shift_elements_right::<3>(255);
+    /// assert_eq!(x.to_array(), [255, 255, 255, 0]);
+    ///
+    /// let y = a.shift_elements_right::<7>(255);
+    /// assert_eq!(y.to_array(), [255, 255, 255, 255]);
+    /// ```
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn shift_elements_right<const OFFSET: usize>(self, padding: T) -> Self {
diff --git a/crates/core_simd/src/to_bytes.rs b/crates/core_simd/src/to_bytes.rs
index 4833ea9e113..fee2cc06c5b 100644
--- a/crates/core_simd/src/to_bytes.rs
+++ b/crates/core_simd/src/to_bytes.rs
@@ -1,6 +1,6 @@
 use crate::simd::{
-    num::{SimdFloat, SimdInt, SimdUint},
     LaneCount, Simd, SimdElement, SupportedLaneCount,
+    num::{SimdFloat, SimdInt, SimdUint},
 };
 
 mod sealed {
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 9c4dd36c24f..d76a6cd52bf 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -1,8 +1,8 @@
 use crate::simd::{
+    LaneCount, Mask, MaskElement, SupportedLaneCount, Swizzle,
     cmp::SimdPartialOrd,
     num::SimdUint,
     ptr::{SimdConstPtr, SimdMutPtr},
-    LaneCount, Mask, MaskElement, SupportedLaneCount, Swizzle,
 };
 
 /// A SIMD vector with the shape of `[T; N]` but the operations of `T`.
@@ -83,7 +83,7 @@ use crate::simd::{
 /// converting `[T]` to `[Simd<T, N>]`, and allows soundly operating on an aligned SIMD body,
 /// but it may cost more time when handling the scalar head and tail.
 /// If these are not enough, it is most ideal to design data structures to be already aligned
-/// to `mem::align_of::<Simd<T, N>>()` before using `unsafe` Rust to read or write.
+/// to `align_of::<Simd<T, N>>()` before using `unsafe` Rust to read or write.
 /// Other ways to compensate for these facts, like materializing `Simd` to or from an array first,
 /// are handled by safe methods like [`Simd::from_array`] and [`Simd::from_slice`].
 ///
diff --git a/crates/core_simd/tests/layout.rs b/crates/core_simd/tests/layout.rs
index 24114c2d261..3b4666249b0 100644
--- a/crates/core_simd/tests/layout.rs
+++ b/crates/core_simd/tests/layout.rs
@@ -7,8 +7,8 @@ macro_rules! layout_tests {
             test_helpers::test_lanes! {
                 fn no_padding<const LANES: usize>() {
                     assert_eq!(
-                        core::mem::size_of::<core_simd::simd::Simd::<$ty, LANES>>(),
-                        core::mem::size_of::<[$ty; LANES]>(),
+                        size_of::<core_simd::simd::Simd::<$ty, LANES>>(),
+                        size_of::<[$ty; LANES]>(),
                     );
                 }
             }
diff --git a/crates/core_simd/tests/pointers.rs b/crates/core_simd/tests/pointers.rs
index d7db4e82b3c..6e74c2d18b1 100644
--- a/crates/core_simd/tests/pointers.rs
+++ b/crates/core_simd/tests/pointers.rs
@@ -1,8 +1,8 @@
 #![feature(portable_simd)]
 
 use core_simd::simd::{
-    ptr::{SimdConstPtr, SimdMutPtr},
     Simd,
+    ptr::{SimdConstPtr, SimdMutPtr},
 };
 
 macro_rules! common_tests {
diff --git a/crates/core_simd/tests/round.rs b/crates/core_simd/tests/round.rs
index 847766ec41e..4c1ac3c36f8 100644
--- a/crates/core_simd/tests/round.rs
+++ b/crates/core_simd/tests/round.rs
@@ -58,7 +58,7 @@ macro_rules! float_rounding_test {
                     // all of the mantissa digits set to 1, pushed up to the MSB.
                     const ALL_MANTISSA_BITS: IntScalar = ((1 << <Scalar>::MANTISSA_DIGITS) - 1);
                     const MAX_REPRESENTABLE_VALUE: Scalar =
-                        (ALL_MANTISSA_BITS << (core::mem::size_of::<Scalar>() * 8 - <Scalar>::MANTISSA_DIGITS as usize - 1)) as Scalar;
+                        (ALL_MANTISSA_BITS << (size_of::<Scalar>() * 8 - <Scalar>::MANTISSA_DIGITS as usize - 1)) as Scalar;
 
                     let mut runner = test_helpers::make_runner();
                     runner.run(
diff --git a/crates/test_helpers/src/subnormals.rs b/crates/test_helpers/src/subnormals.rs
index ec0f1fb24b9..b5f19ba47b8 100644
--- a/crates/test_helpers/src/subnormals.rs
+++ b/crates/test_helpers/src/subnormals.rs
@@ -12,7 +12,7 @@ macro_rules! impl_float {
         $(
         impl FlushSubnormals for $ty {
             fn flush(self) -> Self {
-                let is_f32 = core::mem::size_of::<Self>() == 4;
+                let is_f32 = size_of::<Self>() == 4;
                 let ppc_flush = is_f32 && cfg!(all(
                     any(target_arch = "powerpc", all(target_arch = "powerpc64", target_endian = "big")),
                     target_feature = "altivec",