From f759a7862be30f8e334a9b529ecab5169ba56c1a Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Wed, 26 May 2021 08:13:09 +0200
Subject: [PATCH 01/69] refactor: check_remainder()

---
 src/implementation/algorithm.rs | 34 ++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index 85d881c4..c68ddd75 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -200,6 +200,21 @@ macro_rules! algorithm_simd {
                     panic!("Unsupported number of chunks");
                 }
             }
+
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(unconditional_panic)] // does not panic because len is checked
+            #[allow(const_err)] // the same, but for Rust 1.38.0
+            unsafe fn check_remainder(&mut self, input: *const u8, len: usize) {
+                let mut tmpbuf = TempSimdChunk::new();
+                crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
+                    input,
+                    tmpbuf.0.as_mut_ptr(),
+                    len,
+                );
+                let simd_input = SimdInput::new(&tmpbuf.0);
+                self.check_utf8(simd_input);
+            }
         }
 
         /// Validation implementation for CPUs supporting the SIMD extension (see module).
@@ -241,14 +256,7 @@ macro_rules! algorithm_simd {
             }
 
             if idx < len {
-                let mut tmpbuf = TempSimdChunk::new();
-                crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
-                    input.as_ptr().add(idx),
-                    tmpbuf.0.as_mut_ptr(),
-                    len - idx,
-                );
-                let simd_input = SimdInput::new(&tmpbuf.0);
-                algorithm.check_utf8(simd_input);
+                algorithm.check_remainder(input.as_ptr().add(idx), len - idx);
             }
             algorithm.check_incomplete_pending();
             if algorithm.has_error() {
@@ -331,15 +339,7 @@ macro_rules! algorithm_simd {
                 }
             }
             if idx < len {
-                let mut tmpbuf = TempSimdChunk::new();
-                crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
-                    input.as_ptr().add(idx),
-                    tmpbuf.0.as_mut_ptr(),
-                    len - idx,
-                );
-                let simd_input = SimdInput::new(&tmpbuf.0);
-
-                algorithm.check_utf8(simd_input);
+                algorithm.check_remainder(input.as_ptr().add(idx), len - idx)
             }
             algorithm.check_incomplete_pending();
             if algorithm.has_error() {

From e562d487e3d12d406d555161b6562fd723fca7f7 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Wed, 26 May 2021 08:29:34 +0200
Subject: [PATCH 02/69] bit mor idiomatic

---
 src/implementation/algorithm.rs | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index c68ddd75..3969e349 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -186,18 +186,22 @@ macro_rules! algorithm_simd {
             unsafe fn check_block(&mut self, input: SimdInput) {
                 // WORKAROUND
                 // necessary because the for loop is not unrolled on ARM64
-                if input.vals.len() == 2 {
-                    self.check_bytes(input.vals[0]);
-                    self.check_bytes(input.vals[1]);
-                    self.incomplete = Self::is_incomplete(input.vals[1]);
-                } else if input.vals.len() == 4 {
-                    self.check_bytes(input.vals[0]);
-                    self.check_bytes(input.vals[1]);
-                    self.check_bytes(input.vals[2]);
-                    self.check_bytes(input.vals[3]);
-                    self.incomplete = Self::is_incomplete(input.vals[3]);
-                } else {
-                    panic!("Unsupported number of chunks");
+                match input.vals.len() {
+                    2 => {
+                        self.check_bytes(input.vals[0]);
+                        self.check_bytes(input.vals[1]);
+                        self.incomplete = Self::is_incomplete(input.vals[1]);
+                    }
+                    4 => {
+                        self.check_bytes(input.vals[0]);
+                        self.check_bytes(input.vals[1]);
+                        self.check_bytes(input.vals[2]);
+                        self.check_bytes(input.vals[3]);
+                        self.incomplete = Self::is_incomplete(input.vals[3]);
+                    }
+                    _ => {
+                        panic!("Unsupported number of chunks");
+                    }
                 }
             }
 

From 1edb3b27786e765844d64aba0e75caabaa23a214 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Wed, 26 May 2021 12:02:49 +0200
Subject: [PATCH 03/69] WIP: remainder optimization

---
 src/implementation/aarch64/neon.rs | 12 +++++++
 src/implementation/algorithm.rs    | 55 ++++++++++++++++++++++--------
 src/implementation/helpers.rs      | 31 +++++++++++++++++
 3 files changed, 83 insertions(+), 15 deletions(-)

diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs
index 8fb05057..b932b596 100644
--- a/src/implementation/aarch64/neon.rs
+++ b/src/implementation/aarch64/neon.rs
@@ -100,6 +100,17 @@ impl SimdU8Value {
         Self::from(dst.assume_init())
     }
 
+    #[inline]
+    unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
+        let mut tmpbuf = [0u8; 16];
+        crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_16(
+            ptr,
+            tmpbuf.as_mut_ptr(),
+            len,
+        );
+        Self::load_from(tmpbuf.as_ptr())
+    }
+
     #[inline]
     #[allow(clippy::too_many_arguments)]
     unsafe fn lookup_16(
@@ -233,6 +244,7 @@ unsafe fn simd_prefetch(ptr: *const u8) {
 }
 
 const PREFETCH: bool = false;
+#[allow(unused_imports)]
 use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk;
 simd_input_128_bit!("not_used");
 algorithm_simd!("not_used");
diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index 3969e349..f0eab573 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -209,15 +209,36 @@ macro_rules! algorithm_simd {
             #[inline]
             #[allow(unconditional_panic)] // does not panic because len is checked
             #[allow(const_err)] // the same, but for Rust 1.38.0
-            unsafe fn check_remainder(&mut self, input: *const u8, len: usize) {
-                let mut tmpbuf = TempSimdChunk::new();
-                crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
-                    input,
-                    tmpbuf.0.as_mut_ptr(),
-                    len,
-                );
-                let simd_input = SimdInput::new(&tmpbuf.0);
-                self.check_utf8(simd_input);
+            unsafe fn check_remainder(&mut self, mut input: *const u8, len: usize) {
+                let orig_len = len;
+                let mut len = len;
+                const SIMD_SIZE: usize = core::mem::size_of::<SimdU8Value>();
+                while len > SIMD_SIZE {
+                    let simd_val = SimdU8Value::load_from(input);
+                    input = input.add(SIMD_SIZE);
+                    if simd_val.is_ascii() {
+                        if orig_len == len {
+                            // first after last block, check if previous block is incomplete
+                            self.check_incomplete_pending();
+                        }
+                    } else {
+                        self.check_bytes(simd_val);
+                        self.incomplete = Self::is_incomplete(simd_val);
+                    }
+                    len -= SIMD_SIZE;
+                }
+                if len > 0 {
+                    let simd_val = SimdU8Value::load_partial(input, len);
+                    if simd_val.is_ascii() {
+                        if orig_len < SIMD_SIZE {
+                            // first after last block, check if previous block is incomplete
+                            self.check_incomplete_pending();
+                        }
+                    } else {
+                        self.check_bytes(simd_val);
+                        self.incomplete = Self::is_incomplete(simd_val);
+                    }
+                }
             }
         }
 
@@ -241,13 +262,17 @@ macro_rules! algorithm_simd {
             let mut idx: usize = 0;
             let iter_lim = len - (len % SIMD_CHUNK_SIZE);
 
-            while idx < iter_lim {
-                let simd_input = SimdInput::new(input.get_unchecked(idx as usize..));
-                idx += SIMD_CHUNK_SIZE;
-                if !simd_input.is_ascii() {
-                    algorithm.check_block(simd_input);
-                    break;
+            'outer: loop {
+                while idx < iter_lim {
+                    let simd_input = SimdInput::new(input.get_unchecked(idx as usize..));
+                    idx += SIMD_CHUNK_SIZE;
+                    if !simd_input.is_ascii() {
+                        algorithm.check_block(simd_input);
+                        break 'outer;
+                    }
                 }
+                // TODO: check remainder ASCII
+                break;
             }
 
             while idx < iter_lim {
diff --git a/src/implementation/helpers.rs b/src/implementation/helpers.rs
index a6bd693a..0a8bb36d 100644
--- a/src/implementation/helpers.rs
+++ b/src/implementation/helpers.rs
@@ -37,6 +37,37 @@ pub(crate) fn get_compat_error(input: &[u8], failing_block_pos: usize) -> Utf8Er
     validate_utf8_at_offset(input, offset).unwrap_err()
 }
 
+#[allow(dead_code)]
+#[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const
+pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_16(
+    mut src: *const u8,
+    mut dest: *mut u8,
+    mut len: usize,
+) {
+    #[inline]
+    unsafe fn memcpy_u32(src: &mut *const u8, dest: &mut *mut u8) {
+        #[allow(clippy::cast_ptr_alignment)]
+        dest.cast::<u32>()
+            .write_unaligned(src.cast::<u32>().read_unaligned());
+        *src = src.offset(4);
+        *dest = dest.offset(4);
+    }
+    if len >= 8 {
+        memcpy_u32(&mut src, &mut dest);
+        memcpy_u32(&mut src, &mut dest);
+        len -= 8;
+    }
+    if len >= 4 {
+        memcpy_u32(&mut src, &mut dest);
+        len -= 4;
+    }
+    while len > 0 {
+        *dest = *src;
+        src = src.offset(1);
+        dest = dest.offset(1);
+        len -= 1;
+    }
+}
 #[allow(dead_code)]
 #[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const
 pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_64(

From a7efda182145d8da039a16f372a7f3755e805f85 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sat, 29 May 2021 21:29:32 +0200
Subject: [PATCH 04/69] aarch64 remainder processing: use lane loads

---
 src/implementation/aarch64/neon.rs | 69 +++++++++++++++++++++++++++---
 1 file changed, 62 insertions(+), 7 deletions(-)

diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs
index b932b596..a215012c 100644
--- a/src/implementation/aarch64/neon.rs
+++ b/src/implementation/aarch64/neon.rs
@@ -102,13 +102,68 @@ impl SimdU8Value {
 
     #[inline]
     unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
-        let mut tmpbuf = [0u8; 16];
-        crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_16(
-            ptr,
-            tmpbuf.as_mut_ptr(),
-            len,
-        );
-        Self::load_from(tmpbuf.as_ptr())
+        let mut res = Self::splat0();
+        if len == 0 {
+        } else if len < 4 {
+            res.0 = core::arch::aarch64::vld1q_lane_u8(ptr, res.0, 0);
+            if len > 1 {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(1), res.0, 1);
+            }
+            if len > 2 {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(2), res.0, 2);
+            }
+        } else if len < 8 {
+            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                ptr.cast(),
+                core::mem::transmute(res.0),
+                0,
+            ));
+            if len > 4 {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(4), res.0, 4);
+            }
+            if len > 5 {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(5), res.0, 5);
+            }
+            if len > 6 {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(6), res.0, 6);
+            }
+        } else if len < 12 {
+            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                ptr.cast(),
+                core::mem::transmute(res.0),
+                0,
+            ));
+            if len > 8 {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(8), res.0, 8);
+            }
+            if len > 9 {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(9), res.0, 9);
+            }
+            if len > 10 {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(10), res.0, 10);
+            }
+        } else {
+            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                ptr.cast(),
+                core::mem::transmute(res.0),
+                0,
+            ));
+            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                ptr.add(8).cast(),
+                core::mem::transmute(res.0),
+                2,
+            ));
+            if len > 12 {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(12), res.0, 12);
+            }
+            if len > 13 {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(13), res.0, 13);
+            }
+            if len > 14 {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(14), res.0, 14);
+            }
+        }
+        res
     }
 
     #[inline]

From b3c763680b736d2bff0a415ad5e6355dc03ba5a0 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sat, 29 May 2021 21:30:02 +0200
Subject: [PATCH 05/69] fix

---
 src/implementation/algorithm.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index f0eab573..e601ab56 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -213,7 +213,7 @@ macro_rules! algorithm_simd {
                 let orig_len = len;
                 let mut len = len;
                 const SIMD_SIZE: usize = core::mem::size_of::<SimdU8Value>();
-                while len > SIMD_SIZE {
+                while len >= SIMD_SIZE {
                     let simd_val = SimdU8Value::load_from(input);
                     input = input.add(SIMD_SIZE);
                     if simd_val.is_ascii() {

From defd4d28f5bf4c47c3205952b49f4ace0fa409db Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sat, 29 May 2021 21:34:26 +0200
Subject: [PATCH 06/69] SimdInput::new_partial() implementation

---
 src/implementation/algorithm.rs | 43 +++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index e601ab56..5b5dc516 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -567,6 +567,49 @@ macro_rules! simd_input_128_bit {
                 }
             }
 
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(clippy::cast_ptr_alignment)]
+            unsafe fn new_partial(ptr: *const u8, len: usize) -> Self {
+                if len < 16 {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_partial(ptr, len),
+                            SimdU8Value::splat0(),
+                            SimdU8Value::splat0(),
+                            SimdU8Value::splat0(),
+                        ],
+                    }
+                } else if len < 32 {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_from(ptr),
+                            SimdU8Value::load_partial(ptr.add(16), len - 16),
+                            SimdU8Value::splat0(),
+                            SimdU8Value::splat0(),
+                        ],
+                    }
+                } else if len < 48 {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_from(ptr),
+                            SimdU8Value::load_from(ptr.add(16)),
+                            SimdU8Value::load_partial(ptr.add(32), len - 32),
+                            SimdU8Value::splat0(),
+                        ],
+                    }
+                } else {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_from(ptr),
+                            SimdU8Value::load_from(ptr.add(16)),
+                            SimdU8Value::load_from(ptr.add(32)),
+                            SimdU8Value::load_partial(ptr.add(48), len - 48),
+                        ],
+                    }
+                }
+            }
+
             #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
             #[inline]
             unsafe fn is_ascii(&self) -> bool {

From 5df7b381840bf9a0f63da6c57e6e2e31dac3c520 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sat, 29 May 2021 21:35:18 +0200
Subject: [PATCH 07/69] ascii special-casing

---
 src/implementation/algorithm.rs | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index 5b5dc516..2a2f5814 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -271,8 +271,13 @@ macro_rules! algorithm_simd {
                         break 'outer;
                     }
                 }
-                // TODO: check remainder ASCII
-                break;
+                if idx < len {
+                    let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len - idx);
+                    if !simd_input.is_ascii() {
+                        break;
+                    }
+                }
+                return Ok(());
             }
 
             while idx < iter_lim {
@@ -286,6 +291,8 @@ macro_rules! algorithm_simd {
 
             if idx < len {
                 algorithm.check_remainder(input.as_ptr().add(idx), len - idx);
+                // let input = SimdInput::new_partial(input.as_ptr().add(idx), len - idx);
+                // algorithm.check_utf8(input);
             }
             algorithm.check_incomplete_pending();
             if algorithm.has_error() {
@@ -339,7 +346,13 @@ macro_rules! algorithm_simd {
                         }
                         idx += SIMD_CHUNK_SIZE;
                     }
-                    break;
+                    if idx < len {
+                        let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len - idx);
+                        if !simd_input.is_ascii() {
+                            break;
+                        }
+                    }
+                    return Ok(());
                 } else {
                     while idx < iter_lim {
                         if PREFETCH {

From 7f0cf3b5e3556ac067508039d976ccf17108df52 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sun, 30 May 2021 12:37:28 +0200
Subject: [PATCH 08/69] ascii-optimized remainder checking

---
 src/implementation/algorithm.rs | 36 ++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index 2a2f5814..28ada5c9 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -240,6 +240,30 @@ macro_rules! algorithm_simd {
                     }
                 }
             }
+
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(unconditional_panic)] // does not panic because len is checked
+            #[allow(const_err)] // the same, but for Rust 1.38.0
+            unsafe fn check_remainder_ascii(&mut self, mut input: *const u8, mut len: usize) {
+                const SIMD_SIZE: usize = core::mem::size_of::<SimdU8Value>();
+                while len >= SIMD_SIZE {
+                    let simd_val = SimdU8Value::load_from(input);
+                    input = input.add(SIMD_SIZE);
+                    if !simd_val.is_ascii() {
+                        self.check_bytes(simd_val);
+                        self.incomplete = Self::is_incomplete(simd_val);
+                    }
+                    len -= SIMD_SIZE;
+                }
+                if len > 0 {
+                    let simd_val = SimdU8Value::load_partial(input, len);
+                    if !simd_val.is_ascii() {
+                        self.check_bytes(simd_val);
+                        self.incomplete = Self::is_incomplete(simd_val);
+                    }
+                }
+            }
         }
 
         /// Validation implementation for CPUs supporting the SIMD extension (see module).
@@ -347,12 +371,14 @@ macro_rules! algorithm_simd {
                         idx += SIMD_CHUNK_SIZE;
                     }
                     if idx < len {
-                        let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len - idx);
-                        if !simd_input.is_ascii() {
-                            break;
-                        }
+                        algorithm.check_remainder_ascii(input.as_ptr().add(idx), len - idx);
+                        algorithm.check_incomplete_pending();
                     }
-                    return Ok(());
+                    return if algorithm.has_error() {
+                        Err(idx)
+                    } else {
+                        Ok(())
+                    };
                 } else {
                     while idx < iter_lim {
                         if PREFETCH {

From 2cbf8a9c9d4b11b7bf1cf6819c4859dbf8f2b616 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sun, 30 May 2021 12:37:53 +0200
Subject: [PATCH 09/69] test-bed for different partial load implementations

---
 src/implementation/aarch64/neon.rs | 269 ++++++++++++++++++++++++++++-
 1 file changed, 268 insertions(+), 1 deletion(-)

diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs
index a215012c..8e6534ae 100644
--- a/src/implementation/aarch64/neon.rs
+++ b/src/implementation/aarch64/neon.rs
@@ -101,7 +101,23 @@ impl SimdU8Value {
     }
 
     #[inline]
-    unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
+    pub unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
+        Self::load_partial_direct_16(ptr, len)
+    }
+
+    #[inline]
+    unsafe fn load_partial_copy(ptr: *const u8, len: usize) -> Self {
+        let mut tmpbuf = [0u8; 16];
+        crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_16(
+            ptr,
+            tmpbuf.as_mut_ptr(),
+            len,
+        );
+        Self::load_from(tmpbuf.as_ptr())
+    }
+
+    #[inline]
+    unsafe fn load_partial_direct(ptr: *const u8, len: usize) -> Self {
         let mut res = Self::splat0();
         if len == 0 {
         } else if len < 4 {
@@ -166,6 +182,257 @@ impl SimdU8Value {
         res
     }
 
+    #[inline]
+    unsafe fn load_partial_direct_16(ptr: *const u8, len: usize) -> Self {
+        let mut res = Self::splat0();
+        if len == 0 {
+        } else if len < 4 {
+            if len == 1 {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr, res.0, 0);
+            } else {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                if len > 2 {
+                    res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(2), res.0, 2);
+                }
+            }
+        } else if len < 8 {
+            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                ptr.cast(),
+                core::mem::transmute(res.0),
+                0,
+            ));
+            if len == 5 {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(4), res.0, 4);
+            } else if len > 5 {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.add(4).cast(),
+                    core::mem::transmute(res.0),
+                    2,
+                ));
+                if len > 6 {
+                    res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(6), res.0, 6);
+                }
+            }
+        } else if len < 12 {
+            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                ptr.cast(),
+                core::mem::transmute(res.0),
+                0,
+            ));
+            if len == 9 {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(8), res.0, 8);
+            } else if len > 9 {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.add(8).cast(),
+                    core::mem::transmute(res.0),
+                    4,
+                ));
+                if len > 10 {
+                    res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(10), res.0, 10);
+                }
+            }
+        } else {
+            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                ptr.cast(),
+                core::mem::transmute(res.0),
+                0,
+            ));
+            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                ptr.add(8).cast(),
+                core::mem::transmute(res.0),
+                2,
+            ));
+            if len == 13 {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(12), res.0, 12);
+            } else if len > 13 {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.add(12).cast(),
+                    core::mem::transmute(res.0),
+                    6,
+                ));
+                if len > 14 {
+                    res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(14), res.0, 14);
+                }
+            }
+        }
+        res
+    }
+
+    #[inline]
+    unsafe fn load_partial_direct_match(ptr: *const u8, len: usize) -> Self {
+        let mut res = Self::splat0();
+        match len {
+            0 => {}
+            1 => {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr, res.0, 0);
+            }
+            2 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+            }
+            3 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(2), res.0, 2);
+            }
+            4 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+            }
+            5 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(4), res.0, 4);
+            }
+            6 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.add(4).cast(),
+                    core::mem::transmute(res.0),
+                    2,
+                ));
+            }
+            7 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.add(4).cast(),
+                    core::mem::transmute(res.0),
+                    2,
+                ));
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(6), res.0, 6);
+            }
+            8 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+            }
+            9 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(8), res.0, 8);
+            }
+            10 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.add(8).cast(),
+                    core::mem::transmute(res.0),
+                    4,
+                ));
+            }
+            11 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.add(8).cast(),
+                    core::mem::transmute(res.0),
+                    4,
+                ));
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(10), res.0, 10);
+            }
+            12 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.add(8).cast(),
+                    core::mem::transmute(res.0),
+                    2,
+                ));
+            }
+            13 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.add(8).cast(),
+                    core::mem::transmute(res.0),
+                    2,
+                ));
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(12), res.0, 12);
+            }
+            14 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.add(8).cast(),
+                    core::mem::transmute(res.0),
+                    2,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.add(12).cast(),
+                    core::mem::transmute(res.0),
+                    6,
+                ));
+            }
+            15 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.add(8).cast(),
+                    core::mem::transmute(res.0),
+                    2,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.add(12).cast(),
+                    core::mem::transmute(res.0),
+                    6,
+                ));
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(14), res.0, 14);
+            }
+            _ => {
+                // not allowed
+                debug_assert!(false);
+            }
+        }
+        res
+    }
+
     #[inline]
     #[allow(clippy::too_many_arguments)]
     unsafe fn lookup_16(

From 42f1acde033abab919a9448aed7c00da84d7f7e9 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sun, 30 May 2021 13:44:08 +0200
Subject: [PATCH 10/69] aarch64: select partial load implementation

---
 src/implementation/aarch64/neon.rs | 162 -----------------------------
 1 file changed, 162 deletions(-)

diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs
index 8e6534ae..80724ab8 100644
--- a/src/implementation/aarch64/neon.rs
+++ b/src/implementation/aarch64/neon.rs
@@ -102,168 +102,6 @@ impl SimdU8Value {
 
     #[inline]
     pub unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
-        Self::load_partial_direct_16(ptr, len)
-    }
-
-    #[inline]
-    unsafe fn load_partial_copy(ptr: *const u8, len: usize) -> Self {
-        let mut tmpbuf = [0u8; 16];
-        crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_16(
-            ptr,
-            tmpbuf.as_mut_ptr(),
-            len,
-        );
-        Self::load_from(tmpbuf.as_ptr())
-    }
-
-    #[inline]
-    unsafe fn load_partial_direct(ptr: *const u8, len: usize) -> Self {
-        let mut res = Self::splat0();
-        if len == 0 {
-        } else if len < 4 {
-            res.0 = core::arch::aarch64::vld1q_lane_u8(ptr, res.0, 0);
-            if len > 1 {
-                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(1), res.0, 1);
-            }
-            if len > 2 {
-                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(2), res.0, 2);
-            }
-        } else if len < 8 {
-            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
-                ptr.cast(),
-                core::mem::transmute(res.0),
-                0,
-            ));
-            if len > 4 {
-                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(4), res.0, 4);
-            }
-            if len > 5 {
-                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(5), res.0, 5);
-            }
-            if len > 6 {
-                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(6), res.0, 6);
-            }
-        } else if len < 12 {
-            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
-                ptr.cast(),
-                core::mem::transmute(res.0),
-                0,
-            ));
-            if len > 8 {
-                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(8), res.0, 8);
-            }
-            if len > 9 {
-                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(9), res.0, 9);
-            }
-            if len > 10 {
-                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(10), res.0, 10);
-            }
-        } else {
-            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
-                ptr.cast(),
-                core::mem::transmute(res.0),
-                0,
-            ));
-            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
-                ptr.add(8).cast(),
-                core::mem::transmute(res.0),
-                2,
-            ));
-            if len > 12 {
-                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(12), res.0, 12);
-            }
-            if len > 13 {
-                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(13), res.0, 13);
-            }
-            if len > 14 {
-                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(14), res.0, 14);
-            }
-        }
-        res
-    }
-
-    #[inline]
-    unsafe fn load_partial_direct_16(ptr: *const u8, len: usize) -> Self {
-        let mut res = Self::splat0();
-        if len == 0 {
-        } else if len < 4 {
-            if len == 1 {
-                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr, res.0, 0);
-            } else {
-                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
-                    ptr.cast(),
-                    core::mem::transmute(res.0),
-                    0,
-                ));
-                if len > 2 {
-                    res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(2), res.0, 2);
-                }
-            }
-        } else if len < 8 {
-            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
-                ptr.cast(),
-                core::mem::transmute(res.0),
-                0,
-            ));
-            if len == 5 {
-                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(4), res.0, 4);
-            } else if len > 5 {
-                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
-                    ptr.add(4).cast(),
-                    core::mem::transmute(res.0),
-                    2,
-                ));
-                if len > 6 {
-                    res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(6), res.0, 6);
-                }
-            }
-        } else if len < 12 {
-            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
-                ptr.cast(),
-                core::mem::transmute(res.0),
-                0,
-            ));
-            if len == 9 {
-                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(8), res.0, 8);
-            } else if len > 9 {
-                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
-                    ptr.add(8).cast(),
-                    core::mem::transmute(res.0),
-                    4,
-                ));
-                if len > 10 {
-                    res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(10), res.0, 10);
-                }
-            }
-        } else {
-            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
-                ptr.cast(),
-                core::mem::transmute(res.0),
-                0,
-            ));
-            res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
-                ptr.add(8).cast(),
-                core::mem::transmute(res.0),
-                2,
-            ));
-            if len == 13 {
-                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(12), res.0, 12);
-            } else if len > 13 {
-                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
-                    ptr.add(12).cast(),
-                    core::mem::transmute(res.0),
-                    6,
-                ));
-                if len > 14 {
-                    res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(14), res.0, 14);
-                }
-            }
-        }
-        res
-    }
-
-    #[inline]
-    unsafe fn load_partial_direct_match(ptr: *const u8, len: usize) -> Self {
         let mut res = Self::splat0();
         match len {
             0 => {}

From 8e2706f516216a9f647b0a0d29f475da2955ff01 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sun, 30 May 2021 15:18:49 +0200
Subject: [PATCH 11/69] cleanup

---
 src/implementation/algorithm.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index 28ada5c9..e0eb8c63 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -315,8 +315,6 @@ macro_rules! algorithm_simd {
 
             if idx < len {
                 algorithm.check_remainder(input.as_ptr().add(idx), len - idx);
-                // let input = SimdInput::new_partial(input.as_ptr().add(idx), len - idx);
-                // algorithm.check_utf8(input);
             }
             algorithm.check_incomplete_pending();
             if algorithm.has_error() {

From 695c4f8357db3613de2ca8629c8380cf8c492489 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sun, 30 May 2021 17:54:44 +0200
Subject: [PATCH 12/69] remove unnecessary pub

---
 src/implementation/aarch64/neon.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs
index 80724ab8..17966445 100644
--- a/src/implementation/aarch64/neon.rs
+++ b/src/implementation/aarch64/neon.rs
@@ -101,7 +101,7 @@ impl SimdU8Value {
     }
 
     #[inline]
-    pub unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
+    unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
         let mut res = Self::splat0();
         match len {
             0 => {}

From fa25b4676ef3dc3ae9b303dc3e10f0b8caecf710 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sun, 30 May 2021 17:55:57 +0200
Subject: [PATCH 13/69] basic x86 partial load impl.

---
 src/implementation/algorithm.rs | 18 ++++++++++++++++++
 src/implementation/x86/avx2.rs  | 12 ++++++++++++
 src/implementation/x86/sse42.rs | 12 ++++++++++++
 3 files changed, 42 insertions(+)

diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index e0eb8c63..8cc197cb 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -679,6 +679,24 @@ macro_rules! simd_input_256_bit {
                 }
             }
 
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(clippy::cast_ptr_alignment)]
+            unsafe fn new_partial(ptr: *const u8, len: usize) -> Self {
+                if len < 32 {
+                    Self {
+                        vals: [SimdU8Value::load_partial(ptr, len), SimdU8Value::splat0()],
+                    }
+                } else {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_from(ptr),
+                            SimdU8Value::load_partial(ptr.add(32), len - 32),
+                        ],
+                    }
+                }
+            }
+
             #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
             #[inline]
             unsafe fn is_ascii(&self) -> bool {
diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs
index 8232f571..9bdcb0c0 100644
--- a/src/implementation/x86/avx2.rs
+++ b/src/implementation/x86/avx2.rs
@@ -103,6 +103,18 @@ impl SimdU8Value {
         Self::from(_mm256_loadu_si256(ptr.cast::<__m256i>()))
     }
 
+    #[target_feature(enable = "avx2")]
+    #[inline]
+    unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
+        let mut tmpbuf = [0_u8; 32];
+        crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_32(
+            ptr,
+            tmpbuf.as_mut_ptr(),
+            len,
+        );
+        Self::load_from(tmpbuf.as_ptr())
+    }
+
     #[target_feature(enable = "avx2")]
     #[inline]
     unsafe fn lookup_16(
diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs
index f9140d50..496361fe 100644
--- a/src/implementation/x86/sse42.rs
+++ b/src/implementation/x86/sse42.rs
@@ -99,6 +99,18 @@ impl SimdU8Value {
         Self::from(_mm_loadu_si128(ptr.cast::<__m128i>()))
     }
 
+    #[target_feature(enable = "sse4.2")]
+    #[inline]
+    unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
+        let mut tmpbuf = [0_u8; 16];
+        crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_16(
+            ptr,
+            tmpbuf.as_mut_ptr(),
+            len,
+        );
+        Self::load_from(tmpbuf.as_ptr())
+    }
+
     #[target_feature(enable = "sse4.2")]
     #[inline]
     unsafe fn lookup_16(

From 3d718220c22e0564e5fd6dc19f280d2f657bdfbe Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sun, 30 May 2021 18:56:12 +0200
Subject: [PATCH 14/69] clippy

---
 src/implementation/algorithm.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index 8cc197cb..aa65766b 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -210,9 +210,9 @@ macro_rules! algorithm_simd {
             #[allow(unconditional_panic)] // does not panic because len is checked
             #[allow(const_err)] // the same, but for Rust 1.38.0
             unsafe fn check_remainder(&mut self, mut input: *const u8, len: usize) {
+                const SIMD_SIZE: usize = core::mem::size_of::<SimdU8Value>();
                 let orig_len = len;
                 let mut len = len;
-                const SIMD_SIZE: usize = core::mem::size_of::<SimdU8Value>();
                 while len >= SIMD_SIZE {
                     let simd_val = SimdU8Value::load_from(input);
                     input = input.add(SIMD_SIZE);

From d4d3ccd97d7dd6191431acd35da07cf93bb42560 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sun, 30 May 2021 18:56:27 +0200
Subject: [PATCH 15/69] add missing helper fns

---
 src/implementation/helpers.rs | 68 ++++++++++++++++++++++++++---------
 1 file changed, 51 insertions(+), 17 deletions(-)

diff --git a/src/implementation/helpers.rs b/src/implementation/helpers.rs
index 0a8bb36d..bbef8733 100644
--- a/src/implementation/helpers.rs
+++ b/src/implementation/helpers.rs
@@ -37,6 +37,24 @@ pub(crate) fn get_compat_error(input: &[u8], failing_block_pos: usize) -> Utf8Er
     validate_utf8_at_offset(input, offset).unwrap_err()
 }
 
+#[inline]
+unsafe fn memcpy_u32(src: &mut *const u8, dest: &mut *mut u8) {
+    #[allow(clippy::cast_ptr_alignment)]
+    dest.cast::<u32>()
+        .write_unaligned(src.cast::<u32>().read_unaligned());
+    *src = src.offset(4);
+    *dest = dest.offset(4);
+}
+
+#[inline]
+unsafe fn memcpy_u64(src: &mut *const u8, dest: &mut *mut u8) {
+    #[allow(clippy::cast_ptr_alignment)]
+    dest.cast::<u64>()
+        .write_unaligned(src.cast::<u64>().read_unaligned());
+    *src = src.offset(8);
+    *dest = dest.offset(8);
+}
+
 #[allow(dead_code)]
 #[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const
 pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_16(
@@ -44,14 +62,6 @@ pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_16(
     mut dest: *mut u8,
     mut len: usize,
 ) {
-    #[inline]
-    unsafe fn memcpy_u32(src: &mut *const u8, dest: &mut *mut u8) {
-        #[allow(clippy::cast_ptr_alignment)]
-        dest.cast::<u32>()
-            .write_unaligned(src.cast::<u32>().read_unaligned());
-        *src = src.offset(4);
-        *dest = dest.offset(4);
-    }
     if len >= 8 {
         memcpy_u32(&mut src, &mut dest);
         memcpy_u32(&mut src, &mut dest);
@@ -68,22 +78,42 @@ pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_16(
         len -= 1;
     }
 }
+
 #[allow(dead_code)]
 #[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const
-pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
+pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_32(
     mut src: *const u8,
     mut dest: *mut u8,
     mut len: usize,
 ) {
-    // This gets properly auto-vectorized on AVX 2 and SSE 4.2
-    #[inline]
-    unsafe fn memcpy_u64(src: &mut *const u8, dest: &mut *mut u8) {
-        #[allow(clippy::cast_ptr_alignment)]
-        dest.cast::<u64>()
-            .write_unaligned(src.cast::<u64>().read_unaligned());
-        *src = src.offset(8);
-        *dest = dest.offset(8);
+    if len >= 16 {
+        memcpy_u64(&mut src, &mut dest);
+        memcpy_u64(&mut src, &mut dest);
+        len -= 16;
+    }
+    if len >= 8 {
+        memcpy_u64(&mut src, &mut dest);
+        len -= 8;
+    }
+    if len >= 4 {
+        memcpy_u32(&mut src, &mut dest);
+        len -= 4;
     }
+    while len > 0 {
+        *dest = *src;
+        src = src.offset(1);
+        dest = dest.offset(1);
+        len -= 1;
+    }
+}
+
+#[allow(dead_code)]
+#[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const
+pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
+    mut src: *const u8,
+    mut dest: *mut u8,
+    mut len: usize,
+) {
     if len >= 32 {
         memcpy_u64(&mut src, &mut dest);
         memcpy_u64(&mut src, &mut dest);
@@ -100,6 +130,10 @@ pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
         memcpy_u64(&mut src, &mut dest);
         len -= 8;
     }
+    if len >= 4 {
+        memcpy_u32(&mut src, &mut dest);
+        len -= 4;
+    }
     while len > 0 {
         *dest = *src;
         src = src.offset(1);

From d08c88b3870dd438866b7934bbf6160e619f1a4b Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sun, 30 May 2021 19:12:08 +0200
Subject: [PATCH 16/69] clippy

---
 src/implementation/x86/avx2.rs  | 1 +
 src/implementation/x86/sse42.rs | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs
index 9bdcb0c0..e402e145 100644
--- a/src/implementation/x86/avx2.rs
+++ b/src/implementation/x86/avx2.rs
@@ -268,6 +268,7 @@ unsafe fn simd_prefetch(ptr: *const u8) {
 }
 
 const PREFETCH: bool = true;
+#[allow(unused_imports)]
 use crate::implementation::helpers::TempSimdChunkA32 as TempSimdChunk;
 simd_input_256_bit!("avx2");
 algorithm_simd!("avx2");
diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs
index 496361fe..e0d08675 100644
--- a/src/implementation/x86/sse42.rs
+++ b/src/implementation/x86/sse42.rs
@@ -252,6 +252,7 @@ unsafe fn simd_prefetch(ptr: *const u8) {
 }
 
 const PREFETCH: bool = false;
+#[allow(unused_imports)]
 use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk;
 simd_input_128_bit!("sse4.2");
 algorithm_simd!("sse4.2");

From 0c391490e7f6b99cacc3e6494f05acc1dc4d3f83 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Tue, 1 Jun 2021 13:59:37 +0200
Subject: [PATCH 17/69] add unit test for AVX 2 load_partial()

---
 src/implementation/x86/avx2.rs | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs
index e402e145..bc93f3a4 100644
--- a/src/implementation/x86/avx2.rs
+++ b/src/implementation/x86/avx2.rs
@@ -267,6 +267,31 @@ unsafe fn simd_prefetch(ptr: *const u8) {
     _mm_prefetch(ptr.cast::<i8>(), _MM_HINT_T0);
 }
 
+mod test {
+    #[allow(unused_imports)]
+    use super::*;
+
+    #[test]
+    pub fn masked_load() {
+        let arr = [
+            1_u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        unsafe {
+            for len in 0..32 {
+                let loaded_arr: [u8; 32] =
+                    core::mem::transmute(SimdU8Value::load_partial(arr.as_ptr(), len));
+                for i in 0..len {
+                    assert_eq!(arr[i], loaded_arr[i]);
+                }
+                for x in &loaded_arr[len..arr.len()] {
+                    assert_eq!(*x, 0);
+                }
+            }
+        }
+    }
+}
+
 const PREFETCH: bool = true;
 #[allow(unused_imports)]
 use crate::implementation::helpers::TempSimdChunkA32 as TempSimdChunk;

From 198e11ceb646ca926681d17926f4c61be5023bfa Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Tue, 1 Jun 2021 14:01:11 +0200
Subject: [PATCH 18/69] Implement AVX 2 simd value Display and LowerHex traits
 for debugging

---
 src/implementation/x86/avx2.rs | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs
index bc93f3a4..3e536bfd 100644
--- a/src/implementation/x86/avx2.rs
+++ b/src/implementation/x86/avx2.rs
@@ -248,6 +248,24 @@ impl From<__m256i> for SimdU8Value {
     }
 }
 
+impl core::fmt::Display for SimdU8Value {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        unsafe {
+            let arr: [u8; 32] = core::mem::transmute(self.0);
+            write!(f, "{:?}", arr)
+        }
+    }
+}
+
+impl core::fmt::LowerHex for SimdU8Value {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        unsafe {
+            let arr: [u8; 32] = core::mem::transmute(self.0);
+            write!(f, "{:x?}", arr)
+        }
+    }
+}
+
 impl Utf8CheckAlgorithm<SimdU8Value> {
     #[target_feature(enable = "avx2")]
     #[inline]

From 57b0790c778855ea3cb695d47090d3cc61135863 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Tue, 1 Jun 2021 14:01:47 +0200
Subject: [PATCH 19/69] add load_partial_direct() method

---
 src/implementation/x86/avx2.rs | 64 +++++++++++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 8 deletions(-)

diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs
index 3e536bfd..bc107f09 100644
--- a/src/implementation/x86/avx2.rs
+++ b/src/implementation/x86/avx2.rs
@@ -4,17 +4,21 @@
 
 #[cfg(target_arch = "x86")]
 use core::arch::x86::{
-    __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_cmpgt_epi8, _mm256_loadu_si256,
-    _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi8,
-    _mm256_setr_epi8, _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16,
-    _mm256_subs_epu8, _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0,
+    __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_blendv_ps, _mm256_castps_si256,
+    _mm256_castsi256_ps, _mm256_cmpgt_epi8, _mm256_loadu_si256, _mm256_maskload_epi32,
+    _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi32,
+    _mm256_set1_epi8, _mm256_set_epi32, _mm256_setr_epi8, _mm256_setzero_si256,
+    _mm256_shuffle_epi8, _mm256_sllv_epi32, _mm256_srli_epi16, _mm256_subs_epu8,
+    _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0,
 };
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::{
-    __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_cmpgt_epi8, _mm256_loadu_si256,
-    _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi8,
-    _mm256_setr_epi8, _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16,
-    _mm256_subs_epu8, _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0,
+    __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_blendv_ps, _mm256_castps_si256,
+    _mm256_castsi256_ps, _mm256_cmpgt_epi8, _mm256_loadu_si256, _mm256_maskload_epi32,
+    _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi32,
+    _mm256_set1_epi8, _mm256_set_epi32, _mm256_setr_epi8, _mm256_setzero_si256,
+    _mm256_shuffle_epi8, _mm256_sllv_epi32, _mm256_srli_epi16, _mm256_subs_epu8,
+    _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0,
 };
 
 use crate::implementation::helpers::Utf8CheckAlgorithm;
@@ -103,9 +107,53 @@ impl SimdU8Value {
         Self::from(_mm256_loadu_si256(ptr.cast::<__m256i>()))
     }
 
+    #[target_feature(enable = "avx2")]
+    #[inline]
+    unsafe fn vecmask_from_bitmask(mask: u8) -> Self {
+        let vshift_count = _mm256_set_epi32(24, 25, 26, 27, 28, 29, 30, 31);
+        let bcast = _mm256_set1_epi32(i32::from(mask));
+        let shifted = _mm256_sllv_epi32(bcast, vshift_count); // high bit of each element = corresponding bit of the mask
+        Self::from(shifted)
+    }
+
     #[target_feature(enable = "avx2")]
     #[inline]
     unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
+        Self::load_partial_direct(ptr, len)
+    }
+
+    #[target_feature(enable = "avx2")]
+    #[inline]
+    unsafe fn load_partial_direct(mut ptr: *const u8, len: usize) -> Self {
+        if len == 0 {
+            return Self::splat0();
+        }
+        let sel_mask = 1 << (len / 4);
+        let mask = (sel_mask - 1) as u8;
+        let mut res = _mm256_maskload_epi32(ptr.cast(), Self::vecmask_from_bitmask(mask).0);
+        let remainder = len % 4;
+        if remainder != 0 {
+            ptr = ptr.add((len - len % 4) as usize);
+            let remaining_bytes = match remainder {
+                1 => u32::from(*ptr),
+                2 => u32::from(*ptr) | u32::from(*ptr.add(1)) << 8,
+                3 => u32::from(*ptr) | u32::from(*ptr.add(1)) << 8 | u32::from(*ptr.add(2)) << 16,
+                _ => 0,
+            };
+            #[allow(clippy::cast_possible_wrap)]
+            let remaining_vec = _mm256_set1_epi32(remaining_bytes as i32);
+            res = _mm256_castps_si256(_mm256_blendv_ps(
+                _mm256_castsi256_ps(res),
+                _mm256_castsi256_ps(remaining_vec),
+                _mm256_castsi256_ps(Self::vecmask_from_bitmask(sel_mask).0),
+            ));
+        }
+        Self::from(res)
+    }
+
+    #[target_feature(enable = "avx2")]
+    #[inline]
+    unsafe fn load_partial_copy(ptr: *const u8, len: usize) -> Self {
         let mut tmpbuf = [0_u8; 32];
         crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_32(
             ptr,

From 38b58de365e5be84885635bb2bb2e06ece7bc04e Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Tue, 1 Jun 2021 14:10:43 +0200
Subject: [PATCH 20/69] only run avx2 masked load test if AVX 2 is available

---
 src/implementation/x86/avx2.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs
index bc107f09..93ed101c 100644
--- a/src/implementation/x86/avx2.rs
+++ b/src/implementation/x86/avx2.rs
@@ -334,11 +334,18 @@ unsafe fn simd_prefetch(ptr: *const u8) {
 }
 
 mod test {
+    #[cfg(not(features = "std"))]
+    extern crate std;
+
     #[allow(unused_imports)]
     use super::*;
 
     #[test]
     pub fn masked_load() {
+        if std::is_x86_feature_detected!("avx2") {
+            return;
+        }
+
         let arr = [
             1_u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
             24, 25, 26, 27, 28, 29, 30, 31, 32,

From b6b23882b2f5be93cdc9098e59901a8db3329d01 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Tue, 1 Jun 2021 14:21:31 +0200
Subject: [PATCH 21/69] fix avx2 detection

---
 src/implementation/x86/avx2.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs
index 93ed101c..e0efeaa1 100644
--- a/src/implementation/x86/avx2.rs
+++ b/src/implementation/x86/avx2.rs
@@ -342,7 +342,7 @@ mod test {
 
     #[test]
     pub fn masked_load() {
-        if std::is_x86_feature_detected!("avx2") {
+        if !std::is_x86_feature_detected!("avx2") {
             return;
         }
 

From 44f8fd101b3d3e01ff794110d8da2da347a19d5e Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Tue, 1 Jun 2021 18:17:55 +0200
Subject: [PATCH 22/69] prevent remainder loop unrolling in sse 4.2 which
 caused the methods not to be inlined.

---
 src/implementation/aarch64/neon.rs |  1 +
 src/implementation/algorithm.rs    | 11 +++++++++++
 src/implementation/x86/avx2.rs     |  1 +
 src/implementation/x86/sse42.rs    |  1 +
 4 files changed, 14 insertions(+)

diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs
index 17966445..325386a0 100644
--- a/src/implementation/aarch64/neon.rs
+++ b/src/implementation/aarch64/neon.rs
@@ -404,6 +404,7 @@ unsafe fn simd_prefetch(ptr: *const u8) {
 }
 
 const PREFETCH: bool = false;
+const PREVENT_REMAINDER_LOOP_UNROLLING: bool = false;
 #[allow(unused_imports)]
 use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk;
 simd_input_128_bit!("not_used");
diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index aa65766b..7e91d8c7 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -213,6 +213,12 @@ macro_rules! algorithm_simd {
                 const SIMD_SIZE: usize = core::mem::size_of::<SimdU8Value>();
                 let orig_len = len;
                 let mut len = len;
+
+                // necessary, otherwise the compiler needlessly unrolls the loop,
+                // the function becomes to big and is no longer inlined for SSE 4.2
+                if PREVENT_REMAINDER_LOOP_UNROLLING {
+                    assert!(len < crate::implementation::helpers::SIMD_CHUNK_SIZE);
+                }
                 while len >= SIMD_SIZE {
                     let simd_val = SimdU8Value::load_from(input);
                     input = input.add(SIMD_SIZE);
@@ -247,6 +253,11 @@ macro_rules! algorithm_simd {
             #[allow(const_err)] // the same, but for Rust 1.38.0
             unsafe fn check_remainder_ascii(&mut self, mut input: *const u8, mut len: usize) {
                 const SIMD_SIZE: usize = core::mem::size_of::<SimdU8Value>();
+
+                // prevent loop unrolling which can cause the function to be too big for inlining
+                if PREVENT_REMAINDER_LOOP_UNROLLING {
+                    assert!(len < crate::implementation::helpers::SIMD_CHUNK_SIZE);
+                }
                 while len >= SIMD_SIZE {
                     let simd_val = SimdU8Value::load_from(input);
                     input = input.add(SIMD_SIZE);
diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs
index 93ed101c..c54db356 100644
--- a/src/implementation/x86/avx2.rs
+++ b/src/implementation/x86/avx2.rs
@@ -366,6 +366,7 @@ mod test {
 }
 
 const PREFETCH: bool = true;
+const PREVENT_REMAINDER_LOOP_UNROLLING: bool = false;
 #[allow(unused_imports)]
 use crate::implementation::helpers::TempSimdChunkA32 as TempSimdChunk;
 simd_input_256_bit!("avx2");
diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs
index e0d08675..99f11597 100644
--- a/src/implementation/x86/sse42.rs
+++ b/src/implementation/x86/sse42.rs
@@ -252,6 +252,7 @@ unsafe fn simd_prefetch(ptr: *const u8) {
 }
 
 const PREFETCH: bool = false;
+const PREVENT_REMAINDER_LOOP_UNROLLING: bool = true;
 #[allow(unused_imports)]
 use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk;
 simd_input_128_bit!("sse4.2");

From cb7ed197bd91db6d40a50dbb1a5aa5ef079a2cad Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Tue, 1 Jun 2021 18:18:11 +0200
Subject: [PATCH 23/69] only implement Debug/LowerHex for tests

---
 src/implementation/x86/avx2.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs
index c54db356..f158069e 100644
--- a/src/implementation/x86/avx2.rs
+++ b/src/implementation/x86/avx2.rs
@@ -296,6 +296,7 @@ impl From<__m256i> for SimdU8Value {
     }
 }
 
+#[cfg(test)]
 impl core::fmt::Display for SimdU8Value {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         unsafe {
@@ -305,6 +306,7 @@ impl core::fmt::Display for SimdU8Value {
     }
 }
 
+#[cfg(test)]
 impl core::fmt::LowerHex for SimdU8Value {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         unsafe {
@@ -333,6 +335,7 @@ unsafe fn simd_prefetch(ptr: *const u8) {
     _mm_prefetch(ptr.cast::<i8>(), _MM_HINT_T0);
 }
 
+#[cfg(test)]
 mod test {
     #[cfg(not(features = "std"))]
     extern crate std;

From 10df3366190d4a927cfd9c13e7e5a9ffba79549c Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Wed, 2 Jun 2021 07:27:48 +0200
Subject: [PATCH 24/69] expand benchmarks

---
 bench/src/lib.rs | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/bench/src/lib.rs b/bench/src/lib.rs
index 218055b3..9aade97b 100644
--- a/bench/src/lib.rs
+++ b/bench/src/lib.rs
@@ -2,6 +2,7 @@ use criterion::{measurement::Measurement, BenchmarkGroup, BenchmarkId, Criterion
 use simdutf8::basic::from_utf8 as basic_from_utf8;
 use simdutf8::compat::from_utf8 as compat_from_utf8;
 
+use std::collections::HashSet;
 use std::str::from_utf8 as std_from_utf8;
 
 #[cfg(feature = "simdjson")]
@@ -113,7 +114,24 @@ fn get_valid_slice_of_len_or_more_aligned(
 
 fn bench<M: Measurement>(c: &mut Criterion<M>, name: &str, bytes: &[u8], bench_fn: BenchFn) {
     let mut group = c.benchmark_group(name);
-    for i in [1, 8, 64, 512, 4096, 65536, 131072].iter() {
+    let mut sizes = HashSet::new();
+    for i in 1..129 {
+        let alignment = Alignment {
+            boundary: 64,
+            offset: 8, // 8 is the default alignment on 64-bit, so this is what can be expected worst-case
+        };
+        let (vec, offset) = get_valid_slice_of_len_or_more_aligned(bytes, i, alignment);
+        let slice = &vec[offset..];
+        assert_eq!(
+            (slice.as_ptr() as usize) % alignment.boundary,
+            alignment.offset
+        );
+        if !sizes.contains(&slice.len()) {
+            bench_input(&mut group, slice, true, true, bench_fn);
+            sizes.insert(slice.len());
+        }
+    }
+    for i in [512, 4096, 65536, 131072].iter() {
         let alignment = Alignment {
             boundary: 64,
             offset: 8, // 8 is the default alignment on 64-bit, so this is what can be expected worst-case

From bab2fbffde7f2c4bc743b4a8be3abbdf19df307e Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Wed, 2 Jun 2021 07:28:27 +0200
Subject: [PATCH 25/69] x86: make delegation to std for small inputs in-code
 configurable

---
 src/implementation/x86/mod.rs | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs
index 19954956..36108f09 100644
--- a/src/implementation/x86/mod.rs
+++ b/src/implementation/x86/mod.rs
@@ -9,6 +9,8 @@ use super::helpers::SIMD_CHUNK_SIZE;
 
 // validate_utf8_basic() std: implementation auto-selection
 
+const DELEGATE_TO_STD_FOR_SMALL_INPUTS: bool = false;
+
 #[cfg(all(feature = "std", not(target_feature = "avx2")))]
 #[inline]
 pub(crate) unsafe fn validate_utf8_basic(
@@ -27,7 +29,7 @@ pub(crate) unsafe fn validate_utf8_basic(
         (fun)(input)
     }
 
-    if input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SIMD_CHUNK_SIZE {
         return super::validate_utf8_basic_fallback(input);
     }
 
@@ -53,7 +55,7 @@ fn get_fastest_available_implementation_basic() -> super::ValidateUtf8Fn {
 pub(crate) unsafe fn validate_utf8_basic(
     input: &[u8],
 ) -> core::result::Result<(), crate::basic::Utf8Error> {
-    if input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE {
         return super::validate_utf8_basic_fallback(input);
     }
 
@@ -76,7 +78,7 @@ unsafe fn validate_utf8_basic_avx2(
 pub(crate) unsafe fn validate_utf8_basic(
     input: &[u8],
 ) -> core::result::Result<(), crate::basic::Utf8Error> {
-    if input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE {
         return super::validate_utf8_basic_fallback(input);
     }
 
@@ -123,7 +125,7 @@ pub(crate) unsafe fn validate_utf8_compat(
         (fun)(input)
     }
 
-    if input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SIMD_CHUNK_SIZE {
         return super::validate_utf8_compat_fallback(input);
     }
 
@@ -149,7 +151,7 @@ fn get_fastest_available_implementation_compat() -> super::ValidateUtf8CompatFn
 pub(crate) unsafe fn validate_utf8_compat(
     input: &[u8],
 ) -> core::result::Result<(), crate::compat::Utf8Error> {
-    if input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE {
         return super::validate_utf8_compat_fallback(input);
     }
 
@@ -172,7 +174,7 @@ unsafe fn validate_utf8_compat_avx2(
 pub(crate) unsafe fn validate_utf8_compat(
     input: &[u8],
 ) -> core::result::Result<(), crate::compat::Utf8Error> {
-    if input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE {
         return super::validate_utf8_compat_fallback(input);
     }
 

From 0bb570dc8439e0b571b551ec4283c843e4698096 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Wed, 2 Jun 2021 11:20:07 +0200
Subject: [PATCH 26/69] comment

---
 src/implementation/x86/avx2.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs
index a4c928d7..88e769e6 100644
--- a/src/implementation/x86/avx2.rs
+++ b/src/implementation/x86/avx2.rs
@@ -140,6 +140,7 @@ impl SimdU8Value {
                 3 => u32::from(*ptr) | u32::from(*ptr.add(1)) << 8 | u32::from(*ptr.add(2)) << 16,
                 _ => 0,
             };
+            // blend vec with 4-byte chunks and last incomplete chunk together
             #[allow(clippy::cast_possible_wrap)]
             let remaining_vec = _mm256_set1_epi32(remaining_bytes as i32);
             res = _mm256_castps_si256(_mm256_blendv_ps(

From cbae8480df7f63eb6e523017a9b0a69bf5567fd1 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Wed, 2 Jun 2021 11:20:29 +0200
Subject: [PATCH 27/69] x86: delegate inputs < 9 bytes to std

---
 src/implementation/x86/mod.rs | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs
index 36108f09..dfa41ddd 100644
--- a/src/implementation/x86/mod.rs
+++ b/src/implementation/x86/mod.rs
@@ -4,12 +4,10 @@ pub(crate) mod avx2;
 #[allow(dead_code)]
 pub(crate) mod sse42;
 
-#[allow(unused_imports)]
-use super::helpers::SIMD_CHUNK_SIZE;
-
 // validate_utf8_basic() std: implementation auto-selection
 
-const DELEGATE_TO_STD_FOR_SMALL_INPUTS: bool = false;
+const DELEGATE_TO_STD_FOR_SMALL_INPUTS: bool = true;
+const SMALL_STRING_LIMIT: usize = 9;
 
 #[cfg(all(feature = "std", not(target_feature = "avx2")))]
 #[inline]
@@ -29,7 +27,7 @@ pub(crate) unsafe fn validate_utf8_basic(
         (fun)(input)
     }
 
-    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_basic_fallback(input);
     }
 
@@ -55,7 +53,7 @@ fn get_fastest_available_implementation_basic() -> super::ValidateUtf8Fn {
 pub(crate) unsafe fn validate_utf8_basic(
     input: &[u8],
 ) -> core::result::Result<(), crate::basic::Utf8Error> {
-    if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_basic_fallback(input);
     }
 
@@ -78,7 +76,7 @@ unsafe fn validate_utf8_basic_avx2(
 pub(crate) unsafe fn validate_utf8_basic(
     input: &[u8],
 ) -> core::result::Result<(), crate::basic::Utf8Error> {
-    if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_basic_fallback(input);
     }
 
@@ -125,7 +123,7 @@ pub(crate) unsafe fn validate_utf8_compat(
         (fun)(input)
     }
 
-    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_compat_fallback(input);
     }
 
@@ -151,7 +149,7 @@ fn get_fastest_available_implementation_compat() -> super::ValidateUtf8CompatFn
 pub(crate) unsafe fn validate_utf8_compat(
     input: &[u8],
 ) -> core::result::Result<(), crate::compat::Utf8Error> {
-    if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_compat_fallback(input);
     }
 
@@ -174,7 +172,7 @@ unsafe fn validate_utf8_compat_avx2(
 pub(crate) unsafe fn validate_utf8_compat(
     input: &[u8],
 ) -> core::result::Result<(), crate::compat::Utf8Error> {
-    if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_compat_fallback(input);
     }
 

From 522949283565c0a9537b3bb044a15219e036dd3d Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Wed, 2 Jun 2021 14:50:53 +0200
Subject: [PATCH 28/69] Dsiplay for SimdU8Value

---
 src/implementation/helpers.rs  | 34 ++++++++++++++++++++++++++++++++++
 src/implementation/x86/avx2.rs | 20 --------------------
 2 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/src/implementation/helpers.rs b/src/implementation/helpers.rs
index bbef8733..9aedb81a 100644
--- a/src/implementation/helpers.rs
+++ b/src/implementation/helpers.rs
@@ -180,3 +180,37 @@ impl TempSimdChunkA32 {
 pub(crate) struct SimdU8Value<T>(pub(crate) T)
 where
     T: Copy;
+
+#[cfg(test)]
+impl<T: Copy> core::fmt::Display for SimdU8Value<T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        unsafe {
+            if core::mem::size_of::<T>() == 16 {
+                let arr: [u8; 16] = core::mem::transmute_copy(&self.0);
+                write!(f, "{:?}", arr)
+            } else if core::mem::size_of::<T>() == 32 {
+                let arr: [u8; 32] = core::mem::transmute_copy(&self.0);
+                write!(f, "{:?}", arr)
+            } else {
+                Err(core::fmt::Error)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+impl<T: Copy> core::fmt::LowerHex for SimdU8Value<T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        unsafe {
+            if core::mem::size_of::<T>() == 16 {
+                let arr: [u8; 16] = core::mem::transmute_copy(&self.0);
+                write!(f, "{:x?}", arr)
+            } else if core::mem::size_of::<T>() == 32 {
+                let arr: [u8; 32] = core::mem::transmute_copy(&self.0);
+                write!(f, "{:x?}", arr)
+            } else {
+                Err(core::fmt::Error)
+            }
+        }
+    }
+}
diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs
index 88e769e6..b42e230c 100644
--- a/src/implementation/x86/avx2.rs
+++ b/src/implementation/x86/avx2.rs
@@ -297,26 +297,6 @@ impl From<__m256i> for SimdU8Value {
     }
 }
 
-#[cfg(test)]
-impl core::fmt::Display for SimdU8Value {
-    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        unsafe {
-            let arr: [u8; 32] = core::mem::transmute(self.0);
-            write!(f, "{:?}", arr)
-        }
-    }
-}
-
-#[cfg(test)]
-impl core::fmt::LowerHex for SimdU8Value {
-    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        unsafe {
-            let arr: [u8; 32] = core::mem::transmute(self.0);
-            write!(f, "{:x?}", arr)
-        }
-    }
-}
-
 impl Utf8CheckAlgorithm<SimdU8Value> {
     #[target_feature(enable = "avx2")]
     #[inline]

From 2952db3a4c11ef2325f6b1a5191378d5c6e93f6c Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Wed, 2 Jun 2021 14:51:21 +0200
Subject: [PATCH 29/69] SSE 4.2 load_partial()

---
 src/implementation/x86/sse42.rs | 187 +++++++++++++++++++++++++++++++-
 1 file changed, 181 insertions(+), 6 deletions(-)

diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs
index 99f11597..e60deb3d 100644
--- a/src/implementation/x86/sse42.rs
+++ b/src/implementation/x86/sse42.rs
@@ -4,15 +4,17 @@
 
 #[cfg(target_arch = "x86")]
 use core::arch::x86::{
-    __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_loadu_si128, _mm_movemask_epi8,
-    _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8,
-    _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
+    __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_bsrli_si128, _mm_cmpgt_epi8, _mm_insert_epi16,
+    _mm_insert_epi8, _mm_loadu_si128, _mm_loadu_si64, _mm_movemask_epi8, _mm_or_si128,
+    _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128,
+    _mm_shuffle_epi8, _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
 };
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::{
-    __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_loadu_si128, _mm_movemask_epi8,
-    _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8,
-    _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
+    __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_bsrli_si128, _mm_cmpgt_epi8, _mm_insert_epi16,
+    _mm_insert_epi8, _mm_loadu_si128, _mm_loadu_si64, _mm_movemask_epi8, _mm_or_si128,
+    _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128,
+    _mm_shuffle_epi8, _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
 };
 
 use crate::implementation::helpers::Utf8CheckAlgorithm;
@@ -101,7 +103,149 @@ impl SimdU8Value {
 
     #[target_feature(enable = "sse4.2")]
     #[inline]
+    #[allow(clippy::too_many_lines)]
+    #[allow(clippy::cast_ptr_alignment)]
     unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
+        Self::from(match len {
+            1 => _mm_setr_epi8(
+                ptr.cast::<i8>().read_unaligned(),
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            ),
+            2 => _mm_setr_epi16(ptr.cast::<i16>().read_unaligned(), 0, 0, 0, 0, 0, 0, 0),
+            3 => _mm_setr_epi8(
+                ptr.cast::<i8>().read_unaligned(),
+                ptr.add(1).cast::<i8>().read_unaligned(),
+                ptr.add(2).cast::<i8>().read_unaligned(),
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            ),
+            4 => _mm_setr_epi32(ptr.cast::<i32>().read_unaligned(), 0, 0, 0), // assembly???
+            5 => {
+                let val = _mm_setr_epi32(ptr.cast::<i32>().read_unaligned(), 0, 0, 0);
+                _mm_insert_epi8(val, i32::from(ptr.add(4).cast::<i8>().read_unaligned()), 4)
+            }
+            6 => _mm_setr_epi16(
+                ptr.cast::<i16>().read_unaligned(),
+                ptr.add(2).cast::<i16>().read_unaligned(),
+                ptr.add(4).cast::<i16>().read_unaligned(),
+                0,
+                0,
+                0,
+                0,
+                0,
+            ),
+            7 => {
+                let val = _mm_setr_epi16(
+                    ptr.cast::<i16>().read_unaligned(),
+                    ptr.add(2).cast::<i16>().read_unaligned(),
+                    ptr.add(4).cast::<i16>().read_unaligned(),
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                );
+                _mm_insert_epi8(val, i32::from(ptr.add(6).cast::<i8>().read_unaligned()), 6)
+            }
+            8 => _mm_bsrli_si128(_mm_loadu_si64(ptr), 8),
+            9 => {
+                let val = _mm_bsrli_si128(_mm_loadu_si64(ptr), 8);
+                _mm_insert_epi8(val, i32::from(ptr.add(8).cast::<i8>().read_unaligned()), 8)
+            }
+            10 => {
+                let val = _mm_bsrli_si128(_mm_loadu_si64(ptr), 8);
+                _mm_insert_epi16(val, i32::from(ptr.add(8).cast::<i16>().read_unaligned()), 4)
+            }
+            11 => {
+                let mut val = _mm_bsrli_si128(_mm_loadu_si64(ptr), 8);
+                val =
+                    _mm_insert_epi16(val, i32::from(ptr.add(8).cast::<i16>().read_unaligned()), 4);
+                _mm_insert_epi8(
+                    val,
+                    i32::from(ptr.add(10).cast::<i8>().read_unaligned()),
+                    10,
+                )
+            }
+            12 => _mm_setr_epi32(
+                ptr.cast::<i32>().read_unaligned(),
+                ptr.add(4).cast::<i32>().read_unaligned(),
+                ptr.add(8).cast::<i32>().read_unaligned(),
+                0,
+            ),
+            13 => {
+                let val = _mm_setr_epi32(
+                    ptr.cast::<i32>().read_unaligned(),
+                    ptr.add(4).cast::<i32>().read_unaligned(),
+                    ptr.add(8).cast::<i32>().read_unaligned(),
+                    0,
+                );
+                _mm_insert_epi8(
+                    val,
+                    i32::from(ptr.add(12).cast::<i8>().read_unaligned()),
+                    12,
+                )
+            }
+            14 => {
+                let val = _mm_setr_epi32(
+                    ptr.cast::<i32>().read_unaligned(),
+                    ptr.add(4).cast::<i32>().read_unaligned(),
+                    ptr.add(8).cast::<i32>().read_unaligned(),
+                    0,
+                );
+                _mm_insert_epi16(
+                    val,
+                    i32::from(ptr.add(12).cast::<i16>().read_unaligned()),
+                    6,
+                )
+            }
+            15 => {
+                let mut val = _mm_setr_epi32(
+                    ptr.cast::<i32>().read_unaligned(),
+                    ptr.add(4).cast::<i32>().read_unaligned(),
+                    ptr.add(8).cast::<i32>().read_unaligned(),
+                    0,
+                );
+                val = _mm_insert_epi16(
+                    val,
+                    i32::from(ptr.add(12).cast::<i16>().read_unaligned()),
+                    6,
+                );
+                _mm_insert_epi8(
+                    val,
+                    i32::from(ptr.add(14).cast::<i8>().read_unaligned()),
+                    14,
+                )
+            }
+            _ => Self::splat0().0, // _ => res = Self::load_partial_copy(ptr, len),
+        })
+    }
+
+    unsafe fn load_partial_copy(ptr: *const u8, len: usize) -> Self {
         let mut tmpbuf = [0_u8; 16];
         crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_16(
             ptr,
@@ -251,6 +395,37 @@ unsafe fn simd_prefetch(ptr: *const u8) {
     _mm_prefetch(ptr.cast::<i8>(), _MM_HINT_T0);
 }
 
+#[cfg(test)]
+mod test {
+    #[cfg(not(features = "std"))]
+    extern crate std;
+
+    #[allow(unused_imports)]
+    use super::*;
+
+    #[test]
+    pub fn masked_load() {
+        if !std::is_x86_feature_detected!("sse4.2") {
+            return;
+        }
+
+        let arr = [1_u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        unsafe {
+            for len in 0..16 {
+                let loaded_arr: [u8; 16] =
+                    core::mem::transmute(SimdU8Value::load_partial(arr.as_ptr(), len));
+                println!("{:?}", loaded_arr);
+                for i in 0..len {
+                    assert_eq!(arr[i], loaded_arr[i]);
+                }
+                for x in &loaded_arr[len..arr.len()] {
+                    assert_eq!(*x, 0);
+                }
+            }
+        }
+    }
+}
+
 const PREFETCH: bool = false;
 const PREVENT_REMAINDER_LOOP_UNROLLING: bool = true;
 #[allow(unused_imports)]

From 2333e0ec19eb78f5ef6c0a38795c7a5c1dbe5d8b Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 3 Jun 2021 09:40:09 +0200
Subject: [PATCH 30/69] cleanup

---
 src/implementation/x86/sse42.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs
index e60deb3d..1ba07b59 100644
--- a/src/implementation/x86/sse42.rs
+++ b/src/implementation/x86/sse42.rs
@@ -241,7 +241,7 @@ impl SimdU8Value {
                     14,
                 )
             }
-            _ => Self::splat0().0, // _ => res = Self::load_partial_copy(ptr, len),
+            _ => Self::splat0().0,
         })
     }
 

From 54e774ebb30281885c1b60fd2c2dce168f4745a5 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 3 Jun 2021 09:40:34 +0200
Subject: [PATCH 31/69] testbed for partial load

---
 src/implementation/algorithm.rs | 74 +++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index 7e91d8c7..75b73d98 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -619,6 +619,13 @@ macro_rules! simd_input_128_bit {
             #[inline]
             #[allow(clippy::cast_ptr_alignment)]
             unsafe fn new_partial(ptr: *const u8, len: usize) -> Self {
+                Self::new_partial_ordered(ptr, len)
+            }
+
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(clippy::cast_ptr_alignment)]
+            unsafe fn new_partial_ordered(ptr: *const u8, len: usize) -> Self {
                 if len < 16 {
                     Self {
                         vals: [
@@ -658,6 +665,50 @@ macro_rules! simd_input_128_bit {
                 }
             }
 
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(clippy::cast_ptr_alignment)]
+            unsafe fn new_partial_small(ptr: *const u8, len: usize) -> Self {
+                let partial = SimdU8Value::load_partial(ptr.add(len / 16 * 16), len % 16);
+                if len < 16 {
+                    Self {
+                        vals: [
+                            partial,
+                            SimdU8Value::splat0(),
+                            SimdU8Value::splat0(),
+                            SimdU8Value::splat0(),
+                        ],
+                    }
+                } else if len < 32 {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_from(ptr),
+                            partial,
+                            SimdU8Value::splat0(),
+                            SimdU8Value::splat0(),
+                        ],
+                    }
+                } else if len < 48 {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_from(ptr),
+                            SimdU8Value::load_from(ptr.add(16)),
+                            partial,
+                            SimdU8Value::splat0(),
+                        ],
+                    }
+                } else {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_from(ptr),
+                            SimdU8Value::load_from(ptr.add(16)),
+                            SimdU8Value::load_from(ptr.add(32)),
+                            partial,
+                        ],
+                    }
+                }
+            }
+
             #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
             #[inline]
             unsafe fn is_ascii(&self) -> bool {
@@ -694,6 +745,13 @@ macro_rules! simd_input_256_bit {
             #[inline]
             #[allow(clippy::cast_ptr_alignment)]
             unsafe fn new_partial(ptr: *const u8, len: usize) -> Self {
+                Self::new_partial_ordered(ptr, len)
+            }
+
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(clippy::cast_ptr_alignment)]
+            unsafe fn new_partial_ordered(ptr: *const u8, len: usize) -> Self {
                 if len < 32 {
                     Self {
                         vals: [SimdU8Value::load_partial(ptr, len), SimdU8Value::splat0()],
@@ -708,6 +766,22 @@ macro_rules! simd_input_256_bit {
                 }
             }
 
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(clippy::cast_ptr_alignment)]
+            unsafe fn new_partial_small(ptr: *const u8, len: usize) -> Self {
+                let partial = SimdU8Value::load_partial(ptr.add(len / 32 * 32), len % 32);
+                if len < 32 {
+                    Self {
+                        vals: [partial, SimdU8Value::splat0()],
+                    }
+                } else {
+                    Self {
+                        vals: [SimdU8Value::load_from(ptr), partial],
+                    }
+                }
+            }
+
             #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
             #[inline]
             unsafe fn is_ascii(&self) -> bool {

From 1969057d496d78a067c053b124264a96e20b8d6c Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 3 Jun 2021 09:46:48 +0200
Subject: [PATCH 32/69] AVX2 in-code config var

---
 src/implementation/x86/mod.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs
index dfa41ddd..b960f96d 100644
--- a/src/implementation/x86/mod.rs
+++ b/src/implementation/x86/mod.rs
@@ -6,6 +6,7 @@ pub(crate) mod sse42;
 
 // validate_utf8_basic() std: implementation auto-selection
 
+const ENABLE_AVX2: bool = true;
 const DELEGATE_TO_STD_FOR_SMALL_INPUTS: bool = true;
 const SMALL_STRING_LIMIT: usize = 9;
 
@@ -38,7 +39,7 @@ pub(crate) unsafe fn validate_utf8_basic(
 #[cfg(all(feature = "std", not(target_feature = "avx2")))]
 #[inline]
 fn get_fastest_available_implementation_basic() -> super::ValidateUtf8Fn {
-    if std::is_x86_feature_detected!("avx2") {
+    if ENABLE_AVX2 && std::is_x86_feature_detected!("avx2") {
         avx2::validate_utf8_basic
     } else if std::is_x86_feature_detected!("sse4.2") {
         sse42::validate_utf8_basic
@@ -134,7 +135,7 @@ pub(crate) unsafe fn validate_utf8_compat(
 #[cfg(all(feature = "std", not(target_feature = "avx2")))]
 #[inline]
 fn get_fastest_available_implementation_compat() -> super::ValidateUtf8CompatFn {
-    if std::is_x86_feature_detected!("avx2") {
+    if ENABLE_AVX2 && std::is_x86_feature_detected!("avx2") {
         avx2::validate_utf8_compat
     } else if std::is_x86_feature_detected!("sse4.2") {
         sse42::validate_utf8_compat

From 7e346a8831183dd28d905cf14a8f9ea29e4defc8 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 3 Jun 2021 12:51:50 +0200
Subject: [PATCH 33/69] Faster than std impl on Apple Silicon

---
 src/implementation/aarch64/mod.rs | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/implementation/aarch64/mod.rs b/src/implementation/aarch64/mod.rs
index 6b1bfb2f..a483ecbc 100644
--- a/src/implementation/aarch64/mod.rs
+++ b/src/implementation/aarch64/mod.rs
@@ -5,10 +5,6 @@ pub(crate) mod neon;
 #[inline]
 #[cfg(all(feature = "aarch64_neon", target_feature = "neon"))]
 pub(crate) unsafe fn validate_utf8_basic(input: &[u8]) -> Result<(), crate::basic::Utf8Error> {
-    if input.len() < super::helpers::SIMD_CHUNK_SIZE {
-        return super::validate_utf8_basic_fallback(input);
-    }
-
     validate_utf8_basic_neon(input)
 }
 
@@ -24,10 +20,6 @@ pub(crate) use super::validate_utf8_basic_fallback as validate_utf8_basic;
 #[inline]
 #[cfg(all(feature = "aarch64_neon", target_feature = "neon"))]
 pub(crate) unsafe fn validate_utf8_compat(input: &[u8]) -> Result<(), crate::compat::Utf8Error> {
-    if input.len() < super::helpers::SIMD_CHUNK_SIZE {
-        return super::validate_utf8_compat_fallback(input);
-    }
-
     validate_utf8_compat_neon(input)
 }
 

From fcbb14c13c53a8b5c0882b0a2506414dceee431d Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 3 Jun 2021 12:57:29 +0200
Subject: [PATCH 34/69] comment wording

---
 src/implementation/algorithm.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index 75b73d98..79ebb393 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -214,7 +214,7 @@ macro_rules! algorithm_simd {
                 let orig_len = len;
                 let mut len = len;
 
-                // necessary, otherwise the compiler needlessly unrolls the loop,
+                // necessary, otherwise the compiler excessively unrolls the loop,
                 // the function becomes to big and is no longer inlined for SSE 4.2
                 if PREVENT_REMAINDER_LOOP_UNROLLING {
                     assert!(len < crate::implementation::helpers::SIMD_CHUNK_SIZE);
@@ -254,7 +254,7 @@ macro_rules! algorithm_simd {
             unsafe fn check_remainder_ascii(&mut self, mut input: *const u8, mut len: usize) {
                 const SIMD_SIZE: usize = core::mem::size_of::<SimdU8Value>();
 
-                // prevent loop unrolling which can cause the function to be too big for inlining
+                // prevent excessive loop unrolling which can cause the function to be too big for inlining
                 if PREVENT_REMAINDER_LOOP_UNROLLING {
                     assert!(len < crate::implementation::helpers::SIMD_CHUNK_SIZE);
                 }

From 74ecc4e3e685f563ede476e374c57f7ed241c372 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 17 Jun 2021 12:11:37 +0200
Subject: [PATCH 35/69] Update algorithm.rs

Help the compiler to assert that partial_len < 16
---
 src/implementation/algorithm.rs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index 75b73d98..b1ddbde4 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -626,10 +626,11 @@ macro_rules! simd_input_128_bit {
             #[inline]
             #[allow(clippy::cast_ptr_alignment)]
             unsafe fn new_partial_ordered(ptr: *const u8, len: usize) -> Self {
+                let partial_len = len % 16;
                 if len < 16 {
                     Self {
                         vals: [
-                            SimdU8Value::load_partial(ptr, len),
+                            SimdU8Value::load_partial(ptr, partial_len),
                             SimdU8Value::splat0(),
                             SimdU8Value::splat0(),
                             SimdU8Value::splat0(),
@@ -639,7 +640,7 @@ macro_rules! simd_input_128_bit {
                     Self {
                         vals: [
                             SimdU8Value::load_from(ptr),
-                            SimdU8Value::load_partial(ptr.add(16), len - 16),
+                            SimdU8Value::load_partial(ptr.add(16), partial_len),
                             SimdU8Value::splat0(),
                             SimdU8Value::splat0(),
                         ],
@@ -649,7 +650,7 @@ macro_rules! simd_input_128_bit {
                         vals: [
                             SimdU8Value::load_from(ptr),
                             SimdU8Value::load_from(ptr.add(16)),
-                            SimdU8Value::load_partial(ptr.add(32), len - 32),
+                            SimdU8Value::load_partial(ptr.add(32), partial_len),
                             SimdU8Value::splat0(),
                         ],
                     }
@@ -659,7 +660,7 @@ macro_rules! simd_input_128_bit {
                             SimdU8Value::load_from(ptr),
                             SimdU8Value::load_from(ptr.add(16)),
                             SimdU8Value::load_from(ptr.add(32)),
-                            SimdU8Value::load_partial(ptr.add(48), len - 48),
+                            SimdU8Value::load_partial(ptr.add(48), partial_len),
                         ],
                     }
                 }

From 7df6fe7f9442d3e49873ab11ef381a2d3191a811 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 17 Jun 2021 13:31:29 +0200
Subject: [PATCH 36/69] Update mod.rs

fix x86 build
---
 src/implementation/x86/mod.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs
index b960f96d..c175e542 100644
--- a/src/implementation/x86/mod.rs
+++ b/src/implementation/x86/mod.rs
@@ -54,7 +54,7 @@ fn get_fastest_available_implementation_basic() -> super::ValidateUtf8Fn {
 pub(crate) unsafe fn validate_utf8_basic(
     input: &[u8],
 ) -> core::result::Result<(), crate::basic::Utf8Error> {
-    if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT {
+    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_basic_fallback(input);
     }
 
@@ -77,7 +77,7 @@ unsafe fn validate_utf8_basic_avx2(
 pub(crate) unsafe fn validate_utf8_basic(
     input: &[u8],
 ) -> core::result::Result<(), crate::basic::Utf8Error> {
-    if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT {
+    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_basic_fallback(input);
     }
 
@@ -150,7 +150,7 @@ fn get_fastest_available_implementation_compat() -> super::ValidateUtf8CompatFn
 pub(crate) unsafe fn validate_utf8_compat(
     input: &[u8],
 ) -> core::result::Result<(), crate::compat::Utf8Error> {
-    if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT {
+    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_compat_fallback(input);
     }
 
@@ -173,7 +173,7 @@ unsafe fn validate_utf8_compat_avx2(
 pub(crate) unsafe fn validate_utf8_compat(
     input: &[u8],
 ) -> core::result::Result<(), crate::compat::Utf8Error> {
-    if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT {
+    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_compat_fallback(input);
     }
 

From 1458d7c999a3e269074acf82789ca1cc35e630ed Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 17 Jun 2021 15:45:07 +0200
Subject: [PATCH 37/69] Update mod.rs

clippy
---
 src/implementation/x86/mod.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs
index c175e542..65734d30 100644
--- a/src/implementation/x86/mod.rs
+++ b/src/implementation/x86/mod.rs
@@ -6,7 +6,9 @@ pub(crate) mod sse42;
 
 // validate_utf8_basic() std: implementation auto-selection
 
+#[allow(dead_code)]
 const ENABLE_AVX2: bool = true;
+
 const DELEGATE_TO_STD_FOR_SMALL_INPUTS: bool = true;
 const SMALL_STRING_LIMIT: usize = 9;
 

From 439dc90b3ee702e015215aa41c3a4cf108f3d20d Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 17 Jun 2021 15:59:58 +0200
Subject: [PATCH 38/69] Update mod.rs

clippy
---
 src/implementation/x86/mod.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs
index 65734d30..09fb53b7 100644
--- a/src/implementation/x86/mod.rs
+++ b/src/implementation/x86/mod.rs
@@ -9,7 +9,10 @@ pub(crate) mod sse42;
 #[allow(dead_code)]
 const ENABLE_AVX2: bool = true;
 
+#[allow(dead_code)]
 const DELEGATE_TO_STD_FOR_SMALL_INPUTS: bool = true;
+
+#[allow(dead_code)]
 const SMALL_STRING_LIMIT: usize = 9;
 
 #[cfg(all(feature = "std", not(target_feature = "avx2")))]

From d650782270af13aceb054ec4218b4b96e3d429bd Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 18 Jun 2021 06:56:16 +0200
Subject: [PATCH 39/69] Rust impl for _mm_loadu_si64 intrinsic was wrong, see
 https://github.com/rust-lang/stdarch/issues/1166 has landed in stable

---
 src/implementation/x86/sse42.rs | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs
index 1ba07b59..34774ba2 100644
--- a/src/implementation/x86/sse42.rs
+++ b/src/implementation/x86/sse42.rs
@@ -23,6 +23,8 @@ use crate::implementation::helpers::Utf8CheckAlgorithm;
 
 type SimdU8Value = crate::implementation::helpers::SimdU8Value<__m128i>;
 
+// _mm_loadu_si64
+
 impl SimdU8Value {
     #[target_feature(enable = "sse4.2")]
     #[inline]
@@ -172,17 +174,17 @@ impl SimdU8Value {
                 );
                 _mm_insert_epi8(val, i32::from(ptr.add(6).cast::<i8>().read_unaligned()), 6)
             }
-            8 => _mm_bsrli_si128(_mm_loadu_si64(ptr), 8),
+            8 => _mm_loadu_si64(ptr),
             9 => {
-                let val = _mm_bsrli_si128(_mm_loadu_si64(ptr), 8);
+                let val = _mm_loadu_si64(ptr);
                 _mm_insert_epi8(val, i32::from(ptr.add(8).cast::<i8>().read_unaligned()), 8)
             }
             10 => {
-                let val = _mm_bsrli_si128(_mm_loadu_si64(ptr), 8);
+                let val = _mm_loadu_si64(ptr);
                 _mm_insert_epi16(val, i32::from(ptr.add(8).cast::<i16>().read_unaligned()), 4)
             }
             11 => {
-                let mut val = _mm_bsrli_si128(_mm_loadu_si64(ptr), 8);
+                let mut val = _mm_loadu_si64(ptr);
                 val =
                     _mm_insert_epi16(val, i32::from(ptr.add(8).cast::<i16>().read_unaligned()), 4);
                 _mm_insert_epi8(

From 25b4272efa8663bd91a8caf0bb40b66d61707d0c Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 18 Jun 2021 07:02:10 +0200
Subject: [PATCH 40/69] Rust 1.38.0 compat: _mm_loadu_si64() not available ->
 replace. Asm is the same.

---
 src/implementation/x86/sse42.rs | 40 ++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs
index 34774ba2..c76c67a3 100644
--- a/src/implementation/x86/sse42.rs
+++ b/src/implementation/x86/sse42.rs
@@ -5,16 +5,16 @@
 #[cfg(target_arch = "x86")]
 use core::arch::x86::{
     __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_bsrli_si128, _mm_cmpgt_epi8, _mm_insert_epi16,
-    _mm_insert_epi8, _mm_loadu_si128, _mm_loadu_si64, _mm_movemask_epi8, _mm_or_si128,
-    _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128,
-    _mm_shuffle_epi8, _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
+    _mm_insert_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8,
+    _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8,
+    _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
 };
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::{
     __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_bsrli_si128, _mm_cmpgt_epi8, _mm_insert_epi16,
-    _mm_insert_epi8, _mm_loadu_si128, _mm_loadu_si64, _mm_movemask_epi8, _mm_or_si128,
-    _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128,
-    _mm_shuffle_epi8, _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
+    _mm_insert_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8,
+    _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8,
+    _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
 };
 
 use crate::implementation::helpers::Utf8CheckAlgorithm;
@@ -174,17 +174,37 @@ impl SimdU8Value {
                 );
                 _mm_insert_epi8(val, i32::from(ptr.add(6).cast::<i8>().read_unaligned()), 6)
             }
-            8 => _mm_loadu_si64(ptr),
+            8 => _mm_setr_epi32(
+                ptr.cast::<i32>().read_unaligned(),
+                ptr.add(4).cast::<i32>().read_unaligned(),
+                0,
+                0,
+            ),
             9 => {
-                let val = _mm_loadu_si64(ptr);
+                let val = _mm_setr_epi32(
+                    ptr.cast::<i32>().read_unaligned(),
+                    ptr.add(4).cast::<i32>().read_unaligned(),
+                    0,
+                    0,
+                );
                 _mm_insert_epi8(val, i32::from(ptr.add(8).cast::<i8>().read_unaligned()), 8)
             }
             10 => {
-                let val = _mm_loadu_si64(ptr);
+                let val = _mm_setr_epi32(
+                    ptr.cast::<i32>().read_unaligned(),
+                    ptr.add(4).cast::<i32>().read_unaligned(),
+                    0,
+                    0,
+                );
                 _mm_insert_epi16(val, i32::from(ptr.add(8).cast::<i16>().read_unaligned()), 4)
             }
             11 => {
-                let mut val = _mm_loadu_si64(ptr);
+                let mut val = _mm_setr_epi32(
+                    ptr.cast::<i32>().read_unaligned(),
+                    ptr.add(4).cast::<i32>().read_unaligned(),
+                    0,
+                    0,
+                );
                 val =
                     _mm_insert_epi16(val, i32::from(ptr.add(8).cast::<i16>().read_unaligned()), 4);
                 _mm_insert_epi8(

From 8f5b7584cdfc91c29a6f1f55962cac6871e9f301 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 18 Jun 2021 07:04:50 +0200
Subject: [PATCH 41/69] clippy

---
 src/implementation/x86/sse42.rs | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs
index c76c67a3..f8fa4be8 100644
--- a/src/implementation/x86/sse42.rs
+++ b/src/implementation/x86/sse42.rs
@@ -4,17 +4,17 @@
 
 #[cfg(target_arch = "x86")]
 use core::arch::x86::{
-    __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_bsrli_si128, _mm_cmpgt_epi8, _mm_insert_epi16,
-    _mm_insert_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8,
-    _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8,
-    _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
+    __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_insert_epi16, _mm_insert_epi8,
+    _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16,
+    _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi16,
+    _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
 };
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::{
-    __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_bsrli_si128, _mm_cmpgt_epi8, _mm_insert_epi16,
-    _mm_insert_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8,
-    _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8,
-    _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
+    __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_insert_epi16, _mm_insert_epi8,
+    _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16,
+    _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi16,
+    _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
 };
 
 use crate::implementation::helpers::Utf8CheckAlgorithm;

From d1753c2fc2d4909f7f6ac1da128aa88af455f513 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 18 Jun 2021 07:11:20 +0200
Subject: [PATCH 42/69] remove extra println!()

---
 src/implementation/x86/sse42.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs
index f8fa4be8..cdc4b1b4 100644
--- a/src/implementation/x86/sse42.rs
+++ b/src/implementation/x86/sse42.rs
@@ -436,7 +436,6 @@ mod test {
             for len in 0..16 {
                 let loaded_arr: [u8; 16] =
                     core::mem::transmute(SimdU8Value::load_partial(arr.as_ptr(), len));
-                println!("{:?}", loaded_arr);
                 for i in 0..len {
                     assert_eq!(arr[i], loaded_arr[i]);
                 }

From 1b2d9aa73800ea79c988f700c2ba5fb6d57837ac Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 18 Jun 2021 07:39:29 +0200
Subject: [PATCH 43/69] remove stray comment

---
 src/implementation/x86/sse42.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs
index cdc4b1b4..7f39ef49 100644
--- a/src/implementation/x86/sse42.rs
+++ b/src/implementation/x86/sse42.rs
@@ -23,8 +23,6 @@ use crate::implementation::helpers::Utf8CheckAlgorithm;
 
 type SimdU8Value = crate::implementation::helpers::SimdU8Value<__m128i>;
 
-// _mm_loadu_si64
-
 impl SimdU8Value {
     #[target_feature(enable = "sse4.2")]
     #[inline]

From 150103b5250de0a94cd3cbc176539e3737be7b76 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 18 Jun 2021 08:52:38 +0200
Subject: [PATCH 44/69] loop unrolling prevention no longer needed

---
 src/implementation/aarch64/neon.rs | 1 -
 src/implementation/algorithm.rs    | 9 ---------
 src/implementation/x86/avx2.rs     | 1 -
 src/implementation/x86/sse42.rs    | 1 -
 4 files changed, 12 deletions(-)

diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs
index 325386a0..17966445 100644
--- a/src/implementation/aarch64/neon.rs
+++ b/src/implementation/aarch64/neon.rs
@@ -404,7 +404,6 @@ unsafe fn simd_prefetch(ptr: *const u8) {
 }
 
 const PREFETCH: bool = false;
-const PREVENT_REMAINDER_LOOP_UNROLLING: bool = false;
 #[allow(unused_imports)]
 use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk;
 simd_input_128_bit!("not_used");
diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index da34e783..44125128 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -214,11 +214,6 @@ macro_rules! algorithm_simd {
                 let orig_len = len;
                 let mut len = len;
 
-                // necessary, otherwise the compiler excessively unrolls the loop,
-                // the function becomes to big and is no longer inlined for SSE 4.2
-                if PREVENT_REMAINDER_LOOP_UNROLLING {
-                    assert!(len < crate::implementation::helpers::SIMD_CHUNK_SIZE);
-                }
                 while len >= SIMD_SIZE {
                     let simd_val = SimdU8Value::load_from(input);
                     input = input.add(SIMD_SIZE);
@@ -254,10 +249,6 @@ macro_rules! algorithm_simd {
             unsafe fn check_remainder_ascii(&mut self, mut input: *const u8, mut len: usize) {
                 const SIMD_SIZE: usize = core::mem::size_of::<SimdU8Value>();
 
-                // prevent excessive loop unrolling which can cause the function to be too big for inlining
-                if PREVENT_REMAINDER_LOOP_UNROLLING {
-                    assert!(len < crate::implementation::helpers::SIMD_CHUNK_SIZE);
-                }
                 while len >= SIMD_SIZE {
                     let simd_val = SimdU8Value::load_from(input);
                     input = input.add(SIMD_SIZE);
diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs
index b42e230c..f20ea798 100644
--- a/src/implementation/x86/avx2.rs
+++ b/src/implementation/x86/avx2.rs
@@ -350,7 +350,6 @@ mod test {
 }
 
 const PREFETCH: bool = true;
-const PREVENT_REMAINDER_LOOP_UNROLLING: bool = false;
 #[allow(unused_imports)]
 use crate::implementation::helpers::TempSimdChunkA32 as TempSimdChunk;
 simd_input_256_bit!("avx2");
diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs
index 7f39ef49..af6de1db 100644
--- a/src/implementation/x86/sse42.rs
+++ b/src/implementation/x86/sse42.rs
@@ -446,7 +446,6 @@ mod test {
 }
 
 const PREFETCH: bool = false;
-const PREVENT_REMAINDER_LOOP_UNROLLING: bool = true;
 #[allow(unused_imports)]
 use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk;
 simd_input_128_bit!("sse4.2");

From 44cd43c42f305525127e681718e1e3e6e571c626 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 18 Jun 2021 12:45:01 +0200
Subject: [PATCH 45/69] benchmark for small inputs of a random length in ranges

---
 bench/benches/small_compat.rs |  3 ++
 bench/benches/small_std.rs    |  3 ++
 bench/src/lib.rs              | 92 +++++++++++++++++++++++++++++++++++
 bench/src/macros.rs           | 28 ++++++++++-
 4 files changed, 124 insertions(+), 2 deletions(-)
 create mode 100644 bench/benches/small_compat.rs
 create mode 100644 bench/benches/small_std.rs

diff --git a/bench/benches/small_compat.rs b/bench/benches/small_compat.rs
new file mode 100644
index 00000000..0ae2af45
--- /dev/null
+++ b/bench/benches/small_compat.rs
@@ -0,0 +1,3 @@
+use simdutf8_bench::define_small_benchmark;
+
+define_small_benchmark!(BenchFn::Compat);
diff --git a/bench/benches/small_std.rs b/bench/benches/small_std.rs
new file mode 100644
index 00000000..5a6bacc7
--- /dev/null
+++ b/bench/benches/small_std.rs
@@ -0,0 +1,3 @@
+use simdutf8_bench::define_throughput_benchmark;
+
+define_small_benchmark!(BenchFn::Std);
diff --git a/bench/src/lib.rs b/bench/src/lib.rs
index 9aade97b..2aad990c 100644
--- a/bench/src/lib.rs
+++ b/bench/src/lib.rs
@@ -63,6 +63,37 @@ pub fn criterion_benchmark<M: Measurement>(c: &mut Criterion<M>, bench_fn: Bench
     bench_late_error(c, bench_fn);
 }
 
+pub fn criterion_benchmark_small<M: Measurement>(c: &mut Criterion<M>, bench_fn: BenchFn) {
+    let core_ids = core_affinity::get_core_ids().unwrap();
+    core_affinity::set_for_current(*core_ids.get(2).unwrap_or(&core_ids[0]));
+
+    bench_small(
+        c,
+        "1-latin",
+        &scale_to_one_mib(include_bytes!("../data/Latin-Lipsum.txt")),
+        bench_fn,
+    );
+
+    bench_small(
+        c,
+        "2-cyrillic",
+        &scale_to_one_mib(include_bytes!("../data/Russian-Lipsum.txt")),
+        bench_fn,
+    );
+    bench_small(
+        c,
+        "3-chinese",
+        &scale_to_one_mib(include_bytes!("../data/Chinese-Lipsum.txt")),
+        bench_fn,
+    );
+    bench_small(
+        c,
+        "4-emoji",
+        &scale_to_one_mib(include_bytes!("../data/Emoji-Lipsum.txt")),
+        bench_fn,
+    );
+}
+
 fn bench_empty<M: Measurement>(c: &mut Criterion<M>, bench_fn: BenchFn) {
     let mut group = c.benchmark_group("0-empty");
     bench_input(&mut group, b"", false, true, bench_fn);
@@ -147,6 +178,67 @@ fn bench<M: Measurement>(c: &mut Criterion<M>, name: &str, bytes: &[u8], bench_f
     group.finish();
 }
 
+fn bench_small<M: Measurement>(c: &mut Criterion<M>, name: &str, bytes: &[u8], bench_fn: BenchFn) {
+    let mut group = c.benchmark_group(name);
+    bench_range(&mut group, bytes, 0, 16, bench_fn);
+    bench_range(&mut group, bytes, 16, 32, bench_fn);
+    bench_range(&mut group, bytes, 32, 64, bench_fn);
+    bench_range(&mut group, bytes, 65, 127, bench_fn);
+    group.finish();
+}
+
+fn gen_valid_in_range(bytes: &[u8], lower_limit: usize, upper_limit: usize) -> usize {
+    use rand::Rng;
+    let mut rng = rand::thread_rng();
+    loop {
+        let x = rng.gen_range(lower_limit..upper_limit);
+        if std_from_utf8(&bytes[0..x]).is_ok() {
+            return x;
+        }
+    }
+}
+
+fn bench_range<T: Measurement>(
+    group: &mut BenchmarkGroup<T>,
+    bytes: &[u8],
+    lower_limit: usize,
+    upper_limit: usize,
+    bench_fn: BenchFn,
+) {
+    match bench_fn {
+        BenchFn::Basic => {
+            group.bench_function(format!("rand_{}-{}", lower_limit, upper_limit), |b| {
+                b.iter_batched(
+                    || gen_valid_in_range(bytes, lower_limit, upper_limit),
+                    |x| assert!(basic_from_utf8(&bytes[0..x]).is_ok()),
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+        }
+        BenchFn::Compat => {
+            group.bench_function(format!("rand_{}-{}", lower_limit, upper_limit), |b| {
+                b.iter_batched(
+                    || gen_valid_in_range(bytes, lower_limit, upper_limit),
+                    |x| assert!(compat_from_utf8(&bytes[0..x]).is_ok()),
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+        }
+        BenchFn::Std => {
+            group.bench_function(format!("rand_{}-{}", lower_limit, upper_limit), |b| {
+                b.iter_batched(
+                    || gen_valid_in_range(bytes, lower_limit, upper_limit),
+                    |x| assert!(std_from_utf8(&bytes[0..x]).is_ok()),
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+        }
+        _ => {
+            unimplemented!();
+        }
+    }
+}
+
 #[inline(never)]
 fn basic_from_utf8_no_inline(v: &[u8]) -> bool {
     basic_from_utf8(v).is_ok()
diff --git a/bench/src/macros.rs b/bench/src/macros.rs
index 94421b41..745432eb 100644
--- a/bench/src/macros.rs
+++ b/bench/src/macros.rs
@@ -8,14 +8,38 @@ macro_rules! define_throughput_benchmark {
 
         use simdutf8_bench::*;
 
-        fn benchmark_compat<M: Measurement>(c: &mut Criterion<M>) {
+        fn benchmark_throughput<M: Measurement>(c: &mut Criterion<M>) {
             criterion_benchmark(c, $bench_fn);
         }
 
         criterion_group!(
             name = benches;
             config = Criterion::default().measurement_time(Duration::from_secs(1)).warm_up_time(Duration::from_secs(1)).sample_size(300);
-            targets = benchmark_compat
+            targets = benchmark_throughput
+        );
+
+        criterion_main!(benches);
+    };
+}
+
+#[macro_export]
+macro_rules! define_small_benchmark {
+    ($bench_fn:expr) => {
+        use std::time::Duration;
+
+        use criterion::measurement::Measurement;
+        use criterion::{criterion_group, criterion_main, Criterion};
+
+        use simdutf8_bench::*;
+
+        fn benchmark_small<M: Measurement>(c: &mut Criterion<M>) {
+            criterion_benchmark_small(c, $bench_fn);
+        }
+
+        criterion_group!(
+            name = benches;
+            config = Criterion::default().measurement_time(Duration::from_secs(1)).warm_up_time(Duration::from_secs(1)).sample_size(300);
+            targets = benchmark_small
         );
 
         criterion_main!(benches);

From 97794b37ad48b9268c34656cf623a03bc844ba44 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 18 Jun 2021 13:00:49 +0200
Subject: [PATCH 46/69] aarch64 partial load asm impl.

---
 src/implementation/aarch64/neon.rs | 250 ++++++++++++++++++++++++++++-
 src/lib.rs                         |   4 +
 2 files changed, 253 insertions(+), 1 deletion(-)

diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs
index 325386a0..0d604168 100644
--- a/src/implementation/aarch64/neon.rs
+++ b/src/implementation/aarch64/neon.rs
@@ -9,6 +9,247 @@ use crate::implementation::helpers::Utf8CheckAlgorithm;
 
 // aarch64 SIMD primitives
 
+#[inline(never)]
+/// C ABI spec is necessary so that the loaded value is returned in a register
+unsafe extern "C" fn load_partial_assembly_opt_call(
+    mut ptr: *const u8,
+    len: usize,
+) -> core::arch::aarch64::uint8x16_t {
+    let res: core::arch::aarch64::uint8x16_t;
+    unsafe {
+        asm!(
+        "movi.2d v0, #0000000000000000",
+        "cmp {len}, 15",
+        "b.hi 99f",
+        "adr {scratch}, #12",
+        "adds {scratch}, {scratch}, {len}, lsl #4",
+        "br {scratch}",
+
+        // 0
+        "ret",
+        "nop",
+        "nop",
+        "nop",
+
+        // 1
+        "ld1.b  {{ v0 }}[0], [{ptr}]",
+        "ret",
+        "nop",
+        "nop",
+
+        // 2
+        "ld1.h  {{ v0 }}[0], [{ptr}]",
+        "ret",
+        "nop",
+        "nop",
+
+        // 3
+        "ld1.h  {{ v0 }}[0], [{ptr}], #2",
+        "ld1.b  {{ v0 }}[2], [{ptr}]",
+        "ret",
+        "nop",
+
+        // 4
+        "ld1.s  {{ v0 }}[0], [{ptr}]",
+        "ret",
+        "nop",
+        "nop",
+
+        // 5
+        "ld1.s  {{ v0 }}[0], [{ptr}], #4",
+        "ld1.b  {{ v0 }}[4], [{ptr}]",
+        "ret",
+        "nop",
+
+        // 6
+        "ld1.s  {{ v0 }}[0], [{ptr}], #4",
+        "ld1.h  {{ v0 }}[2], [{ptr}]",
+        "ret",
+        "nop",
+
+        // 7
+        "ld1.s  {{ v0 }}[0], [{ptr}], #4",
+        "ld1.h  {{ v0 }}[2], [{ptr}], #2",
+        "ld1.b  {{ v0 }}[6], [{ptr}]",
+        "ret",
+
+        // 8
+        "ld1.d  {{ v0 }}[0], [{ptr}]",
+        "ret",
+        "nop",
+        "nop",
+
+        // 9
+        "ld1.d  {{ v0 }}[0], [{ptr}], #8",
+        "ld1.b  {{ v0 }}[8], [{ptr}]",
+        "ret",
+        "nop",
+
+        // 10
+        "ld1.d  {{ v0 }}[0], [{ptr}], #8",
+        "ld1.h  {{ v0 }}[4], [{ptr}]",
+        "ret",
+        "nop",
+
+        // 11
+        "ld1.d  {{ v0 }}[0], [{ptr}], #8",
+        "ld1.h  {{ v0 }}[4], [{ptr}], #2",
+        "ld1.b  {{ v0 }}[10], [{ptr}]",
+        "ret",
+
+        // 12
+        "ld1.d  {{ v0 }}[0], [{ptr}], #8",
+        "ld1.s  {{ v0 }}[2], [{ptr}]",
+        "ret",
+        "nop",
+
+        // 13
+        "ld1.d  {{ v0 }}[0], [{ptr}], #8",
+        "ld1.s  {{ v0 }}[2], [{ptr}], #4",
+        "ld1.b  {{ v0 }}[12], [{ptr}]",
+        "ret",
+
+        // 14
+        "ld1.d  {{ v0 }}[0], [{ptr}], #8",
+        "ld1.s  {{ v0 }}[2], [{ptr}], #4",
+        "ld1.h  {{ v0 }}[6], [{ptr}]",
+        "ret",
+
+        // 15
+        "ld1.d  {{ v0 }}[0], [{ptr}], #8",
+        "ld1.s  {{ v0 }}[2], [{ptr}], #4",
+        "ld1.h  {{ v0 }}[6], [{ptr}], #2",
+        "ld1.b  {{ v0 }}[14], [{ptr}]",
+
+        "99:",
+            ptr = inout(reg) ptr,
+            len = in(reg) len,
+            scratch = out(reg) _,
+            lateout("v0") res,
+            options(pure, readonly, nostack)
+        );
+    };
+    res
+}
+
+#[inline(always)]
+fn load_partial_assembly(mut ptr: *const u8, len: usize) -> core::arch::aarch64::uint8x16_t {
+    assert!(len < 16);
+    let res: core::arch::aarch64::uint8x16_t;
+    unsafe {
+        asm!(
+        "movi.2d {res:v}, #0000000000000000",
+        "adr {scratch}, #12",
+        "adds {scratch}, {scratch}, {len}, lsl #4",
+        "br {scratch}",
+
+        // 0
+        "b 99f",
+        "nop",
+        "nop",
+        "nop",
+
+        // 1
+        "ld1.b  {{ {res:v} }}[0], [{ptr}]",
+        "b 99f",
+        "nop",
+        "nop",
+
+        // 2
+        "ld1.h  {{ {res:v} }}[0], [{ptr}]",
+        "b 99f",
+        "nop",
+        "nop",
+
+        // 3
+        "ld1.h  {{ {res:v} }}[0], [{ptr}], #2",
+        "ld1.b  {{ {res:v} }}[2], [{ptr}]",
+        "b 99f",
+        "nop",
+
+        // 4
+        "ld1.s  {{ {res:v} }}[0], [{ptr}]",
+        "b 99f",
+        "nop",
+        "nop",
+
+        // 5
+        "ld1.s  {{ {res:v} }}[0], [{ptr}], #4",
+        "ld1.b  {{ {res:v} }}[4], [{ptr}]",
+        "b 99f",
+        "nop",
+
+        // 6
+        "ld1.s  {{ {res:v} }}[0], [{ptr}], #4",
+        "ld1.h  {{ {res:v} }}[2], [{ptr}]",
+        "b 99f",
+        "nop",
+
+        // 7
+        "ld1.s  {{ {res:v} }}[0], [{ptr}], #4",
+        "ld1.h  {{ {res:v} }}[2], [{ptr}], #2",
+        "ld1.b  {{ {res:v} }}[6], [{ptr}]",
+        "b 99f",
+
+        // 8
+        "ld1.d  {{ {res:v} }}[0], [{ptr}]",
+        "b 99f",
+        "nop",
+        "nop",
+
+        // 9
+        "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+        "ld1.b  {{ {res:v} }}[8], [{ptr}]",
+        "b 99f",
+        "nop",
+
+        // 10
+        "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+        "ld1.h  {{ {res:v} }}[4], [{ptr}]",
+        "b 99f",
+        "nop",
+
+        // 11
+        "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+        "ld1.h  {{ {res:v} }}[4], [{ptr}], #2",
+        "ld1.b  {{ {res:v} }}[10], [{ptr}]",
+        "b 99f",
+
+        // 12
+        "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+        "ld1.s  {{ {res:v} }}[2], [{ptr}]",
+        "b 99f",
+        "nop",
+
+        // 13
+        "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+        "ld1.s  {{ {res:v} }}[2], [{ptr}], #4",
+        "ld1.b  {{ {res:v} }}[12], [{ptr}]",
+        "b 99f",
+
+        // 14
+        "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+        "ld1.s  {{ {res:v} }}[2], [{ptr}], #4",
+        "ld1.h  {{ {res:v} }}[6], [{ptr}]",
+        "b 99f",
+
+        // 15
+        "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+        "ld1.s  {{ {res:v} }}[2], [{ptr}], #4",
+        "ld1.h  {{ {res:v} }}[6], [{ptr}], #2",
+        "ld1.b  {{ {res:v} }}[14], [{ptr}]",
+
+        "99:",
+            ptr = inout(reg) ptr,
+            len = in(reg) len,
+            scratch = out(reg) _,
+            res = lateout(vreg) res,
+            options(pure, readonly, nostack)
+        );
+    };
+    res
+}
+
 type SimdU8Value = crate::implementation::helpers::SimdU8Value<uint8x16_t>;
 
 impl SimdU8Value {
@@ -102,6 +343,13 @@ impl SimdU8Value {
 
     #[inline]
     unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
+        SimdU8Value::from(load_partial_assembly(ptr, len))
+        // SimdU8Value::from(load_partial_assembly_opt_call(ptr, len))
+        // SimdU8Value::from(Self::load_partial_imp(ptr, len))
+    }
+
+    #[inline(always)]
+    unsafe fn load_partial_imp(ptr: *const u8, len: usize) -> uint8x16_t {
         let mut res = Self::splat0();
         match len {
             0 => {}
@@ -268,7 +516,7 @@ impl SimdU8Value {
                 debug_assert!(false);
             }
         }
-        res
+        res.0
     }
 
     #[inline]
diff --git a/src/lib.rs b/src/lib.rs
index 51057298..076011eb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -17,6 +17,10 @@
     all(feature = "aarch64_neon", target_arch = "aarch64"),
     feature(stdsimd)
 )]
+#![cfg_attr(
+    all(feature = "aarch64_neon", target_arch = "aarch64"),
+    feature(asm)
+)]
 
 //! Blazingly fast API-compatible UTF-8 validation for Rust using SIMD extensions, based on the implementation from
 //! [simdjson](https://github.com/simdjson/simdjson). Originally ported to Rust by the developers of [simd-json.rs](https://simd-json.rs), but now heavily improved.

From 138ea37740261ec0468a194e1cc7f5147ed6e7fc Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 18 Jun 2021 13:01:29 +0200
Subject: [PATCH 47/69] Update Cargo.toml

update benchmark Cargo.toml
---
 bench/Cargo.toml | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/bench/Cargo.toml b/bench/Cargo.toml
index 24c7b5c7..6a502aab 100644
--- a/bench/Cargo.toml
+++ b/bench/Cargo.toml
@@ -17,6 +17,7 @@ core_affinity = "0.5"
 criterion = "0.3"
 simdutf8 = { version = "*", path = "..", features = ["aarch64_neon"] }
 simdjson-utf8 = { version = "*", path = "simdjson-utf8", optional = true }
+rand = "0.8"
 
 [[bench]]
 name = "throughput_basic"
@@ -37,4 +38,12 @@ harness = false
 [[bench]]
 name = "throughput_simdjson"
 harness = false
-required-features = ["simdjson"]
\ No newline at end of file
+required-features = ["simdjson"]
+
+[[bench]]
+name = "small_compat"
+harness = false
+
+[[bench]]
+name = "small_std"
+harness = false
\ No newline at end of file

From b6304c2698b71bd5ee61777e384fb0d42ead92e5 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 18 Jun 2021 13:12:05 +0200
Subject: [PATCH 48/69] fix/allow lint warnings

---
 src/implementation/aarch64/neon.rs | 227 +++++++++++++++--------------
 1 file changed, 114 insertions(+), 113 deletions(-)

diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs
index a4ab319f..3158b572 100644
--- a/src/implementation/aarch64/neon.rs
+++ b/src/implementation/aarch64/neon.rs
@@ -10,129 +10,130 @@ use crate::implementation::helpers::Utf8CheckAlgorithm;
 // aarch64 SIMD primitives
 
 #[inline(never)]
+#[allow(unused_assignments)]
+#[allow(improper_ctypes_definitions)]
 /// C ABI spec is necessary so that the loaded value is returned in a register
 unsafe extern "C" fn load_partial_assembly_opt_call(
     mut ptr: *const u8,
     len: usize,
 ) -> core::arch::aarch64::uint8x16_t {
     let res: core::arch::aarch64::uint8x16_t;
-    unsafe {
-        asm!(
-        "movi.2d v0, #0000000000000000",
-        "cmp {len}, 15",
-        "b.hi 99f",
-        "adr {scratch}, #12",
-        "adds {scratch}, {scratch}, {len}, lsl #4",
-        "br {scratch}",
-
-        // 0
-        "ret",
-        "nop",
-        "nop",
-        "nop",
-
-        // 1
-        "ld1.b  {{ v0 }}[0], [{ptr}]",
-        "ret",
-        "nop",
-        "nop",
-
-        // 2
-        "ld1.h  {{ v0 }}[0], [{ptr}]",
-        "ret",
-        "nop",
-        "nop",
-
-        // 3
-        "ld1.h  {{ v0 }}[0], [{ptr}], #2",
-        "ld1.b  {{ v0 }}[2], [{ptr}]",
-        "ret",
-        "nop",
-
-        // 4
-        "ld1.s  {{ v0 }}[0], [{ptr}]",
-        "ret",
-        "nop",
-        "nop",
-
-        // 5
-        "ld1.s  {{ v0 }}[0], [{ptr}], #4",
-        "ld1.b  {{ v0 }}[4], [{ptr}]",
-        "ret",
-        "nop",
-
-        // 6
-        "ld1.s  {{ v0 }}[0], [{ptr}], #4",
-        "ld1.h  {{ v0 }}[2], [{ptr}]",
-        "ret",
-        "nop",
-
-        // 7
-        "ld1.s  {{ v0 }}[0], [{ptr}], #4",
-        "ld1.h  {{ v0 }}[2], [{ptr}], #2",
-        "ld1.b  {{ v0 }}[6], [{ptr}]",
-        "ret",
-
-        // 8
-        "ld1.d  {{ v0 }}[0], [{ptr}]",
-        "ret",
-        "nop",
-        "nop",
-
-        // 9
-        "ld1.d  {{ v0 }}[0], [{ptr}], #8",
-        "ld1.b  {{ v0 }}[8], [{ptr}]",
-        "ret",
-        "nop",
-
-        // 10
-        "ld1.d  {{ v0 }}[0], [{ptr}], #8",
-        "ld1.h  {{ v0 }}[4], [{ptr}]",
-        "ret",
-        "nop",
-
-        // 11
-        "ld1.d  {{ v0 }}[0], [{ptr}], #8",
-        "ld1.h  {{ v0 }}[4], [{ptr}], #2",
-        "ld1.b  {{ v0 }}[10], [{ptr}]",
-        "ret",
-
-        // 12
-        "ld1.d  {{ v0 }}[0], [{ptr}], #8",
-        "ld1.s  {{ v0 }}[2], [{ptr}]",
-        "ret",
-        "nop",
-
-        // 13
-        "ld1.d  {{ v0 }}[0], [{ptr}], #8",
-        "ld1.s  {{ v0 }}[2], [{ptr}], #4",
-        "ld1.b  {{ v0 }}[12], [{ptr}]",
-        "ret",
-
-        // 14
-        "ld1.d  {{ v0 }}[0], [{ptr}], #8",
-        "ld1.s  {{ v0 }}[2], [{ptr}], #4",
-        "ld1.h  {{ v0 }}[6], [{ptr}]",
-        "ret",
-
-        // 15
-        "ld1.d  {{ v0 }}[0], [{ptr}], #8",
-        "ld1.s  {{ v0 }}[2], [{ptr}], #4",
-        "ld1.h  {{ v0 }}[6], [{ptr}], #2",
-        "ld1.b  {{ v0 }}[14], [{ptr}]",
-
-        "99:",
-            ptr = inout(reg) ptr,
-            len = in(reg) len,
-            scratch = out(reg) _,
-            lateout("v0") res,
-            options(pure, readonly, nostack)
-        );
-    };
+    asm!(
+    "movi.2d v0, #0000000000000000",
+    "cmp {len}, 15",
+    "b.hi 99f",
+    "adr {scratch}, #12",
+    "adds {scratch}, {scratch}, {len}, lsl #4",
+    "br {scratch}",
+
+    // 0
+    "ret",
+    "nop",
+    "nop",
+    "nop",
+
+    // 1
+    "ld1.b  {{ v0 }}[0], [{ptr}]",
+    "ret",
+    "nop",
+    "nop",
+
+    // 2
+    "ld1.h  {{ v0 }}[0], [{ptr}]",
+    "ret",
+    "nop",
+    "nop",
+
+    // 3
+    "ld1.h  {{ v0 }}[0], [{ptr}], #2",
+    "ld1.b  {{ v0 }}[2], [{ptr}]",
+    "ret",
+    "nop",
+
+    // 4
+    "ld1.s  {{ v0 }}[0], [{ptr}]",
+    "ret",
+    "nop",
+    "nop",
+
+    // 5
+    "ld1.s  {{ v0 }}[0], [{ptr}], #4",
+    "ld1.b  {{ v0 }}[4], [{ptr}]",
+    "ret",
+    "nop",
+
+    // 6
+    "ld1.s  {{ v0 }}[0], [{ptr}], #4",
+    "ld1.h  {{ v0 }}[2], [{ptr}]",
+    "ret",
+    "nop",
+
+    // 7
+    "ld1.s  {{ v0 }}[0], [{ptr}], #4",
+    "ld1.h  {{ v0 }}[2], [{ptr}], #2",
+    "ld1.b  {{ v0 }}[6], [{ptr}]",
+    "ret",
+
+    // 8
+    "ld1.d  {{ v0 }}[0], [{ptr}]",
+    "ret",
+    "nop",
+    "nop",
+
+    // 9
+    "ld1.d  {{ v0 }}[0], [{ptr}], #8",
+    "ld1.b  {{ v0 }}[8], [{ptr}]",
+    "ret",
+    "nop",
+
+    // 10
+    "ld1.d  {{ v0 }}[0], [{ptr}], #8",
+    "ld1.h  {{ v0 }}[4], [{ptr}]",
+    "ret",
+    "nop",
+
+    // 11
+    "ld1.d  {{ v0 }}[0], [{ptr}], #8",
+    "ld1.h  {{ v0 }}[4], [{ptr}], #2",
+    "ld1.b  {{ v0 }}[10], [{ptr}]",
+    "ret",
+
+    // 12
+    "ld1.d  {{ v0 }}[0], [{ptr}], #8",
+    "ld1.s  {{ v0 }}[2], [{ptr}]",
+    "ret",
+    "nop",
+
+    // 13
+    "ld1.d  {{ v0 }}[0], [{ptr}], #8",
+    "ld1.s  {{ v0 }}[2], [{ptr}], #4",
+    "ld1.b  {{ v0 }}[12], [{ptr}]",
+    "ret",
+
+    // 14
+    "ld1.d  {{ v0 }}[0], [{ptr}], #8",
+    "ld1.s  {{ v0 }}[2], [{ptr}], #4",
+    "ld1.h  {{ v0 }}[6], [{ptr}]",
+    "ret",
+
+    // 15
+    "ld1.d  {{ v0 }}[0], [{ptr}], #8",
+    "ld1.s  {{ v0 }}[2], [{ptr}], #4",
+    "ld1.h  {{ v0 }}[6], [{ptr}], #2",
+    "ld1.b  {{ v0 }}[14], [{ptr}]",
+
+    "99:",
+        ptr = inout(reg) ptr,
+        len = in(reg) len,
+        scratch = out(reg) _,
+        lateout("v0") res,
+        options(pure, readonly, nostack)
+    );
     res
 }
 
 #[inline(always)]
+#[allow(unused_assignments)]
 fn load_partial_assembly(mut ptr: *const u8, len: usize) -> core::arch::aarch64::uint8x16_t {
     assert!(len < 16);
     let res: core::arch::aarch64::uint8x16_t;

From 7fea54649bcf0913b4f42434ffd7c3179b32b773 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 18 Jun 2021 13:12:28 +0200
Subject: [PATCH 49/69] example is only for x86-64

---
 examples/streaming.rs | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/examples/streaming.rs b/examples/streaming.rs
index 5db0764b..89725392 100644
--- a/examples/streaming.rs
+++ b/examples/streaming.rs
@@ -1,11 +1,9 @@
-#[cfg(feature = "public_imp")]
-use simdutf8::basic::imp::Utf8Validator;
-
 #[allow(unused_imports)]
 use std::io::{stdin, Read, Result};
 
-#[cfg(feature = "public_imp")]
+#[cfg(all(feature = "public_imp", target_arch = "x86_64"))]
 fn main() -> Result<()> {
+    use simdutf8::basic::imp::Utf8Validator;
     unsafe {
         if !std::is_x86_feature_detected!("avx2") {
             panic!("This example only works with CPUs supporting AVX 2");
@@ -32,5 +30,5 @@ fn main() -> Result<()> {
 }
 
 /// Dummy main. This example requires the crate feature `public_imp`.
-#[cfg(not(feature = "public_imp"))]
+#[cfg(not(all(feature = "public_imp", target_arch = "x86_64")))]
 fn main() {}

From b995b7f83c745792a1d1691fdd060e54e9cea51e Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 18 Jun 2021 13:28:14 +0200
Subject: [PATCH 50/69] fix small std benchmark

---
 bench/benches/small_std.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench/benches/small_std.rs b/bench/benches/small_std.rs
index 5a6bacc7..65c4ba8d 100644
--- a/bench/benches/small_std.rs
+++ b/bench/benches/small_std.rs
@@ -1,3 +1,3 @@
-use simdutf8_bench::define_throughput_benchmark;
+use simdutf8_bench::define_small_benchmark;
 
 define_small_benchmark!(BenchFn::Std);

From 114163647e623ef704aa65fb87efc845a5df08d8 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 18 Jun 2021 20:01:48 +0200
Subject: [PATCH 51/69] Update lib.rs

fmt
---
 src/lib.rs | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 076011eb..4c936424 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -17,10 +17,7 @@
     all(feature = "aarch64_neon", target_arch = "aarch64"),
     feature(stdsimd)
 )]
-#![cfg_attr(
-    all(feature = "aarch64_neon", target_arch = "aarch64"),
-    feature(asm)
-)]
+#![cfg_attr(all(feature = "aarch64_neon", target_arch = "aarch64"), feature(asm))]
 
 //! Blazingly fast API-compatible UTF-8 validation for Rust using SIMD extensions, based on the implementation from
 //! [simdjson](https://github.com/simdjson/simdjson). Originally ported to Rust by the developers of [simd-json.rs](https://simd-json.rs), but now heavily improved.

From e1fd53cde48acd0e374ce90b2ad2ed3bb26b31fa Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sat, 19 Jun 2021 14:08:27 +0200
Subject: [PATCH 52/69] use classic lengths for throughput benchmark again now
 that we have an extra benchmark for small inputs.

---
 bench/src/lib.rs | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/bench/src/lib.rs b/bench/src/lib.rs
index 2aad990c..0e6c883a 100644
--- a/bench/src/lib.rs
+++ b/bench/src/lib.rs
@@ -145,24 +145,7 @@ fn get_valid_slice_of_len_or_more_aligned(
 
 fn bench<M: Measurement>(c: &mut Criterion<M>, name: &str, bytes: &[u8], bench_fn: BenchFn) {
     let mut group = c.benchmark_group(name);
-    let mut sizes = HashSet::new();
-    for i in 1..129 {
-        let alignment = Alignment {
-            boundary: 64,
-            offset: 8, // 8 is the default alignment on 64-bit, so this is what can be expected worst-case
-        };
-        let (vec, offset) = get_valid_slice_of_len_or_more_aligned(bytes, i, alignment);
-        let slice = &vec[offset..];
-        assert_eq!(
-            (slice.as_ptr() as usize) % alignment.boundary,
-            alignment.offset
-        );
-        if !sizes.contains(&slice.len()) {
-            bench_input(&mut group, slice, true, true, bench_fn);
-            sizes.insert(slice.len());
-        }
-    }
-    for i in [512, 4096, 65536, 131072].iter() {
+    for i in [1, 8, 64, 512, 4096, 65536, 131072].iter() {
         let alignment = Alignment {
             boundary: 64,
             offset: 8, // 8 is the default alignment on 64-bit, so this is what can be expected worst-case

From b4ce94f0f89e6598ee3b8ab993e06e1d9c0b7bcb Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sat, 19 Jun 2021 14:30:05 +0200
Subject: [PATCH 53/69] add small_basic benchmark

---
 bench/Cargo.toml             | 4 ++++
 bench/benches/small_basic.rs | 3 +++
 2 files changed, 7 insertions(+)
 create mode 100644 bench/benches/small_basic.rs

diff --git a/bench/Cargo.toml b/bench/Cargo.toml
index 6a502aab..2c1d14e5 100644
--- a/bench/Cargo.toml
+++ b/bench/Cargo.toml
@@ -40,6 +40,10 @@ name = "throughput_simdjson"
 harness = false
 required-features = ["simdjson"]
 
+[[bench]]
+name = "small_basic"
+harness = false
+
 [[bench]]
 name = "small_compat"
 harness = false
diff --git a/bench/benches/small_basic.rs b/bench/benches/small_basic.rs
new file mode 100644
index 00000000..5295b710
--- /dev/null
+++ b/bench/benches/small_basic.rs
@@ -0,0 +1,3 @@
+use simdutf8_bench::define_small_benchmark;
+
+define_small_benchmark!(BenchFn::Basic);

From 8babc9f280c7dbab571be30a74ce2d001ddec300 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 24 Jun 2021 14:33:14 +0200
Subject: [PATCH 54/69] ARM64 CI

---
 .github/workflows/ci.yml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1f516f75..cf4355e4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -39,6 +39,24 @@ jobs:
         env:
           RUSTFLAGS: ${{ matrix.rustflags }}
 
+jobs:
+  test:
+    runs-on: ARM64
+    strategy:
+      matrix:
+        features: ["", "--features std" "--features aarch64_neon,std", "--features aarch64_neon,std,publi_imp", "--features aarch64_neon,std,public_imp"]
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+            toolchain: nightly
+            profile: minimal
+            override: true
+      - name: Run tests
+        run: cargo test --no-default-features ${{ matrix.features }} --all-targets --verbose
+        env:
+          RUSTFLAGS: ${{ matrix.rustflags }}
+
   test-inlining-x86:
     runs-on: ubuntu-latest
     strategy:

From 4021faafced69e49953998515a2499179f9773ad Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 24 Jun 2021 14:42:05 +0200
Subject: [PATCH 55/69] fix ci

---
 .github/workflows/ci.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cf4355e4..7c5eff65 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -39,8 +39,7 @@ jobs:
         env:
           RUSTFLAGS: ${{ matrix.rustflags }}
 
-jobs:
-  test:
+  test-arm64:
     runs-on: ARM64
     strategy:
       matrix:

From 80b0bc8d74c1de048bf323566ca56d607ed0e049 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 24 Jun 2021 14:46:13 +0200
Subject: [PATCH 56/69] Update ci.yml

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7c5eff65..752145c9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ARM64
     strategy:
       matrix:
-        features: ["", "--features std" "--features aarch64_neon,std", "--features aarch64_neon,std,publi_imp", "--features aarch64_neon,std,public_imp"]
+        features: ["", "--features std", "--features aarch64_neon,std", "--features aarch64_neon,std,publi_imp", "--features aarch64_neon,std,public_imp"]
     steps:
       - uses: actions/checkout@v2
       - uses: actions-rs/toolchain@v1

From d29695dc76401b399162a096a33d7b14f58a7273 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 24 Jun 2021 14:48:43 +0200
Subject: [PATCH 57/69] fix ci

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 752145c9..644431d1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ARM64
     strategy:
       matrix:
-        features: ["", "--features std", "--features aarch64_neon,std", "--features aarch64_neon,std,publi_imp", "--features aarch64_neon,std,public_imp"]
+        features: ["", "--features std", "--features aarch64_neon,std", "--features aarch64_neon,std,public_imp", "--features aarch64_neon,std,public_imp"]
     steps:
       - uses: actions/checkout@v2
       - uses: actions-rs/toolchain@v1

From b2c033982180a449c0b57aa18548601170ab0e3b Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 24 Jun 2021 15:54:48 +0200
Subject: [PATCH 58/69] more ARM64 ci

---
 .github/workflows/ci.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 644431d1..19c111cc 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -131,12 +131,12 @@ jobs:
         env:
           RUSTDOCFLAGS: --cfg docsrs
 
-  cross-build-arm:
+  cross-build-arm32:
     runs-on: ubuntu-latest
     strategy:
       matrix:
         toolchain: ["1.38.0", stable, beta, nightly ]
-        target: [arm-unknown-linux-gnueabi, aarch64-unknown-linux-gnu]
+        target: [arm-unknown-linux-gnueabi]
         features: ["--features std", ""]
         include:
           - toolchain: nightly
@@ -204,7 +204,10 @@ jobs:
         run: cargo fmt -- --check
 
   clippy_check:
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.runner }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, ARM64]
     steps:
       - uses: actions/checkout@v1
       - name: Update rustup

From b1416f67c0f7a86f6fbdc10a1d284c25f999fc6b Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 25 Jun 2021 07:00:28 +0200
Subject: [PATCH 59/69] fix ci

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index da380fbf..064bd146 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -192,7 +192,7 @@ jobs:
     runs-on: ${{ matrix.runner }}
     strategy:
       matrix:
-        os: [ubuntu-latest, ARM64]
+        runner: [ubuntu-latest, ARM64]
     steps:
       - uses: actions/checkout@v1
       - uses: actions-rs/toolchain@v1

From 5753cad0ec035d80842c3386a16a688da1046d2d Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 25 Jun 2021 07:06:52 +0200
Subject: [PATCH 60/69] clippy

---
 src/implementation/aarch64/neon.rs | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs
index 3158b572..4190cc75 100644
--- a/src/implementation/aarch64/neon.rs
+++ b/src/implementation/aarch64/neon.rs
@@ -10,6 +10,7 @@ use crate::implementation::helpers::Utf8CheckAlgorithm;
 // aarch64 SIMD primitives
 
 #[inline(never)]
+#[allow(clippy::too_many_lines)]
 #[allow(unused_assignments)]
 #[allow(improper_ctypes_definitions)]
 /// C ABI spec is necessary so that the loaded value is returned in a register
@@ -133,6 +134,8 @@ unsafe extern "C" fn load_partial_assembly_opt_call(
 }
 
 #[inline(always)]
+#[allow(clippy::too_many_lines)]
+#[allow(clippy::inline_always)]
 #[allow(unused_assignments)]
 fn load_partial_assembly(mut ptr: *const u8, len: usize) -> core::arch::aarch64::uint8x16_t {
     assert!(len < 16);
@@ -344,12 +347,14 @@ impl SimdU8Value {
 
     #[inline]
     unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
-        SimdU8Value::from(load_partial_assembly(ptr, len))
-        // SimdU8Value::from(load_partial_assembly_opt_call(ptr, len))
-        // SimdU8Value::from(Self::load_partial_imp(ptr, len))
+        Self::from(load_partial_assembly(ptr, len))
+        // Self::from(load_partial_assembly_opt_call(ptr, len))
+        // Self::from(Self::load_partial_imp(ptr, len))
     }
 
     #[inline(always)]
+    #[allow(clippy::inline_always)]
+    #[allow(clippy::too_many_lines)]
     unsafe fn load_partial_imp(ptr: *const u8, len: usize) -> uint8x16_t {
         let mut res = Self::splat0();
         match len {

From b86d00fffffad764e6578628391b26d09a76a7d1 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Wed, 30 Jun 2021 08:21:30 +0200
Subject: [PATCH 61/69] add one more small benchmark

---
 bench/src/lib.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bench/src/lib.rs b/bench/src/lib.rs
index 0e6c883a..3c08b325 100644
--- a/bench/src/lib.rs
+++ b/bench/src/lib.rs
@@ -167,6 +167,7 @@ fn bench_small<M: Measurement>(c: &mut Criterion<M>, name: &str, bytes: &[u8], b
     bench_range(&mut group, bytes, 16, 32, bench_fn);
     bench_range(&mut group, bytes, 32, 64, bench_fn);
     bench_range(&mut group, bytes, 65, 127, bench_fn);
+    bench_range(&mut group, bytes, 129, 255, bench_fn);
     group.finish();
 }
 

From 8eb9ebf950ed47e45d2e4b4e57cb300df9846c29 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Wed, 30 Jun 2021 08:54:39 +0200
Subject: [PATCH 62/69] more consistent small basic benchmarks

---
 bench/src/lib.rs | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/bench/src/lib.rs b/bench/src/lib.rs
index 3c08b325..61021240 100644
--- a/bench/src/lib.rs
+++ b/bench/src/lib.rs
@@ -2,7 +2,6 @@ use criterion::{measurement::Measurement, BenchmarkGroup, BenchmarkId, Criterion
 use simdutf8::basic::from_utf8 as basic_from_utf8;
 use simdutf8::compat::from_utf8 as compat_from_utf8;
 
-use std::collections::HashSet;
 use std::str::from_utf8 as std_from_utf8;
 
 #[cfg(feature = "simdjson")]
@@ -166,8 +165,8 @@ fn bench_small<M: Measurement>(c: &mut Criterion<M>, name: &str, bytes: &[u8], b
     bench_range(&mut group, bytes, 0, 16, bench_fn);
     bench_range(&mut group, bytes, 16, 32, bench_fn);
     bench_range(&mut group, bytes, 32, 64, bench_fn);
-    bench_range(&mut group, bytes, 65, 127, bench_fn);
-    bench_range(&mut group, bytes, 129, 255, bench_fn);
+    bench_range(&mut group, bytes, 64, 128, bench_fn);
+    bench_range(&mut group, bytes, 128, 256, bench_fn);
     group.finish();
 }
 
@@ -189,29 +188,31 @@ fn bench_range<T: Measurement>(
     upper_limit: usize,
     bench_fn: BenchFn,
 ) {
+    let bench_id = format!("rand_{:03}-{:03}", lower_limit, upper_limit);
+    let gen_fn = || gen_valid_in_range(bytes, lower_limit, upper_limit);
     match bench_fn {
         BenchFn::Basic => {
-            group.bench_function(format!("rand_{}-{}", lower_limit, upper_limit), |b| {
+            group.bench_function(bench_id, |b| {
                 b.iter_batched(
-                    || gen_valid_in_range(bytes, lower_limit, upper_limit),
+                    gen_fn,
                     |x| assert!(basic_from_utf8(&bytes[0..x]).is_ok()),
                     criterion::BatchSize::SmallInput,
                 )
             });
         }
         BenchFn::Compat => {
-            group.bench_function(format!("rand_{}-{}", lower_limit, upper_limit), |b| {
+            group.bench_function(bench_id, |b| {
                 b.iter_batched(
-                    || gen_valid_in_range(bytes, lower_limit, upper_limit),
+                    gen_fn,
                     |x| assert!(compat_from_utf8(&bytes[0..x]).is_ok()),
                     criterion::BatchSize::SmallInput,
                 )
             });
         }
         BenchFn::Std => {
-            group.bench_function(format!("rand_{}-{}", lower_limit, upper_limit), |b| {
+            group.bench_function(bench_id, |b| {
                 b.iter_batched(
-                    || gen_valid_in_range(bytes, lower_limit, upper_limit),
+                    gen_fn,
                     |x| assert!(std_from_utf8(&bytes[0..x]).is_ok()),
                     criterion::BatchSize::SmallInput,
                 )

From 47588bc265039c03833a1ede47c417e7acbab928 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Wed, 30 Jun 2021 10:51:28 +0200
Subject: [PATCH 63/69] rename fn

---
 src/implementation/aarch64/neon.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs
index 4190cc75..75e714a1 100644
--- a/src/implementation/aarch64/neon.rs
+++ b/src/implementation/aarch64/neon.rs
@@ -349,13 +349,13 @@ impl SimdU8Value {
     unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
         Self::from(load_partial_assembly(ptr, len))
         // Self::from(load_partial_assembly_opt_call(ptr, len))
-        // Self::from(Self::load_partial_imp(ptr, len))
+        // Self::from(Self::load_partial_intrinsics(ptr, len))
     }
 
     #[inline(always)]
     #[allow(clippy::inline_always)]
     #[allow(clippy::too_many_lines)]
-    unsafe fn load_partial_imp(ptr: *const u8, len: usize) -> uint8x16_t {
+    unsafe fn load_partial_intrinsics(ptr: *const u8, len: usize) -> uint8x16_t {
         let mut res = Self::splat0();
         match len {
             0 => {}

From 119b6225bb687c8e98d268eda9b7e0703670e5cb Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Tue, 6 Jul 2021 08:08:33 +0200
Subject: [PATCH 64/69] make cargo test --all-features work on non-x86

---
 src/basic.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/basic.rs b/src/basic.rs
index 79aea46d..6b24dfba 100644
--- a/src/basic.rs
+++ b/src/basic.rs
@@ -77,6 +77,11 @@ pub mod imp {
     /// use simdutf8::basic::imp::Utf8Validator;
     /// use std::io::{stdin, Read, Result};
     ///
+    /// # #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    /// # fn main() {
+    /// # }
+    ///
+    /// # #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     /// fn main() -> Result<()> {
     ///     unsafe {
     ///         if !std::is_x86_feature_detected!("avx2") {

From bb58d71cb47b68e43f892b6b2ae4778dea1e56f8 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Tue, 6 Jul 2021 08:09:37 +0200
Subject: [PATCH 65/69] remove aarch64 partial load assembly impl. non-inlined
 fn

---
 src/implementation/aarch64/neon.rs | 124 -----------------------------
 1 file changed, 124 deletions(-)

diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs
index 75e714a1..cc277e6a 100644
--- a/src/implementation/aarch64/neon.rs
+++ b/src/implementation/aarch64/neon.rs
@@ -9,130 +9,6 @@ use crate::implementation::helpers::Utf8CheckAlgorithm;
 
 // aarch64 SIMD primitives
 
-#[inline(never)]
-#[allow(clippy::too_many_lines)]
-#[allow(unused_assignments)]
-#[allow(improper_ctypes_definitions)]
-/// C ABI spec is necessary so that the loaded value is returned in a register
-unsafe extern "C" fn load_partial_assembly_opt_call(
-    mut ptr: *const u8,
-    len: usize,
-) -> core::arch::aarch64::uint8x16_t {
-    let res: core::arch::aarch64::uint8x16_t;
-    asm!(
-    "movi.2d v0, #0000000000000000",
-    "cmp {len}, 15",
-    "b.hi 99f",
-    "adr {scratch}, #12",
-    "adds {scratch}, {scratch}, {len}, lsl #4",
-    "br {scratch}",
-
-    // 0
-    "ret",
-    "nop",
-    "nop",
-    "nop",
-
-    // 1
-    "ld1.b  {{ v0 }}[0], [{ptr}]",
-    "ret",
-    "nop",
-    "nop",
-
-    // 2
-    "ld1.h  {{ v0 }}[0], [{ptr}]",
-    "ret",
-    "nop",
-    "nop",
-
-    // 3
-    "ld1.h  {{ v0 }}[0], [{ptr}], #2",
-    "ld1.b  {{ v0 }}[2], [{ptr}]",
-    "ret",
-    "nop",
-
-    // 4
-    "ld1.s  {{ v0 }}[0], [{ptr}]",
-    "ret",
-    "nop",
-    "nop",
-
-    // 5
-    "ld1.s  {{ v0 }}[0], [{ptr}], #4",
-    "ld1.b  {{ v0 }}[4], [{ptr}]",
-    "ret",
-    "nop",
-
-    // 6
-    "ld1.s  {{ v0 }}[0], [{ptr}], #4",
-    "ld1.h  {{ v0 }}[2], [{ptr}]",
-    "ret",
-    "nop",
-
-    // 7
-    "ld1.s  {{ v0 }}[0], [{ptr}], #4",
-    "ld1.h  {{ v0 }}[2], [{ptr}], #2",
-    "ld1.b  {{ v0 }}[6], [{ptr}]",
-    "ret",
-
-    // 8
-    "ld1.d  {{ v0 }}[0], [{ptr}]",
-    "ret",
-    "nop",
-    "nop",
-
-    // 9
-    "ld1.d  {{ v0 }}[0], [{ptr}], #8",
-    "ld1.b  {{ v0 }}[8], [{ptr}]",
-    "ret",
-    "nop",
-
-    // 10
-    "ld1.d  {{ v0 }}[0], [{ptr}], #8",
-    "ld1.h  {{ v0 }}[4], [{ptr}]",
-    "ret",
-    "nop",
-
-    // 11
-    "ld1.d  {{ v0 }}[0], [{ptr}], #8",
-    "ld1.h  {{ v0 }}[4], [{ptr}], #2",
-    "ld1.b  {{ v0 }}[10], [{ptr}]",
-    "ret",
-
-    // 12
-    "ld1.d  {{ v0 }}[0], [{ptr}], #8",
-    "ld1.s  {{ v0 }}[2], [{ptr}]",
-    "ret",
-    "nop",
-
-    // 13
-    "ld1.d  {{ v0 }}[0], [{ptr}], #8",
-    "ld1.s  {{ v0 }}[2], [{ptr}], #4",
-    "ld1.b  {{ v0 }}[12], [{ptr}]",
-    "ret",
-
-    // 14
-    "ld1.d  {{ v0 }}[0], [{ptr}], #8",
-    "ld1.s  {{ v0 }}[2], [{ptr}], #4",
-    "ld1.h  {{ v0 }}[6], [{ptr}]",
-    "ret",
-
-    // 15
-    "ld1.d  {{ v0 }}[0], [{ptr}], #8",
-    "ld1.s  {{ v0 }}[2], [{ptr}], #4",
-    "ld1.h  {{ v0 }}[6], [{ptr}], #2",
-    "ld1.b  {{ v0 }}[14], [{ptr}]",
-
-    "99:",
-        ptr = inout(reg) ptr,
-        len = in(reg) len,
-        scratch = out(reg) _,
-        lateout("v0") res,
-        options(pure, readonly, nostack)
-    );
-    res
-}
-
 #[inline(always)]
 #[allow(clippy::too_many_lines)]
 #[allow(clippy::inline_always)]

From 45befdbcb7aa46ed686e17c7e5167e0aab4d97bb Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Tue, 6 Jul 2021 08:15:28 +0200
Subject: [PATCH 66/69] asm fn -> method

---
 src/implementation/aarch64/neon.rs | 245 ++++++++++++++---------------
 1 file changed, 122 insertions(+), 123 deletions(-)

diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs
index cc277e6a..1669d356 100644
--- a/src/implementation/aarch64/neon.rs
+++ b/src/implementation/aarch64/neon.rs
@@ -9,127 +9,6 @@ use crate::implementation::helpers::Utf8CheckAlgorithm;
 
 // aarch64 SIMD primitives
 
-#[inline(always)]
-#[allow(clippy::too_many_lines)]
-#[allow(clippy::inline_always)]
-#[allow(unused_assignments)]
-fn load_partial_assembly(mut ptr: *const u8, len: usize) -> core::arch::aarch64::uint8x16_t {
-    assert!(len < 16);
-    let res: core::arch::aarch64::uint8x16_t;
-    unsafe {
-        asm!(
-        "movi.2d {res:v}, #0000000000000000",
-        "adr {scratch}, #12",
-        "adds {scratch}, {scratch}, {len}, lsl #4",
-        "br {scratch}",
-
-        // 0
-        "b 99f",
-        "nop",
-        "nop",
-        "nop",
-
-        // 1
-        "ld1.b  {{ {res:v} }}[0], [{ptr}]",
-        "b 99f",
-        "nop",
-        "nop",
-
-        // 2
-        "ld1.h  {{ {res:v} }}[0], [{ptr}]",
-        "b 99f",
-        "nop",
-        "nop",
-
-        // 3
-        "ld1.h  {{ {res:v} }}[0], [{ptr}], #2",
-        "ld1.b  {{ {res:v} }}[2], [{ptr}]",
-        "b 99f",
-        "nop",
-
-        // 4
-        "ld1.s  {{ {res:v} }}[0], [{ptr}]",
-        "b 99f",
-        "nop",
-        "nop",
-
-        // 5
-        "ld1.s  {{ {res:v} }}[0], [{ptr}], #4",
-        "ld1.b  {{ {res:v} }}[4], [{ptr}]",
-        "b 99f",
-        "nop",
-
-        // 6
-        "ld1.s  {{ {res:v} }}[0], [{ptr}], #4",
-        "ld1.h  {{ {res:v} }}[2], [{ptr}]",
-        "b 99f",
-        "nop",
-
-        // 7
-        "ld1.s  {{ {res:v} }}[0], [{ptr}], #4",
-        "ld1.h  {{ {res:v} }}[2], [{ptr}], #2",
-        "ld1.b  {{ {res:v} }}[6], [{ptr}]",
-        "b 99f",
-
-        // 8
-        "ld1.d  {{ {res:v} }}[0], [{ptr}]",
-        "b 99f",
-        "nop",
-        "nop",
-
-        // 9
-        "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
-        "ld1.b  {{ {res:v} }}[8], [{ptr}]",
-        "b 99f",
-        "nop",
-
-        // 10
-        "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
-        "ld1.h  {{ {res:v} }}[4], [{ptr}]",
-        "b 99f",
-        "nop",
-
-        // 11
-        "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
-        "ld1.h  {{ {res:v} }}[4], [{ptr}], #2",
-        "ld1.b  {{ {res:v} }}[10], [{ptr}]",
-        "b 99f",
-
-        // 12
-        "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
-        "ld1.s  {{ {res:v} }}[2], [{ptr}]",
-        "b 99f",
-        "nop",
-
-        // 13
-        "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
-        "ld1.s  {{ {res:v} }}[2], [{ptr}], #4",
-        "ld1.b  {{ {res:v} }}[12], [{ptr}]",
-        "b 99f",
-
-        // 14
-        "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
-        "ld1.s  {{ {res:v} }}[2], [{ptr}], #4",
-        "ld1.h  {{ {res:v} }}[6], [{ptr}]",
-        "b 99f",
-
-        // 15
-        "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
-        "ld1.s  {{ {res:v} }}[2], [{ptr}], #4",
-        "ld1.h  {{ {res:v} }}[6], [{ptr}], #2",
-        "ld1.b  {{ {res:v} }}[14], [{ptr}]",
-
-        "99:",
-            ptr = inout(reg) ptr,
-            len = in(reg) len,
-            scratch = out(reg) _,
-            res = lateout(vreg) res,
-            options(pure, readonly, nostack)
-        );
-    };
-    res
-}
-
 type SimdU8Value = crate::implementation::helpers::SimdU8Value<uint8x16_t>;
 
 impl SimdU8Value {
@@ -223,8 +102,7 @@ impl SimdU8Value {
 
     #[inline]
     unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
-        Self::from(load_partial_assembly(ptr, len))
-        // Self::from(load_partial_assembly_opt_call(ptr, len))
+        Self::from(Self::load_partial_assembly(ptr, len))
         // Self::from(Self::load_partial_intrinsics(ptr, len))
     }
 
@@ -401,6 +279,127 @@ impl SimdU8Value {
         res.0
     }
 
+    #[inline(always)]
+    #[allow(clippy::too_many_lines)]
+    #[allow(clippy::inline_always)]
+    #[allow(unused_assignments)]
+    fn load_partial_assembly(mut ptr: *const u8, len: usize) -> core::arch::aarch64::uint8x16_t {
+        assert!(len < 16);
+        let res: core::arch::aarch64::uint8x16_t;
+        unsafe {
+            asm!(
+            "movi.2d {res:v}, #0000000000000000",
+            "adr {scratch}, #12",
+            "adds {scratch}, {scratch}, {len}, lsl #4",
+            "br {scratch}",
+
+            // 0
+            "b 99f",
+            "nop",
+            "nop",
+            "nop",
+
+            // 1
+            "ld1.b  {{ {res:v} }}[0], [{ptr}]",
+            "b 99f",
+            "nop",
+            "nop",
+
+            // 2
+            "ld1.h  {{ {res:v} }}[0], [{ptr}]",
+            "b 99f",
+            "nop",
+            "nop",
+
+            // 3
+            "ld1.h  {{ {res:v} }}[0], [{ptr}], #2",
+            "ld1.b  {{ {res:v} }}[2], [{ptr}]",
+            "b 99f",
+            "nop",
+
+            // 4
+            "ld1.s  {{ {res:v} }}[0], [{ptr}]",
+            "b 99f",
+            "nop",
+            "nop",
+
+            // 5
+            "ld1.s  {{ {res:v} }}[0], [{ptr}], #4",
+            "ld1.b  {{ {res:v} }}[4], [{ptr}]",
+            "b 99f",
+            "nop",
+
+            // 6
+            "ld1.s  {{ {res:v} }}[0], [{ptr}], #4",
+            "ld1.h  {{ {res:v} }}[2], [{ptr}]",
+            "b 99f",
+            "nop",
+
+            // 7
+            "ld1.s  {{ {res:v} }}[0], [{ptr}], #4",
+            "ld1.h  {{ {res:v} }}[2], [{ptr}], #2",
+            "ld1.b  {{ {res:v} }}[6], [{ptr}]",
+            "b 99f",
+
+            // 8
+            "ld1.d  {{ {res:v} }}[0], [{ptr}]",
+            "b 99f",
+            "nop",
+            "nop",
+
+            // 9
+            "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+            "ld1.b  {{ {res:v} }}[8], [{ptr}]",
+            "b 99f",
+            "nop",
+
+            // 10
+            "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+            "ld1.h  {{ {res:v} }}[4], [{ptr}]",
+            "b 99f",
+            "nop",
+
+            // 11
+            "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+            "ld1.h  {{ {res:v} }}[4], [{ptr}], #2",
+            "ld1.b  {{ {res:v} }}[10], [{ptr}]",
+            "b 99f",
+
+            // 12
+            "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+            "ld1.s  {{ {res:v} }}[2], [{ptr}]",
+            "b 99f",
+            "nop",
+
+            // 13
+            "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+            "ld1.s  {{ {res:v} }}[2], [{ptr}], #4",
+            "ld1.b  {{ {res:v} }}[12], [{ptr}]",
+            "b 99f",
+
+            // 14
+            "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+            "ld1.s  {{ {res:v} }}[2], [{ptr}], #4",
+            "ld1.h  {{ {res:v} }}[6], [{ptr}]",
+            "b 99f",
+
+            // 15
+            "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+            "ld1.s  {{ {res:v} }}[2], [{ptr}], #4",
+            "ld1.h  {{ {res:v} }}[6], [{ptr}], #2",
+            "ld1.b  {{ {res:v} }}[14], [{ptr}]",
+
+            "99:",
+                ptr = inout(reg) ptr,
+                len = in(reg) len,
+                scratch = out(reg) _,
+                res = lateout(vreg) res,
+                options(pure, readonly, nostack)
+            );
+        };
+        res
+    }
+
     #[inline]
     #[allow(clippy::too_many_arguments)]
     unsafe fn lookup_16(

From 37f9198b20f89e870ef0259be35cf48a786b9100 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sun, 11 Jul 2021 09:54:00 +0200
Subject: [PATCH 67/69] simplify

---
 src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 4c936424..91798806 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -15,9 +15,9 @@
 #![cfg_attr(docsrs, feature(doc_cfg))]
 #![cfg_attr(
     all(feature = "aarch64_neon", target_arch = "aarch64"),
-    feature(stdsimd)
+    feature(stdsimd),
+    feature(asm)
 )]
-#![cfg_attr(all(feature = "aarch64_neon", target_arch = "aarch64"), feature(asm))]
 
 //! Blazingly fast API-compatible UTF-8 validation for Rust using SIMD extensions, based on the implementation from
 //! [simdjson](https://github.com/simdjson/simdjson). Originally ported to Rust by the developers of [simd-json.rs](https://simd-json.rs), but now heavily improved.

From d3399cd9713f95b874fb0cb2a6abc0c3d9e32589 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Sun, 15 Aug 2021 22:39:59 +0200
Subject: [PATCH 68/69] Trigger GitHub actions


From 82c5fee295f9753d7cc90f23c8e1e8736bf2bbd7 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 24 Oct 2024 05:16:14 +0200
Subject: [PATCH 69/69] fixes for current nightly

---
 src/implementation/aarch64/neon.rs | 1 +
 src/implementation/algorithm.rs    | 3 ---
 src/lib.rs                         | 7 +++----
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs
index 1669d356..c1cf3790 100644
--- a/src/implementation/aarch64/neon.rs
+++ b/src/implementation/aarch64/neon.rs
@@ -6,6 +6,7 @@ use core::arch::aarch64::{
 };
 
 use crate::implementation::helpers::Utf8CheckAlgorithm;
+use core::arch::asm;
 
 // aarch64 SIMD primitives
 
diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index 44125128..52b50b83 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -182,7 +182,6 @@ macro_rules! algorithm_simd {
             #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
             #[inline]
             #[allow(unconditional_panic)] // does not panic because len is checked
-            #[allow(const_err)] // the same, but for Rust 1.38.0
             unsafe fn check_block(&mut self, input: SimdInput) {
                 // WORKAROUND
                 // necessary because the for loop is not unrolled on ARM64
@@ -208,7 +207,6 @@ macro_rules! algorithm_simd {
             #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
             #[inline]
             #[allow(unconditional_panic)] // does not panic because len is checked
-            #[allow(const_err)] // the same, but for Rust 1.38.0
             unsafe fn check_remainder(&mut self, mut input: *const u8, len: usize) {
                 const SIMD_SIZE: usize = core::mem::size_of::<SimdU8Value>();
                 let orig_len = len;
@@ -245,7 +243,6 @@ macro_rules! algorithm_simd {
             #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
             #[inline]
             #[allow(unconditional_panic)] // does not panic because len is checked
-            #[allow(const_err)] // the same, but for Rust 1.38.0
             unsafe fn check_remainder_ascii(&mut self, mut input: *const u8, mut len: usize) {
                 const SIMD_SIZE: usize = core::mem::size_of::<SimdU8Value>();
 
diff --git a/src/lib.rs b/src/lib.rs
index 91798806..cee27bc8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,6 +1,6 @@
-#![deny(warnings)]
+#![warn(warnings)]
 #![warn(unused_extern_crates)]
-#![deny(
+#![warn(
     clippy::all,
     clippy::unwrap_used,
     clippy::unnecessary_unwrap,
@@ -15,8 +15,7 @@
 #![cfg_attr(docsrs, feature(doc_cfg))]
 #![cfg_attr(
     all(feature = "aarch64_neon", target_arch = "aarch64"),
-    feature(stdsimd),
-    feature(asm)
+    feature(stdarch_aarch64_prefetch)
 )]
 
 //! Blazingly fast API-compatible UTF-8 validation for Rust using SIMD extensions, based on the implementation from