diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b63455ca..064bd146 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -35,6 +35,23 @@ jobs:
         env:
           RUSTFLAGS: ${{ matrix.rustflags }}
 
+  test-arm64:
+    runs-on: ARM64
+    strategy:
+      matrix:
+        features: ["", "--features std", "--features aarch64_neon,std", "--features aarch64_neon,std,public_imp", "--features aarch64_neon,std,public_imp"]
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+            toolchain: nightly
+            profile: minimal
+            override: true
+      - name: Run tests
+        run: cargo test --no-default-features ${{ matrix.features }} --all-targets --verbose
+        env:
+          RUSTFLAGS: ${{ matrix.rustflags }}
+
   test-inlining-x86:
     runs-on: ubuntu-latest
     strategy:
@@ -106,12 +123,12 @@ jobs:
         env:
           RUSTDOCFLAGS: --cfg docsrs
 
-  cross-build-arm:
+  cross-build-arm32:
     runs-on: ubuntu-latest
     strategy:
       matrix:
         toolchain: ["1.38.0", stable, beta, nightly ]
-        target: [arm-unknown-linux-gnueabi, aarch64-unknown-linux-gnu]
+        target: [arm-unknown-linux-gnueabi]
         features: ["--features std", ""]
         include:
           - toolchain: nightly
@@ -172,7 +189,10 @@ jobs:
         run: cargo fmt -- --check
 
   clippy_check:
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.runner }}
+    strategy:
+      matrix:
+        runner: [ubuntu-latest, ARM64]
     steps:
       - uses: actions/checkout@v1
       - uses: actions-rs/toolchain@v1
diff --git a/bench/Cargo.toml b/bench/Cargo.toml
index 24c7b5c7..2c1d14e5 100644
--- a/bench/Cargo.toml
+++ b/bench/Cargo.toml
@@ -17,6 +17,7 @@ core_affinity = "0.5"
 criterion = "0.3"
 simdutf8 = { version = "*", path = "..", features = ["aarch64_neon"] }
 simdjson-utf8 = { version = "*", path = "simdjson-utf8", optional = true }
+rand = "0.8"
 
 [[bench]]
 name = "throughput_basic"
@@ -37,4 +38,16 @@ harness = false
 [[bench]]
 name = "throughput_simdjson"
 harness = false
-required-features = ["simdjson"]
\ No newline at end of file
+required-features = ["simdjson"]
+
+[[bench]]
+name = "small_basic"
+harness = false
+
+[[bench]]
+name = "small_compat"
+harness = false
+
+[[bench]]
+name = "small_std"
+harness = false
\ No newline at end of file
diff --git a/bench/benches/small_basic.rs b/bench/benches/small_basic.rs
new file mode 100644
index 00000000..5295b710
--- /dev/null
+++ b/bench/benches/small_basic.rs
@@ -0,0 +1,3 @@
+use simdutf8_bench::define_small_benchmark;
+
+define_small_benchmark!(BenchFn::Basic);
diff --git a/bench/benches/small_compat.rs b/bench/benches/small_compat.rs
new file mode 100644
index 00000000..0ae2af45
--- /dev/null
+++ b/bench/benches/small_compat.rs
@@ -0,0 +1,3 @@
+use simdutf8_bench::define_small_benchmark;
+
+define_small_benchmark!(BenchFn::Compat);
diff --git a/bench/benches/small_std.rs b/bench/benches/small_std.rs
new file mode 100644
index 00000000..65c4ba8d
--- /dev/null
+++ b/bench/benches/small_std.rs
@@ -0,0 +1,3 @@
+use simdutf8_bench::define_small_benchmark;
+
+define_small_benchmark!(BenchFn::Std);
diff --git a/bench/src/lib.rs b/bench/src/lib.rs
index 218055b3..61021240 100644
--- a/bench/src/lib.rs
+++ b/bench/src/lib.rs
@@ -62,6 +62,37 @@ pub fn criterion_benchmark<M: Measurement>(c: &mut Criterion<M>, bench_fn: Bench
     bench_late_error(c, bench_fn);
 }
 
+pub fn criterion_benchmark_small<M: Measurement>(c: &mut Criterion<M>, bench_fn: BenchFn) {
+    let core_ids = core_affinity::get_core_ids().unwrap();
+    core_affinity::set_for_current(*core_ids.get(2).unwrap_or(&core_ids[0]));
+
+    bench_small(
+        c,
+        "1-latin",
+        &scale_to_one_mib(include_bytes!("../data/Latin-Lipsum.txt")),
+        bench_fn,
+    );
+
+    bench_small(
+        c,
+        "2-cyrillic",
+        &scale_to_one_mib(include_bytes!("../data/Russian-Lipsum.txt")),
+        bench_fn,
+    );
+    bench_small(
+        c,
+        "3-chinese",
+        &scale_to_one_mib(include_bytes!("../data/Chinese-Lipsum.txt")),
+        bench_fn,
+    );
+    bench_small(
+        c,
+        "4-emoji",
+        &scale_to_one_mib(include_bytes!("../data/Emoji-Lipsum.txt")),
+        bench_fn,
+    );
+}
+
 fn bench_empty<M: Measurement>(c: &mut Criterion<M>, bench_fn: BenchFn) {
     let mut group = c.benchmark_group("0-empty");
     bench_input(&mut group, b"", false, true, bench_fn);
@@ -129,6 +160,70 @@ fn bench<M: Measurement>(c: &mut Criterion<M>, name: &str, bytes: &[u8], bench_f
     group.finish();
 }
 
+fn bench_small<M: Measurement>(c: &mut Criterion<M>, name: &str, bytes: &[u8], bench_fn: BenchFn) {
+    let mut group = c.benchmark_group(name);
+    bench_range(&mut group, bytes, 0, 16, bench_fn);
+    bench_range(&mut group, bytes, 16, 32, bench_fn);
+    bench_range(&mut group, bytes, 32, 64, bench_fn);
+    bench_range(&mut group, bytes, 64, 128, bench_fn);
+    bench_range(&mut group, bytes, 128, 256, bench_fn);
+    group.finish();
+}
+
+fn gen_valid_in_range(bytes: &[u8], lower_limit: usize, upper_limit: usize) -> usize {
+    use rand::Rng;
+    let mut rng = rand::thread_rng();
+    loop {
+        let x = rng.gen_range(lower_limit..upper_limit);
+        if std_from_utf8(&bytes[0..x]).is_ok() {
+            return x;
+        }
+    }
+}
+
+fn bench_range<T: Measurement>(
+    group: &mut BenchmarkGroup<T>,
+    bytes: &[u8],
+    lower_limit: usize,
+    upper_limit: usize,
+    bench_fn: BenchFn,
+) {
+    let bench_id = format!("rand_{:03}-{:03}", lower_limit, upper_limit);
+    let gen_fn = || gen_valid_in_range(bytes, lower_limit, upper_limit);
+    match bench_fn {
+        BenchFn::Basic => {
+            group.bench_function(bench_id, |b| {
+                b.iter_batched(
+                    gen_fn,
+                    |x| assert!(basic_from_utf8(&bytes[0..x]).is_ok()),
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+        }
+        BenchFn::Compat => {
+            group.bench_function(bench_id, |b| {
+                b.iter_batched(
+                    gen_fn,
+                    |x| assert!(compat_from_utf8(&bytes[0..x]).is_ok()),
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+        }
+        BenchFn::Std => {
+            group.bench_function(bench_id, |b| {
+                b.iter_batched(
+                    gen_fn,
+                    |x| assert!(std_from_utf8(&bytes[0..x]).is_ok()),
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+        }
+        _ => {
+            unimplemented!();
+        }
+    }
+}
+
 #[inline(never)]
 fn basic_from_utf8_no_inline(v: &[u8]) -> bool {
     basic_from_utf8(v).is_ok()
diff --git a/bench/src/macros.rs b/bench/src/macros.rs
index 94421b41..745432eb 100644
--- a/bench/src/macros.rs
+++ b/bench/src/macros.rs
@@ -8,14 +8,38 @@ macro_rules! define_throughput_benchmark {
 
         use simdutf8_bench::*;
 
-        fn benchmark_compat<M: Measurement>(c: &mut Criterion<M>) {
+        fn benchmark_throughput<M: Measurement>(c: &mut Criterion<M>) {
             criterion_benchmark(c, $bench_fn);
         }
 
         criterion_group!(
             name = benches;
             config = Criterion::default().measurement_time(Duration::from_secs(1)).warm_up_time(Duration::from_secs(1)).sample_size(300);
-            targets = benchmark_compat
+            targets = benchmark_throughput
+        );
+
+        criterion_main!(benches);
+    };
+}
+
+#[macro_export]
+macro_rules! define_small_benchmark {
+    ($bench_fn:expr) => {
+        use std::time::Duration;
+
+        use criterion::measurement::Measurement;
+        use criterion::{criterion_group, criterion_main, Criterion};
+
+        use simdutf8_bench::*;
+
+        fn benchmark_small<M: Measurement>(c: &mut Criterion<M>) {
+            criterion_benchmark_small(c, $bench_fn);
+        }
+
+        criterion_group!(
+            name = benches;
+            config = Criterion::default().measurement_time(Duration::from_secs(1)).warm_up_time(Duration::from_secs(1)).sample_size(300);
+            targets = benchmark_small
         );
 
         criterion_main!(benches);
diff --git a/examples/streaming.rs b/examples/streaming.rs
index 5db0764b..89725392 100644
--- a/examples/streaming.rs
+++ b/examples/streaming.rs
@@ -1,11 +1,9 @@
-#[cfg(feature = "public_imp")]
-use simdutf8::basic::imp::Utf8Validator;
-
 #[allow(unused_imports)]
 use std::io::{stdin, Read, Result};
 
-#[cfg(feature = "public_imp")]
+#[cfg(all(feature = "public_imp", target_arch = "x86_64"))]
 fn main() -> Result<()> {
+    use simdutf8::basic::imp::Utf8Validator;
     unsafe {
         if !std::is_x86_feature_detected!("avx2") {
             panic!("This example only works with CPUs supporting AVX 2");
@@ -32,5 +30,5 @@ fn main() -> Result<()> {
 }
 
 /// Dummy main. This example requires the crate feature `public_imp`.
-#[cfg(not(feature = "public_imp"))]
+#[cfg(not(all(feature = "public_imp", target_arch = "x86_64")))]
 fn main() {}
diff --git a/src/basic.rs b/src/basic.rs
index 79aea46d..6b24dfba 100644
--- a/src/basic.rs
+++ b/src/basic.rs
@@ -77,6 +77,11 @@ pub mod imp {
     /// use simdutf8::basic::imp::Utf8Validator;
     /// use std::io::{stdin, Read, Result};
     ///
+    /// # #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    /// # fn main() {
+    /// # }
+    ///
+    /// # #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     /// fn main() -> Result<()> {
     ///     unsafe {
     ///         if !std::is_x86_feature_detected!("avx2") {
diff --git a/src/implementation/aarch64/mod.rs b/src/implementation/aarch64/mod.rs
index 6b1bfb2f..a483ecbc 100644
--- a/src/implementation/aarch64/mod.rs
+++ b/src/implementation/aarch64/mod.rs
@@ -5,10 +5,6 @@ pub(crate) mod neon;
 #[inline]
 #[cfg(all(feature = "aarch64_neon", target_feature = "neon"))]
 pub(crate) unsafe fn validate_utf8_basic(input: &[u8]) -> Result<(), crate::basic::Utf8Error> {
-    if input.len() < super::helpers::SIMD_CHUNK_SIZE {
-        return super::validate_utf8_basic_fallback(input);
-    }
-
     validate_utf8_basic_neon(input)
 }
 
@@ -24,10 +20,6 @@ pub(crate) use super::validate_utf8_basic_fallback as validate_utf8_basic;
 #[inline]
 #[cfg(all(feature = "aarch64_neon", target_feature = "neon"))]
 pub(crate) unsafe fn validate_utf8_compat(input: &[u8]) -> Result<(), crate::compat::Utf8Error> {
-    if input.len() < super::helpers::SIMD_CHUNK_SIZE {
-        return super::validate_utf8_compat_fallback(input);
-    }
-
     validate_utf8_compat_neon(input)
 }
 
diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs
index 8fb05057..c1cf3790 100644
--- a/src/implementation/aarch64/neon.rs
+++ b/src/implementation/aarch64/neon.rs
@@ -6,6 +6,7 @@ use core::arch::aarch64::{
 };
 
 use crate::implementation::helpers::Utf8CheckAlgorithm;
+use core::arch::asm;
 
 // aarch64 SIMD primitives
 
@@ -100,6 +101,306 @@ impl SimdU8Value {
         Self::from(dst.assume_init())
     }
 
+    #[inline]
+    unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
+        Self::from(Self::load_partial_assembly(ptr, len))
+        // Self::from(Self::load_partial_intrinsics(ptr, len))
+    }
+
+    #[inline(always)]
+    #[allow(clippy::inline_always)]
+    #[allow(clippy::too_many_lines)]
+    unsafe fn load_partial_intrinsics(ptr: *const u8, len: usize) -> uint8x16_t {
+        let mut res = Self::splat0();
+        match len {
+            0 => {}
+            1 => {
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr, res.0, 0);
+            }
+            2 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+            }
+            3 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(2), res.0, 2);
+            }
+            4 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+            }
+            5 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(4), res.0, 4);
+            }
+            6 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.add(4).cast(),
+                    core::mem::transmute(res.0),
+                    2,
+                ));
+            }
+            7 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.add(4).cast(),
+                    core::mem::transmute(res.0),
+                    2,
+                ));
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(6), res.0, 6);
+            }
+            8 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+            }
+            9 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(8), res.0, 8);
+            }
+            10 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.add(8).cast(),
+                    core::mem::transmute(res.0),
+                    4,
+                ));
+            }
+            11 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.add(8).cast(),
+                    core::mem::transmute(res.0),
+                    4,
+                ));
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(10), res.0, 10);
+            }
+            12 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.add(8).cast(),
+                    core::mem::transmute(res.0),
+                    2,
+                ));
+            }
+            13 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.add(8).cast(),
+                    core::mem::transmute(res.0),
+                    2,
+                ));
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(12), res.0, 12);
+            }
+            14 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.add(8).cast(),
+                    core::mem::transmute(res.0),
+                    2,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.add(12).cast(),
+                    core::mem::transmute(res.0),
+                    6,
+                ));
+            }
+            15 => {
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64(
+                    ptr.cast(),
+                    core::mem::transmute(res.0),
+                    0,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32(
+                    ptr.add(8).cast(),
+                    core::mem::transmute(res.0),
+                    2,
+                ));
+                res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16(
+                    ptr.add(12).cast(),
+                    core::mem::transmute(res.0),
+                    6,
+                ));
+                res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(14), res.0, 14);
+            }
+            _ => {
+                // not allowed
+                debug_assert!(false);
+            }
+        }
+        res.0
+    }
+
+    #[inline(always)]
+    #[allow(clippy::too_many_lines)]
+    #[allow(clippy::inline_always)]
+    #[allow(unused_assignments)]
+    fn load_partial_assembly(mut ptr: *const u8, len: usize) -> core::arch::aarch64::uint8x16_t {
+        assert!(len < 16);
+        let res: core::arch::aarch64::uint8x16_t;
+        unsafe {
+            asm!(
+            "movi.2d {res:v}, #0000000000000000",
+            "adr {scratch}, #12",
+            "adds {scratch}, {scratch}, {len}, lsl #4",
+            "br {scratch}",
+
+            // 0
+            "b 99f",
+            "nop",
+            "nop",
+            "nop",
+
+            // 1
+            "ld1.b  {{ {res:v} }}[0], [{ptr}]",
+            "b 99f",
+            "nop",
+            "nop",
+
+            // 2
+            "ld1.h  {{ {res:v} }}[0], [{ptr}]",
+            "b 99f",
+            "nop",
+            "nop",
+
+            // 3
+            "ld1.h  {{ {res:v} }}[0], [{ptr}], #2",
+            "ld1.b  {{ {res:v} }}[2], [{ptr}]",
+            "b 99f",
+            "nop",
+
+            // 4
+            "ld1.s  {{ {res:v} }}[0], [{ptr}]",
+            "b 99f",
+            "nop",
+            "nop",
+
+            // 5
+            "ld1.s  {{ {res:v} }}[0], [{ptr}], #4",
+            "ld1.b  {{ {res:v} }}[4], [{ptr}]",
+            "b 99f",
+            "nop",
+
+            // 6
+            "ld1.s  {{ {res:v} }}[0], [{ptr}], #4",
+            "ld1.h  {{ {res:v} }}[2], [{ptr}]",
+            "b 99f",
+            "nop",
+
+            // 7
+            "ld1.s  {{ {res:v} }}[0], [{ptr}], #4",
+            "ld1.h  {{ {res:v} }}[2], [{ptr}], #2",
+            "ld1.b  {{ {res:v} }}[6], [{ptr}]",
+            "b 99f",
+
+            // 8
+            "ld1.d  {{ {res:v} }}[0], [{ptr}]",
+            "b 99f",
+            "nop",
+            "nop",
+
+            // 9
+            "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+            "ld1.b  {{ {res:v} }}[8], [{ptr}]",
+            "b 99f",
+            "nop",
+
+            // 10
+            "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+            "ld1.h  {{ {res:v} }}[4], [{ptr}]",
+            "b 99f",
+            "nop",
+
+            // 11
+            "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+            "ld1.h  {{ {res:v} }}[4], [{ptr}], #2",
+            "ld1.b  {{ {res:v} }}[10], [{ptr}]",
+            "b 99f",
+
+            // 12
+            "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+            "ld1.s  {{ {res:v} }}[2], [{ptr}]",
+            "b 99f",
+            "nop",
+
+            // 13
+            "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+            "ld1.s  {{ {res:v} }}[2], [{ptr}], #4",
+            "ld1.b  {{ {res:v} }}[12], [{ptr}]",
+            "b 99f",
+
+            // 14
+            "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+            "ld1.s  {{ {res:v} }}[2], [{ptr}], #4",
+            "ld1.h  {{ {res:v} }}[6], [{ptr}]",
+            "b 99f",
+
+            // 15
+            "ld1.d  {{ {res:v} }}[0], [{ptr}], #8",
+            "ld1.s  {{ {res:v} }}[2], [{ptr}], #4",
+            "ld1.h  {{ {res:v} }}[6], [{ptr}], #2",
+            "ld1.b  {{ {res:v} }}[14], [{ptr}]",
+
+            "99:",
+                ptr = inout(reg) ptr,
+                len = in(reg) len,
+                scratch = out(reg) _,
+                res = lateout(vreg) res,
+                options(pure, readonly, nostack)
+            );
+        };
+        res
+    }
+
     #[inline]
     #[allow(clippy::too_many_arguments)]
     unsafe fn lookup_16(
@@ -233,6 +534,7 @@ unsafe fn simd_prefetch(ptr: *const u8) {
 }
 
 const PREFETCH: bool = false;
+#[allow(unused_imports)]
 use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk;
 simd_input_128_bit!("not_used");
 algorithm_simd!("not_used");
diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
index 85d881c4..52b50b83 100644
--- a/src/implementation/algorithm.rs
+++ b/src/implementation/algorithm.rs
@@ -182,22 +182,85 @@ macro_rules! algorithm_simd {
             #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
             #[inline]
             #[allow(unconditional_panic)] // does not panic because len is checked
-            #[allow(const_err)] // the same, but for Rust 1.38.0
             unsafe fn check_block(&mut self, input: SimdInput) {
                 // WORKAROUND
                 // necessary because the for loop is not unrolled on ARM64
-                if input.vals.len() == 2 {
-                    self.check_bytes(input.vals[0]);
-                    self.check_bytes(input.vals[1]);
-                    self.incomplete = Self::is_incomplete(input.vals[1]);
-                } else if input.vals.len() == 4 {
-                    self.check_bytes(input.vals[0]);
-                    self.check_bytes(input.vals[1]);
-                    self.check_bytes(input.vals[2]);
-                    self.check_bytes(input.vals[3]);
-                    self.incomplete = Self::is_incomplete(input.vals[3]);
-                } else {
-                    panic!("Unsupported number of chunks");
+                match input.vals.len() {
+                    2 => {
+                        self.check_bytes(input.vals[0]);
+                        self.check_bytes(input.vals[1]);
+                        self.incomplete = Self::is_incomplete(input.vals[1]);
+                    }
+                    4 => {
+                        self.check_bytes(input.vals[0]);
+                        self.check_bytes(input.vals[1]);
+                        self.check_bytes(input.vals[2]);
+                        self.check_bytes(input.vals[3]);
+                        self.incomplete = Self::is_incomplete(input.vals[3]);
+                    }
+                    _ => {
+                        panic!("Unsupported number of chunks");
+                    }
+                }
+            }
+
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(unconditional_panic)] // does not panic because len is checked
+            unsafe fn check_remainder(&mut self, mut input: *const u8, len: usize) {
+                const SIMD_SIZE: usize = core::mem::size_of::<SimdU8Value>();
+                let orig_len = len;
+                let mut len = len;
+
+                while len >= SIMD_SIZE {
+                    let simd_val = SimdU8Value::load_from(input);
+                    input = input.add(SIMD_SIZE);
+                    if simd_val.is_ascii() {
+                        if orig_len == len {
+                            // first after last block, check if previous block is incomplete
+                            self.check_incomplete_pending();
+                        }
+                    } else {
+                        self.check_bytes(simd_val);
+                        self.incomplete = Self::is_incomplete(simd_val);
+                    }
+                    len -= SIMD_SIZE;
+                }
+                if len > 0 {
+                    let simd_val = SimdU8Value::load_partial(input, len);
+                    if simd_val.is_ascii() {
+                        if orig_len < SIMD_SIZE {
+                            // first after last block, check if previous block is incomplete
+                            self.check_incomplete_pending();
+                        }
+                    } else {
+                        self.check_bytes(simd_val);
+                        self.incomplete = Self::is_incomplete(simd_val);
+                    }
+                }
+            }
+
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(unconditional_panic)] // does not panic because len is checked
+            unsafe fn check_remainder_ascii(&mut self, mut input: *const u8, mut len: usize) {
+                const SIMD_SIZE: usize = core::mem::size_of::<SimdU8Value>();
+
+                while len >= SIMD_SIZE {
+                    let simd_val = SimdU8Value::load_from(input);
+                    input = input.add(SIMD_SIZE);
+                    if !simd_val.is_ascii() {
+                        self.check_bytes(simd_val);
+                        self.incomplete = Self::is_incomplete(simd_val);
+                    }
+                    len -= SIMD_SIZE;
+                }
+                if len > 0 {
+                    let simd_val = SimdU8Value::load_partial(input, len);
+                    if !simd_val.is_ascii() {
+                        self.check_bytes(simd_val);
+                        self.incomplete = Self::is_incomplete(simd_val);
+                    }
                 }
             }
         }
@@ -222,13 +285,22 @@ macro_rules! algorithm_simd {
             let mut idx: usize = 0;
             let iter_lim = len - (len % SIMD_CHUNK_SIZE);
 
-            while idx < iter_lim {
-                let simd_input = SimdInput::new(input.get_unchecked(idx as usize..));
-                idx += SIMD_CHUNK_SIZE;
-                if !simd_input.is_ascii() {
-                    algorithm.check_block(simd_input);
-                    break;
+            'outer: loop {
+                while idx < iter_lim {
+                    let simd_input = SimdInput::new(input.get_unchecked(idx as usize..));
+                    idx += SIMD_CHUNK_SIZE;
+                    if !simd_input.is_ascii() {
+                        algorithm.check_block(simd_input);
+                        break 'outer;
+                    }
                 }
+                if idx < len {
+                    let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len - idx);
+                    if !simd_input.is_ascii() {
+                        break;
+                    }
+                }
+                return Ok(());
             }
 
             while idx < iter_lim {
@@ -241,14 +313,7 @@ macro_rules! algorithm_simd {
             }
 
             if idx < len {
-                let mut tmpbuf = TempSimdChunk::new();
-                crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
-                    input.as_ptr().add(idx),
-                    tmpbuf.0.as_mut_ptr(),
-                    len - idx,
-                );
-                let simd_input = SimdInput::new(&tmpbuf.0);
-                algorithm.check_utf8(simd_input);
+                algorithm.check_remainder(input.as_ptr().add(idx), len - idx);
             }
             algorithm.check_incomplete_pending();
             if algorithm.has_error() {
@@ -302,7 +367,15 @@ macro_rules! algorithm_simd {
                         }
                         idx += SIMD_CHUNK_SIZE;
                     }
-                    break;
+                    if idx < len {
+                        algorithm.check_remainder_ascii(input.as_ptr().add(idx), len - idx);
+                        algorithm.check_incomplete_pending();
+                    }
+                    return if algorithm.has_error() {
+                        Err(idx)
+                    } else {
+                        Ok(())
+                    };
                 } else {
                     while idx < iter_lim {
                         if PREFETCH {
@@ -331,15 +404,7 @@ macro_rules! algorithm_simd {
                 }
             }
             if idx < len {
-                let mut tmpbuf = TempSimdChunk::new();
-                crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
-                    input.as_ptr().add(idx),
-                    tmpbuf.0.as_mut_ptr(),
-                    len - idx,
-                );
-                let simd_input = SimdInput::new(&tmpbuf.0);
-
-                algorithm.check_utf8(simd_input);
+                algorithm.check_remainder(input.as_ptr().add(idx), len - idx)
             }
             algorithm.check_incomplete_pending();
             if algorithm.has_error() {
@@ -538,6 +603,101 @@ macro_rules! simd_input_128_bit {
                 }
             }
 
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(clippy::cast_ptr_alignment)]
+            unsafe fn new_partial(ptr: *const u8, len: usize) -> Self {
+                Self::new_partial_ordered(ptr, len)
+            }
+
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(clippy::cast_ptr_alignment)]
+            unsafe fn new_partial_ordered(ptr: *const u8, len: usize) -> Self {
+                let partial_len = len % 16;
+                if len < 16 {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_partial(ptr, partial_len),
+                            SimdU8Value::splat0(),
+                            SimdU8Value::splat0(),
+                            SimdU8Value::splat0(),
+                        ],
+                    }
+                } else if len < 32 {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_from(ptr),
+                            SimdU8Value::load_partial(ptr.add(16), partial_len),
+                            SimdU8Value::splat0(),
+                            SimdU8Value::splat0(),
+                        ],
+                    }
+                } else if len < 48 {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_from(ptr),
+                            SimdU8Value::load_from(ptr.add(16)),
+                            SimdU8Value::load_partial(ptr.add(32), partial_len),
+                            SimdU8Value::splat0(),
+                        ],
+                    }
+                } else {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_from(ptr),
+                            SimdU8Value::load_from(ptr.add(16)),
+                            SimdU8Value::load_from(ptr.add(32)),
+                            SimdU8Value::load_partial(ptr.add(48), partial_len),
+                        ],
+                    }
+                }
+            }
+
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(clippy::cast_ptr_alignment)]
+            unsafe fn new_partial_small(ptr: *const u8, len: usize) -> Self {
+                let partial = SimdU8Value::load_partial(ptr.add(len / 16 * 16), len % 16);
+                if len < 16 {
+                    Self {
+                        vals: [
+                            partial,
+                            SimdU8Value::splat0(),
+                            SimdU8Value::splat0(),
+                            SimdU8Value::splat0(),
+                        ],
+                    }
+                } else if len < 32 {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_from(ptr),
+                            partial,
+                            SimdU8Value::splat0(),
+                            SimdU8Value::splat0(),
+                        ],
+                    }
+                } else if len < 48 {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_from(ptr),
+                            SimdU8Value::load_from(ptr.add(16)),
+                            partial,
+                            SimdU8Value::splat0(),
+                        ],
+                    }
+                } else {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_from(ptr),
+                            SimdU8Value::load_from(ptr.add(16)),
+                            SimdU8Value::load_from(ptr.add(32)),
+                            partial,
+                        ],
+                    }
+                }
+            }
+
             #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
             #[inline]
             unsafe fn is_ascii(&self) -> bool {
@@ -570,6 +730,47 @@ macro_rules! simd_input_256_bit {
                 }
             }
 
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(clippy::cast_ptr_alignment)]
+            unsafe fn new_partial(ptr: *const u8, len: usize) -> Self {
+                Self::new_partial_ordered(ptr, len)
+            }
+
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(clippy::cast_ptr_alignment)]
+            unsafe fn new_partial_ordered(ptr: *const u8, len: usize) -> Self {
+                if len < 32 {
+                    Self {
+                        vals: [SimdU8Value::load_partial(ptr, len), SimdU8Value::splat0()],
+                    }
+                } else {
+                    Self {
+                        vals: [
+                            SimdU8Value::load_from(ptr),
+                            SimdU8Value::load_partial(ptr.add(32), len - 32),
+                        ],
+                    }
+                }
+            }
+
+            #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
+            #[inline]
+            #[allow(clippy::cast_ptr_alignment)]
+            unsafe fn new_partial_small(ptr: *const u8, len: usize) -> Self {
+                let partial = SimdU8Value::load_partial(ptr.add(len / 32 * 32), len % 32);
+                if len < 32 {
+                    Self {
+                        vals: [partial, SimdU8Value::splat0()],
+                    }
+                } else {
+                    Self {
+                        vals: [SimdU8Value::load_from(ptr), partial],
+                    }
+                }
+            }
+
             #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))]
             #[inline]
             unsafe fn is_ascii(&self) -> bool {
diff --git a/src/implementation/helpers.rs b/src/implementation/helpers.rs
index a6bd693a..9aedb81a 100644
--- a/src/implementation/helpers.rs
+++ b/src/implementation/helpers.rs
@@ -37,22 +37,83 @@ pub(crate) fn get_compat_error(input: &[u8], failing_block_pos: usize) -> Utf8Er
     validate_utf8_at_offset(input, offset).unwrap_err()
 }
 
+#[inline]
+unsafe fn memcpy_u32(src: &mut *const u8, dest: &mut *mut u8) {
+    #[allow(clippy::cast_ptr_alignment)]
+    dest.cast::<u32>()
+        .write_unaligned(src.cast::<u32>().read_unaligned());
+    *src = src.offset(4);
+    *dest = dest.offset(4);
+}
+
+#[inline]
+unsafe fn memcpy_u64(src: &mut *const u8, dest: &mut *mut u8) {
+    #[allow(clippy::cast_ptr_alignment)]
+    dest.cast::<u64>()
+        .write_unaligned(src.cast::<u64>().read_unaligned());
+    *src = src.offset(8);
+    *dest = dest.offset(8);
+}
+
 #[allow(dead_code)]
 #[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const
-pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
+pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_16(
     mut src: *const u8,
     mut dest: *mut u8,
     mut len: usize,
 ) {
-    // This gets properly auto-vectorized on AVX 2 and SSE 4.2
-    #[inline]
-    unsafe fn memcpy_u64(src: &mut *const u8, dest: &mut *mut u8) {
-        #[allow(clippy::cast_ptr_alignment)]
-        dest.cast::<u64>()
-            .write_unaligned(src.cast::<u64>().read_unaligned());
-        *src = src.offset(8);
-        *dest = dest.offset(8);
+    if len >= 8 {
+        memcpy_u32(&mut src, &mut dest);
+        memcpy_u32(&mut src, &mut dest);
+        len -= 8;
     }
+    if len >= 4 {
+        memcpy_u32(&mut src, &mut dest);
+        len -= 4;
+    }
+    while len > 0 {
+        *dest = *src;
+        src = src.offset(1);
+        dest = dest.offset(1);
+        len -= 1;
+    }
+}
+
+#[allow(dead_code)]
+#[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const
+pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_32(
+    mut src: *const u8,
+    mut dest: *mut u8,
+    mut len: usize,
+) {
+    if len >= 16 {
+        memcpy_u64(&mut src, &mut dest);
+        memcpy_u64(&mut src, &mut dest);
+        len -= 16;
+    }
+    if len >= 8 {
+        memcpy_u64(&mut src, &mut dest);
+        len -= 8;
+    }
+    if len >= 4 {
+        memcpy_u32(&mut src, &mut dest);
+        len -= 4;
+    }
+    while len > 0 {
+        *dest = *src;
+        src = src.offset(1);
+        dest = dest.offset(1);
+        len -= 1;
+    }
+}
+
+#[allow(dead_code)]
+#[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const
+pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
+    mut src: *const u8,
+    mut dest: *mut u8,
+    mut len: usize,
+) {
     if len >= 32 {
         memcpy_u64(&mut src, &mut dest);
         memcpy_u64(&mut src, &mut dest);
@@ -69,6 +130,10 @@ pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
         memcpy_u64(&mut src, &mut dest);
         len -= 8;
     }
+    if len >= 4 {
+        memcpy_u32(&mut src, &mut dest);
+        len -= 4;
+    }
     while len > 0 {
         *dest = *src;
         src = src.offset(1);
@@ -115,3 +180,37 @@ impl TempSimdChunkA32 {
 pub(crate) struct SimdU8Value<T>(pub(crate) T)
 where
     T: Copy;
+
+#[cfg(test)]
+impl<T: Copy> core::fmt::Display for SimdU8Value<T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        unsafe {
+            if core::mem::size_of::<T>() == 16 {
+                let arr: [u8; 16] = core::mem::transmute_copy(&self.0);
+                write!(f, "{:?}", arr)
+            } else if core::mem::size_of::<T>() == 32 {
+                let arr: [u8; 32] = core::mem::transmute_copy(&self.0);
+                write!(f, "{:?}", arr)
+            } else {
+                Err(core::fmt::Error)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+impl<T: Copy> core::fmt::LowerHex for SimdU8Value<T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        unsafe {
+            if core::mem::size_of::<T>() == 16 {
+                let arr: [u8; 16] = core::mem::transmute_copy(&self.0);
+                write!(f, "{:x?}", arr)
+            } else if core::mem::size_of::<T>() == 32 {
+                let arr: [u8; 32] = core::mem::transmute_copy(&self.0);
+                write!(f, "{:x?}", arr)
+            } else {
+                Err(core::fmt::Error)
+            }
+        }
+    }
+}
diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs
index 8232f571..f20ea798 100644
--- a/src/implementation/x86/avx2.rs
+++ b/src/implementation/x86/avx2.rs
@@ -4,17 +4,21 @@
 
 #[cfg(target_arch = "x86")]
 use core::arch::x86::{
-    __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_cmpgt_epi8, _mm256_loadu_si256,
-    _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi8,
-    _mm256_setr_epi8, _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16,
-    _mm256_subs_epu8, _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0,
+    __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_blendv_ps, _mm256_castps_si256,
+    _mm256_castsi256_ps, _mm256_cmpgt_epi8, _mm256_loadu_si256, _mm256_maskload_epi32,
+    _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi32,
+    _mm256_set1_epi8, _mm256_set_epi32, _mm256_setr_epi8, _mm256_setzero_si256,
+    _mm256_shuffle_epi8, _mm256_sllv_epi32, _mm256_srli_epi16, _mm256_subs_epu8,
+    _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0,
 };
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::{
-    __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_cmpgt_epi8, _mm256_loadu_si256,
-    _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi8,
-    _mm256_setr_epi8, _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16,
-    _mm256_subs_epu8, _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0,
+    __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_blendv_ps, _mm256_castps_si256,
+    _mm256_castsi256_ps, _mm256_cmpgt_epi8, _mm256_loadu_si256, _mm256_maskload_epi32,
+    _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi32,
+    _mm256_set1_epi8, _mm256_set_epi32, _mm256_setr_epi8, _mm256_setzero_si256,
+    _mm256_shuffle_epi8, _mm256_sllv_epi32, _mm256_srli_epi16, _mm256_subs_epu8,
+    _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0,
 };
 
 use crate::implementation::helpers::Utf8CheckAlgorithm;
@@ -103,6 +107,63 @@ impl SimdU8Value {
         Self::from(_mm256_loadu_si256(ptr.cast::<__m256i>()))
     }
 
+    #[target_feature(enable = "avx2")]
+    #[inline]
+    unsafe fn vecmask_from_bitmask(mask: u8) -> Self {
+        let vshift_count = _mm256_set_epi32(24, 25, 26, 27, 28, 29, 30, 31);
+        let bcast = _mm256_set1_epi32(i32::from(mask));
+        let shifted = _mm256_sllv_epi32(bcast, vshift_count); // high bit of each element = corresponding bit of the mask
+        Self::from(shifted)
+    }
+
+    #[target_feature(enable = "avx2")]
+    #[inline]
+    unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
+        Self::load_partial_direct(ptr, len)
+    }
+
+    #[target_feature(enable = "avx2")]
+    #[inline]
+    unsafe fn load_partial_direct(mut ptr: *const u8, len: usize) -> Self {
+        if len == 0 {
+            return Self::splat0();
+        }
+        let sel_mask = 1 << (len / 4);
+        let mask = (sel_mask - 1) as u8;
+        let mut res = _mm256_maskload_epi32(ptr.cast(), Self::vecmask_from_bitmask(mask).0);
+        let remainder = len % 4;
+        if remainder != 0 {
+            ptr = ptr.add((len - len % 4) as usize);
+            let remaining_bytes = match remainder {
+                1 => u32::from(*ptr),
+                2 => u32::from(*ptr) | u32::from(*ptr.add(1)) << 8,
+                3 => u32::from(*ptr) | u32::from(*ptr.add(1)) << 8 | u32::from(*ptr.add(2)) << 16,
+                _ => 0,
+            };
+            // blend vec with 4-byte chunks and last incomplete chunk together
+            #[allow(clippy::cast_possible_wrap)]
+            let remaining_vec = _mm256_set1_epi32(remaining_bytes as i32);
+            res = _mm256_castps_si256(_mm256_blendv_ps(
+                _mm256_castsi256_ps(res),
+                _mm256_castsi256_ps(remaining_vec),
+                _mm256_castsi256_ps(Self::vecmask_from_bitmask(sel_mask).0),
+            ));
+        }
+        Self::from(res)
+    }
+
+    #[target_feature(enable = "avx2")]
+    #[inline]
+    unsafe fn load_partial_copy(ptr: *const u8, len: usize) -> Self {
+        let mut tmpbuf = [0_u8; 32];
+        crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_32(
+            ptr,
+            tmpbuf.as_mut_ptr(),
+            len,
+        );
+        Self::load_from(tmpbuf.as_ptr())
+    }
+
     #[target_feature(enable = "avx2")]
     #[inline]
     unsafe fn lookup_16(
@@ -255,7 +316,41 @@ unsafe fn simd_prefetch(ptr: *const u8) {
     _mm_prefetch(ptr.cast::<i8>(), _MM_HINT_T0);
 }
 
+#[cfg(test)]
+mod test {
+    #[cfg(not(features = "std"))]
+    extern crate std;
+
+    #[allow(unused_imports)]
+    use super::*;
+
+    #[test]
+    pub fn masked_load() {
+        if !std::is_x86_feature_detected!("avx2") {
+            return;
+        }
+
+        let arr = [
+            1_u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        unsafe {
+            for len in 0..32 {
+                let loaded_arr: [u8; 32] =
+                    core::mem::transmute(SimdU8Value::load_partial(arr.as_ptr(), len));
+                for i in 0..len {
+                    assert_eq!(arr[i], loaded_arr[i]);
+                }
+                for x in &loaded_arr[len..arr.len()] {
+                    assert_eq!(*x, 0);
+                }
+            }
+        }
+    }
+}
+
 const PREFETCH: bool = true;
+#[allow(unused_imports)]
 use crate::implementation::helpers::TempSimdChunkA32 as TempSimdChunk;
 simd_input_256_bit!("avx2");
 algorithm_simd!("avx2");
diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs
index 19954956..09fb53b7 100644
--- a/src/implementation/x86/mod.rs
+++ b/src/implementation/x86/mod.rs
@@ -4,11 +4,17 @@ pub(crate) mod avx2;
 #[allow(dead_code)]
 pub(crate) mod sse42;
 
-#[allow(unused_imports)]
-use super::helpers::SIMD_CHUNK_SIZE;
-
 // validate_utf8_basic() std: implementation auto-selection
 
+#[allow(dead_code)]
+const ENABLE_AVX2: bool = true;
+
+#[allow(dead_code)]
+const DELEGATE_TO_STD_FOR_SMALL_INPUTS: bool = true;
+
+#[allow(dead_code)]
+const SMALL_STRING_LIMIT: usize = 9;
+
 #[cfg(all(feature = "std", not(target_feature = "avx2")))]
 #[inline]
 pub(crate) unsafe fn validate_utf8_basic(
@@ -27,7 +33,7 @@ pub(crate) unsafe fn validate_utf8_basic(
         (fun)(input)
     }
 
-    if input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_basic_fallback(input);
     }
 
@@ -38,7 +44,7 @@ pub(crate) unsafe fn validate_utf8_basic(
 #[cfg(all(feature = "std", not(target_feature = "avx2")))]
 #[inline]
 fn get_fastest_available_implementation_basic() -> super::ValidateUtf8Fn {
-    if std::is_x86_feature_detected!("avx2") {
+    if ENABLE_AVX2 && std::is_x86_feature_detected!("avx2") {
         avx2::validate_utf8_basic
     } else if std::is_x86_feature_detected!("sse4.2") {
         sse42::validate_utf8_basic
@@ -53,7 +59,7 @@ fn get_fastest_available_implementation_basic() -> super::ValidateUtf8Fn {
 pub(crate) unsafe fn validate_utf8_basic(
     input: &[u8],
 ) -> core::result::Result<(), crate::basic::Utf8Error> {
-    if input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_basic_fallback(input);
     }
 
@@ -76,7 +82,7 @@ unsafe fn validate_utf8_basic_avx2(
 pub(crate) unsafe fn validate_utf8_basic(
     input: &[u8],
 ) -> core::result::Result<(), crate::basic::Utf8Error> {
-    if input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_basic_fallback(input);
     }
 
@@ -123,7 +129,7 @@ pub(crate) unsafe fn validate_utf8_compat(
         (fun)(input)
     }
 
-    if input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_compat_fallback(input);
     }
 
@@ -134,7 +140,7 @@ pub(crate) unsafe fn validate_utf8_compat(
 #[cfg(all(feature = "std", not(target_feature = "avx2")))]
 #[inline]
 fn get_fastest_available_implementation_compat() -> super::ValidateUtf8CompatFn {
-    if std::is_x86_feature_detected!("avx2") {
+    if ENABLE_AVX2 && std::is_x86_feature_detected!("avx2") {
         avx2::validate_utf8_compat
     } else if std::is_x86_feature_detected!("sse4.2") {
         sse42::validate_utf8_compat
@@ -149,7 +155,7 @@ fn get_fastest_available_implementation_compat() -> super::ValidateUtf8CompatFn
 pub(crate) unsafe fn validate_utf8_compat(
     input: &[u8],
 ) -> core::result::Result<(), crate::compat::Utf8Error> {
-    if input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_compat_fallback(input);
     }
 
@@ -172,7 +178,7 @@ unsafe fn validate_utf8_compat_avx2(
 pub(crate) unsafe fn validate_utf8_compat(
     input: &[u8],
 ) -> core::result::Result<(), crate::compat::Utf8Error> {
-    if input.len() < SIMD_CHUNK_SIZE {
+    if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT {
         return super::validate_utf8_compat_fallback(input);
     }
 
diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs
index f9140d50..af6de1db 100644
--- a/src/implementation/x86/sse42.rs
+++ b/src/implementation/x86/sse42.rs
@@ -4,15 +4,17 @@
 
 #[cfg(target_arch = "x86")]
 use core::arch::x86::{
-    __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_loadu_si128, _mm_movemask_epi8,
-    _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8,
-    _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
+    __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_insert_epi16, _mm_insert_epi8,
+    _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16,
+    _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi16,
+    _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
 };
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::{
-    __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_loadu_si128, _mm_movemask_epi8,
-    _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8,
-    _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
+    __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_insert_epi16, _mm_insert_epi8,
+    _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16,
+    _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi16,
+    _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
 };
 
 use crate::implementation::helpers::Utf8CheckAlgorithm;
@@ -99,6 +101,180 @@ impl SimdU8Value {
         Self::from(_mm_loadu_si128(ptr.cast::<__m128i>()))
     }
 
+    #[target_feature(enable = "sse4.2")]
+    #[inline]
+    #[allow(clippy::too_many_lines)]
+    #[allow(clippy::cast_ptr_alignment)]
+    unsafe fn load_partial(ptr: *const u8, len: usize) -> Self {
+        Self::from(match len {
+            1 => _mm_setr_epi8(
+                ptr.cast::<i8>().read_unaligned(),
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            ),
+            2 => _mm_setr_epi16(ptr.cast::<i16>().read_unaligned(), 0, 0, 0, 0, 0, 0, 0),
+            3 => _mm_setr_epi8(
+                ptr.cast::<i8>().read_unaligned(),
+                ptr.add(1).cast::<i8>().read_unaligned(),
+                ptr.add(2).cast::<i8>().read_unaligned(),
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            ),
+            4 => _mm_setr_epi32(ptr.cast::<i32>().read_unaligned(), 0, 0, 0), // assembly???
+            5 => {
+                let val = _mm_setr_epi32(ptr.cast::<i32>().read_unaligned(), 0, 0, 0);
+                _mm_insert_epi8(val, i32::from(ptr.add(4).cast::<i8>().read_unaligned()), 4)
+            }
+            6 => _mm_setr_epi16(
+                ptr.cast::<i16>().read_unaligned(),
+                ptr.add(2).cast::<i16>().read_unaligned(),
+                ptr.add(4).cast::<i16>().read_unaligned(),
+                0,
+                0,
+                0,
+                0,
+                0,
+            ),
+            7 => {
+                let val = _mm_setr_epi16(
+                    ptr.cast::<i16>().read_unaligned(),
+                    ptr.add(2).cast::<i16>().read_unaligned(),
+                    ptr.add(4).cast::<i16>().read_unaligned(),
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                );
+                _mm_insert_epi8(val, i32::from(ptr.add(6).cast::<i8>().read_unaligned()), 6)
+            }
+            8 => _mm_setr_epi32(
+                ptr.cast::<i32>().read_unaligned(),
+                ptr.add(4).cast::<i32>().read_unaligned(),
+                0,
+                0,
+            ),
+            9 => {
+                let val = _mm_setr_epi32(
+                    ptr.cast::<i32>().read_unaligned(),
+                    ptr.add(4).cast::<i32>().read_unaligned(),
+                    0,
+                    0,
+                );
+                _mm_insert_epi8(val, i32::from(ptr.add(8).cast::<i8>().read_unaligned()), 8)
+            }
+            10 => {
+                let val = _mm_setr_epi32(
+                    ptr.cast::<i32>().read_unaligned(),
+                    ptr.add(4).cast::<i32>().read_unaligned(),
+                    0,
+                    0,
+                );
+                _mm_insert_epi16(val, i32::from(ptr.add(8).cast::<i16>().read_unaligned()), 4)
+            }
+            11 => {
+                let mut val = _mm_setr_epi32(
+                    ptr.cast::<i32>().read_unaligned(),
+                    ptr.add(4).cast::<i32>().read_unaligned(),
+                    0,
+                    0,
+                );
+                val =
+                    _mm_insert_epi16(val, i32::from(ptr.add(8).cast::<i16>().read_unaligned()), 4);
+                _mm_insert_epi8(
+                    val,
+                    i32::from(ptr.add(10).cast::<i8>().read_unaligned()),
+                    10,
+                )
+            }
+            12 => _mm_setr_epi32(
+                ptr.cast::<i32>().read_unaligned(),
+                ptr.add(4).cast::<i32>().read_unaligned(),
+                ptr.add(8).cast::<i32>().read_unaligned(),
+                0,
+            ),
+            13 => {
+                let val = _mm_setr_epi32(
+                    ptr.cast::<i32>().read_unaligned(),
+                    ptr.add(4).cast::<i32>().read_unaligned(),
+                    ptr.add(8).cast::<i32>().read_unaligned(),
+                    0,
+                );
+                _mm_insert_epi8(
+                    val,
+                    i32::from(ptr.add(12).cast::<i8>().read_unaligned()),
+                    12,
+                )
+            }
+            14 => {
+                let val = _mm_setr_epi32(
+                    ptr.cast::<i32>().read_unaligned(),
+                    ptr.add(4).cast::<i32>().read_unaligned(),
+                    ptr.add(8).cast::<i32>().read_unaligned(),
+                    0,
+                );
+                _mm_insert_epi16(
+                    val,
+                    i32::from(ptr.add(12).cast::<i16>().read_unaligned()),
+                    6,
+                )
+            }
+            15 => {
+                let mut val = _mm_setr_epi32(
+                    ptr.cast::<i32>().read_unaligned(),
+                    ptr.add(4).cast::<i32>().read_unaligned(),
+                    ptr.add(8).cast::<i32>().read_unaligned(),
+                    0,
+                );
+                val = _mm_insert_epi16(
+                    val,
+                    i32::from(ptr.add(12).cast::<i16>().read_unaligned()),
+                    6,
+                );
+                _mm_insert_epi8(
+                    val,
+                    i32::from(ptr.add(14).cast::<i8>().read_unaligned()),
+                    14,
+                )
+            }
+            _ => Self::splat0().0,
+        })
+    }
+
+    unsafe fn load_partial_copy(ptr: *const u8, len: usize) -> Self {
+        let mut tmpbuf = [0_u8; 16];
+        crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_16(
+            ptr,
+            tmpbuf.as_mut_ptr(),
+            len,
+        );
+        Self::load_from(tmpbuf.as_ptr())
+    }
+
     #[target_feature(enable = "sse4.2")]
     #[inline]
     unsafe fn lookup_16(
@@ -239,7 +415,38 @@ unsafe fn simd_prefetch(ptr: *const u8) {
     _mm_prefetch(ptr.cast::<i8>(), _MM_HINT_T0);
 }
 
+#[cfg(test)]
+mod test {
+    #[cfg(not(features = "std"))]
+    extern crate std;
+
+    #[allow(unused_imports)]
+    use super::*;
+
+    #[test]
+    pub fn masked_load() {
+        if !std::is_x86_feature_detected!("sse4.2") {
+            return;
+        }
+
+        let arr = [1_u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        unsafe {
+            for len in 0..16 {
+                let loaded_arr: [u8; 16] =
+                    core::mem::transmute(SimdU8Value::load_partial(arr.as_ptr(), len));
+                for i in 0..len {
+                    assert_eq!(arr[i], loaded_arr[i]);
+                }
+                for x in &loaded_arr[len..arr.len()] {
+                    assert_eq!(*x, 0);
+                }
+            }
+        }
+    }
+}
+
 const PREFETCH: bool = false;
+#[allow(unused_imports)]
 use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk;
 simd_input_128_bit!("sse4.2");
 algorithm_simd!("sse4.2");
diff --git a/src/lib.rs b/src/lib.rs
index 51057298..cee27bc8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,6 +1,6 @@
-#![deny(warnings)]
+#![warn(warnings)]
 #![warn(unused_extern_crates)]
-#![deny(
+#![warn(
     clippy::all,
     clippy::unwrap_used,
     clippy::unnecessary_unwrap,
@@ -15,7 +15,7 @@
 #![cfg_attr(docsrs, feature(doc_cfg))]
 #![cfg_attr(
     all(feature = "aarch64_neon", target_arch = "aarch64"),
-    feature(stdsimd)
+    feature(stdarch_aarch64_prefetch)
 )]
 
 //! Blazingly fast API-compatible UTF-8 validation for Rust using SIMD extensions, based on the implementation from