diff --git a/.drone.yml b/.drone.yml
index 9b8e8bb2..dbc33acb 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -58,5 +58,7 @@ steps:
 - name: test
   image: rust:1
   commands:
-  - cargo build --verbose --all
-  - cargo test --verbose --all
+  - rustup default nightly
+  - rustup update
+  - cargo clean && cargo +nightly build --verbose --all
+  - cargo +nightly test --verbose --all
diff --git a/src/avx2/generator.rs b/src/avx2/generator.rs
new file mode 100644
index 00000000..13e72061
--- /dev/null
+++ b/src/avx2/generator.rs
@@ -0,0 +1,51 @@
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+use crate::value::generator::ESCAPED;
+use std::io;
+
+#[inline(always)]
+pub unsafe fn write_str_simd<W>(writer: &mut W, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> where W: std::io::Write {
+    let zero = _mm256_set1_epi8(0);
+    let lower_quote_range = _mm256_set1_epi8(0x1F as i8);
+    let quote = _mm256_set1_epi8(b'"' as i8);
+    let backslash = _mm256_set1_epi8(b'\\' as i8);
+    while *len - *idx >= 32 {
+        // Load 32 bytes of data;
+        #[allow(clippy::cast_ptr_alignment)]
+            let data: __m256i = _mm256_loadu_si256(string.as_ptr().add(*idx) as *const __m256i);
+        // Test the data against being backslash and quote.
+        let bs_or_quote = _mm256_or_si256(
+            _mm256_cmpeq_epi8(data, backslash),
+            _mm256_cmpeq_epi8(data, quote),
+        );
+        // Now mask the data with the quote range (0x1F).
+        let in_quote_range = _mm256_and_si256(data, lower_quote_range);
+        // then test of the data is unchanged. aka: xor it with the
+        // Any field that was inside the quote range it will be zero
+        // now.
+        let is_unchanged = _mm256_xor_si256(data, in_quote_range);
+        let in_range = _mm256_cmpeq_epi8(is_unchanged, zero);
+        let quote_bits = _mm256_movemask_epi8(_mm256_or_si256(bs_or_quote, in_range));
+        if quote_bits != 0 {
+            let quote_dist = quote_bits.trailing_zeros() as usize;
+            stry!(writer.write_all(&string[0..*idx + quote_dist]));
+            let ch = string[*idx + quote_dist];
+            match ESCAPED[ch as usize] {
+                b'u' => stry!(write!(writer, "\\u{:04x}", ch)),
+
+                escape => stry!(writer.write_all(&[b'\\', escape])),
+            };
+            *string = &string[*idx + quote_dist + 1..];
+            *idx = 0;
+            *len = string.len();
+        } else {
+            *idx += 32;
+        }
+    }
+    stry!(writer.write_all(&string[0..*idx]));
+    *string = &string[*idx..];
+    Ok(())
+}
\ No newline at end of file
diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs
index 30c55c86..ac608ae2 100644
--- a/src/avx2/mod.rs
+++ b/src/avx2/mod.rs
@@ -1,3 +1,4 @@
 pub mod deser;
 pub mod stage1;
-pub mod utf8check;
\ No newline at end of file
+pub mod utf8check;
+pub mod generator;
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
index 02038553..b68deadb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,18 @@
 #![deny(warnings)]
+
+#![cfg_attr(target_feature = "neon", feature(
+    asm,
+    stdsimd,
+    repr_simd,
+    custom_inner_attributes,
+    aarch64_target_feature,
+    platform_intrinsics,
+    stmt_expr_attributes,
+    simd_ffi,
+    link_llvm_intrinsics
+    )
+)]
+
 #![cfg_attr(feature = "hints", feature(core_intrinsics))]
 //! simdjson-rs is a rust port of the simejson c++ library. It follows
 //! most of the design closely with a few exceptions to make it better
@@ -89,17 +103,25 @@ pub use crate::avx2::deser::*;
 #[cfg(target_feature = "avx2")]
 use crate::avx2::stage1::SIMDJSON_PADDING;
 
-#[cfg(not(target_feature = "avx2"))]
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))]
 mod sse42;
-#[cfg(not(target_feature = "avx2"))]
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))]
 pub use crate::sse42::deser::*;
-#[cfg(not(target_feature = "avx2"))]
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))]
 use crate::sse42::stage1::SIMDJSON_PADDING;
 
+#[cfg(target_feature = "neon")]
+mod neon;
+#[cfg(target_feature = "neon")]
+pub use crate::neon::deser::*;
+#[cfg(target_feature = "neon")]
+use crate::neon::stage1::SIMDJSON_PADDING;
+
 mod stage2;
 pub mod value;
 
 use crate::numberparse::Number;
+#[cfg(not(target_feature = "neon"))]
 use std::mem;
 use std::str;
 
@@ -163,7 +185,11 @@ impl<'de> Deserializer<'de> {
 
         let counts = Deserializer::validate(input, &structural_indexes)?;
 
-        let strings = Vec::with_capacity(len + SIMDJSON_PADDING);
+        // Set length to allow slice access in ARM code
+        let mut strings = Vec::with_capacity(len + SIMDJSON_PADDING);
+        unsafe {
+            strings.set_len(len + SIMDJSON_PADDING);
+        }
 
         Ok(Deserializer {
             counts,
diff --git a/src/neon/deser.rs b/src/neon/deser.rs
new file mode 100644
index 00000000..5d70b7af
--- /dev/null
+++ b/src/neon/deser.rs
@@ -0,0 +1,199 @@
+
+pub use crate::error::{Error, ErrorType};
+pub use crate::Deserializer;
+pub use crate::Result;
+pub use crate::neon::stage1::*;
+pub use crate::neon::utf8check::*;
+pub use crate::neon::intrinsics::*;
+pub use crate::stringparse::*;
+
+impl<'de> Deserializer<'de> {
+    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+    pub fn parse_str_(&mut self) -> Result<&'de str> {
+        // Add 1 to skip the initial "
+        let idx = self.iidx + 1;
+        let mut padding = [0u8; 32];
+        //let mut read: usize = 0;
+
+        // we include the terminal '"' so we know where to end
+        // This is safe since we check sub's lenght in the range access above and only
+        // create sub sliced form sub to `sub.len()`.
+
+        let src: &[u8] = unsafe { &self.input.get_unchecked(idx..) };
+        let mut src_i: usize = 0;
+        let mut len = src_i;
+        loop {
+            // store to dest unconditionally - we can overwrite the bits we don't like
+            // later
+
+            let (v0, v1) = if src.len() >= src_i + 32 {
+                // This is safe since we ensure src is at least 16 wide
+                #[allow(clippy::cast_ptr_alignment)]
+                unsafe {
+                    (
+                        vld1q_u8(src.get_unchecked(src_i..src_i + 16).as_ptr()),
+                        vld1q_u8(src.get_unchecked(src_i + 16..src_i + 32).as_ptr()),
+                    )
+                }
+            } else {
+                unsafe {
+                    padding
+                        .get_unchecked_mut(..src.len() - src_i)
+                        .clone_from_slice(src.get_unchecked(src_i..));
+                    // This is safe since we ensure src is at least 32 wide
+                    (
+                        vld1q_u8(padding.get_unchecked(0..16).as_ptr()),
+                        vld1q_u8(padding.get_unchecked(16..32).as_ptr()),
+                    )
+                }
+            };
+
+            let ParseStringHelper { bs_bits, quote_bits } = find_bs_bits_and_quote_bits(v0, v1);
+
+            if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
+                // we encountered quotes first. Move dst to point to quotes and exit
+                // find out where the quote is...
+                let quote_dist: u32 = quote_bits.trailing_zeros();
+
+                ///////////////////////
+                // Above, check for overflow in case someone has a crazy string (>=4GB?)
+                // But only add the overflow check when the document itself exceeds 4GB
+                // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+                ////////////////////////
+
+                // we advance the point, accounting for the fact that we have a NULl termination
+
+                len += quote_dist as usize;
+                unsafe {
+                    let v = self.input.get_unchecked(idx..idx + len) as *const [u8] as *const str;
+                    return Ok(&*v);
+                }
+
+                // we compare the pointers since we care if they are 'at the same spot'
+                // not if they are the same value
+            }
+            if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
+                // Move to the 'bad' character
+                let bs_dist: u32 = bs_bits.trailing_zeros();
+                len += bs_dist as usize;
+                src_i += bs_dist as usize;
+                break;
+            } else {
+                // they are the same. Since they can't co-occur, it means we encountered
+                // neither.
+                src_i += 32;
+                len += 32;
+            }
+        }
+
+        let mut dst_i: usize = 0;
+        let dst: &mut [u8] = self.strings.as_mut_slice();
+
+        loop {
+            let (v0, v1) = if src.len() >= src_i + 32 {
+                // This is safe since we ensure src is at least 16 wide
+                #[allow(clippy::cast_ptr_alignment)]
+                    unsafe {
+                    (
+                        vld1q_u8(src.get_unchecked(src_i..src_i + 16).as_ptr()),
+                        vld1q_u8(src.get_unchecked(src_i + 16..src_i + 32).as_ptr()),
+                    )
+                }
+            } else {
+                unsafe {
+                    padding
+                        .get_unchecked_mut(..src.len() - src_i)
+                        .clone_from_slice(src.get_unchecked(src_i..));
+                    // This is safe since we ensure src is at least 32 wide
+                    (
+                        vld1q_u8(padding.get_unchecked(0..16).as_ptr()),
+                        vld1q_u8(padding.get_unchecked(16..32).as_ptr()),
+                    )
+                }
+            };
+
+            unsafe {
+                dst.get_unchecked_mut(dst_i..dst_i + 32).copy_from_slice(src.get_unchecked(src_i..src_i + 32));
+            }
+
+            // store to dest unconditionally - we can overwrite the bits we don't like
+            // later
+            let ParseStringHelper { bs_bits, quote_bits } = find_bs_bits_and_quote_bits(v0, v1);
+
+            if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
+                // we encountered quotes first. Move dst to point to quotes and exit
+                // find out where the quote is...
+                let quote_dist: u32 = quote_bits.trailing_zeros();
+
+                ///////////////////////
+                // Above, check for overflow in case someone has a crazy string (>=4GB?)
+                // But only add the overflow check when the document itself exceeds 4GB
+                // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+                ////////////////////////
+
+                // we advance the point, accounting for the fact that we have a NULl termination
+
+                dst_i += quote_dist as usize;
+                unsafe {
+                    self.input
+                        .get_unchecked_mut(idx + len..idx + len + dst_i)
+                        .clone_from_slice(&self.strings.get_unchecked(..dst_i));
+                    let v = self.input.get_unchecked(idx..idx + len + dst_i) as *const [u8]
+                        as *const str;
+                    self.str_offset += dst_i as usize;
+                    return Ok(&*v);
+                }
+
+                // we compare the pointers since we care if they are 'at the same spot'
+                // not if they are the same value
+            }
+            if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
+                // find out where the backspace is
+                let bs_dist: u32 = bs_bits.trailing_zeros();
+                let escape_char: u8 = unsafe { *src.get_unchecked(src_i + bs_dist as usize + 1) };
+                // we encountered backslash first. Handle backslash
+                if escape_char == b'u' {
+                    // move src/dst up to the start; they will be further adjusted
+                    // within the unicode codepoint handling code.
+                    src_i += bs_dist as usize;
+                    dst_i += bs_dist as usize;
+                    let (o, s) = if let Ok(r) = handle_unicode_codepoint(
+                            unsafe { src.get_unchecked(src_i..) },
+                            unsafe { dst.get_unchecked_mut(dst_i..) }
+                    )
+                    {
+                        r
+                    } else {
+                        return Err(self.error(ErrorType::InvlaidUnicodeCodepoint));
+                    };
+                    if o == 0 {
+                        return Err(self.error(ErrorType::InvlaidUnicodeCodepoint));
+                    };
+                    // We moved o steps forword at the destiation and 6 on the source
+                    src_i += s;
+                    dst_i += o;
+                } else {
+                    // simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+                    // write bs_dist+1 characters to output
+                    // note this may reach beyond the part of the buffer we've actually
+                    // seen. I think this is ok
+                    let escape_result: u8 =
+                        unsafe { *ESCAPE_MAP.get_unchecked(escape_char as usize) };
+                    if escape_result == 0 {
+                        return Err(self.error(ErrorType::InvalidEscape));
+                    }
+                    unsafe {
+                        *dst.get_unchecked_mut(dst_i + bs_dist as usize) = escape_result;
+                    }
+                    src_i += bs_dist as usize + 2;
+                    dst_i += bs_dist as usize + 1;
+                }
+            } else {
+                // they are the same. Since they can't co-occur, it means we encountered
+                // neither.
+                src_i += 32;
+                dst_i += 32;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/neon/generator.rs b/src/neon/generator.rs
new file mode 100644
index 00000000..6c8cf358
--- /dev/null
+++ b/src/neon/generator.rs
@@ -0,0 +1,48 @@
+use crate::value::generator::ESCAPED;
+use std::io;
+use crate::neon::intrinsics::*;
+use crate::neon::stage1::neon_movemask;
+
+#[inline(always)]
+pub unsafe fn write_str_simd<W>(writer: &mut W, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> where W: std::io::Write {
+    // The case where we have a 16+ byte block
+    // we repeate the same logic as above but with
+    // only 16 bytes
+    let zero = vdupq_n_u8(0);
+    let lower_quote_range = vdupq_n_u8(0x1F);
+    let quote = vdupq_n_u8(b'"');
+    let backslash = vdupq_n_u8(b'\\');
+    while *len - *idx > 16 {
+        // Load 16 bytes of data;
+        let data: uint8x16_t = vld1q_u8(string.as_ptr().add(*idx));
+        // Test the data against being backslash and quote.
+        let bs_or_quote =
+            vorrq_u8(vceqq_u8(data, backslash), vceqq_u8(data, quote));
+        // Now mask the data with the quote range (0x1F).
+        let in_quote_range = vandq_u8(data, lower_quote_range);
+        // then test of the data is unchanged. aka: xor it with the
+        // Any field that was inside the quote range it will be zero
+        // now.
+        let is_unchanged = vxorrq_u8(data, in_quote_range);
+        let in_range = vceqq_u8(is_unchanged, zero);
+        let quote_bits = neon_movemask(vorrq_u8(bs_or_quote, in_range));
+        if quote_bits != 0 {
+            let quote_dist = quote_bits.trailing_zeros() as usize;
+            stry!(writer.write_all(&string[0..*idx + quote_dist]));
+            let ch = string[*idx + quote_dist];
+            match ESCAPED[ch as usize] {
+                b'u' => stry!(write!(writer, "\\u{:04x}", ch)),
+
+                escape => stry!(writer.write_all(&[b'\\', escape])),
+            };
+            *string = &string[*idx + quote_dist + 1..];
+            *idx = 0;
+            *len = string.len();
+        } else {
+            *idx += 16;
+        }
+    }
+    stry!(writer.write_all(&string[0..*idx]));
+    *string = &string[*idx..];
+    Ok(())
+}
diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs
new file mode 100644
index 00000000..c5c98cb6
--- /dev/null
+++ b/src/neon/intrinsics.rs
@@ -0,0 +1,557 @@
+//use std::arch::
+
+use crate::neon::simd_llvm;
+
+use std::mem;
+use core;
+
+#[allow(unused)]
+macro_rules! types {
+    ($(
+        $(#[$doc:meta])*
+        pub struct $name:ident($($fields:tt)*);
+    )*) => ($(
+        $(#[$doc])*
+        #[derive(Copy, Clone, Debug)]
+        #[allow(non_camel_case_types)]
+        #[repr(simd)]
+        #[allow(clippy::missing_inline_in_public_items)]
+        pub struct $name($($fields)*);
+    )*)
+}
+
+#[allow(non_camel_case_types)]
+pub type poly64_t = i64;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.aarch64.neon.addp.v16u8"]
+    fn vpaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    #[link_name = "llvm.aarch64.neon.pmull64"]
+    fn vmull_p64_(a: i64, b: i64) -> int8x16_t;
+    #[link_name = "llvm.aarch64.neon.uqxtn.v2u32"]
+    fn vqmovn_u64_(a: uint64x2_t) -> uint32x2_t;
+    #[link_name = "llvm.aarch64.neon.uqsub.v16u8"]
+    fn vqsubq_u8_(a: uint8x16_t, a: uint8x16_t) -> uint8x16_t;
+    #[link_name = "llvm.aarch64.neon.uqsub.v16i8"]
+    fn vqsubq_s8_(a: int8x16_t, a: int8x16_t) -> int8x16_t;
+}
+
+#[inline]
+unsafe fn vaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_llvm::simd_add(mem::transmute(a), mem::transmute(b))
+}
+
+#[inline]
+unsafe fn vaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_llvm::simd_add(mem::transmute(a), mem::transmute(b))
+}
+
+#[inline]
+unsafe fn vaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_llvm::simd_add(mem::transmute(a), mem::transmute(b))
+}
+
+#[inline]
+pub unsafe fn vnegq_u8(a: uint8x16_t) -> uint8x16_t {
+    let x: u128 = mem::transmute(a);
+    let nx = !x;
+    mem::transmute(nx)
+}
+
+#[inline]
+pub unsafe fn vnegq_s8(a: int8x16_t) -> int8x16_t {
+    let x: u128 = mem::transmute(a);
+    let nx = !x;
+    mem::transmute(nx)
+}
+
+
+#[inline]
+fn rotate_(a: u128, b: u128, n: u128) -> u128 {
+    let az = a >> (n * 8);
+    let bz = b << (128 - (n * 8));
+    az | bz
+}
+
+#[inline]
+pub unsafe fn vextq_u8(a: uint8x16_t, b: uint8x16_t, n: u8) -> uint8x16_t {
+    mem::transmute(rotate_(mem::transmute(a), mem::transmute(b), n as u128))
+}
+
+#[inline]
+pub unsafe fn vextq_s8(a: int8x16_t, b: int8x16_t, n: u8) -> int8x16_t {
+    mem::transmute(rotate_(mem::transmute(a), mem::transmute(b), n as u128))
+}
+
+#[inline]
+pub unsafe fn vmull_p64(a: poly64_t, b: poly64_t) -> poly128_t {
+    mem::transmute(vmull_p64_(mem::transmute(a), mem::transmute(b)))
+}
+
+#[inline]
+pub fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { vpaddq_u8_(a, b) }
+}
+
+#[inline]
+pub unsafe fn vshrq_n_u8(a: uint8x16_t, n: u8) -> uint8x16_t {
+    uint8x16_t(
+        a.0 >> n,
+        a.1 >> n,
+        a.2 >> n,
+        a.3 >> n,
+        a.4 >> n,
+        a.5 >> n,
+        a.6 >> n,
+        a.7 >> n,
+        a.8 >> n,
+        a.9 >> n,
+        a.10 >> n,
+        a.11 >> n,
+        a.12 >> n,
+        a.13 >> n,
+        a.14 >> n,
+        a.15 >> n,
+    )
+}
+
+types! {
+    /// ARM-specific 64-bit wide vector of eight packed `i8`.
+    pub struct int8x8_t(i8, i8, i8, i8, i8, i8, i8, i8);
+    /// ARM-specific 64-bit wide vector of eight packed `u8`.
+    pub struct uint8x8_t(u8, u8, u8, u8, u8, u8, u8, u8);
+    /// ARM-specific 64-bit wide polynomial vector of eight packed `u8`.
+    pub struct poly8x8_t(u8, u8, u8, u8, u8, u8, u8, u8);
+    /// ARM-specific 64-bit wide vector of four packed `i16`.
+    pub struct int16x4_t(i16, i16, i16, i16);
+    /// ARM-specific 64-bit wide vector of four packed `u16`.
+    pub struct uint16x4_t(u16, u16, u16, u16);
+    /// ARM-specific 64-bit wide vector of four packed `u16`.
+    pub struct poly16x4_t(u16, u16, u16, u16);
+    /// ARM-specific 64-bit wide vector of two packed `i32`.
+    pub struct int32x2_t(i32, i32);
+    /// ARM-specific 64-bit wide vector of two packed `u32`.
+    pub struct uint32x2_t(u32, u32);
+    /// ARM-specific 64-bit wide vector of two packed `f32`.
+    pub struct float32x2_t(f32, f32);
+    /// ARM-specific 64-bit wide vector of one packed `i64`.
+    pub struct int64x1_t(i64);
+    /// ARM-specific 64-bit wide vector of one packed `u64`.
+    pub struct uint64x1_t(u64);
+    /// ARM-specific 128-bit wide vector of sixteen packed `i8`.
+    pub struct int8x16_t(
+        i8, i8 ,i8, i8, i8, i8 ,i8, i8,
+        i8, i8 ,i8, i8, i8, i8 ,i8, i8,
+    );
+    /// ARM-specific 128-bit wide vector of sixteen packed `u8`.
+    pub struct uint8x16_t(
+        u8, u8 ,u8, u8, u8, u8 ,u8, u8,
+        u8, u8 ,u8, u8, u8, u8 ,u8, u8,
+    );
+    /// ARM-specific 128-bit wide vector of sixteen packed `u8`.
+    pub struct poly8x16_t(
+        u8, u8, u8, u8, u8, u8, u8, u8,
+        u8, u8, u8, u8, u8, u8, u8, u8
+    );
+    /// ARM-specific 128-bit wide vector of eight packed `i16`.
+    pub struct int16x8_t(i16, i16, i16, i16, i16, i16, i16, i16);
+    /// ARM-specific 128-bit wide vector of eight packed `u16`.
+    pub struct uint16x8_t(u16, u16, u16, u16, u16, u16, u16, u16);
+    /// ARM-specific 128-bit wide vector of eight packed `u16`.
+    pub struct poly16x8_t(u16, u16, u16, u16, u16, u16, u16, u16);
+    /// ARM-specific 128-bit wide vector of four packed `i32`.
+    pub struct int32x4_t(i32, i32, i32, i32);
+    /// ARM-specific 128-bit wide vector of four packed `u32`.
+    pub struct uint32x4_t(u32, u32, u32, u32);
+    /// ARM-specific 128-bit wide vector of four packed `f32`.
+    pub struct float32x4_t(f32, f32, f32, f32);
+    /// ARM-specific 128-bit wide vector of two packed `i64`.
+    pub struct int64x2_t(i64, i64);
+    /// ARM-specific 128-bit wide vector of two packed `u64`.
+    pub struct uint64x2_t(u64, u64);
+    /// ARM-specific 128-bit wide vector of one packed `i128`.
+    pub struct poly128_t(i128); // FIXME: check this!
+}
+
+impl uint8x16_t {
+    #[inline]
+    pub fn new(a: u8, b: u8, c: u8, d: u8, e: u8, f: u8, g: u8, h: u8, i: u8, j: u8, k: u8, l: u8, m: u8, n: u8, o: u8, p: u8) -> uint8x16_t {
+        uint8x16_t(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)
+    }
+}
+
+impl int8x16_t {
+    #[inline]
+    pub fn new(a: i8, b: i8, c: i8, d: i8, e: i8, f: i8, g: i8, h: i8, i: i8, j: i8, k: i8, l: i8, m: i8, n: i8, o: i8, p: i8) -> int8x16_t {
+        int8x16_t(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)
+    }
+}
+
+impl int32x4_t {
+    #[inline]
+    pub fn new(a: i32, b: i32, c: i32, d: i32) -> int32x4_t {
+        int32x4_t(a, b, c, d)
+    }
+}
+
+//#[inline]
+//pub fn add_overflow(a: u64, b: u64, out: &mut u64) -> bool {
+//    let (carry, did_carry) = a.overflowing_add(b);
+//    *out = carry;
+//    did_carry
+//}
+
+#[inline]
+pub unsafe fn vld1q_s8(addr: *const i8) -> int8x16_t {
+    *(addr as *const int8x16_t)
+}
+
+#[inline]
+pub unsafe fn vld1q_u8(addr: *const u8) -> uint8x16_t {
+    *(addr as *const uint8x16_t)
+}
+
+#[inline]
+pub unsafe fn vst1q_u8(addr: *mut u8, val: uint8x16_t) {
+    std::ptr::write(addr as *mut uint8x16_t, val);
+}
+
+macro_rules! aarch64_simd_2 {
+    ($name: ident, $type: ty, $simd_fn: ident, $intrarm: ident, $intraarch: ident) => {
+        aarch64_simd_2!($name, $type, $type, $simd_fn, $intrarm, $intraarch);
+    };
+    ($name: ident, $type: ty, $res: ty, $simd_fn: ident, $intrarm: ident, $intraarch: ident) => {
+        #[inline]
+        pub fn $name(a: $type, b: $type) -> $res {
+            unsafe { simd_llvm::$simd_fn(a, b) }
+        }
+    }
+}
+
+macro_rules! aarch64_simd_ceq {
+    ($name: ident, $type: ty, $res: ty) => {
+        /// Compare bitwise Equal (vector)
+        aarch64_simd_2!($name, $type, $res, simd_eq, cmeq, cmeq);
+    };
+}
+
+aarch64_simd_ceq!(vceq_s8, int8x8_t, uint8x8_t);
+aarch64_simd_ceq!(vceqq_s8, int8x16_t, uint8x16_t);
+aarch64_simd_ceq!(vceq_s16, int16x4_t, uint16x4_t);
+aarch64_simd_ceq!(vceqq_s16, int16x8_t, uint16x8_t);
+aarch64_simd_ceq!(vceq_s32, int32x2_t, uint32x2_t);
+aarch64_simd_ceq!(vceqq_s32, int32x4_t, uint32x4_t);
+aarch64_simd_ceq!(vceq_u8, uint8x8_t, uint8x8_t);
+aarch64_simd_ceq!(vceqq_u8, uint8x16_t, uint8x16_t);
+aarch64_simd_ceq!(vceq_u16, uint16x4_t, uint16x4_t);
+aarch64_simd_ceq!(vceqq_u16, uint16x8_t, uint16x8_t);
+aarch64_simd_ceq!(vceq_u32, uint32x2_t, uint32x2_t);
+aarch64_simd_ceq!(vceqq_u32, uint32x4_t, uint32x4_t);
+aarch64_simd_2!(vceq_f32, float32x2_t, uint32x2_t, simd_eq, fcmeq, fcmeq);
+aarch64_simd_2!(vceqq_f32, float32x4_t, uint32x4_t, simd_eq, fcmeq, fcmeq);
+aarch64_simd_ceq!(vceq_p8, poly8x8_t, poly8x8_t);
+aarch64_simd_ceq!(vceqq_p8, poly8x16_t, poly8x16_t);
+
+macro_rules! aarch64_simd_cgt {
+    ($name:ident, $type:ty, $res:ty) => {
+        /// Compare signed Greater than (vector)
+        aarch64_simd_2!($name, $type, $res, simd_gt, cmgt, cmgt);
+    };
+}
+
+//macro_rules! aarch64_simd_cgtu {
+//    ($name: ident, $type: ty) => {
+//        /// Compare Greater than (vector)
+//        aarch64_simd_2!($name, $type, simd_gt, cmhi);
+//    };
+//}
+
+aarch64_simd_cgt!(vcgt_s8, int8x8_t, uint8x8_t);
+aarch64_simd_cgt!(vcgtq_s8, int8x16_t, uint8x16_t);
+aarch64_simd_cgt!(vcgt_s16, int16x4_t, uint16x4_t);
+aarch64_simd_cgt!(vcgtq_s16, int16x8_t, uint16x8_t);
+aarch64_simd_cgt!(vcgt_s32, int32x2_t, uint32x2_t);
+aarch64_simd_cgt!(vcgtq_s32, int32x4_t, uint32x4_t);
+
+//aarch64_simd_cgtu!(vcgtq_u8, uint8x16_t);
+//aarch64_simd_cgt!(vcgt_s64, int64x1_t);
+//aarch64_simd_cgt!(vcgtq_s64, int64x2_t);
+//aarch64_simd_cgtu!(vcgt_u64, uint64x1_t);
+//aarch64_simd_cgtu!(vcgtq_u64, uint64x2_t);
+
+macro_rules! aarch64_simd_clt {
+    ($name:ident, $type:ty, $res:ty) => {
+        /// Compare signed Lesser than (vector)
+        aarch64_simd_2!($name, $type, $res, simd_lt, cmgt, cmgt);
+    };
+}
+
+//macro_rules! aarch64_simd_cltu {
+//( $ name: ident, $ type: ty) => {
+///// Compare Lesser than (vector)
+//aarch64_simd_2 ! ( $ name, $ type, simd_lt, cmhi);
+//};
+//}
+
+aarch64_simd_clt!(vclt_s8, int8x8_t, uint8x8_t);
+aarch64_simd_clt!(vcltq_s8, int8x16_t, uint8x16_t);
+aarch64_simd_clt!(vclt_s16, int16x4_t, uint16x4_t);
+aarch64_simd_clt!(vcltq_s16, int16x8_t, uint16x8_t);
+aarch64_simd_clt!(vclt_s32, int32x2_t, uint32x2_t);
+aarch64_simd_clt!(vcltq_s32, int32x4_t, uint32x4_t);
+
+//arm_simd_cltu!(vclt_u8, uint8x8_t);
+//arm_simd_cltu!(vcltq_u8, uint8x16_t);
+//arm_simd_cltu!(vclt_u16, uint16x4_t);
+//arm_simd_cltu!(vcltq_u16, uint16x8_t);
+//arm_simd_cltu!(vclt_u32, uint32x2_t);
+//arm_simd_cltu!(vcltq_u32, uint32x4_t);
+
+macro_rules! aarch64_simd_cge {
+    ($name:ident, $type:ty, $res:ty) => {
+        /// Compare signed Greater than equals (vector)
+        aarch64_simd_2!($name, $type, $res, simd_ge, cmge, cmge);
+    };
+}
+
+//macro_rules! aarch64_simd_cgeu {
+//( $ name: ident, $ type: ty) => {
+///// Compare Greater than (vector)
+//aarch64_simd_2 ! ( $ name, $ type, simd_ge, cmhs);
+//};
+//}
+
+aarch64_simd_cge!(vcge_s8, int8x8_t, uint8x8_t);
+aarch64_simd_cge!(vcgeq_s8, int8x16_t, uint8x16_t);
+aarch64_simd_cge!(vcge_s16, int16x4_t, uint16x4_t);
+aarch64_simd_cge!(vcgeq_s16, int16x8_t, uint16x8_t);
+aarch64_simd_cge!(vcge_s32, int32x2_t, uint32x2_t);
+aarch64_simd_cge!(vcgeq_s32, int32x4_t, uint32x4_t);
+//arm_simd_cgeu!(vcge_u8, uint8x8_t);
+//arm_simd_cgeu!(vcgeq_u8, uint8x16_t);
+//arm_simd_cgeu!(vcge_u16, uint16x4_t);
+//arm_simd_cgeu!(vcgeq_u16, uint16x8_t);
+//arm_simd_cgeu!(vcge_u32, uint32x2_t);
+//arm_simd_cgeu!(vcgeq_u32, uint32x4_t);
+
+macro_rules! aarch64_simd_cle {
+    ($name:ident, $type:ty, $res:ty) => {
+        /// Compare signed Lesser than equals (vector)
+        aarch64_simd_2!($name, $type, $res, simd_le, cmge, cmge);
+    };
+}
+
+//macro_rules! aarch64_simd_cleu {
+//( $ name: ident, $ type: ty) => {
+///// Compare Lesser than (vector)
+//aarch64_simd_2 ! ( $ name, $ type, simd_le, cmhs);
+//};
+//}
+
+aarch64_simd_cle!(vcle_s8, int8x8_t, uint8x8_t);
+aarch64_simd_cle!(vcleq_s8, int8x16_t, uint8x16_t);
+aarch64_simd_cle!(vcle_s16, int16x4_t, uint16x4_t);
+aarch64_simd_cle!(vcleq_s16, int16x8_t, uint16x8_t);
+aarch64_simd_cle!(vcle_s32, int32x2_t, uint32x2_t);
+aarch64_simd_cle!(vcleq_s32, int32x4_t, uint32x4_t);
+//arm_simd_cleu!(vcle_u8, uint8x8_t);
+aarch64_simd_cle!(vcleq_u8, uint8x16_t, uint8x16_t);
+//arm_simd_cleu!(vcle_u16, uint16x4_t);
+//arm_simd_cleu!(vcleq_u16, uint16x8_t);
+//arm_simd_cleu!(vcle_u32, uint32x2_t);
+//arm_simd_cleu!(vcleq_u32, uint32x4_t);
+
+#[inline]
+pub fn vdupq_n_s8(a: i8) -> int8x16_t {
+    int8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
+#[inline]
+pub fn zeroi8x16() -> int8x16_t {
+    int8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00)
+}
+
+#[inline]
+pub fn vdupq_n_u8(a: u8) -> uint8x16_t {
+    uint8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
+#[inline]
+pub fn vmovq_n_u8(a: u8) -> uint8x16_t {
+    uint8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
+#[inline]
+pub fn vmovq_n_s8(a: i8) -> int8x16_t {
+    int8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
+#[inline]
+pub fn zerou8x16() -> uint8x16_t {
+    uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00)
+}
+
+#[inline]
+pub unsafe fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    mem::transmute(vaddq_u8_(mem::transmute(a), mem::transmute(b)))
+}
+
+#[inline]
+pub unsafe fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    mem::transmute(vaddq_s8_(mem::transmute(a), mem::transmute(b)))
+}
+
+#[inline]
+pub unsafe fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    mem::transmute(vaddq_s32_(mem::transmute(a), mem::transmute(b)))
+}
+
+#[inline]
+pub fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_and(a, b) } }
+#[inline]
+pub fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_and(a, b) } }
+#[inline]
+pub fn vandq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { unsafe { simd_llvm::simd_and(a, b) } }
+#[inline]
+pub fn vorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_or(a, b) } }
+#[inline]
+pub fn vorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_or(a, b) } }
+#[inline]
+pub fn vxorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_xor(a, b) } }
+#[inline]
+pub fn vxorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_xor(a, b) } }
+
+macro_rules! arm_reinterpret {
+    ($name: ident, $from: ty, $to: ty) => {
+        // Vector reinterpret cast operation
+        #[inline]
+        pub fn $name(a: $from) -> $to {
+            unsafe { mem::transmute(a) }
+        }
+    };
+}
+
+arm_reinterpret!(vreinterpret_u64_u32, uint32x2_t, uint64x1_t);
+arm_reinterpret!(vreinterpretq_u64_u32, uint32x4_t, uint64x2_t);
+arm_reinterpret!(vreinterpretq_s8_u8, uint8x16_t, int8x16_t);
+arm_reinterpret!(vreinterpretq_u16_u8, uint8x16_t, uint16x8_t);
+arm_reinterpret!(vreinterpretq_u32_u8, uint8x16_t, uint32x4_t);
+arm_reinterpret!(vreinterpretq_u64_u8, uint8x16_t, uint64x2_t);
+arm_reinterpret!(vreinterpretq_u64_s8, int8x16_t, uint64x2_t);
+arm_reinterpret!(vreinterpretq_u8_s8, int8x16_t, uint8x16_t);
+
+arm_reinterpret!(vreinterpretq_s16_s8, int8x16_t, int16x8_t);
+arm_reinterpret!(vreinterpretq_s32_s8, int8x16_t, int32x4_t);
+arm_reinterpret!(vreinterpretq_s64_s8, int8x16_t, int64x2_t);
+
+macro_rules! arm_vget_lane {
+    ($name: ident, $to: ty, $from: ty, $lanes: literal) => {
+        #[inline]
+        pub unsafe fn $name(v: $from, lane: u32) -> $ to {
+            simd_llvm::simd_extract(v, lane)
+        }
+    };
+}
+
+arm_vget_lane!(vgetq_lane_u16, u16, uint16x8_t, 7);
+arm_vget_lane!(vgetq_lane_u32, u32, uint32x4_t, 3);
+arm_vget_lane!(vgetq_lane_u64, u64, uint64x2_t, 1);
+arm_vget_lane!(vget_lane_u64, u64, uint64x1_t, 0);
+
+arm_vget_lane!(vgetq_lane_s16, i16, int16x8_t, 7);
+arm_vget_lane!(vgetq_lane_s32, i32, int32x4_t, 3);
+arm_vget_lane!(vgetq_lane_s64, i64, int64x2_t, 1);
+arm_vget_lane!(vget_lane_s64, i64, int64x1_t, 0);
+
+#[inline]
+pub unsafe fn vqmovn_u64(a: uint64x2_t) -> uint32x2_t {
+    vqmovn_u64_(a)
+}
+
+#[inline]
+pub unsafe fn vqtbl1q_s8(t: int8x16_t, idx: uint8x16_t) -> int8x16_t {
+    mem::transmute(core::arch::aarch64::vqtbl1q_s8(mem::transmute(t), mem::transmute(idx)))
+}
+
+#[inline]
+pub unsafe fn vqtbl1q_u8(t: uint8x16_t, idx: uint8x16_t) -> uint8x16_t {
+    mem::transmute(core::arch::aarch64::vqtbl1q_s8(mem::transmute(t), mem::transmute(idx)))
+}
+
+#[inline]
+pub unsafe fn vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    vqsubq_u8_(a, b)
+}
+
+#[inline]
+pub unsafe fn vqsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    vqsubq_s8_(a, b)
+}
+
+#[inline]
+fn test_u8(a: u8, b: u8) -> u8 {
+    if a & b != 0 {
+        0xFF
+    } else {
+        0x00
+    }
+}
+
+#[inline]
+pub unsafe fn vtstq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    uint8x16_t(
+        test_u8(a.0, b.0),
+        test_u8(a.1, b.1),
+        test_u8(a.2, b.2),
+        test_u8(a.3, b.3),
+        test_u8(a.4, b.4),
+        test_u8(a.5, b.5),
+        test_u8(a.6, b.6),
+        test_u8(a.7, b.7),
+        test_u8(a.8, b.8),
+        test_u8(a.9, b.9),
+        test_u8(a.10, b.10),
+        test_u8(a.11, b.11),
+        test_u8(a.12, b.12),
+        test_u8(a.13, b.13),
+        test_u8(a.14, b.14),
+        test_u8(a.15, b.15),
+    )
+}
+
+#[inline]
+fn test_s8(a: i8, b: i8) -> i8 {
+    if a & b != 0 {
+        -1
+    } else {
+        0x00
+    }
+}
+
+#[inline]
+pub unsafe fn vtstq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    int8x16_t(
+        test_s8(a.0, b.0),
+        test_s8(a.1, b.1),
+        test_s8(a.2, b.2),
+        test_s8(a.3, b.3),
+        test_s8(a.4, b.4),
+        test_s8(a.5, b.5),
+        test_s8(a.6, b.6),
+        test_s8(a.7, b.7),
+        test_s8(a.8, b.8),
+        test_s8(a.9, b.9),
+        test_s8(a.10, b.10),
+        test_s8(a.11, b.11),
+        test_s8(a.12, b.12),
+        test_s8(a.13, b.13),
+        test_s8(a.14, b.14),
+        test_s8(a.15, b.15),
+    )
+}
+
+#[inline]
+pub unsafe fn vst1q_u32(addr: *mut u8, val: uint32x4_t) {
+    std::ptr::write(addr as *mut uint32x4_t, val)
+}
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
new file mode 100644
index 00000000..f7868249
--- /dev/null
+++ b/src/neon/mod.rs
@@ -0,0 +1,7 @@
+pub mod deser;
+pub mod stage1;
+pub mod utf8check;
+pub mod generator;
+mod simd;
+mod simd_llvm;
+mod intrinsics;
\ No newline at end of file
diff --git a/src/neon/simd.rs b/src/neon/simd.rs
new file mode 100644
index 00000000..8a5a21fc
--- /dev/null
+++ b/src/neon/simd.rs
@@ -0,0 +1,470 @@
+#![allow(non_camel_case_types)]
+#![allow(unused)]
+
+use crate::neon::simd_llvm;
+
+macro_rules! simd_ty {
+    ($id:ident [$ety:ident]: $($elem_ty:ident),* | $($elem_name:ident),*) => {
+        #[repr(simd)]
+        #[derive(Copy, Clone, Debug, PartialEq)]
+        pub(crate) struct $id($(pub $elem_ty),*);
+
+        #[allow(clippy::use_self)]
+        impl $id {
+            #[inline]
+            pub(crate) const fn new($($elem_name: $elem_ty),*) -> Self {
+                $id($($elem_name),*)
+            }
+            // FIXME: Workaround rust@60637
+            #[inline(always)]
+            pub(crate) const fn splat(value: $ety) -> Self {
+                $id($({
+                    #[allow(non_camel_case_types, dead_code)]
+                    struct $elem_name;
+                    value
+                }),*)
+            }
+
+            // FIXME: Workaround rust@60637
+            #[inline(always)]
+            pub(crate) fn extract(self, index: usize) -> $ety {
+                unsafe {
+                    simd_llvm::simd_extract(self, index as u32)
+                }
+            }
+        }
+    }
+}
+
+macro_rules! simd_m_ty {
+    ($id:ident [$ety:ident]: $($elem_ty:ident),* | $($elem_name:ident),*) => {
+        #[repr(simd)]
+        #[derive(Copy, Clone, Debug, PartialEq)]
+        pub(crate) struct $id($(pub $elem_ty),*);
+
+        #[allow(clippy::use_self)]
+        impl $id {
+            #[inline]
+            const fn bool_to_internal(x: bool) -> $ety {
+                [0 as $ety, !(0 as $ety)][x as usize]
+            }
+
+            #[inline]
+            pub(crate) const fn new($($elem_name: bool),*) -> Self {
+                $id($(Self::bool_to_internal($elem_name)),*)
+            }
+
+            // FIXME: Workaround rust@60637
+            #[inline(always)]
+            pub(crate) const fn splat(value: bool) -> Self {
+                $id($({
+                    #[allow(non_camel_case_types, dead_code)]
+                    struct $elem_name;
+                    Self::bool_to_internal(value)
+                }),*)
+            }
+
+            // FIXME: Workaround rust@60637
+            #[inline(always)]
+            pub(crate) fn extract(self, index: usize) -> bool {
+                let r: $ety = unsafe {
+                    simd_llvm::simd_extract(self, index as u32)
+                };
+                r != 0
+            }
+        }
+    }
+}
+
+// 16-bit wide types:
+
+simd_ty!(u8x2[u8]: u8, u8 | x0, x1);
+simd_ty!(i8x2[i8]: i8, i8 | x0, x1);
+
+// 32-bit wide types:
+
+simd_ty!(u8x4[u8]: u8, u8, u8, u8 | x0, x1, x2, x3);
+simd_ty!(u16x2[u16]: u16, u16 | x0, x1);
+
+simd_ty!(i8x4[i8]: i8, i8, i8, i8 | x0, x1, x2, x3);
+simd_ty!(i16x2[i16]: i16, i16 | x0, x1);
+
+// 64-bit wide types:
+
+simd_ty!(u8x8[u8]:
+         u8, u8, u8, u8, u8, u8, u8, u8
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(u16x4[u16]: u16, u16, u16, u16 | x0, x1, x2, x3);
+simd_ty!(u32x2[u32]: u32, u32 | x0, x1);
+simd_ty!(u64x1[u64]: u64 | x1);
+
+simd_ty!(i8x8[i8]:
+         i8, i8, i8, i8, i8, i8, i8, i8
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(i16x4[i16]: i16, i16, i16, i16 | x0, x1, x2, x3);
+simd_ty!(i32x2[i32]: i32, i32 | x0, x1);
+simd_ty!(i64x1[i64]: i64 | x1);
+
+simd_ty!(f32x2[f32]: f32, f32 | x0, x1);
+
+// 128-bit wide types:
+
+simd_ty!(u8x16[u8]:
+         u8, u8, u8, u8, u8, u8, u8, u8,
+         u8, u8, u8, u8, u8, u8, u8, u8
+         | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+);
+simd_ty!(u16x8[u16]:
+         u16, u16, u16, u16, u16, u16, u16, u16
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(u32x4[u32]: u32, u32, u32, u32 | x0, x1, x2, x3);
+simd_ty!(u64x2[u64]: u64, u64 | x0, x1);
+
+simd_ty!(i8x16[i8]:
+         i8, i8, i8, i8, i8, i8, i8, i8,
+         i8, i8, i8, i8, i8, i8, i8, i8
+         | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+);
+simd_ty!(i16x8[i16]:
+         i16, i16, i16, i16, i16, i16, i16, i16
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(i32x4[i32]: i32, i32, i32, i32 | x0, x1, x2, x3);
+simd_ty!(i64x2[i64]: i64, i64 | x0, x1);
+
+simd_ty!(f32x4[f32]: f32, f32, f32, f32 | x0, x1, x2, x3);
+simd_ty!(f64x2[f64]: f64, f64 | x0, x1);
+
+simd_m_ty!(m8x16[i8]:
+           i8, i8, i8, i8, i8, i8, i8, i8,
+           i8, i8, i8, i8, i8, i8, i8, i8
+           | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+);
+simd_m_ty!(m16x8[i16]:
+           i16, i16, i16, i16, i16, i16, i16, i16
+           | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_m_ty!(m32x4[i32]: i32, i32, i32, i32 | x0, x1, x2, x3);
+simd_m_ty!(m64x2[i64]: i64, i64 | x0, x1);
+
+// 256-bit wide types:
+
+simd_ty!(u8x32[u8]:
+         u8, u8, u8, u8, u8, u8, u8, u8,
+         u8, u8, u8, u8, u8, u8, u8, u8,
+         u8, u8, u8, u8, u8, u8, u8, u8,
+         u8, u8, u8, u8, u8, u8, u8, u8
+         | x0, x1, x2, x3, x4, x5, x6, x7,
+         x8, x9, x10, x11, x12, x13, x14, x15,
+         x16, x17, x18, x19, x20, x21, x22, x23,
+         x24, x25, x26, x27, x28, x29, x30, x31
+);
+simd_ty!(u16x16[u16]:
+         u16, u16, u16, u16, u16, u16, u16, u16,
+         u16, u16, u16, u16, u16, u16, u16, u16
+         | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+);
+simd_ty!(u32x8[u32]:
+         u32, u32, u32, u32, u32, u32, u32, u32
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(u64x4[u64]: u64, u64, u64, u64 | x0, x1, x2, x3);
+
+simd_ty!(i8x32[i8]:
+         i8, i8, i8, i8, i8, i8, i8, i8,
+         i8, i8, i8, i8, i8, i8, i8, i8,
+         i8, i8, i8, i8, i8, i8, i8, i8,
+         i8, i8, i8, i8, i8, i8, i8, i8
+         | x0, x1, x2, x3, x4, x5, x6, x7,
+         x8, x9, x10, x11, x12, x13, x14, x15,
+         x16, x17, x18, x19, x20, x21, x22, x23,
+         x24, x25, x26, x27, x28, x29, x30, x31
+);
+simd_ty!(i16x16[i16]:
+         i16, i16, i16, i16, i16, i16, i16, i16,
+         i16, i16, i16, i16, i16, i16, i16, i16
+         | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+);
+simd_ty!(i32x8[i32]:
+         i32, i32, i32, i32, i32, i32, i32, i32
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(i64x4[i64]: i64, i64, i64, i64 | x0, x1, x2, x3);
+
+simd_ty!(f32x8[f32]:
+         f32, f32, f32, f32, f32, f32, f32, f32 |
+         x0, x1, x2, x3, x4, x5, x6, x7);
+
+// 512-bit wide types:
+
+simd_ty!(i32x16[i32]:
+         i32, i32, i32, i32, i32, i32, i32, i32,
+         i32, i32, i32, i32, i32, i32, i32, i32
+         | x0, x1, x2, x3, x4, x5, x6, x7,
+         x8, x9, x10, x11, x12, x13, x14, x15);
+
+simd_ty!(i64x8[i64]:
+         i64, i64, i64, i64, i64, i64, i64, i64
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+
+#[allow(unused)]
+#[macro_export]
+macro_rules! constify_imm8 {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match ($imm8) & 0b1111_1111 {
+            0 => $expand!(0),
+            1 => $expand!(1),
+            2 => $expand!(2),
+            3 => $expand!(3),
+            4 => $expand!(4),
+            5 => $expand!(5),
+            6 => $expand!(6),
+            7 => $expand!(7),
+            8 => $expand!(8),
+            9 => $expand!(9),
+            10 => $expand!(10),
+            11 => $expand!(11),
+            12 => $expand!(12),
+            13 => $expand!(13),
+            14 => $expand!(14),
+            15 => $expand!(15),
+            16 => $expand!(16),
+            17 => $expand!(17),
+            18 => $expand!(18),
+            19 => $expand!(19),
+            20 => $expand!(20),
+            21 => $expand!(21),
+            22 => $expand!(22),
+            23 => $expand!(23),
+            24 => $expand!(24),
+            25 => $expand!(25),
+            26 => $expand!(26),
+            27 => $expand!(27),
+            28 => $expand!(28),
+            29 => $expand!(29),
+            30 => $expand!(30),
+            31 => $expand!(31),
+            32 => $expand!(32),
+            33 => $expand!(33),
+            34 => $expand!(34),
+            35 => $expand!(35),
+            36 => $expand!(36),
+            37 => $expand!(37),
+            38 => $expand!(38),
+            39 => $expand!(39),
+            40 => $expand!(40),
+            41 => $expand!(41),
+            42 => $expand!(42),
+            43 => $expand!(43),
+            44 => $expand!(44),
+            45 => $expand!(45),
+            46 => $expand!(46),
+            47 => $expand!(47),
+            48 => $expand!(48),
+            49 => $expand!(49),
+            50 => $expand!(50),
+            51 => $expand!(51),
+            52 => $expand!(52),
+            53 => $expand!(53),
+            54 => $expand!(54),
+            55 => $expand!(55),
+            56 => $expand!(56),
+            57 => $expand!(57),
+            58 => $expand!(58),
+            59 => $expand!(59),
+            60 => $expand!(60),
+            61 => $expand!(61),
+            62 => $expand!(62),
+            63 => $expand!(63),
+            64 => $expand!(64),
+            65 => $expand!(65),
+            66 => $expand!(66),
+            67 => $expand!(67),
+            68 => $expand!(68),
+            69 => $expand!(69),
+            70 => $expand!(70),
+            71 => $expand!(71),
+            72 => $expand!(72),
+            73 => $expand!(73),
+            74 => $expand!(74),
+            75 => $expand!(75),
+            76 => $expand!(76),
+            77 => $expand!(77),
+            78 => $expand!(78),
+            79 => $expand!(79),
+            80 => $expand!(80),
+            81 => $expand!(81),
+            82 => $expand!(82),
+            83 => $expand!(83),
+            84 => $expand!(84),
+            85 => $expand!(85),
+            86 => $expand!(86),
+            87 => $expand!(87),
+            88 => $expand!(88),
+            89 => $expand!(89),
+            90 => $expand!(90),
+            91 => $expand!(91),
+            92 => $expand!(92),
+            93 => $expand!(93),
+            94 => $expand!(94),
+            95 => $expand!(95),
+            96 => $expand!(96),
+            97 => $expand!(97),
+            98 => $expand!(98),
+            99 => $expand!(99),
+            100 => $expand!(100),
+            101 => $expand!(101),
+            102 => $expand!(102),
+            103 => $expand!(103),
+            104 => $expand!(104),
+            105 => $expand!(105),
+            106 => $expand!(106),
+            107 => $expand!(107),
+            108 => $expand!(108),
+            109 => $expand!(109),
+            110 => $expand!(110),
+            111 => $expand!(111),
+            112 => $expand!(112),
+            113 => $expand!(113),
+            114 => $expand!(114),
+            115 => $expand!(115),
+            116 => $expand!(116),
+            117 => $expand!(117),
+            118 => $expand!(118),
+            119 => $expand!(119),
+            120 => $expand!(120),
+            121 => $expand!(121),
+            122 => $expand!(122),
+            123 => $expand!(123),
+            124 => $expand!(124),
+            125 => $expand!(125),
+            126 => $expand!(126),
+            127 => $expand!(127),
+            128 => $expand!(128),
+            129 => $expand!(129),
+            130 => $expand!(130),
+            131 => $expand!(131),
+            132 => $expand!(132),
+            133 => $expand!(133),
+            134 => $expand!(134),
+            135 => $expand!(135),
+            136 => $expand!(136),
+            137 => $expand!(137),
+            138 => $expand!(138),
+            139 => $expand!(139),
+            140 => $expand!(140),
+            141 => $expand!(141),
+            142 => $expand!(142),
+            143 => $expand!(143),
+            144 => $expand!(144),
+            145 => $expand!(145),
+            146 => $expand!(146),
+            147 => $expand!(147),
+            148 => $expand!(148),
+            149 => $expand!(149),
+            150 => $expand!(150),
+            151 => $expand!(151),
+            152 => $expand!(152),
+            153 => $expand!(153),
+            154 => $expand!(154),
+            155 => $expand!(155),
+            156 => $expand!(156),
+            157 => $expand!(157),
+            158 => $expand!(158),
+            159 => $expand!(159),
+            160 => $expand!(160),
+            161 => $expand!(161),
+            162 => $expand!(162),
+            163 => $expand!(163),
+            164 => $expand!(164),
+            165 => $expand!(165),
+            166 => $expand!(166),
+            167 => $expand!(167),
+            168 => $expand!(168),
+            169 => $expand!(169),
+            170 => $expand!(170),
+            171 => $expand!(171),
+            172 => $expand!(172),
+            173 => $expand!(173),
+            174 => $expand!(174),
+            175 => $expand!(175),
+            176 => $expand!(176),
+            177 => $expand!(177),
+            178 => $expand!(178),
+            179 => $expand!(179),
+            180 => $expand!(180),
+            181 => $expand!(181),
+            182 => $expand!(182),
+            183 => $expand!(183),
+            184 => $expand!(184),
+            185 => $expand!(185),
+            186 => $expand!(186),
+            187 => $expand!(187),
+            188 => $expand!(188),
+            189 => $expand!(189),
+            190 => $expand!(190),
+            191 => $expand!(191),
+            192 => $expand!(192),
+            193 => $expand!(193),
+            194 => $expand!(194),
+            195 => $expand!(195),
+            196 => $expand!(196),
+            197 => $expand!(197),
+            198 => $expand!(198),
+            199 => $expand!(199),
+            200 => $expand!(200),
+            201 => $expand!(201),
+            202 => $expand!(202),
+            203 => $expand!(203),
+            204 => $expand!(204),
+            205 => $expand!(205),
+            206 => $expand!(206),
+            207 => $expand!(207),
+            208 => $expand!(208),
+            209 => $expand!(209),
+            210 => $expand!(210),
+            211 => $expand!(211),
+            212 => $expand!(212),
+            213 => $expand!(213),
+            214 => $expand!(214),
+            215 => $expand!(215),
+            216 => $expand!(216),
+            217 => $expand!(217),
+            218 => $expand!(218),
+            219 => $expand!(219),
+            220 => $expand!(220),
+            221 => $expand!(221),
+            222 => $expand!(222),
+            223 => $expand!(223),
+            224 => $expand!(224),
+            225 => $expand!(225),
+            226 => $expand!(226),
+            227 => $expand!(227),
+            228 => $expand!(228),
+            229 => $expand!(229),
+            230 => $expand!(230),
+            231 => $expand!(231),
+            232 => $expand!(232),
+            233 => $expand!(233),
+            234 => $expand!(234),
+            235 => $expand!(235),
+            236 => $expand!(236),
+            237 => $expand!(237),
+            238 => $expand!(238),
+            239 => $expand!(239),
+            240 => $expand!(240),
+            241 => $expand!(241),
+            242 => $expand!(242),
+            243 => $expand!(243),
+            244 => $expand!(244),
+            245 => $expand!(245),
+            246 => $expand!(246),
+            247 => $expand!(247),
+            248 => $expand!(248),
+            249 => $expand!(249),
+            250 => $expand!(250),
+            251 => $expand!(251),
+            252 => $expand!(252),
+            253 => $expand!(253),
+            254 => $expand!(254),
+            _ => $expand!(255),
+        }
+    };
+}
diff --git a/src/neon/simd_llvm.rs b/src/neon/simd_llvm.rs
new file mode 100644
index 00000000..6e60b63c
--- /dev/null
+++ b/src/neon/simd_llvm.rs
@@ -0,0 +1,54 @@
+extern "platform-intrinsic" {
+    pub fn simd_eq<T, U>(x: T, y: T) -> U;
+//    pub fn simd_ne<T, U>(x: T, y: T) -> U;
+    pub fn simd_lt<T, U>(x: T, y: T) -> U;
+    pub fn simd_le<T, U>(x: T, y: T) -> U;
+    pub fn simd_gt<T, U>(x: T, y: T) -> U;
+    pub fn simd_ge<T, U>(x: T, y: T) -> U;
+//
+//    pub fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U;
+//    pub fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U;
+//    pub fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U;
+//    pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
+//    pub fn simd_shuffle32<T, U>(x: T, y: T, idx: [u32; 32]) -> U;
+//    pub fn simd_shuffle64<T, U>(x: T, y: T, idx: [u32; 64]) -> U;
+//    pub fn simd_shuffle128<T, U>(x: T, y: T, idx: [u32; 128]) -> U;
+//
+//    pub fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
+    pub fn simd_extract<T, U>(x: T, idx: u32) -> U;
+//
+//    pub fn simd_cast<T, U>(x: T) -> U;
+//
+    pub fn simd_add<T>(x: T, y: T) -> T;
+//    pub fn simd_sub<T>(x: T, y: T) -> T;
+//    pub fn simd_mul<T>(x: T, y: T) -> T;
+//    pub fn simd_div<T>(x: T, y: T) -> T;
+//    pub fn simd_shl<T>(x: T, y: T) -> T;
+//    pub fn simd_shr<T>(x: T, y: T) -> T;
+    pub fn simd_and<T>(x: T, y: T) -> T;
+    pub fn simd_or<T>(x: T, y: T) -> T;
+    pub fn simd_xor<T>(x: T, y: T) -> T;
+//
+//    pub fn simd_reduce_add_unordered<T, U>(x: T) -> U;
+//    pub fn simd_reduce_mul_unordered<T, U>(x: T) -> U;
+//    pub fn simd_reduce_add_ordered<T, U>(x: T, acc: U) -> U;
+//    pub fn simd_reduce_mul_ordered<T, U>(x: T, acc: U) -> U;
+//    pub fn simd_reduce_min<T, U>(x: T) -> U;
+//    pub fn simd_reduce_max<T, U>(x: T) -> U;
+//    pub fn simd_reduce_min_nanless<T, U>(x: T) -> U;
+//    pub fn simd_reduce_max_nanless<T, U>(x: T) -> U;
+//    pub fn simd_reduce_and<T, U>(x: T) -> U;
+//    pub fn simd_reduce_or<T, U>(x: T) -> U;
+//    pub fn simd_reduce_xor<T, U>(x: T) -> U;
+//    pub fn simd_reduce_all<T>(x: T) -> bool;
+//    pub fn simd_reduce_any<T>(x: T) -> bool;
+//
+//    pub fn simd_select<M, T>(m: M, a: T, b: T) -> T;
+//    pub fn simd_select_bitmask<M, T>(m: M, a: T, b: T) -> T;
+//
+//    pub fn simd_fmin<T>(a: T, b: T) -> T;
+//    pub fn simd_fmax<T>(a: T, b: T) -> T;
+//
+//    pub fn simd_fsqrt<T>(a: T) -> T;
+//    pub fn simd_fma<T>(a: T, b: T, c: T) -> T;
+}
\ No newline at end of file
diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs
new file mode 100644
index 00000000..45322cee
--- /dev/null
+++ b/src/neon/stage1.rs
@@ -0,0 +1,598 @@
+#![allow(dead_code)]
+
+use crate::neon::intrinsics::*;
+use crate::neon::utf8check::*;
+use crate::*;
+
+use std::mem;
+
+// NEON-SPECIFIC
+
+macro_rules! bit_mask {
+    () => {
+        uint8x16_t::new(
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+        )
+    };
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+pub(crate) unsafe fn neon_movemask(input: uint8x16_t) -> u16 {
+    let minput: uint8x16_t = vandq_u8(input, bit_mask!());
+    let tmp: uint8x16_t = vpaddq_u8(minput, minput);
+    let tmp = vpaddq_u8(tmp, tmp);
+    let tmp = vpaddq_u8(tmp, tmp);
+
+    vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0)
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+pub unsafe fn neon_movemask_bulk(p0: uint8x16_t, p1: uint8x16_t, p2: uint8x16_t, p3: uint8x16_t) -> u64 {
+    let bit_mask = bit_mask!();
+
+    let t0 = vandq_u8(p0, bit_mask);
+    let t1 = vandq_u8(p1, bit_mask);
+    let t2 = vandq_u8(p2, bit_mask);
+    let t3 = vandq_u8(p3, bit_mask);
+    let sum0 = vpaddq_u8(t0, t1);
+    let sum1 = vpaddq_u8(t2, t3);
+    let sum0 = vpaddq_u8(sum0, sum1);
+    let sum0 = vpaddq_u8(sum0, sum0);
+
+    vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0)
+}
+
+// /NEON-SPECIFIC
+
+pub const SIMDJSON_PADDING: usize = mem::size_of::<uint8x16_t>() * 4;
+
+unsafe fn compute_quote_mask(quote_bits: u64) -> u64 {
+    vgetq_lane_u64(
+        vreinterpretq_u64_u8(
+            mem::transmute(
+                vmull_p64(
+                    -1,
+                    quote_bits as i64)
+            )
+        ),
+        0
+    )
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn check_ascii(si: &SimdInput) -> bool {
+    let highbit: uint8x16_t = vdupq_n_u8(0x80);
+    let t0: uint8x16_t = vorrq_u8(si.v0, si.v1);
+    let t1: uint8x16_t = vorrq_u8(si.v2, si.v3);
+    let t3: uint8x16_t = vorrq_u8(t0, t1);
+    let t4: uint8x16_t = vandq_u8(t3, highbit);
+
+    let v64: uint64x2_t = vreinterpretq_u64_u8(t4);
+    let v32: uint32x2_t = vqmovn_u64(v64);
+    let result: uint64x1_t = vreinterpret_u64_u32(v32);
+
+    vget_lane_u64(result, 0) == 0
+}
+
+#[derive(Debug)]
+struct SimdInput {
+    v0: uint8x16_t,
+    v1: uint8x16_t,
+    v2: uint8x16_t,
+    v3: uint8x16_t,
+}
+
+fn fill_input(ptr: &[u8]) -> SimdInput {
+    unsafe {
+        #[allow(clippy::cast_ptr_alignment)]
+        SimdInput {
+            v0: vld1q_u8(ptr.as_ptr() as *const u8),
+            v1: vld1q_u8(ptr.as_ptr().add(16) as *const u8),
+            v2: vld1q_u8(ptr.as_ptr().add(32) as *const u8),
+            v3: vld1q_u8(ptr.as_ptr().add(48) as *const u8),
+        }
+    }
+}
+
+struct Utf8CheckingState {
+    has_error: int8x16_t,
+    previous: ProcessedUtfBytes,
+}
+
+impl Default for Utf8CheckingState {
+    #[cfg_attr(not(feature = "no-inline"), inline)]
+    fn default() -> Self {
+        Utf8CheckingState {
+            has_error: vdupq_n_s8(0),
+            previous: ProcessedUtfBytes::default(),
+        }
+    }
+}
+
+#[inline]
+fn is_utf8_status_ok(has_error: int8x16_t) -> bool {
+    unsafe {
+        let has_error_128 : i128 = mem::transmute(has_error);
+
+        has_error_128 == 0
+    }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn check_utf8(
+    input: &SimdInput,
+    state: &mut Utf8CheckingState,
+) {
+    if check_ascii(input) {
+        // All bytes are ascii. Therefore the byte that was just before must be
+        // ascii too. We only check the byte that was just before simd_input. Nines
+        // are arbitrary values.
+        let verror: int8x16_t = int8x16_t::new(
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1,
+        );
+        state.has_error = vreinterpretq_s8_u8(vorrq_u8(
+            vcgtq_s8(
+                state.previous.carried_continuations,
+                verror,
+            ),
+            vreinterpretq_u8_s8(state.has_error)),
+        );
+    } else {
+        // it is not ascii so we have to do heavy work
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v0), &mut state.previous, &mut state.has_error);
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v1), &mut state.previous, &mut state.has_error);
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v2), &mut state.previous, &mut state.has_error);
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v3), &mut state.previous, &mut state.has_error);
+    }
+}
+
+// a straightforward comparison of a mask against input
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+fn cmp_mask_against_input(input: &SimdInput, m: u8) -> u64 {
+    unsafe {
+        let mask: uint8x16_t = vmovq_n_u8(m);
+        let cmp_res_0: uint8x16_t = vceqq_u8(input.v0, mask);
+        let cmp_res_1: uint8x16_t = vceqq_u8(input.v1, mask);
+        let cmp_res_2: uint8x16_t = vceqq_u8(input.v2, mask);
+        let cmp_res_3: uint8x16_t = vceqq_u8(input.v3, mask);
+
+        neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3)
+    }
+}
+
+// find all values less than or equal than the content of maxval (using unsigned arithmetic)
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+fn unsigned_lteq_against_input(input: &SimdInput, maxval: uint8x16_t) -> u64 {
+    unsafe {
+        let cmp_res_0: uint8x16_t = vcleq_u8(input.v0, maxval);
+        let cmp_res_1: uint8x16_t = vcleq_u8(input.v1, maxval);
+        let cmp_res_2: uint8x16_t = vcleq_u8(input.v2, maxval);
+        let cmp_res_3: uint8x16_t = vcleq_u8(input.v3, maxval);
+        neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3)
+    }
+}
+
+// return a bitvector indicating where we have characters that end an odd-length
+// sequence of backslashes (and thus change the behavior of the next character
+// to follow). A even-length sequence of backslashes, and, for that matter, the
+// largest even-length prefix of our odd-length sequence of backslashes, simply
+// modify the behavior of the backslashes themselves.
+// We also update the prev_iter_ends_odd_backslash reference parameter to
+// indicate whether we end an iteration on an odd-length sequence of
+// backslashes, which modifies our subsequent search for odd-length
+// sequences of backslashes in an obvious way.
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn find_odd_backslash_sequences(input: &SimdInput, prev_iter_ends_odd_backslash: &mut u64) -> u64 {
+    const EVEN_BITS: u64 = 0x5555_5555_5555_5555;
+    const ODD_BITS: u64 = !EVEN_BITS;
+
+    let bs_bits: u64 = cmp_mask_against_input(&input, b'\\');
+    let start_edges: u64 = bs_bits & !(bs_bits << 1);
+    // flip lowest if we have an odd-length run at the end of the prior
+    // iteration
+    let even_start_mask: u64 = EVEN_BITS ^ *prev_iter_ends_odd_backslash;
+    let even_starts: u64 = start_edges & even_start_mask;
+    let odd_starts: u64 = start_edges & !even_start_mask;
+    let even_carries: u64 = bs_bits.wrapping_add(even_starts);
+
+    // must record the carry-out of our odd-carries out of bit 63; this
+    // indicates whether the sense of any edge going to the next iteration
+    // should be flipped
+    let (mut odd_carries, iter_ends_odd_backslash) = bs_bits.overflowing_add(odd_starts);
+
+    odd_carries |= *prev_iter_ends_odd_backslash;
+    // push in bit zero as a potential end
+    // if we had an odd-numbered run at the
+    // end of the previous iteration
+    *prev_iter_ends_odd_backslash = if iter_ends_odd_backslash { 0x1 } else { 0x0 };
+    let even_carry_ends: u64 = even_carries & !bs_bits;
+    let odd_carry_ends: u64 = odd_carries & !bs_bits;
+    let even_start_odd_end: u64 = even_carry_ends & ODD_BITS;
+    let odd_start_even_end: u64 = odd_carry_ends & EVEN_BITS;
+    let odd_ends: u64 = even_start_odd_end | odd_start_even_end;
+    odd_ends
+}
+
+// return both the quote mask (which is a half-open mask that covers the first
+// quote in an unescaped quote pair and everything in the quote pair) and the
+// quote bits, which are the simple unescaped quoted bits.
+//
+// We also update the prev_iter_inside_quote value to tell the next iteration
+// whether we finished the final iteration inside a quote pair; if so, this
+// inverts our behavior of whether we're inside quotes for the next iteration.
+//
+// Note that we don't do any error checking to see if we have backslash
+// sequences outside quotes; these
+// backslash sequences (of any length) will be detected elsewhere.
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn find_quote_mask_and_bits(
+    input: &SimdInput,
+    odd_ends: u64,
+    prev_iter_inside_quote: &mut u64,
+    quote_bits: &mut u64,
+    error_mask: &mut u64,
+) -> u64 {
+    *quote_bits = cmp_mask_against_input(&input, b'"');
+    *quote_bits &= !odd_ends;
+    // remove from the valid quoted region the unescapted characters.
+    let mut quote_mask: u64 = compute_quote_mask(*quote_bits);
+
+    quote_mask ^= *prev_iter_inside_quote;
+    // All Unicode characters may be placed within the
+    // quotation marks, except for the characters that MUST be escaped:
+    // quotation mark, reverse solidus, and the control characters (U+0000
+    //through U+001F).
+    // https://tools.ietf.org/html/rfc8259
+    let unescaped: u64 = unsigned_lteq_against_input(input, vmovq_n_u8(0x1F));
+    *error_mask |= quote_mask & unescaped;
+    // right shift of a signed value expected to be well-defined and standard
+    // compliant as of C++20,
+    // John Regher from Utah U. says this is fine code
+    *prev_iter_inside_quote = static_cast_u64!(static_cast_i64!(quote_mask) >> 63);
+    quote_mask
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn find_whitespace_and_structurals(
+    input: &SimdInput,
+    whitespace: &mut u64,
+    structurals: &mut u64,
+) {
+    // do a 'shufti' to detect structural JSON characters
+    // they are
+    // * `{` 0x7b
+    // * `}` 0x7d
+    // * `:` 0x3a
+    // * `[` 0x5b
+    // * `]` 0x5d
+    // * `,` 0x2c
+    // these go into the first 3 buckets of the comparison (1/2/4)
+
+    // we are also interested in the four whitespace characters:
+    // * space 0x20
+    // * linefeed 0x0a
+    // * horizontal tab 0x09
+    // * carriage return 0x0d
+    // these go into the next 2 buckets of the comparison (8/16)
+
+    // TODO: const?
+    let low_nibble_mask: uint8x16_t = uint8x16_t::new(
+        16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0,
+    );
+    // TODO: const?
+    let high_nibble_mask: uint8x16_t = uint8x16_t::new(
+        8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0,
+    );
+
+    let structural_shufti_mask: uint8x16_t = vmovq_n_u8(0x7);
+    let whitespace_shufti_mask: uint8x16_t = vmovq_n_u8(0x18);
+    let low_nib_and_mask: uint8x16_t = vmovq_n_u8(0xf);
+
+    let nib_0_lo: uint8x16_t = vandq_u8(input.v0, low_nib_and_mask);
+    let nib_0_hi: uint8x16_t = vshrq_n_u8(input.v0, 4);
+    let shuf_0_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
+    let shuf_0_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
+    let v_0: uint8x16_t = vandq_u8(shuf_0_lo, shuf_0_hi);
+
+    let nib_1_lo: uint8x16_t = vandq_u8(input.v1, low_nib_and_mask);
+    let nib_1_hi: uint8x16_t = vshrq_n_u8(input.v1, 4);
+    let shuf_1_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
+    let shuf_1_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
+    let v_1: uint8x16_t = vandq_u8(shuf_1_lo, shuf_1_hi);
+
+    let nib_2_lo: uint8x16_t = vandq_u8(input.v2, low_nib_and_mask);
+    let nib_2_hi: uint8x16_t = vshrq_n_u8(input.v2, 4);
+    let shuf_2_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
+    let shuf_2_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
+    let v_2: uint8x16_t = vandq_u8(shuf_2_lo, shuf_2_hi);
+
+    let nib_3_lo: uint8x16_t = vandq_u8(input.v3, low_nib_and_mask);
+    let nib_3_hi: uint8x16_t = vshrq_n_u8(input.v3, 4);
+    let shuf_3_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
+    let shuf_3_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
+    let v_3: uint8x16_t = vandq_u8(shuf_3_lo, shuf_3_hi);
+
+    let tmp_0: uint8x16_t = vtstq_u8(v_0, structural_shufti_mask);
+    let tmp_1: uint8x16_t = vtstq_u8(v_1, structural_shufti_mask);
+    let tmp_2: uint8x16_t = vtstq_u8(v_2, structural_shufti_mask);
+    let tmp_3: uint8x16_t = vtstq_u8(v_3, structural_shufti_mask);
+    *structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
+
+    let tmp_ws_v0: uint8x16_t = vtstq_u8(v_0, whitespace_shufti_mask);
+    let tmp_ws_v1: uint8x16_t = vtstq_u8(v_1, whitespace_shufti_mask);
+    let tmp_ws_v2: uint8x16_t = vtstq_u8(v_2, whitespace_shufti_mask);
+    let tmp_ws_v3: uint8x16_t = vtstq_u8(v_3, whitespace_shufti_mask);
+    *whitespace = neon_movemask_bulk(tmp_ws_v0, tmp_ws_v1, tmp_ws_v2, tmp_ws_v3);
+}
+
+// flatten out values in 'bits' assuming that they are are to have values of idx
+// plus their position in the bitvector, and store these indexes at
+// base_ptr[base] incrementing base as we go
+// will potentially store extra values beyond end of valid bits, so base_ptr
+// needs to be large enough to handle this
+//TODO: usize was u32 here does this matter?
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+fn flatten_bits(base: &mut Vec<u32>, idx: u32, mut bits: u64) {
+    let cnt: usize = bits.count_ones() as usize;
+    let mut l = base.len();
+    let idx_minus_64 = idx.wrapping_sub(64);
+    let idx_64_v = unsafe {
+        int32x4_t::new(
+            static_cast_i32!(idx_minus_64),
+            static_cast_i32!(idx_minus_64),
+            static_cast_i32!(idx_minus_64),
+            static_cast_i32!(idx_minus_64),
+        )
+    };
+
+    // We're doing some trickery here.
+    // We reserve 64 extra entries, because we've at most 64 bit to set
+    // then we trunctate the base to the next base (that we calcuate above)
+    // We later indiscriminatory writre over the len we set but that's OK
+    // since we ensure we reserve the needed space
+    base.reserve(64);
+    unsafe {
+        base.set_len(l + cnt);
+    }
+
+    while bits != 0 {
+        unsafe {
+            let v0 = bits.trailing_zeros() as i32;
+            bits &= bits.wrapping_sub(1);
+            let v1 = bits.trailing_zeros() as i32;
+            bits &= bits.wrapping_sub(1);
+            let v2 = bits.trailing_zeros() as i32;
+            bits &= bits.wrapping_sub(1);
+            let v3 = bits.trailing_zeros() as i32;
+            bits &= bits.wrapping_sub(1);
+
+            let v: int32x4_t = int32x4_t::new(v0, v1, v2, v3);
+            let v: int32x4_t = vaddq_s32(idx_64_v, v);
+            #[allow(clippy::cast_ptr_alignment)]
+            std::ptr::write(base.as_mut_ptr().add(l) as *mut int32x4_t, v);
+        }
+        l += 4;
+    }
+}
+
+// return a updated structural bit vector with quoted contents cleared out and
+// pseudo-structural characters added to the mask
+// updates prev_iter_ends_pseudo_pred which tells us whether the previous
+// iteration ended on a whitespace or a structural character (which means that
+// the next iteration
+// will have a pseudo-structural character at its start)
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+fn finalize_structurals(
+    mut structurals: u64,
+    whitespace: u64,
+    quote_mask: u64,
+    quote_bits: u64,
+    prev_iter_ends_pseudo_pred: &mut u64,
+) -> u64 {
+    // mask off anything inside quotes
+    structurals &= !quote_mask;
+    // add the real quote bits back into our bitmask as well, so we can
+    // quickly traverse the strings we've spent all this trouble gathering
+    structurals |= quote_bits;
+    // Now, establish "pseudo-structural characters". These are non-whitespace
+    // characters that are (a) outside quotes and (b) have a predecessor that's
+    // either whitespace or a structural character. This means that subsequent
+    // passes will get a chance to encounter the first character of every string
+    // of non-whitespace and, if we're parsing an atom like true/false/null or a
+    // number we can stop at the first whitespace or structural character
+    // following it.
+
+    // a qualified predecessor is something that can happen 1 position before an
+    // psuedo-structural character
+    let pseudo_pred: u64 = structurals | whitespace;
+
+    let shifted_pseudo_pred: u64 = (pseudo_pred << 1) | *prev_iter_ends_pseudo_pred;
+    *prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
+    let pseudo_structurals: u64 = shifted_pseudo_pred & (!whitespace) & (!quote_mask);
+    structurals |= pseudo_structurals;
+
+    // now, we've used our close quotes all we need to. So let's switch them off
+    // they will be off in the quote mask and on in quote bits.
+    structurals &= !(quote_bits & !quote_mask);
+    structurals
+}
+
+pub fn find_bs_bits_and_quote_bits(v0: uint8x16_t, v1: uint8x16_t) -> ParseStringHelper {
+    let quote_mask = vmovq_n_u8(b'"');
+    let bs_mask = vmovq_n_u8(b'\\');
+    let bit_mask = bit_mask!();
+
+    let cmp_bs_0 : uint8x16_t = vceqq_u8(v0, bs_mask);
+    let cmp_bs_1 : uint8x16_t = vceqq_u8(v1, bs_mask);
+    let cmp_qt_0 : uint8x16_t = vceqq_u8(v0, quote_mask);
+    let cmp_qt_1 : uint8x16_t = vceqq_u8(v1, quote_mask);
+
+    let cmp_bs_0 = vandq_u8(cmp_bs_0, bit_mask);
+    let cmp_bs_1 = vandq_u8(cmp_bs_1, bit_mask);
+    let cmp_qt_0 = vandq_u8(cmp_qt_0, bit_mask);
+    let cmp_qt_1 = vandq_u8(cmp_qt_1, bit_mask);
+
+    let sum0 : uint8x16_t = vpaddq_u8(cmp_bs_0, cmp_bs_1);
+    let sum1 : uint8x16_t = vpaddq_u8(cmp_qt_0, cmp_qt_1);
+    let sum0 = vpaddq_u8(sum0, sum1);
+    let sum0 = vpaddq_u8(sum0, sum0);
+
+    ParseStringHelper {
+        bs_bits: unsafe { vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0) },
+        quote_bits: unsafe { vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) },
+    }
+}
+
+impl<'de> Deserializer<'de> {
+    //#[inline(never)]
+    pub unsafe fn find_structural_bits(input: &[u8]) -> std::result::Result<Vec<u32>, ErrorType> {
+        let len = input.len();
+        // 6 is a heuristic number to estimate it turns out a rate of 1/6 structural caracters lears
+        // almost never to relocations.
+        let mut structural_indexes = Vec::with_capacity(len / 6);
+        structural_indexes.push(0); // push extra root element
+
+        let mut utf8_state: Utf8CheckingState = Utf8CheckingState::default();
+
+        // we have padded the input out to 64 byte multiple with the remainder being
+        // zeros
+
+        // persistent state across loop
+        // does the last iteration end with an odd-length sequence of backslashes?
+        // either 0 or 1, but a 64-bit value
+        let mut prev_iter_ends_odd_backslash: u64 = 0;
+        // does the previous iteration end inside a double-quote pair?
+        let mut prev_iter_inside_quote: u64 = 0;
+        // either all zeros or all ones
+        // does the previous iteration end on something that is a predecessor of a
+        // pseudo-structural character - i.e. whitespace or a structural character
+        // effectively the very first char is considered to follow "whitespace" for
+        // the
+        // purposes of pseudo-structural character detection so we initialize to 1
+        let mut prev_iter_ends_pseudo_pred: u64 = 1;
+
+        // structurals are persistent state across loop as we flatten them on the
+        // subsequent iteration into our array pointed to be base_ptr.
+        // This is harmless on the first iteration as structurals==0
+        // and is done for performance reasons; we can hide some of the latency of the
+        // expensive carryless multiply in the previous step with this work
+        let mut structurals: u64 = 0;
+
+        let lenminus64: usize = if len < 64 { 0 } else { len as usize - 64 };
+        let mut idx: usize = 0;
+        let mut error_mask: u64 = 0; // for unescaped characters within strings (ASCII code points < 0x20)
+
+        while idx < lenminus64 {
+            /*
+            #ifndef _MSC_VER
+              __builtin_prefetch(buf + idx + 128);
+            #endif
+             */
+            let input: SimdInput = fill_input(input.get_unchecked(idx as usize..));
+            check_utf8(&input, &mut utf8_state);
+            // detect odd sequences of backslashes
+            let odd_ends: u64 =
+                find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash);
+
+            // detect insides of quote pairs ("quote_mask") and also our quote_bits
+            // themselves
+            let mut quote_bits: u64 = 0;
+            let quote_mask: u64 = find_quote_mask_and_bits(
+                &input,
+                odd_ends,
+                &mut prev_iter_inside_quote,
+                &mut quote_bits,
+                &mut error_mask,
+            );
+
+            // take the previous iterations structural bits, not our current iteration,
+            // and flatten
+            flatten_bits(&mut structural_indexes, idx as u32, structurals);
+
+            let mut whitespace: u64 = 0;
+            find_whitespace_and_structurals(&input, &mut whitespace, &mut structurals);
+
+            // fixup structurals to reflect quotes and add pseudo-structural characters
+            structurals = finalize_structurals(
+                structurals,
+                whitespace,
+                quote_mask,
+                quote_bits,
+                &mut prev_iter_ends_pseudo_pred,
+            );
+            idx += 64;
+        }
+
+        // we use a giant copy-paste which is ugly.
+        // but otherwise the string needs to be properly padded or else we
+        // risk invalidating the UTF-8 checks.
+        if idx < len {
+            let mut tmpbuf: [u8; 64] = [0x20; 64];
+            tmpbuf
+                .as_mut_ptr()
+                .copy_from(input.as_ptr().add(idx), len as usize - idx);
+            let input: SimdInput = fill_input(&tmpbuf);
+
+            check_utf8(&input, &mut utf8_state);
+
+            // detect odd sequences of backslashes
+            let odd_ends: u64 =
+                find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash);
+
+            // detect insides of quote pairs ("quote_mask") and also our quote_bits
+            // themselves
+            let mut quote_bits: u64 = 0;
+            let quote_mask: u64 = find_quote_mask_and_bits(
+                &input,
+                odd_ends,
+                &mut prev_iter_inside_quote,
+                &mut quote_bits,
+                &mut error_mask,
+            );
+
+            // take the previous iterations structural bits, not our current iteration,
+            // and flatten
+            flatten_bits(&mut structural_indexes, idx as u32, structurals);
+
+            let mut whitespace: u64 = 0;
+            find_whitespace_and_structurals(&input, &mut whitespace, &mut structurals);
+
+            // fixup structurals to reflect quotes and add pseudo-structural characters
+            structurals = finalize_structurals(
+                structurals,
+                whitespace,
+                quote_mask,
+                quote_bits,
+                &mut prev_iter_ends_pseudo_pred,
+            );
+            idx += 64;
+        }
+        // This test isn't in upstream, for some reason the error mask is et for then.
+        if prev_iter_inside_quote != 0 {
+            return Err(ErrorType::Syntax);
+        }
+        // finally, flatten out the remaining structurals from the last iteration
+        flatten_bits(&mut structural_indexes, idx as u32, structurals);
+
+        // a valid JSON file cannot have zero structural indexes - we should have
+        // found something (note that we compare to 1 as we always add the root!)
+        if structural_indexes.len() == 1 {
+            return Err(ErrorType::EOF);
+        }
+
+        if structural_indexes.last() > Some(&(len as u32)) {
+            return Err(ErrorType::InternalError);
+        }
+
+        if error_mask != 0 {
+            return Err(ErrorType::Syntax);
+        }
+
+        if is_utf8_status_ok(utf8_state.has_error) {
+            Ok(structural_indexes)
+        } else {
+            Err(ErrorType::InvalidUTF8)
+        }
+    }
+}
diff --git a/src/neon/utf8check.rs b/src/neon/utf8check.rs
new file mode 100644
index 00000000..082183b1
--- /dev/null
+++ b/src/neon/utf8check.rs
@@ -0,0 +1,253 @@
+use crate::neon::intrinsics::*;
+
+/*
+ * legal utf-8 byte sequence
+ * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
+ *
+ *  Code Points        1st       2s       3s       4s
+ * U+0000..U+007F     00..7F
+ * U+0080..U+07FF     C2..DF   80..BF
+ * U+0800..U+0FFF     E0       A0..BF   80..BF
+ * U+1000..U+CFFF     E1..EC   80..BF   80..BF
+ * U+D000..U+D7FF     ED       80..9F   80..BF
+ * U+E000..U+FFFF     EE..EF   80..BF   80..BF
+ * U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
+ * U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
+ * U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
+ *
+ */
+
+/*****************************/
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn push_last_byte_of_a_to_b(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe {
+        vextq_s8(a, b, 16 - 1)
+    }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn push_last_2bytes_of_a_to_b(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe {
+        vextq_s8(a, b, 16 - 2)
+    }
+}
+
+// all byte values must be no larger than 0xF4
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn check_smaller_than_0xf4(current_bytes: int8x16_t, has_error: &mut int8x16_t) {
+    // unsigned, saturates to 0 below max
+    *has_error = unsafe {
+        vorrq_s8(
+            *has_error,
+            vqsubq_s8(current_bytes, vdupq_n_s8(-12 /* 0xF4 */))
+        )
+    };
+}
+
+macro_rules! nibbles_tbl {
+    () => {
+        int8x16_t::new(
+            1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
+            0, 0, 0, 0,             // 10xx (continuation)
+            2, 2,                   // 110x
+            3,                      // 1110
+            4,                      // 1111, next should be 0 (not checked here)
+        )
+    };
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn continuation_lengths(high_nibbles: int8x16_t) -> int8x16_t {
+    unsafe {
+        vqtbl1q_s8(
+            nibbles_tbl!(),
+            vreinterpretq_u8_s8(high_nibbles),
+        )
+    }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn carry_continuations(initial_lengths: int8x16_t, previous_carries: int8x16_t) -> int8x16_t {
+    unsafe {
+        let right1: int8x16_t = vqsubq_s8(
+            push_last_byte_of_a_to_b(previous_carries, initial_lengths),
+            vdupq_n_s8(1),
+        );
+        let sum: int8x16_t = vaddq_s8(initial_lengths, right1);
+        let right2: int8x16_t = vqsubq_s8(
+            push_last_2bytes_of_a_to_b(previous_carries, sum),
+            vdupq_n_s8(2),
+        );
+        vaddq_s8(sum, right2)
+    }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn check_continuations(initial_lengths: int8x16_t, carries: int8x16_t, has_error: &mut int8x16_t) {
+    // overlap || underlap
+    // carry > length && length > 0 || !(carry > length) && !(length > 0)
+    // (carries > length) == (lengths > 0)
+    {
+        let overunder: uint8x16_t = vceqq_u8(
+            vcgtq_s8(carries, initial_lengths),
+            vcgtq_s8(initial_lengths, vdupq_n_s8(0)),
+        );
+
+        *has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(overunder));
+    }
+}
+
+// when 0xED is found, next byte must be no larger than 0x9F
+// when 0xF4 is found, next byte must be no larger than 0x8F
+// next byte must be continuation, ie sign bit is set, so signed < is ok
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn check_first_continuation_max(
+    current_bytes: int8x16_t,
+    off1_current_bytes: int8x16_t,
+    has_error: &mut int8x16_t,
+) {
+    {
+        let mask_ed: uint8x16_t = vceqq_s8(
+            off1_current_bytes,
+            vdupq_n_s8(-19 /* 0xED */),
+        );
+        let mask_f4: uint8x16_t = vceqq_s8(
+            off1_current_bytes,
+            vdupq_n_s8(-12 /* 0xF4 */),
+        );
+
+        let badfollow_ed: uint8x16_t = vandq_u8(
+            vcgtq_s8(current_bytes, vdupq_n_s8(-97 /* 0x9F */)),
+            mask_ed,
+        );
+        let badfollow_f4: uint8x16_t = vandq_u8(
+            vcgtq_s8(current_bytes, vdupq_n_s8(-113 /* 0x8F */)),
+            mask_f4,
+        );
+
+        *has_error = vorrq_s8(
+            *has_error,
+            vreinterpretq_s8_u8(vorrq_u8(badfollow_ed, badfollow_f4)),
+        );
+    }
+}
+
+macro_rules! initial_mins_tbl {
+    () => {
+        int8x16_t::new(
+            -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, // 10xx => false
+            -62 /* 0xC2 */, -128,               // 110x
+            -31 /* 0xE1 */,                     // 1110
+            -15 /*0xF1 */,                      // 1111
+        )
+    };
+}
+
+macro_rules! second_mins_tbl {
+    () => {
+        int8x16_t::new(
+            -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, // 10xx => false
+            127, 127,                           // 110x => true
+            -96 /* 0xA0 */,                     // 1110
+            -112 /* 0x90 */,                    // 1111
+        )
+    };
+}
+
+// map off1_hibits => error condition
+// hibits     off1    cur
+// C       => < C2 && true
+// E       => < E1 && < A0
+// F       => < F1 && < 90
+// else      false && false
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn check_overlong(
+    current_bytes: int8x16_t,
+    off1_current_bytes: int8x16_t,
+    hibits: int8x16_t,
+    previous_hibits: int8x16_t,
+    has_error: &mut int8x16_t,
+) {
+    unsafe {
+        let off1_hibits: int8x16_t = push_last_byte_of_a_to_b(previous_hibits, hibits);
+        let initial_mins: int8x16_t = vqtbl1q_s8(
+            initial_mins_tbl!(),
+            vreinterpretq_u8_s8(off1_hibits)
+        );
+
+        let initial_under: uint8x16_t = vcgtq_s8(initial_mins, off1_current_bytes);
+
+        let second_mins: int8x16_t = vqtbl1q_s8(
+            second_mins_tbl!(),
+            vreinterpretq_u8_s8(off1_hibits)
+        );
+        let second_under: uint8x16_t = vcgtq_s8(second_mins, current_bytes);
+        *has_error = vorrq_s8(
+            *has_error,
+            vreinterpretq_s8_u8(vandq_u8(initial_under, second_under))
+        );
+    }
+}
+
+pub struct ProcessedUtfBytes {
+    rawbytes: int8x16_t,
+    high_nibbles: int8x16_t,
+    pub carried_continuations: int8x16_t,
+}
+
+impl Default for ProcessedUtfBytes {
+    #[cfg_attr(not(feature = "no-inline"), inline)]
+    fn default() -> Self {
+        ProcessedUtfBytes {
+            rawbytes: vdupq_n_s8(0x00),
+            high_nibbles: vdupq_n_s8(0x00),
+            carried_continuations: vdupq_n_s8(0x00),
+        }
+    }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn count_nibbles(bytes: int8x16_t, answer: &mut ProcessedUtfBytes) {
+    answer.rawbytes = bytes;
+    answer.high_nibbles = unsafe {
+        vandq_s8(
+            vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4)),
+            vmovq_n_s8(0x0F)
+        )
+    };
+}
+
+// check whether the current bytes are valid UTF-8
+// at the end of the function, previous gets updated
+#[cfg_attr(not(feature = "no-inline"), inline)]
+pub fn check_utf8_bytes(
+    current_bytes: int8x16_t,
+    previous: &mut ProcessedUtfBytes,
+    has_error: &mut int8x16_t,
+) -> ProcessedUtfBytes {
+    let mut pb = ProcessedUtfBytes::default();
+    count_nibbles(current_bytes, &mut pb);
+
+    check_smaller_than_0xf4(current_bytes, has_error);
+
+    let initial_lengths: int8x16_t = continuation_lengths(pb.high_nibbles);
+
+    pb.carried_continuations =
+        carry_continuations(initial_lengths, previous.carried_continuations);
+
+    check_continuations(initial_lengths, pb.carried_continuations, has_error);
+
+    let off1_current_bytes: int8x16_t = push_last_byte_of_a_to_b(previous.rawbytes, pb.rawbytes);
+    check_first_continuation_max(current_bytes, off1_current_bytes, has_error);
+
+    check_overlong(
+        current_bytes,
+        off1_current_bytes,
+        pb.high_nibbles,
+        previous.high_nibbles,
+        has_error,
+    );
+    pb
+}
diff --git a/src/numberparse.rs b/src/numberparse.rs
index 03880d56..01af460e 100644
--- a/src/numberparse.rs
+++ b/src/numberparse.rs
@@ -1,6 +1,7 @@
 use crate::charutils::*;
 use crate::unlikely;
 use crate::*;
+
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
@@ -133,6 +134,7 @@ pub enum Number {
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 {
     unsafe {
         // this actually computes *16* values so we are being wasteful.
@@ -143,7 +145,7 @@ fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 {
         let mul_1_10000: __m128i = _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
         // We know what we're doing right? :P
         #[allow(clippy::cast_ptr_alignment)]
-        let input: __m128i = _mm_sub_epi8(
+            let input: __m128i = _mm_sub_epi8(
             _mm_loadu_si128(chars.get_unchecked(0..16).as_ptr() as *const __m128i),
             ascii0,
         );
@@ -155,6 +157,17 @@ fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 {
     }
 }
 
+#[cfg_attr(not(feature = "no-inline"), inline)]
+#[cfg(target_feature = "neon")]
+fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 {
+    let val: u64 = unsafe { *(chars.as_ptr() as *const u64) };
+    //    memcpy(&val, chars, sizeof(u64));
+    let val = (val & 0x0F0F0F0F0F0F0F0F).wrapping_mul(2561) >> 8;
+    let val = (val & 0x00FF00FF00FF00FF).wrapping_mul(6553601) >> 16;
+
+    return ((val & 0x0000FFFF0000FFFF).wrapping_mul(42949672960001) >> 32) as u32;
+}
+
 impl<'de> Deserializer<'de> {
     /// called by parse_number when we know that the output is a float,
     /// but where there might be some integer overflow. The trick here is to
@@ -215,7 +228,7 @@ impl<'de> Deserializer<'de> {
                 digit = unsafe { *p.get_unchecked(digitcount) } - b'0';
                 digitcount += 1;
                 fraction_weight *= 10.0;
-                fraction += f64::from(digit) / fraction_weight;;
+                fraction += f64::from(digit) / fraction_weight;
             }
             i += fraction;
         }
diff --git a/src/portability.rs b/src/portability.rs
new file mode 100644
index 00000000..69481a85
--- /dev/null
+++ b/src/portability.rs
@@ -0,0 +1,30 @@
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+#[cfg(target_arch = "x86_64")]
+pub fn add_overflow(value1: u64, value2: u64, result: &mut u64) -> bool {
+    unsafe { _addcarry_u64(0, value1, value2, result) != 0 }
+}
+
+//TODO: static?
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+#[cfg(target_arch = "x86_64")]
+pub fn hamming(input_num: u64) -> u32 {
+    unsafe { _popcnt64(input_num as i64) as u32 }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+#[cfg(target_arch = "x86_64")]
+pub fn hamming(input_num: u64) -> u32 {
+    unsafe { __popcnt(input_num as u32) + __popcnt((input_num >> 32) as u32) as u32 }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+#[cfg(target_arch = "x86_64")]
+pub fn trailingzeroes(input_num: u64) -> u32 {
+    unsafe { _tzcnt_u64(input_num) as u32 }
+}
diff --git a/src/sse42/generator.rs b/src/sse42/generator.rs
new file mode 100644
index 00000000..e6636585
--- /dev/null
+++ b/src/sse42/generator.rs
@@ -0,0 +1,51 @@
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+use crate::value::generator::ESCAPED;
+use std::io;
+
+#[inline(always)]
+pub unsafe fn write_str_simd<W>(writer: &mut W, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> where W: std::io::Write {
+    let zero = _mm_set1_epi8(0);
+    let lower_quote_range = _mm_set1_epi8(0x1F as i8);
+    let quote = _mm_set1_epi8(b'"' as i8);
+    let backslash = _mm_set1_epi8(b'\\' as i8);
+    while *len - *idx > 16 {
+        // Load 16 bytes of data;
+        #[allow(clippy::cast_ptr_alignment)]
+            let data: __m128i = _mm_loadu_si128(string.as_ptr().add(*idx) as *const __m128i);
+        // Test the data against being backslash and quote.
+        let bs_or_quote = _mm_or_si128(
+            _mm_cmpeq_epi8(data, backslash),
+            _mm_cmpeq_epi8(data, quote)
+        );
+        // Now mask the data with the quote range (0x1F).
+        let in_quote_range = _mm_and_si128(data, lower_quote_range);
+        // then test of the data is unchanged. aka: xor it with the
+        // Any field that was inside the quote range it will be zero
+        // now.
+        let is_unchanged = _mm_xor_si128(data, in_quote_range);
+        let in_range = _mm_cmpeq_epi8(is_unchanged, zero);
+        let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range));
+        if quote_bits != 0 {
+            let quote_dist = quote_bits.trailing_zeros() as usize;
+            stry!(writer.write_all(&string[0..*idx + quote_dist]));
+            let ch = string[*idx + quote_dist];
+            match ESCAPED[ch as usize] {
+                b'u' => stry!(write!(writer, "\\u{:04x}", ch)),
+
+                escape => stry!(writer.write_all(&[b'\\', escape])),
+            };
+            *string = &string[*idx + quote_dist + 1..];
+            *idx = 0;
+            *len = string.len();
+        } else {
+            *idx += 16;
+        }
+    }
+    stry!(writer.write_all(&string[0..*idx]));
+    *string = &string[*idx..];
+    Ok(())
+}
diff --git a/src/sse42/mod.rs b/src/sse42/mod.rs
index 30c55c86..ac608ae2 100644
--- a/src/sse42/mod.rs
+++ b/src/sse42/mod.rs
@@ -1,3 +1,4 @@
 pub mod deser;
 pub mod stage1;
-pub mod utf8check;
\ No newline at end of file
+pub mod utf8check;
+pub mod generator;
\ No newline at end of file
diff --git a/src/stage2.rs b/src/stage2.rs
index aaa51ad8..3bbd71c1 100644
--- a/src/stage2.rs
+++ b/src/stage2.rs
@@ -1,9 +1,11 @@
 #![allow(dead_code)]
+use crate::charutils::*;
 #[cfg(target_feature = "avx2")]
 use crate::avx2::stage1::SIMDJSON_PADDING;
-use crate::charutils::*;
-#[cfg(not(target_feature = "avx2"))]
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))]
 use crate::sse42::stage1::SIMDJSON_PADDING;
+#[cfg(target_feature = "neon")]
+use crate::neon::stage1::SIMDJSON_PADDING;
 use crate::{Deserializer, Error, ErrorType, Result};
 
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
diff --git a/src/stringparse.rs b/src/stringparse.rs
index 0ff8a078..9a7e3b8f 100644
--- a/src/stringparse.rs
+++ b/src/stringparse.rs
@@ -73,3 +73,9 @@ pub fn handle_unicode_codepoint(
     let offset: usize = codepoint_to_utf8(code_point, dst_ptr);
     Ok((offset, src_offset))
 }
+
+// Holds backslashes and quotes locations.
+pub struct ParseStringHelper {
+    pub bs_bits: u32,
+    pub quote_bits: u32,
+}
diff --git a/src/value.rs b/src/value.rs
index 06245878..c5d5aa87 100644
--- a/src/value.rs
+++ b/src/value.rs
@@ -11,7 +11,7 @@
 /// we do not require prior knowledge sbout string comtent to to take advantage
 /// of it.
 pub mod borrowed;
-mod generator;
+pub(crate) mod generator;
 pub mod owned;
 
 pub use self::borrowed::{to_value as to_borrowed_value, Value as BorrowedValue};
diff --git a/src/value/generator.rs b/src/value/generator.rs
index 55824f57..ebdb0a88 100644
--- a/src/value/generator.rs
+++ b/src/value/generator.rs
@@ -5,19 +5,21 @@
 // https://github.com/maciejhirsz/json-rust/blob/master/src/codegen.rs
 
 use crate::value::ValueTrait;
-#[cfg(target_arch = "x86")]
-use std::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
 use std::io;
 use std::io::Write;
 use std::marker::PhantomData;
 use std::ptr;
 
+use crate::*;
+
 #[cfg(target_feature = "avx2")]
-const AVX2_PRESENT : bool = true;
-#[cfg(not(target_feature = "avx2"))]
-const AVX2_PRESENT : bool = false;
+use crate::avx2::generator::*;
+
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))]
+use crate::sse42::generator::*;
+
+#[cfg(target_feature = "neon")]
+use crate::neon::generator::*;
 
 const QU: u8 = b'"';
 const BS: u8 = b'\\';
@@ -30,7 +32,7 @@ const UU: u8 = b'u';
 const __: u8 = 0;
 
 // Look up table for characters that need escaping in a product string
-static ESCAPED: [u8; 256] = [
+pub(crate) static ESCAPED: [u8; 256] = [
     // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
     UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0
     UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1
@@ -106,85 +108,7 @@ pub trait BaseGenerator {
             // quote characters that gives us a bitmask of 0x1f for that
             // region, only quote (`"`) and backslash (`\`) are not in
             // this range.
-            if AVX2_PRESENT {
-                let zero = _mm256_set1_epi8(0);
-                let lower_quote_range = _mm256_set1_epi8(0x1F as i8);
-                let quote = _mm256_set1_epi8(b'"' as i8);
-                let backslash = _mm256_set1_epi8(b'\\' as i8);
-                while len - idx >= 32 {
-                    // Load 32 bytes of data;
-                    #[allow(clippy::cast_ptr_alignment)]
-                    let data: __m256i = _mm256_loadu_si256(string.as_ptr().add(idx) as *const __m256i);
-                    // Test the data against being backslash and quote.
-                    let bs_or_quote = _mm256_or_si256(
-                        _mm256_cmpeq_epi8(data, backslash),
-                        _mm256_cmpeq_epi8(data, quote),
-                    );
-                    // Now mask the data with the quote range (0x1F).
-                    let in_quote_range = _mm256_and_si256(data, lower_quote_range);
-                    // then test of the data is unchanged. aka: xor it with the
-                    // Any field that was inside the quote range it will be zero
-                    // now.
-                    let is_unchanged = _mm256_xor_si256(data, in_quote_range);
-                    let in_range = _mm256_cmpeq_epi8(is_unchanged, zero);
-                    let quote_bits = _mm256_movemask_epi8(_mm256_or_si256(bs_or_quote, in_range));
-                    if quote_bits != 0 {
-                        let quote_dist = quote_bits.trailing_zeros() as usize;
-                        stry!(self.get_writer().write_all(&string[0..idx + quote_dist]));
-                        let ch = string[idx + quote_dist];
-                        match ESCAPED[ch as usize] {
-                            b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
-
-                            escape => stry!(self.write(&[b'\\', escape])),
-                        };
-                        string = &string[idx + quote_dist + 1..];
-                        idx = 0;
-                        len = string.len();
-                    } else {
-                        idx += 32;
-                    }
-                }
-            }
-            // The case where we have a 16+ byte block
-            // we repeate the same logic as above but with
-            // only 16 bytes
-            let zero = _mm_set1_epi8(0);
-            let lower_quote_range = _mm_set1_epi8(0x1F as i8);
-            let quote = _mm_set1_epi8(b'"' as i8);
-            let backslash = _mm_set1_epi8(b'\\' as i8);
-            while len - idx > 16 {
-                // Load 16 bytes of data;
-                #[allow(clippy::cast_ptr_alignment)]
-                let data: __m128i = _mm_loadu_si128(string.as_ptr().add(idx) as *const __m128i);
-                // Test the data against being backslash and quote.
-                let bs_or_quote =
-                    _mm_or_si128(_mm_cmpeq_epi8(data, backslash), _mm_cmpeq_epi8(data, quote));
-                // Now mask the data with the quote range (0x1F).
-                let in_quote_range = _mm_and_si128(data, lower_quote_range);
-                // then test of the data is unchanged. aka: xor it with the
-                // Any field that was inside the quote range it will be zero
-                // now.
-                let is_unchanged = _mm_xor_si128(data, in_quote_range);
-                let in_range = _mm_cmpeq_epi8(is_unchanged, zero);
-                let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range));
-                if quote_bits != 0 {
-                    let quote_dist = quote_bits.trailing_zeros() as usize;
-                    stry!(self.get_writer().write_all(&string[0..idx + quote_dist]));
-                    let ch = string[idx + quote_dist];
-                    match ESCAPED[ch as usize] {
-                        b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
-
-                        escape => stry!(self.write(&[b'\\', escape])),
-                    };
-                    string = &string[idx + quote_dist + 1..];
-                    idx = 0;
-                    len = string.len();
-                } else {
-                    idx += 16;
-                }
-            }
-            stry!(self.get_writer().write_all(&string[0..idx]));
-            string = &string[idx..];
+            stry!(write_str_simd(self.get_writer(), &mut string, &mut len, &mut idx));
         }
         // Legacy code to handle the remainder of the code
         for (index, ch) in string.iter().enumerate() {