From fa26882132e42154357b5d1450e3a3074b820788 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Wed, 7 Aug 2019 22:47:11 -0400
Subject: [PATCH 01/34] feat: neon support (mostly broken)

---
 Cargo.toml             |    1 +
 src/lib.rs             | 2187 ++++++++++++++++++++--------------------
 src/neon/deser.rs      |  204 ++++
 src/neon/intrinsics.rs |  287 ++++++
 src/neon/mod.rs        |    6 +
 src/neon/simd.rs       |  469 +++++++++
 src/neon/simd_llvm.rs  |   54 +
 src/neon/stage1.rs     |  560 ++++++++++
 src/neon/utf8check.rs  |  247 +++++
 src/numberparse.rs     |   30 +-
 src/portability.rs     |   30 +
 src/stage2.rs          |    6 +-
 src/value/generator.rs |  156 +--
 13 files changed, 3049 insertions(+), 1188 deletions(-)
 create mode 100644 src/neon/deser.rs
 create mode 100644 src/neon/intrinsics.rs
 create mode 100644 src/neon/mod.rs
 create mode 100644 src/neon/simd.rs
 create mode 100644 src/neon/simd_llvm.rs
 create mode 100644 src/neon/stage1.rs
 create mode 100644 src/neon/utf8check.rs
 create mode 100644 src/portability.rs
diff --git a/Cargo.toml b/Cargo.toml
index 399f7c9d..cae31f52 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,6 +13,7 @@ halfbrown = "0.1"
 page_size = "0.4"
 itoa = "0.4"
 ryu = "1"
+core_arch = "0.1.5"
 
 # serde compatibilty
 serde = { version = "1", features = ["derive"], optional = true}
diff --git a/src/lib.rs b/src/lib.rs
index 02038553..3973b7a5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,14 @@
 #![deny(warnings)]
+#![feature(asm)]
+#![feature(stdsimd)]
+#![feature(repr_simd)]
+#![feature(custom_inner_attributes)]
+#![feature(aarch64_target_feature)]
+#![feature(platform_intrinsics)]
+#![feature(stmt_expr_attributes)]
+#![feature(simd_ffi)]
+#![feature(link_llvm_intrinsics)]
+
 #![cfg_attr(feature = "hints", feature(core_intrinsics))]
 //! simdjson-rs is a rust port of the simejson c++ library. It follows
 //! most of the design closely with a few exceptions to make it better
@@ -69,16 +79,16 @@
 //! let v: Value = simd_json::serde::from_slice(&mut d).unwrap();
 //! ```
 
-#[cfg(feature = "serde_impl")]
-extern crate serde as serde_ext;
-#[cfg(feature = "serde_impl")]
-pub mod serde;
+//#[cfg(feature = "serde_impl")]
+//extern crate serde as serde_ext;
+//#[cfg(feature = "serde_impl")]
+//pub mod serde;
 
 mod charutils;
 #[macro_use]
 mod macros;
 mod error;
-mod numberparse;
+//mod numberparse;
 mod parsedjson;
 mod stringparse;
 
@@ -89,22 +99,29 @@ pub use crate::avx2::deser::*;
 #[cfg(target_feature = "avx2")]
 use crate::avx2::stage1::SIMDJSON_PADDING;
 
-#[cfg(not(target_feature = "avx2"))]
+#[cfg(target_feature = "sse42")]
 mod sse42;
-#[cfg(not(target_feature = "avx2"))]
+#[cfg(target_feature = "sse42")]
 pub use crate::sse42::deser::*;
-#[cfg(not(target_feature = "avx2"))]
+#[cfg(target_feature = "sse42")]
 use crate::sse42::stage1::SIMDJSON_PADDING;
 
-mod stage2;
-pub mod value;
+//#[cfg(target_feature = "neon")]
+mod neon;
+//#[cfg(target_feature = "neon")]
+//pub use crate::neon::deser::*;
+//#[cfg(target_feature = "neon")]
+//use crate::neon::stage1::SIMDJSON_PADDING;
+
+//mod stage2;
+//pub mod value;
 
-use crate::numberparse::Number;
-use std::mem;
-use std::str;
+//use crate::numberparse::Number;
+//use std::mem;
+//use std::str;
 
 pub use crate::error::{Error, ErrorType};
-pub use crate::value::*;
+//pub use crate::value::*;
 
 pub type Result<T> = std::result::Result<T, Error>;
 
@@ -120,1074 +137,1074 @@ pub struct Deserializer<'de> {
     str_offset: usize,
     iidx: usize,
 }
-
-impl<'de> Deserializer<'de> {
-    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-    fn error(&self, error: ErrorType) -> Error {
-        Error::new(self.idx, self.iidx, self.c() as char, error)
-    }
-    // By convention, `Deserializer` constructors are named like `from_xyz`.
-    // That way basic use cases are satisfied by something like
-    // `serde_json::from_str(...)` while advanced use cases that require a
-    // deserializer can make one with `serde_json::Deserializer::from_str(...)`.
-    pub fn from_slice(input: &'de mut [u8]) -> Result<Self> {
-        // We have to pick an initial size of the structural indexes.
-        // 6 is a heuristic that seems to work well for the benchmark
-        // data and limit re-allocation frequency.
-
-        let len = input.len();
-
-        let buf_start: usize = input.as_ptr() as *const () as usize;
-        let needs_relocation = (buf_start + input.len()) % page_size::get() < SIMDJSON_PADDING;
-
-        let s1_result: std::result::Result<Vec<u32>, ErrorType> = if needs_relocation {
-            let mut data: Vec<u8> = Vec::with_capacity(len + SIMDJSON_PADDING);
-            unsafe {
-                data.set_len(len + 1);
-                data.as_mut_slice()
-                    .get_unchecked_mut(0..len)
-                    .clone_from_slice(input);
-                *(data.get_unchecked_mut(len)) = 0;
-                data.set_len(len);
-                Deserializer::find_structural_bits(&data)
-            }
-        } else {
-            unsafe { Deserializer::find_structural_bits(input) }
-        };
-        let structural_indexes = match s1_result {
-            Ok(i) => i,
-            Err(t) => {
-                return Err(Error::generic(t));
-            }
-        };
-
-        let counts = Deserializer::validate(input, &structural_indexes)?;
-
-        let strings = Vec::with_capacity(len + SIMDJSON_PADDING);
-
-        Ok(Deserializer {
-            counts,
-            structural_indexes,
-            input,
-            idx: 0,
-            strings,
-            str_offset: 0,
-            iidx: 0,
-        })
-    }
-
-    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-    fn skip(&mut self) {
-        self.idx += 1;
-        self.iidx = unsafe { *self.structural_indexes.get_unchecked(self.idx) as usize };
-    }
-
-    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-    fn c(&self) -> u8 {
-        unsafe {
-            *self
-                .input
-                .get_unchecked(*self.structural_indexes.get_unchecked(self.idx) as usize)
-        }
-    }
-
-    // pull out the check so we don't need to
-    // stry every time
-    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-    fn next_(&mut self) -> u8 {
-        unsafe {
-            self.idx += 1;
-            self.iidx = *self.structural_indexes.get_unchecked(self.idx) as usize;
-            *self.input.get_unchecked(self.iidx)
-        }
-    }
-
-    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-    fn count_elements(&self) -> usize {
-        unsafe { *self.counts.get_unchecked(self.idx) }
-    }
-
-    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-    fn parse_number_root(&mut self, minus: bool) -> Result<Number> {
-        let input = unsafe { &self.input.get_unchecked(self.iidx..) };
-        let len = input.len();
-        let mut copy = vec![0u8; len + SIMDJSON_PADDING];
-        copy[len] = 0;
-        unsafe {
-            copy.as_mut_ptr().copy_from(input.as_ptr(), len);
-        };
-        self.parse_number_int(&copy, minus)
-    }
-
-    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-    fn parse_number(&mut self, minus: bool) -> Result<Number> {
-        let input = unsafe { &self.input.get_unchecked(self.iidx..) };
-        let len = input.len();
-        if len < SIMDJSON_PADDING {
-            let mut copy = vec![0u8; len + SIMDJSON_PADDING];
-            unsafe {
-                copy.as_mut_ptr().copy_from(input.as_ptr(), len);
-            };
-            self.parse_number_int(&copy, minus)
-        } else {
-            self.parse_number_int(input, minus)
-        }
-    }
-
-    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-    fn parse_number_(&mut self, minus: bool) -> Result<Number> {
-        let input = unsafe { &self.input.get_unchecked(self.iidx..) };
-        self.parse_number_int(input, minus)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::serde::from_slice;
-    use super::{
-        owned::to_value, owned::Map, owned::Value, to_borrowed_value, to_owned_value, Deserializer,
-    };
-    use halfbrown::HashMap;
-    use proptest::prelude::*;
-    use serde::Deserialize;
-    use serde_json;
-
-    #[test]
-    fn count1() {
-        let mut d = String::from("[]");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let simd = Deserializer::from_slice(&mut d).expect("");
-        assert_eq!(simd.counts[1], 0);
-    }
-
-    #[test]
-    fn count2() {
-        let mut d = String::from("[1]");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let simd = Deserializer::from_slice(&mut d).expect("");
-        assert_eq!(simd.counts[1], 1);
-    }
-
-    #[test]
-    fn count3() {
-        let mut d = String::from("[1,2]");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let simd = Deserializer::from_slice(&mut d).expect("");
-        assert_eq!(simd.counts[1], 2);
-    }
-
-    #[test]
-    fn count4() {
-        let mut d = String::from(" [ 1 , [ 3 ] , 2 ]");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let simd = Deserializer::from_slice(&mut d).expect("");
-        assert_eq!(simd.counts[1], 3);
-        assert_eq!(simd.counts[4], 1);
-    }
-
-    #[test]
-    fn count5() {
-        let mut d = String::from("[[],null,null]");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let simd = Deserializer::from_slice(&mut d).expect("");
-        assert_eq!(simd.counts[1], 3);
-        assert_eq!(simd.counts[2], 0);
-    }
-
-    #[test]
-    fn empty() {
-        let mut d = String::from("");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_simd = from_slice::<Value>(&mut d);
-        let v_serde = serde_json::from_slice::<Value>(d);
-        assert!(v_simd.is_err());
-        assert!(v_serde.is_err());
-    }
-
-    #[test]
-    fn bool_true() {
-        let mut d = String::from("true");
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-        assert_eq!(to_value(&mut d1), Ok(Value::from(true)));
-    }
-
-    #[test]
-    fn bool_false() {
-        let mut d = String::from("false");
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-        assert_eq!(to_value(&mut d1), Ok(Value::from(false)));
-        //assert!(false)
-    }
-
-    #[test]
-    fn union() {
-        let mut d = String::from("null");
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-        assert_eq!(to_value(&mut d1), Ok(Value::Null));
-    }
-
-    #[test]
-    fn int() {
-        let mut d = String::from("42");
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-        assert_eq!(to_value(&mut d1), Ok(Value::from(42)));
-    }
-
-    #[test]
-    fn zero() {
-        let mut d = String::from("0");
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-        assert_eq!(to_value(&mut d1), Ok(Value::from(0)));
-    }
-
-    #[test]
-    fn one() {
-        let mut d = String::from("1");
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-        assert_eq!(to_value(&mut d1), Ok(Value::from(1)));
-    }
-
-    #[test]
-    fn minus_one() {
-        let mut d = String::from("-1");
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-        assert_eq!(to_value(&mut d1), Ok(Value::from(-1)));
-    }
-
-    #[test]
-    fn float() {
-        let mut d = String::from("23.0");
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-        assert_eq!(to_value(&mut d1), Ok(Value::from(23.0)));
-    }
-
-    #[test]
-    fn string() {
-        let mut d = String::from(r#""snot""#);
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(to_value(&mut d1), Ok(Value::from("snot")));
-        assert_eq!(v_simd, v_serde);
-    }
-
-    #[test]
-    fn lonely_quote() {
-        let mut d = String::from(r#"""#);
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
-        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
-        assert!(v_simd);
-        assert!(v_serde);
-    }
-
-    #[test]
-    fn lonely_quote1() {
-        let mut d = String::from(r#"["]"#);
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
-        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
-        assert!(v_simd);
-        assert!(v_serde);
-    }
-    #[test]
-    fn lonely_quote2() {
-        let mut d = String::from(r#"[1, "]"#);
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
-        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
-        assert!(v_simd);
-        assert!(v_serde);
-    }
-
-    #[test]
-    fn lonely_quote3() {
-        let mut d = String::from(r#"{": 1}"#);
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
-        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
-        assert!(v_simd);
-        assert!(v_serde);
-    }
-
-    #[test]
-    fn empty_string() {
-        let mut d = String::from(r#""""#);
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(to_value(&mut d1), Ok(Value::from("")));
-        assert_eq!(v_simd, v_serde);
-    }
-
-    #[test]
-    fn empty_array() {
-        let mut d = String::from(r#"[]"#);
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("parse_serde");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("parse_simd");
-        assert_eq!(to_value(&mut d1), Ok(Value::Array(vec![])));
-        assert_eq!(v_simd, v_serde);
-    }
-
-    #[test]
-    fn malformed_array() {
-        let mut d = String::from(r#"[["#);
-        let mut d1 = d.clone();
-        let mut d2 = d.clone();
-        let mut d = unsafe { d.as_bytes_mut() };
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d2 = unsafe { d2.as_bytes_mut() };
-        let v_serde: Result<serde_json::Value, _> = serde_json::from_slice(d);
-        let v_simd_ov = to_owned_value(&mut d);
-        let v_simd_bv = to_borrowed_value(&mut d1);
-        let v_simd: Result<serde_json::Value, _> = from_slice(&mut d2);
-        assert!(v_simd_ov.is_err());
-        assert!(v_simd_bv.is_err());
-        assert!(v_simd.is_err());
-        assert!(v_serde.is_err());
-    }
-
-    #[test]
-    fn double_array() {
-        let mut d = String::from(r#"[[]]"#);
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("parse_serde");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("parse_simd");
-        assert_eq!(
-            to_value(&mut d1),
-            Ok(Value::Array(vec![Value::Array(vec![])]))
-        );
-        assert_eq!(v_simd, v_serde);
-    }
-
-    #[test]
-    fn null_null_array() {
-        let mut d = String::from(r#"[[],null,null]"#);
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("parse_serde");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("parse_simd");
-        assert_eq!(
-            to_value(&mut d1),
-            Ok(Value::Array(vec![
-                Value::Array(vec![]),
-                Value::Null,
-                Value::Null,
-            ]))
-        );
-        assert_eq!(v_simd, v_serde);
-    }
-
-    #[test]
-    fn one_element_array() {
-        let mut d = String::from(r#"["snot"]"#);
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        assert_eq!(
-            to_value(&mut d1),
-            Ok(Value::Array(vec![Value::from("snot")]))
-        );
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-    }
-
-    #[test]
-    fn two_element_array() {
-        let mut d = String::from(r#"["snot", "badger"]"#);
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        assert_eq!(
-            to_value(&mut d1),
-            Ok(Value::Array(vec![
-                Value::from("snot"),
-                Value::from("badger")
-            ]))
-        );
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-    }
-
-    #[test]
-    fn list() {
-        let mut d = String::from(r#"[42, 23.0, "snot badger"]"#);
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-        assert_eq!(
-            to_value(&mut d1),
-            Ok(Value::Array(vec![
-                Value::from(42),
-                Value::from(23.0),
-                Value::from("snot badger")
-            ]))
-        );
-    }
-
-    #[test]
-    fn nested_list1() {
-        let mut d = String::from(r#"[42, [23.0, "snot"], "bad", "ger"]"#);
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        assert_eq!(
-            to_value(&mut d1),
-            Ok(Value::Array(vec![
-                Value::from(42),
-                Value::Array(vec![Value::from(23.0), Value::from("snot")]),
-                Value::from("bad"),
-                Value::from("ger")
-            ]))
-        );
-
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-    }
-
-    #[test]
-    fn nested_list2() {
-        let mut d = String::from(r#"[42, [23.0, "snot"], {"bad": "ger"}]"#);
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn utf8() {
-        let mut d = String::from(r#""\u000e""#);
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, "\u{e}");
-        // NOTE: serde is broken for this
-        //assert_eq!(v_serde, "\u{e}");
-        //assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn unicode() {
-        let mut d = String::from(r#""¡\"""#);
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-    }
-
-    #[test]
-    fn odd_array() {
-        let mut d = String::from("[{},null]");
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-        assert_eq!(
-            to_value(&mut d1),
-            Ok(Value::Array(vec![Value::Object(Map::new()), Value::Null]))
-        );
-    }
-
-    #[test]
-    fn map2() {
-        let mut d = String::from(r#"[{"\u0000":null}]"#);
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn null() {
-        let mut d = String::from(r#"null"#);
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        assert_eq!(to_value(&mut d1), Ok(Value::Null));
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-    }
-    #[test]
-    fn null_null() {
-        let mut d = String::from(r#"[null, null]"#);
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        assert_eq!(
-            to_value(&mut d1),
-            Ok(Value::Array(vec![Value::Null, Value::Null,]))
-        );
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-    }
-
-    #[test]
-    fn nested_null() {
-        let mut d = String::from(r#"[[null, null]]"#);
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        assert_eq!(
-            to_value(&mut d1),
-            Ok(Value::Array(vec![Value::Array(vec![
-                Value::Null,
-                Value::Null,
-            ])]))
-        );
-
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-    }
-
-    #[test]
-    fn nestednested_null() {
-        let mut d = String::from(r#"[[[null, null]]]"#);
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-        assert_eq!(
-            to_value(&mut d1),
-            Ok(Value::Array(vec![Value::Array(vec![Value::Array(vec![
-                Value::Null,
-                Value::Null,
-            ])])]))
-        );
-    }
-
-    #[test]
-    fn odd_array2() {
-        let mut d = String::from("[[\"\\u0000\\\"\"]]");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn odd_array3() {
-        let mut d = String::from("[{\"\\u0000\\u0000\":null}]");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn odd_array4() {
-        let mut d = String::from("[{\"\\u0000𐀀a\":null}]");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn float1() {
-        let mut d = String::from("2.3250706903316115e307");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("serde_json");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    // We ignore this since serde is less percise on this test
-    #[ignore]
-    #[test]
-    fn float2() {
-        let mut d = String::from("-4.5512678569607477e306");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("serde_json");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn map0() {
-        let mut d = String::from(r#"{"snot": "badger"}"#);
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-        let mut h = Map::new();
-        h.insert("snot".into(), Value::from("badger"));
-        assert_eq!(to_value(&mut d1), Ok(Value::Object(h)));
-    }
-
-    #[test]
-    fn map1() {
-        let mut d = String::from(r#"{"snot": "badger", "badger": "snot"}"#);
-        let mut d1 = d.clone();
-        let mut d1 = unsafe { d1.as_bytes_mut() };
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-        assert_eq!(v_simd, v_serde);
-        let mut h = Map::new();
-        h.insert("snot".into(), Value::from("badger"));
-        h.insert("badger".into(), Value::from("snot"));
-        assert_eq!(to_value(&mut d1), Ok(Value::Object(h)));
-    }
-
-    #[test]
-    fn tpl1() {
-        let mut d = String::from("[-65.613616999999977, 43.420273000000009]");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: (f32, f32) = serde_json::from_slice(d).expect("serde_json");
-        let v_simd: (f32, f32) = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn tpl2() {
-        let mut d = String::from("[[-65.613616999999977, 43.420273000000009]]");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: Vec<(f32, f32)> = serde_json::from_slice(d).expect("serde_json");
-        let v_simd: Vec<(f32, f32)> = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn tpl3() {
-        let mut d = String::from(
-            "[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]",
-        );
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: Vec<(f32, f32)> = serde_json::from_slice(d).expect("serde_json");
-        let v_simd: Vec<(f32, f32)> = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-    #[test]
-    fn tpl4() {
-        let mut d = String::from("[[[-65.613616999999977,43.420273000000009]]]");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: Vec<Vec<(f32, f32)>> = serde_json::from_slice(d).expect("serde_json");
-        let v_simd: Vec<Vec<(f32, f32)>> = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-    #[test]
-    fn tpl5() {
-        let mut d = String::from("[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: Vec<Vec<(f32, f32)>> = serde_json::from_slice(d).expect("serde_json");
-        let v_simd: Vec<Vec<(f32, f32)>> = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn tpl6() {
-        let mut d = String::from("[[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]]");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: Vec<Vec<Vec<(f32, f32)>>> = serde_json::from_slice(d).expect("serde_json");
-        let v_simd: Vec<Vec<Vec<(f32, f32)>>> = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn tpl7() {
-        let mut d = String::from("[[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]]");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: Vec<Vec<Vec<[f32; 2]>>> = serde_json::from_slice(d).expect("serde_json");
-        let v_simd: Vec<Vec<Vec<[f32; 2]>>> = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[derive(Deserialize, PartialEq, Debug)]
-    struct Obj {
-        a: u64,
-        b: u64,
-    }
-
-    #[derive(Deserialize, PartialEq, Debug)]
-    struct Obj1 {
-        a: Obj,
-    }
-
-    #[test]
-    fn obj() {
-        let mut d = String::from(r#"{"a": 1, "b":1}"#);
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: Obj = serde_json::from_slice(d).expect("serde_json");
-        let v_simd: Obj = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn obj2() {
-        let mut d =
-            String::from(r#"{"a": {"a": 1, "b":1}, "b": {"a": 1, "b":1}, "c": {"a": 1, "b": 1}}"#);
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: HashMap<String, Obj> = serde_json::from_slice(d).expect("serde_json");
-        let v_simd: HashMap<String, Obj> = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn obj3() {
-        let mut d = String::from(
-            r#"{"c": {"a": {"a": 1, "b":1}, "b": {"a": 1, "b":1}, "c": {"a": 1, "b": 1}}}"#,
-        );
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: HashMap<String, HashMap<String, Obj>> =
-            serde_json::from_slice(d).expect("serde_json");
-        let v_simd: HashMap<String, HashMap<String, Obj>> = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn obj4() {
-        let mut d = String::from(r#"{"c": {"a": {"a": 1, "b":1}}}"#);
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: HashMap<String, Obj1> = serde_json::from_slice(d).expect("serde_json");
-        let v_simd: HashMap<String, Obj1> = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn vecvec() {
-        let mut d = String::from("[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]], [[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]");
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: Vec<Vec<(f32, f32)>> = serde_json::from_slice(d).expect("serde_json");
-        let v_simd: Vec<Vec<(f32, f32)>> = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn crazy_string() {
-        // there is unicode in here!
-        let d = "\"𐀀𐀀  𐀀𐀀0 𐀀A\\u00000A0 A \\u000b\"";
-        let mut d = String::from(d);
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("serde_json");
-        let v_simd: serde_json::Value = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    #[test]
-    fn event() {
-        #[derive(Deserialize, Debug, PartialEq)]
-        #[serde(deny_unknown_fields, rename_all = "camelCase")]
-        pub struct CitmCatalog {
-            pub area_names: HashMap<String, String>,
-            pub audience_sub_category_names: HashMap<String, String>,
-            pub block_names: HashMap<String, String>,
-            pub events: HashMap<String, Event>,
-        }
-        pub type Id = u32;
-        #[derive(Deserialize, Debug, PartialEq)]
-        #[serde(deny_unknown_fields, rename_all = "camelCase")]
-        pub struct Event {
-            pub description: (),
-            pub id: Id,
-            pub logo: Option<String>,
-            pub name: String,
-            pub sub_topic_ids: Vec<Id>,
-            pub subject_code: (),
-            pub subtitle: (),
-            pub topic_ids: Vec<Id>,
-        }
-
-        let mut d = String::from(
-            r#"
-{
-    "areaNames": {
-        "205705993": "Arrière-scène central",
-        "205705994": "1er balcon central",
-        "205705995": "2ème balcon bergerie cour",
-        "205705996": "2ème balcon bergerie jardin",
-        "205705998": "1er balcon bergerie jardin",
-        "205705999": "1er balcon bergerie cour",
-        "205706000": "Arrière-scène jardin",
-        "205706001": "Arrière-scène cour",
-        "205706002": "2ème balcon jardin",
-        "205706003": "2ème balcon cour",
-        "205706004": "2ème Balcon central",
-        "205706005": "1er balcon jardin",
-        "205706006": "1er balcon cour",
-        "205706007": "Orchestre central",
-        "205706008": "Orchestre jardin",
-        "205706009": "Orchestre cour",
-        "342752287": "Zone physique secrète"
-    },
-    "audienceSubCategoryNames": {
-        "337100890": "Abonné"
-    },
-    "blockNames": {},
-  "events": {
-    "138586341": {
-      "description": null,
-      "id": 138586341,
-      "logo": null,
-      "name": "30th Anniversary Tour",
-      "subTopicIds": [
-        337184269,
-        337184283
-      ],
-      "subjectCode": null,
-      "subtitle": null,
-      "topicIds": [
-        324846099,
-        107888604
-      ]
-    },
-    "138586345": {
-      "description": null,
-      "id": 138586345,
-      "logo": "/images/UE0AAAAACEKo6QAAAAZDSVRN",
-      "name": "Berliner Philharmoniker",
-      "subTopicIds": [
-        337184268,
-        337184283,
-        337184275
-      ],
-      "subjectCode": null,
-      "subtitle": null,
-      "topicIds": [
-        324846099,
-        107888604,
-        324846100
-      ]
-    }
-  }
-}
-"#,
-        );
-        let mut d = unsafe { d.as_bytes_mut() };
-        let v_serde: CitmCatalog = serde_json::from_slice(d).expect("serde_json");
-        let v_simd: CitmCatalog = from_slice(&mut d).expect("simd_json");
-        assert_eq!(v_simd, v_serde)
-    }
-
-    // How much do we care about this, it's within the same range and
-    // based on floating point math inprecisions during parsing.
-    // Is this a real issue worth improving?
-    #[test]
-    fn silly_float1() {
-        let v = Value::from(3.0901448042322017e305);
-        let s = v.to_string();
-        dbg!(&s);
-        let mut bytes = s.as_bytes().to_vec();
-        let parsed = to_owned_value(&mut bytes).expect("failed to parse gernated float");
-        assert_eq!(v, parsed);
-    }
-
-    #[test]
-    #[ignore]
-    fn silly_float2() {
-        let v = Value::from(-6.990585694841803e305);
-        let s = v.to_string();
-        dbg!(&s);
-        let mut bytes = s.as_bytes().to_vec();
-        let parsed = to_owned_value(&mut bytes).expect("failed to parse gernated float");
-        assert_eq!(v, parsed);
-    }
-
-    //6.576692109929364e305
-    fn arb_json() -> BoxedStrategy<String> {
-        let leaf = prop_oneof![
-            Just(Value::Null),
-            any::<bool>().prop_map(Value::Bool),
-            // (-1.0e306f64..1.0e306f64).prop_map(|f| json!(f)), // The float parsing of simd and serde are too different
-            any::<i64>().prop_map(|i| json!(i)),
-            ".*".prop_map(Value::from),
-        ];
-        leaf.prop_recursive(
-            8,   // 8 levels deep
-            256, // Shoot for maximum size of 256 nodes
-            10,  // We put up to 10 items per collection
-            |inner| {
-                prop_oneof![
-                    // Take the inner strategy and make the two recursive cases.
-                    prop::collection::vec(inner.clone(), 0..10).prop_map(|v| json!(v)),
-                    prop::collection::hash_map(".*", inner, 0..10).prop_map(|m| json!(m)),
-                ]
-            },
-        )
-        .prop_map(|v| serde_json::to_string(&v).expect("").to_string())
-        .boxed()
-    }
-
-    fn arb_json_value() -> BoxedStrategy<Value> {
-        let leaf = prop_oneof![
-            Just(Value::Null),
-            any::<bool>().prop_map(Value::Bool),
-            //(-1.0e306f64..1.0e306f64).prop_map(|f| json!(f)), // damn you float!
-            any::<i64>().prop_map(|i| json!(i)),
-            ".*".prop_map(Value::from),
-        ];
-        leaf.prop_recursive(
-            8,   // 8 levels deep
-            256, // Shoot for maximum size of 256 nodes
-            10,  // We put up to 10 items per collection
-            |inner| {
-                prop_oneof![
-                    // Take the inner strategy and make the two recursive cases.
-                    prop::collection::vec(inner.clone(), 0..10).prop_map(|v| json!(v)),
-                    prop::collection::hash_map(".*", inner, 0..10).prop_map(|m| json!(m)),
-                ]
-            },
-        )
-        .boxed()
-    }
-
-    proptest! {
-        #![proptest_config(ProptestConfig {
-            // Setting both fork and timeout is redundant since timeout implies
-            // fork, but both are shown for clarity.
-            fork: true,
-            .. ProptestConfig::default()
-        })]
-
-        #[test]
-        fn prop_json_encode_decode(val in arb_json_value()) {
-            let mut encoded: Vec<u8> = Vec::new();
-            let _ = val.write(&mut encoded);
-            println!("{}", String::from_utf8(encoded.clone()).unwrap());
-            let res = to_owned_value(&mut encoded).unwrap();
-            assert_eq!(val, res);
-        }
-
-    }
-    proptest! {
-        #![proptest_config(ProptestConfig {
-            // Setting both fork and timeout is redundant since timeout implies
-            // fork, but both are shown for clarity.
-            fork: true,
-            .. ProptestConfig::default()
-        })]
-
-        #[test]
-        fn prop_json(d in arb_json()) {
-            if let Ok(v_serde) = serde_json::from_slice::<serde_json::Value>(&d.as_bytes()) {
-                let mut d1 = d.clone();
-                let d1 = unsafe{ d1.as_bytes_mut()};
-                let v_simd_serde: serde_json::Value = from_slice(d1).expect("");
-                // We add our own encoder in here.
-                let mut d2 = v_simd_serde.to_string();
-                let d2 = unsafe{ d2.as_bytes_mut()};
-                let mut d3 = d.clone();
-                let d3 = unsafe{ d3.as_bytes_mut()};
-                assert_eq!(v_simd_serde, v_serde);
-                let v_simd_owned = to_owned_value(d2);
-                assert!(v_simd_owned.is_ok());
-                let v_simd_borrowed = to_borrowed_value(d3);
-                dbg!(&v_simd_borrowed);
-                assert!(v_simd_borrowed.is_ok());
-                assert_eq!(v_simd_owned.unwrap(), super::OwnedValue::from(v_simd_borrowed.unwrap()));
-            }
-
-        }
-
-    }
-
-    fn arb_junk() -> BoxedStrategy<Vec<u8>> {
-        prop::collection::vec(any::<u8>(), 0..(1024 * 8)).boxed()
-    }
-    proptest! {
-        #![proptest_config(ProptestConfig {
-            // Setting both fork and timeout is redundant since timeout implies
-            // fork, but both are shown for clarity.
-            fork: true,
-            .. ProptestConfig::default()
-        })]
-        #[test]
-        fn prop_junk(d in arb_junk()) {
-            let mut d1 = d.clone();
-            let mut d2 = d.clone();
-            let mut d3 = d.clone();
-
-            let _ = from_slice::<serde_json::Value>(&mut d1);
-            let _ = to_borrowed_value(&mut d2);
-            let _ = to_owned_value(&mut d3);
-
-        }
-    }
-
-    proptest! {
-        #![proptest_config(ProptestConfig {
-            // Setting both fork and timeout is redundant since timeout implies
-            // fork, but both are shown for clarity.
-            fork: true,
-            .. ProptestConfig::default()
-        })]
-
-        #[test]
-        fn prop_string(d in "\\PC*") {
-            let mut d1 = d.clone();
-            let mut d1 = unsafe{ d1.as_bytes_mut()};
-            let mut d2 = d.clone();
-            let mut d2 = unsafe{ d2.as_bytes_mut()};
-            let mut d3 = d.clone();
-            let mut d3 = unsafe{ d3.as_bytes_mut()};
-            let _ = from_slice::<serde_json::Value>(&mut d1);
-            let _ = to_borrowed_value(&mut d2);
-            let _ = to_owned_value(&mut d3);
-
-        }
-    }
-}
+//
+//impl<'de> Deserializer<'de> {
+//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+//    fn error(&self, error: ErrorType) -> Error {
+//        Error::new(self.idx, self.iidx, self.c() as char, error)
+//    }
+//    // By convention, `Deserializer` constructors are named like `from_xyz`.
+//    // That way basic use cases are satisfied by something like
+//    // `serde_json::from_str(...)` while advanced use cases that require a
+//    // deserializer can make one with `serde_json::Deserializer::from_str(...)`.
+//    pub fn from_slice(input: &'de mut [u8]) -> Result<Self> {
+//        // We have to pick an initial size of the structural indexes.
+//        // 6 is a heuristic that seems to work well for the benchmark
+//        // data and limit re-allocation frequency.
+//
+//        let len = input.len();
+//
+//        let buf_start: usize = input.as_ptr() as *const () as usize;
+//        let needs_relocation = (buf_start + input.len()) % page_size::get() < SIMDJSON_PADDING;
+//
+//        let s1_result: std::result::Result<Vec<u32>, ErrorType> = if needs_relocation {
+//            let mut data: Vec<u8> = Vec::with_capacity(len + SIMDJSON_PADDING);
+//            unsafe {
+//                data.set_len(len + 1);
+//                data.as_mut_slice()
+//                    .get_unchecked_mut(0..len)
+//                    .clone_from_slice(input);
+//                *(data.get_unchecked_mut(len)) = 0;
+//                data.set_len(len);
+//                Deserializer::find_structural_bits(&data)
+//            }
+//        } else {
+//            unsafe { Deserializer::find_structural_bits(input) }
+//        };
+//        let structural_indexes = match s1_result {
+//            Ok(i) => i,
+//            Err(t) => {
+//                return Err(Error::generic(t));
+//            }
+//        };
+//
+//        let counts = Deserializer::validate(input, &structural_indexes)?;
+//
+//        let strings = Vec::with_capacity(len + SIMDJSON_PADDING);
+//
+//        Ok(Deserializer {
+//            counts,
+//            structural_indexes,
+//            input,
+//            idx: 0,
+//            strings,
+//            str_offset: 0,
+//            iidx: 0,
+//        })
+//    }
+//
+//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+//    fn skip(&mut self) {
+//        self.idx += 1;
+//        self.iidx = unsafe { *self.structural_indexes.get_unchecked(self.idx) as usize };
+//    }
+//
+//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+//    fn c(&self) -> u8 {
+//        unsafe {
+//            *self
+//                .input
+//                .get_unchecked(*self.structural_indexes.get_unchecked(self.idx) as usize)
+//        }
+//    }
+//
+//    // pull out the check so we don't need to
+//    // stry every time
+//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+//    fn next_(&mut self) -> u8 {
+//        unsafe {
+//            self.idx += 1;
+//            self.iidx = *self.structural_indexes.get_unchecked(self.idx) as usize;
+//            *self.input.get_unchecked(self.iidx)
+//        }
+//    }
+//
+//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+//    fn count_elements(&self) -> usize {
+//        unsafe { *self.counts.get_unchecked(self.idx) }
+//    }
+//
+//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+//    fn parse_number_root(&mut self, minus: bool) -> Result<Number> {
+//        let input = unsafe { &self.input.get_unchecked(self.iidx..) };
+//        let len = input.len();
+//        let mut copy = vec![0u8; len + SIMDJSON_PADDING];
+//        copy[len] = 0;
+//        unsafe {
+//            copy.as_mut_ptr().copy_from(input.as_ptr(), len);
+//        };
+//        self.parse_number_int(&copy, minus)
+//    }
+//
+//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+//    fn parse_number(&mut self, minus: bool) -> Result<Number> {
+//        let input = unsafe { &self.input.get_unchecked(self.iidx..) };
+//        let len = input.len();
+//        if len < SIMDJSON_PADDING {
+//            let mut copy = vec![0u8; len + SIMDJSON_PADDING];
+//            unsafe {
+//                copy.as_mut_ptr().copy_from(input.as_ptr(), len);
+//            };
+//            self.parse_number_int(&copy, minus)
+//        } else {
+//            self.parse_number_int(input, minus)
+//        }
+//    }
+//
+//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+//    fn parse_number_(&mut self, minus: bool) -> Result<Number> {
+//        let input = unsafe { &self.input.get_unchecked(self.iidx..) };
+//        self.parse_number_int(input, minus)
+//    }
+//}
+//
+//#[cfg(test)]
+//mod tests {
+//    use super::serde::from_slice;
+//    use super::{
+//        owned::to_value, owned::Map, owned::Value, to_borrowed_value, to_owned_value, Deserializer,
+//    };
+//    use halfbrown::HashMap;
+//    use proptest::prelude::*;
+//    use serde::Deserialize;
+//    use serde_json;
+//
+//    #[test]
+//    fn count1() {
+//        let mut d = String::from("[]");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let simd = Deserializer::from_slice(&mut d).expect("");
+//        assert_eq!(simd.counts[1], 0);
+//    }
+//
+//    #[test]
+//    fn count2() {
+//        let mut d = String::from("[1]");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let simd = Deserializer::from_slice(&mut d).expect("");
+//        assert_eq!(simd.counts[1], 1);
+//    }
+//
+//    #[test]
+//    fn count3() {
+//        let mut d = String::from("[1,2]");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let simd = Deserializer::from_slice(&mut d).expect("");
+//        assert_eq!(simd.counts[1], 2);
+//    }
+//
+//    #[test]
+//    fn count4() {
+//        let mut d = String::from(" [ 1 , [ 3 ] , 2 ]");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let simd = Deserializer::from_slice(&mut d).expect("");
+//        assert_eq!(simd.counts[1], 3);
+//        assert_eq!(simd.counts[4], 1);
+//    }
+//
+//    #[test]
+//    fn count5() {
+//        let mut d = String::from("[[],null,null]");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let simd = Deserializer::from_slice(&mut d).expect("");
+//        assert_eq!(simd.counts[1], 3);
+//        assert_eq!(simd.counts[2], 0);
+//    }
+//
+//    #[test]
+//    fn empty() {
+//        let mut d = String::from("");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_simd = from_slice::<Value>(&mut d);
+//        let v_serde = serde_json::from_slice::<Value>(d);
+//        assert!(v_simd.is_err());
+//        assert!(v_serde.is_err());
+//    }
+//
+//    #[test]
+//    fn bool_true() {
+//        let mut d = String::from("true");
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//        assert_eq!(to_value(&mut d1), Ok(Value::from(true)));
+//    }
+//
+//    #[test]
+//    fn bool_false() {
+//        let mut d = String::from("false");
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//        assert_eq!(to_value(&mut d1), Ok(Value::from(false)));
+//        //assert!(false)
+//    }
+//
+//    #[test]
+//    fn union() {
+//        let mut d = String::from("null");
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//        assert_eq!(to_value(&mut d1), Ok(Value::Null));
+//    }
+//
+//    #[test]
+//    fn int() {
+//        let mut d = String::from("42");
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//        assert_eq!(to_value(&mut d1), Ok(Value::from(42)));
+//    }
+//
+//    #[test]
+//    fn zero() {
+//        let mut d = String::from("0");
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//        assert_eq!(to_value(&mut d1), Ok(Value::from(0)));
+//    }
+//
+//    #[test]
+//    fn one() {
+//        let mut d = String::from("1");
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//        assert_eq!(to_value(&mut d1), Ok(Value::from(1)));
+//    }
+//
+//    #[test]
+//    fn minus_one() {
+//        let mut d = String::from("-1");
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//        assert_eq!(to_value(&mut d1), Ok(Value::from(-1)));
+//    }
+//
+//    #[test]
+//    fn float() {
+//        let mut d = String::from("23.0");
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//        assert_eq!(to_value(&mut d1), Ok(Value::from(23.0)));
+//    }
+//
+//    #[test]
+//    fn string() {
+//        let mut d = String::from(r#""snot""#);
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(to_value(&mut d1), Ok(Value::from("snot")));
+//        assert_eq!(v_simd, v_serde);
+//    }
+//
+//    #[test]
+//    fn lonely_quote() {
+//        let mut d = String::from(r#"""#);
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
+//        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
+//        assert!(v_simd);
+//        assert!(v_serde);
+//    }
+//
+//    #[test]
+//    fn lonely_quote1() {
+//        let mut d = String::from(r#"["]"#);
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
+//        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
+//        assert!(v_simd);
+//        assert!(v_serde);
+//    }
+//    #[test]
+//    fn lonely_quote2() {
+//        let mut d = String::from(r#"[1, "]"#);
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
+//        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
+//        assert!(v_simd);
+//        assert!(v_serde);
+//    }
+//
+//    #[test]
+//    fn lonely_quote3() {
+//        let mut d = String::from(r#"{": 1}"#);
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
+//        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
+//        assert!(v_simd);
+//        assert!(v_serde);
+//    }
+//
+//    #[test]
+//    fn empty_string() {
+//        let mut d = String::from(r#""""#);
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(to_value(&mut d1), Ok(Value::from("")));
+//        assert_eq!(v_simd, v_serde);
+//    }
+//
+//    #[test]
+//    fn empty_array() {
+//        let mut d = String::from(r#"[]"#);
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("parse_serde");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("parse_simd");
+//        assert_eq!(to_value(&mut d1), Ok(Value::Array(vec![])));
+//        assert_eq!(v_simd, v_serde);
+//    }
+//
+//    #[test]
+//    fn malformed_array() {
+//        let mut d = String::from(r#"[["#);
+//        let mut d1 = d.clone();
+//        let mut d2 = d.clone();
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d2 = unsafe { d2.as_bytes_mut() };
+//        let v_serde: Result<serde_json::Value, _> = serde_json::from_slice(d);
+//        let v_simd_ov = to_owned_value(&mut d);
+//        let v_simd_bv = to_borrowed_value(&mut d1);
+//        let v_simd: Result<serde_json::Value, _> = from_slice(&mut d2);
+//        assert!(v_simd_ov.is_err());
+//        assert!(v_simd_bv.is_err());
+//        assert!(v_simd.is_err());
+//        assert!(v_serde.is_err());
+//    }
+//
+//    #[test]
+//    fn double_array() {
+//        let mut d = String::from(r#"[[]]"#);
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("parse_serde");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("parse_simd");
+//        assert_eq!(
+//            to_value(&mut d1),
+//            Ok(Value::Array(vec![Value::Array(vec![])]))
+//        );
+//        assert_eq!(v_simd, v_serde);
+//    }
+//
+//    #[test]
+//    fn null_null_array() {
+//        let mut d = String::from(r#"[[],null,null]"#);
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("parse_serde");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("parse_simd");
+//        assert_eq!(
+//            to_value(&mut d1),
+//            Ok(Value::Array(vec![
+//                Value::Array(vec![]),
+//                Value::Null,
+//                Value::Null,
+//            ]))
+//        );
+//        assert_eq!(v_simd, v_serde);
+//    }
+//
+//    #[test]
+//    fn one_element_array() {
+//        let mut d = String::from(r#"["snot"]"#);
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        assert_eq!(
+//            to_value(&mut d1),
+//            Ok(Value::Array(vec![Value::from("snot")]))
+//        );
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//    }
+//
+//    #[test]
+//    fn two_element_array() {
+//        let mut d = String::from(r#"["snot", "badger"]"#);
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        assert_eq!(
+//            to_value(&mut d1),
+//            Ok(Value::Array(vec![
+//                Value::from("snot"),
+//                Value::from("badger")
+//            ]))
+//        );
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//    }
+//
+//    #[test]
+//    fn list() {
+//        let mut d = String::from(r#"[42, 23.0, "snot badger"]"#);
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//        assert_eq!(
+//            to_value(&mut d1),
+//            Ok(Value::Array(vec![
+//                Value::from(42),
+//                Value::from(23.0),
+//                Value::from("snot badger")
+//            ]))
+//        );
+//    }
+//
+//    #[test]
+//    fn nested_list1() {
+//        let mut d = String::from(r#"[42, [23.0, "snot"], "bad", "ger"]"#);
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        assert_eq!(
+//            to_value(&mut d1),
+//            Ok(Value::Array(vec![
+//                Value::from(42),
+//                Value::Array(vec![Value::from(23.0), Value::from("snot")]),
+//                Value::from("bad"),
+//                Value::from("ger")
+//            ]))
+//        );
+//
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//    }
+//
+//    #[test]
+//    fn nested_list2() {
+//        let mut d = String::from(r#"[42, [23.0, "snot"], {"bad": "ger"}]"#);
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn utf8() {
+//        let mut d = String::from(r#""\u000e""#);
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, "\u{e}");
+//        // NOTE: serde is broken for this
+//        //assert_eq!(v_serde, "\u{e}");
+//        //assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn unicode() {
+//        let mut d = String::from(r#""¡\"""#);
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//    }
+//
+//    #[test]
+//    fn odd_array() {
+//        let mut d = String::from("[{},null]");
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//        assert_eq!(
+//            to_value(&mut d1),
+//            Ok(Value::Array(vec![Value::Object(Map::new()), Value::Null]))
+//        );
+//    }
+//
+//    #[test]
+//    fn map2() {
+//        let mut d = String::from(r#"[{"\u0000":null}]"#);
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn null() {
+//        let mut d = String::from(r#"null"#);
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        assert_eq!(to_value(&mut d1), Ok(Value::Null));
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//    }
+//    #[test]
+//    fn null_null() {
+//        let mut d = String::from(r#"[null, null]"#);
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        assert_eq!(
+//            to_value(&mut d1),
+//            Ok(Value::Array(vec![Value::Null, Value::Null,]))
+//        );
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//    }
+//
+//    #[test]
+//    fn nested_null() {
+//        let mut d = String::from(r#"[[null, null]]"#);
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        assert_eq!(
+//            to_value(&mut d1),
+//            Ok(Value::Array(vec![Value::Array(vec![
+//                Value::Null,
+//                Value::Null,
+//            ])]))
+//        );
+//
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//    }
+//
+//    #[test]
+//    fn nestednested_null() {
+//        let mut d = String::from(r#"[[[null, null]]]"#);
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//        assert_eq!(
+//            to_value(&mut d1),
+//            Ok(Value::Array(vec![Value::Array(vec![Value::Array(vec![
+//                Value::Null,
+//                Value::Null,
+//            ])])]))
+//        );
+//    }
+//
+//    #[test]
+//    fn odd_array2() {
+//        let mut d = String::from("[[\"\\u0000\\\"\"]]");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn odd_array3() {
+//        let mut d = String::from("[{\"\\u0000\\u0000\":null}]");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn odd_array4() {
+//        let mut d = String::from("[{\"\\u0000𐀀a\":null}]");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn float1() {
+//        let mut d = String::from("2.3250706903316115e307");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    // We ignore this since serde is less percise on this test
+//    #[ignore]
+//    #[test]
+//    fn float2() {
+//        let mut d = String::from("-4.5512678569607477e306");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn map0() {
+//        let mut d = String::from(r#"{"snot": "badger"}"#);
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//        let mut h = Map::new();
+//        h.insert("snot".into(), Value::from("badger"));
+//        assert_eq!(to_value(&mut d1), Ok(Value::Object(h)));
+//    }
+//
+//    #[test]
+//    fn map1() {
+//        let mut d = String::from(r#"{"snot": "badger", "badger": "snot"}"#);
+//        let mut d1 = d.clone();
+//        let mut d1 = unsafe { d1.as_bytes_mut() };
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+//        assert_eq!(v_simd, v_serde);
+//        let mut h = Map::new();
+//        h.insert("snot".into(), Value::from("badger"));
+//        h.insert("badger".into(), Value::from("snot"));
+//        assert_eq!(to_value(&mut d1), Ok(Value::Object(h)));
+//    }
+//
+//    #[test]
+//    fn tpl1() {
+//        let mut d = String::from("[-65.613616999999977, 43.420273000000009]");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: (f32, f32) = serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: (f32, f32) = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn tpl2() {
+//        let mut d = String::from("[[-65.613616999999977, 43.420273000000009]]");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: Vec<(f32, f32)> = serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: Vec<(f32, f32)> = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn tpl3() {
+//        let mut d = String::from(
+//            "[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]",
+//        );
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: Vec<(f32, f32)> = serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: Vec<(f32, f32)> = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//    #[test]
+//    fn tpl4() {
+//        let mut d = String::from("[[[-65.613616999999977,43.420273000000009]]]");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: Vec<Vec<(f32, f32)>> = serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: Vec<Vec<(f32, f32)>> = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//    #[test]
+//    fn tpl5() {
+//        let mut d = String::from("[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: Vec<Vec<(f32, f32)>> = serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: Vec<Vec<(f32, f32)>> = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn tpl6() {
+//        let mut d = String::from("[[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]]");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: Vec<Vec<Vec<(f32, f32)>>> = serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: Vec<Vec<Vec<(f32, f32)>>> = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn tpl7() {
+//        let mut d = String::from("[[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]]");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: Vec<Vec<Vec<[f32; 2]>>> = serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: Vec<Vec<Vec<[f32; 2]>>> = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[derive(Deserialize, PartialEq, Debug)]
+//    struct Obj {
+//        a: u64,
+//        b: u64,
+//    }
+//
+//    #[derive(Deserialize, PartialEq, Debug)]
+//    struct Obj1 {
+//        a: Obj,
+//    }
+//
+//    #[test]
+//    fn obj() {
+//        let mut d = String::from(r#"{"a": 1, "b":1}"#);
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: Obj = serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: Obj = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn obj2() {
+//        let mut d =
+//            String::from(r#"{"a": {"a": 1, "b":1}, "b": {"a": 1, "b":1}, "c": {"a": 1, "b": 1}}"#);
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: HashMap<String, Obj> = serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: HashMap<String, Obj> = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn obj3() {
+//        let mut d = String::from(
+//            r#"{"c": {"a": {"a": 1, "b":1}, "b": {"a": 1, "b":1}, "c": {"a": 1, "b": 1}}}"#,
+//        );
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: HashMap<String, HashMap<String, Obj>> =
+//            serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: HashMap<String, HashMap<String, Obj>> = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn obj4() {
+//        let mut d = String::from(r#"{"c": {"a": {"a": 1, "b":1}}}"#);
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: HashMap<String, Obj1> = serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: HashMap<String, Obj1> = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn vecvec() {
+//        let mut d = String::from("[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]], [[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]");
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: Vec<Vec<(f32, f32)>> = serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: Vec<Vec<(f32, f32)>> = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn crazy_string() {
+//        // there is unicode in here!
+//        let d = "\"𐀀𐀀  𐀀𐀀0 𐀀A\\u00000A0 A \\u000b\"";
+//        let mut d = String::from(d);
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: serde_json::Value = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    #[test]
+//    fn event() {
+//        #[derive(Deserialize, Debug, PartialEq)]
+//        #[serde(deny_unknown_fields, rename_all = "camelCase")]
+//        pub struct CitmCatalog {
+//            pub area_names: HashMap<String, String>,
+//            pub audience_sub_category_names: HashMap<String, String>,
+//            pub block_names: HashMap<String, String>,
+//            pub events: HashMap<String, Event>,
+//        }
+//        pub type Id = u32;
+//        #[derive(Deserialize, Debug, PartialEq)]
+//        #[serde(deny_unknown_fields, rename_all = "camelCase")]
+//        pub struct Event {
+//            pub description: (),
+//            pub id: Id,
+//            pub logo: Option<String>,
+//            pub name: String,
+//            pub sub_topic_ids: Vec<Id>,
+//            pub subject_code: (),
+//            pub subtitle: (),
+//            pub topic_ids: Vec<Id>,
+//        }
+//
+//        let mut d = String::from(
+//            r#"
+//{
+//    "areaNames": {
+//        "205705993": "Arrière-scène central",
+//        "205705994": "1er balcon central",
+//        "205705995": "2ème balcon bergerie cour",
+//        "205705996": "2ème balcon bergerie jardin",
+//        "205705998": "1er balcon bergerie jardin",
+//        "205705999": "1er balcon bergerie cour",
+//        "205706000": "Arrière-scène jardin",
+//        "205706001": "Arrière-scène cour",
+//        "205706002": "2ème balcon jardin",
+//        "205706003": "2ème balcon cour",
+//        "205706004": "2ème Balcon central",
+//        "205706005": "1er balcon jardin",
+//        "205706006": "1er balcon cour",
+//        "205706007": "Orchestre central",
+//        "205706008": "Orchestre jardin",
+//        "205706009": "Orchestre cour",
+//        "342752287": "Zone physique secrète"
+//    },
+//    "audienceSubCategoryNames": {
+//        "337100890": "Abonné"
+//    },
+//    "blockNames": {},
+//  "events": {
+//    "138586341": {
+//      "description": null,
+//      "id": 138586341,
+//      "logo": null,
+//      "name": "30th Anniversary Tour",
+//      "subTopicIds": [
+//        337184269,
+//        337184283
+//      ],
+//      "subjectCode": null,
+//      "subtitle": null,
+//      "topicIds": [
+//        324846099,
+//        107888604
+//      ]
+//    },
+//    "138586345": {
+//      "description": null,
+//      "id": 138586345,
+//      "logo": "/images/UE0AAAAACEKo6QAAAAZDSVRN",
+//      "name": "Berliner Philharmoniker",
+//      "subTopicIds": [
+//        337184268,
+//        337184283,
+//        337184275
+//      ],
+//      "subjectCode": null,
+//      "subtitle": null,
+//      "topicIds": [
+//        324846099,
+//        107888604,
+//        324846100
+//      ]
+//    }
+//  }
+//}
+//"#,
+//        );
+//        let mut d = unsafe { d.as_bytes_mut() };
+//        let v_serde: CitmCatalog = serde_json::from_slice(d).expect("serde_json");
+//        let v_simd: CitmCatalog = from_slice(&mut d).expect("simd_json");
+//        assert_eq!(v_simd, v_serde)
+//    }
+//
+//    // How much do we care about this, it's within the same range and
+//    // based on floating point math inprecisions during parsing.
+//    // Is this a real issue worth improving?
+//    #[test]
+//    fn silly_float1() {
+//        let v = Value::from(3.0901448042322017e305);
+//        let s = v.to_string();
+//        dbg!(&s);
+//        let mut bytes = s.as_bytes().to_vec();
+//        let parsed = to_owned_value(&mut bytes).expect("failed to parse gernated float");
+//        assert_eq!(v, parsed);
+//    }
+//
+//    #[test]
+//    #[ignore]
+//    fn silly_float2() {
+//        let v = Value::from(-6.990585694841803e305);
+//        let s = v.to_string();
+//        dbg!(&s);
+//        let mut bytes = s.as_bytes().to_vec();
+//        let parsed = to_owned_value(&mut bytes).expect("failed to parse gernated float");
+//        assert_eq!(v, parsed);
+//    }
+//
+//    //6.576692109929364e305
+//    fn arb_json() -> BoxedStrategy<String> {
+//        let leaf = prop_oneof![
+//            Just(Value::Null),
+//            any::<bool>().prop_map(Value::Bool),
+//            // (-1.0e306f64..1.0e306f64).prop_map(|f| json!(f)), // The float parsing of simd and serde are too different
+//            any::<i64>().prop_map(|i| json!(i)),
+//            ".*".prop_map(Value::from),
+//        ];
+//        leaf.prop_recursive(
+//            8,   // 8 levels deep
+//            256, // Shoot for maximum size of 256 nodes
+//            10,  // We put up to 10 items per collection
+//            |inner| {
+//                prop_oneof![
+//                    // Take the inner strategy and make the two recursive cases.
+//                    prop::collection::vec(inner.clone(), 0..10).prop_map(|v| json!(v)),
+//                    prop::collection::hash_map(".*", inner, 0..10).prop_map(|m| json!(m)),
+//                ]
+//            },
+//        )
+//        .prop_map(|v| serde_json::to_string(&v).expect("").to_string())
+//        .boxed()
+//    }
+//
+//    fn arb_json_value() -> BoxedStrategy<Value> {
+//        let leaf = prop_oneof![
+//            Just(Value::Null),
+//            any::<bool>().prop_map(Value::Bool),
+//            //(-1.0e306f64..1.0e306f64).prop_map(|f| json!(f)), // damn you float!
+//            any::<i64>().prop_map(|i| json!(i)),
+//            ".*".prop_map(Value::from),
+//        ];
+//        leaf.prop_recursive(
+//            8,   // 8 levels deep
+//            256, // Shoot for maximum size of 256 nodes
+//            10,  // We put up to 10 items per collection
+//            |inner| {
+//                prop_oneof![
+//                    // Take the inner strategy and make the two recursive cases.
+//                    prop::collection::vec(inner.clone(), 0..10).prop_map(|v| json!(v)),
+//                    prop::collection::hash_map(".*", inner, 0..10).prop_map(|m| json!(m)),
+//                ]
+//            },
+//        )
+//        .boxed()
+//    }
+//
+//    proptest! {
+//        #![proptest_config(ProptestConfig {
+//            // Setting both fork and timeout is redundant since timeout implies
+//            // fork, but both are shown for clarity.
+//            fork: true,
+//            .. ProptestConfig::default()
+//        })]
+//
+//        #[test]
+//        fn prop_json_encode_decode(val in arb_json_value()) {
+//            let mut encoded: Vec<u8> = Vec::new();
+//            let _ = val.write(&mut encoded);
+//            println!("{}", String::from_utf8(encoded.clone()).unwrap());
+//            let res = to_owned_value(&mut encoded).unwrap();
+//            assert_eq!(val, res);
+//        }
+//
+//    }
+//    proptest! {
+//        #![proptest_config(ProptestConfig {
+//            // Setting both fork and timeout is redundant since timeout implies
+//            // fork, but both are shown for clarity.
+//            fork: true,
+//            .. ProptestConfig::default()
+//        })]
+//
+//        #[test]
+//        fn prop_json(d in arb_json()) {
+//            if let Ok(v_serde) = serde_json::from_slice::<serde_json::Value>(&d.as_bytes()) {
+//                let mut d1 = d.clone();
+//                let d1 = unsafe{ d1.as_bytes_mut()};
+//                let v_simd_serde: serde_json::Value = from_slice(d1).expect("");
+//                // We add our own encoder in here.
+//                let mut d2 = v_simd_serde.to_string();
+//                let d2 = unsafe{ d2.as_bytes_mut()};
+//                let mut d3 = d.clone();
+//                let d3 = unsafe{ d3.as_bytes_mut()};
+//                assert_eq!(v_simd_serde, v_serde);
+//                let v_simd_owned = to_owned_value(d2);
+//                assert!(v_simd_owned.is_ok());
+//                let v_simd_borrowed = to_borrowed_value(d3);
+//                dbg!(&v_simd_borrowed);
+//                assert!(v_simd_borrowed.is_ok());
+//                assert_eq!(v_simd_owned.unwrap(), super::OwnedValue::from(v_simd_borrowed.unwrap()));
+//            }
+//
+//        }
+//
+//    }
+//
+//    fn arb_junk() -> BoxedStrategy<Vec<u8>> {
+//        prop::collection::vec(any::<u8>(), 0..(1024 * 8)).boxed()
+//    }
+//    proptest! {
+//        #![proptest_config(ProptestConfig {
+//            // Setting both fork and timeout is redundant since timeout implies
+//            // fork, but both are shown for clarity.
+//            fork: true,
+//            .. ProptestConfig::default()
+//        })]
+//        #[test]
+//        fn prop_junk(d in arb_junk()) {
+//            let mut d1 = d.clone();
+//            let mut d2 = d.clone();
+//            let mut d3 = d.clone();
+//
+//            let _ = from_slice::<serde_json::Value>(&mut d1);
+//            let _ = to_borrowed_value(&mut d2);
+//            let _ = to_owned_value(&mut d3);
+//
+//        }
+//    }
+//
+//    proptest! {
+//        #![proptest_config(ProptestConfig {
+//            // Setting both fork and timeout is redundant since timeout implies
+//            // fork, but both are shown for clarity.
+//            fork: true,
+//            .. ProptestConfig::default()
+//        })]
+//
+//        #[test]
+//        fn prop_string(d in "\\PC*") {
+//            let mut d1 = d.clone();
+//            let mut d1 = unsafe{ d1.as_bytes_mut()};
+//            let mut d2 = d.clone();
+//            let mut d2 = unsafe{ d2.as_bytes_mut()};
+//            let mut d3 = d.clone();
+//            let mut d3 = unsafe{ d3.as_bytes_mut()};
+//            let _ = from_slice::<serde_json::Value>(&mut d1);
+//            let _ = to_borrowed_value(&mut d2);
+//            let _ = to_owned_value(&mut d3);
+//
+//        }
+//    }
+//}
diff --git a/src/neon/deser.rs b/src/neon/deser.rs
new file mode 100644
index 00000000..790eb1b6
--- /dev/null
+++ b/src/neon/deser.rs
@@ -0,0 +1,204 @@
+//
+//use std::mem;
+//
+//pub use crate::error::{Error, ErrorType};
+//pub use crate::Deserializer;
+//pub use crate::Result;
+//pub use crate::neon::intrinsics::*;
+////pub use crate::neon::utf8check::*;
+//pub use crate::stringparse::*;
+//
+////use crate::portability::trailingzeroes;
+//
+//
+//impl<'de> Deserializer<'de> {
+//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+//    pub fn parse_str_(&mut self) -> Result<&'de str> {
+//        // Add 1 to skip the initial "
+//        let idx = self.iidx + 1;
+//        let mut padding = [0u8; 32];
+//        //let mut read: usize = 0;
+//
+//        // we include the terminal '"' so we know where to end
+//        // This is safe since we check sub's lenght in the range access above and only
+//        // create sub sliced form sub to `sub.len()`.
+//
+//        let src: &[u8] = unsafe { &self.input.get_unchecked(idx..) };
+//        let mut src_i: usize = 0;
+//        let mut len = src_i;
+//        loop {
+//            let v: __m128i = if src.len() >= src_i + 16 {
+//                // This is safe since we ensure src is at least 16 wide
+//                #[allow(clippy::cast_ptr_alignment)]
+//                unsafe {
+//                    _mm_loadu_si128(src.as_ptr().add(src_i) as *const __m128i)
+//                }
+//            } else {
+//                unsafe {
+//                    padding
+//                        .get_unchecked_mut(..src.len() - src_i)
+//                        .clone_from_slice(src.get_unchecked(src_i..));
+//                    // This is safe since we ensure src is at least 32 wide
+//                    #[allow(clippy::cast_ptr_alignment)]
+//                    _mm_loadu_si128(padding.as_ptr() as *const __m128i)
+//                }
+//            };
+//
+//            // store to dest unconditionally - we can overwrite the bits we don't like
+//            // later
+//            let bs_bits: u32 = unsafe {
+//                static_cast_u32!(_mm_movemask_epi8(_mm_cmpeq_epi8(
+//                    v,
+//                    _mm_set1_epi8(b'\\' as i8)
+//                )))
+//            };
+//            let quote_mask = unsafe { _mm_cmpeq_epi8(v, _mm_set1_epi8(b'"' as i8)) };
+//            let quote_bits = unsafe { static_cast_u32!(_mm_movemask_epi8(quote_mask)) };
+//            if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
+//                // we encountered quotes first. Move dst to point to quotes and exit
+//                // find out where the quote is...
+//                let quote_dist: u32 = trailingzeroes(u64::from(quote_bits)) as u32;
+//
+//                ///////////////////////
+//                // Above, check for overflow in case someone has a crazy string (>=4GB?)
+//                // But only add the overflow check when the document itself exceeds 4GB
+//                // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+//                ////////////////////////
+//
+//                // we advance the point, accounting for the fact that we have a NULl termination
+//
+//                len += quote_dist as usize;
+//                unsafe {
+//                    let v = self.input.get_unchecked(idx..idx + len) as *const [u8] as *const str;
+//                    return Ok(&*v);
+//                }
+//
+//                // we compare the pointers since we care if they are 'at the same spot'
+//                // not if they are the same value
+//            }
+//            if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
+//                // Move to the 'bad' character
+//                let bs_dist: u32 = trailingzeroes(u64::from(bs_bits));
+//                len += bs_dist as usize;
+//                src_i += bs_dist as usize;
+//                break;
+//            } else {
+//                // they are the same. Since they can't co-occur, it means we encountered
+//                // neither.
+//                src_i += 16;
+//                len += 16;
+//            }
+//        }
+//
+//        let mut dst_i: usize = 0;
+//        let dst: &mut [u8] = &mut self.strings;
+//
+//        loop {
+//            let v: __m128i = if src.len() >= src_i + 16 {
+//                // This is safe since we ensure src is at least 16 wide
+//                #[allow(clippy::cast_ptr_alignment)]
+//                unsafe {
+//                    _mm_loadu_si128(src.as_ptr().add(src_i) as *const __m128i)
+//                }
+//            } else {
+//                unsafe {
+//                    padding
+//                        .get_unchecked_mut(..src.len() - src_i)
+//                        .clone_from_slice(src.get_unchecked(src_i..));
+//                    // This is safe since we ensure src is at least 16 wide
+//                    #[allow(clippy::cast_ptr_alignment)]
+//                    _mm_loadu_si128(padding.as_ptr() as *const __m128i)
+//                }
+//            };
+//
+//            #[allow(clippy::cast_ptr_alignment)]
+//            unsafe {
+//                _mm_storeu_si128(dst.as_mut_ptr().add(dst_i) as *mut __m128i, v)
+//            };
+//
+//            // store to dest unconditionally - we can overwrite the bits we don't like
+//            // later
+//            let bs_bits: u32 = unsafe {
+//                static_cast_u32!(_mm_movemask_epi8(_mm_cmpeq_epi8(
+//                    v,
+//                    _mm_set1_epi8(b'\\' as i8)
+//                )))
+//            };
+//            let quote_mask = unsafe { _mm_cmpeq_epi8(v, _mm_set1_epi8(b'"' as i8)) };
+//            let quote_bits = unsafe { static_cast_u32!(_mm_movemask_epi8(quote_mask)) };
+//            if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
+//                // we encountered quotes first. Move dst to point to quotes and exit
+//                // find out where the quote is...
+//                let quote_dist: u32 = trailingzeroes(u64::from(quote_bits)) as u32;
+//
+//                ///////////////////////
+//                // Above, check for overflow in case someone has a crazy string (>=4GB?)
+//                // But only add the overflow check when the document itself exceeds 4GB
+//                // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+//                ////////////////////////
+//
+//                // we advance the point, accounting for the fact that we have a NULl termination
+//
+//                dst_i += quote_dist as usize;
+//                unsafe {
+//                    self.input
+//                        .get_unchecked_mut(idx + len..idx + len + dst_i)
+//                        .clone_from_slice(&self.strings.get_unchecked(..dst_i));
+//                    let v = self.input.get_unchecked(idx..idx + len + dst_i) as *const [u8]
+//                        as *const str;
+//                    self.str_offset += dst_i as usize;
+//                    return Ok(&*v);
+//                }
+//
+//                // we compare the pointers since we care if they are 'at the same spot'
+//                // not if they are the same value
+//            }
+//            if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
+//                // find out where the backspace is
+//                let bs_dist: u32 = trailingzeroes(u64::from(bs_bits));
+//                let escape_char: u8 = unsafe { *src.get_unchecked(src_i + bs_dist as usize + 1) };
+//                // we encountered backslash first. Handle backslash
+//                if escape_char == b'u' {
+//                    // move src/dst up to the start; they will be further adjusted
+//                    // within the unicode codepoint handling code.
+//                    src_i += bs_dist as usize;
+//                    dst_i += bs_dist as usize;
+//                    let (o, s) = if let Ok(r) =
+//                        handle_unicode_codepoint(unsafe { src.get_unchecked(src_i..) }, unsafe {
+//                            dst.get_unchecked_mut(dst_i..)
+//                        }) {
+//                        r
+//                    } else {
+//                        return Err(self.error(ErrorType::InvlaidUnicodeCodepoint));
+//                    };
+//                    if o == 0 {
+//                        return Err(self.error(ErrorType::InvlaidUnicodeCodepoint));
+//                    };
+//                    // We moved o steps forword at the destiation and 6 on the source
+//                    src_i += s;
+//                    dst_i += o;
+//                } else {
+//                    // simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+//                    // write bs_dist+1 characters to output
+//                    // note this may reach beyond the part of the buffer we've actually
+//                    // seen. I think this is ok
+//                    let escape_result: u8 =
+//                        unsafe { *ESCAPE_MAP.get_unchecked(escape_char as usize) };
+//                    if escape_result == 0 {
+//                        return Err(self.error(ErrorType::InvalidEscape));
+//                    }
+//                    unsafe {
+//                        *dst.get_unchecked_mut(dst_i + bs_dist as usize) = escape_result;
+//                    }
+//                    src_i += bs_dist as usize + 2;
+//                    dst_i += bs_dist as usize + 1;
+//                }
+//            } else {
+//                // they are the same. Since they can't co-occur, it means we encountered
+//                // neither.
+//                src_i += 16;
+//                dst_i += 16;
+//            }
+//        }
+//    }
+//}
diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs
new file mode 100644
index 00000000..a2bca414
--- /dev/null
+++ b/src/neon/intrinsics.rs
@@ -0,0 +1,287 @@
+//use std::arch::
+
+use crate::neon::simd_llvm;
+use crate::neon::simd;
+
+use std::mem;
+use std::hint::unreachable_unchecked;
+
+#[allow(unused)]
+macro_rules! types {
+    ($(
+        $(#[$doc:meta])*
+        pub struct $name:ident($($fields:tt)*);
+    )*) => ($(
+        $(#[$doc])*
+        #[derive(Copy, Clone, Debug)]
+        #[allow(non_camel_case_types)]
+        #[repr(simd)]
+        #[allow(clippy::missing_inline_in_public_items)]
+        pub struct $name($($fields)*);
+    )*)
+}
+
+pub type poly64_t = i64;
+
+extern "C" {
+    #[link_name = "llvm.aarch64.neon.pmull64"]
+    fn vmull_p64_(a: i64, b: i64) -> int8x16_t;
+}
+
+//poly128_t vmull_p64 (poly64_t a, poly64_t b)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(pmull))]
+pub unsafe fn vmull_p64(a: poly64_t, b: poly64_t) -> poly128_t {
+    mem::transmute(vmull_p64_(mem::transmute(a), mem::transmute(b)))
+}
+
+types! {
+    /// ARM-specific 64-bit wide vector of eight packed `i8`.
+    pub struct int8x8_t(i8, i8, i8, i8, i8, i8, i8, i8);
+    /// ARM-specific 64-bit wide vector of eight packed `u8`.
+    pub struct uint8x8_t(u8, u8, u8, u8, u8, u8, u8, u8);
+    /// ARM-specific 64-bit wide polynomial vector of eight packed `u8`.
+    pub struct poly8x8_t(u8, u8, u8, u8, u8, u8, u8, u8);
+    /// ARM-specific 64-bit wide vector of four packed `i16`.
+    pub struct int16x4_t(i16, i16, i16, i16);
+    /// ARM-specific 64-bit wide vector of four packed `u16`.
+    pub struct uint16x4_t(u16, u16, u16, u16);
+    // FIXME: ARM-specific 64-bit wide vector of four packed `f16`.
+    // pub struct float16x4_t(f16, f16, f16, f16);
+    /// ARM-specific 64-bit wide vector of four packed `u16`.
+    pub struct poly16x4_t(u16, u16, u16, u16);
+    /// ARM-specific 64-bit wide vector of two packed `i32`.
+    pub struct int32x2_t(i32, i32);
+    /// ARM-specific 64-bit wide vector of two packed `u32`.
+    pub struct uint32x2_t(u32, u32);
+    /// ARM-specific 64-bit wide vector of two packed `f32`.
+    pub struct float32x2_t(f32, f32);
+    /// ARM-specific 64-bit wide vector of one packed `i64`.
+    pub struct int64x1_t(i64);
+    /// ARM-specific 64-bit wide vector of one packed `u64`.
+    pub struct uint64x1_t(u64);
+
+    /// ARM-specific 128-bit wide vector of sixteen packed `i8`.
+    pub struct int8x16_t(
+        i8, i8 ,i8, i8, i8, i8 ,i8, i8,
+        i8, i8 ,i8, i8, i8, i8 ,i8, i8,
+    );
+    /// ARM-specific 128-bit wide vector of sixteen packed `u8`.
+    pub struct uint8x16_t(
+        u8, u8 ,u8, u8, u8, u8 ,u8, u8,
+        u8, u8 ,u8, u8, u8, u8 ,u8, u8,
+    );
+    /// ARM-specific 128-bit wide vector of sixteen packed `u8`.
+    pub struct poly8x16_t(
+        u8, u8, u8, u8, u8, u8, u8, u8,
+        u8, u8, u8, u8, u8, u8, u8, u8
+    );
+    /// ARM-specific 128-bit wide vector of eight packed `i16`.
+    pub struct int16x8_t(i16, i16, i16, i16, i16, i16, i16, i16);
+    /// ARM-specific 128-bit wide vector of eight packed `u16`.
+    pub struct uint16x8_t(u16, u16, u16, u16, u16, u16, u16, u16);
+    // FIXME: ARM-specific 128-bit wide vector of eight packed `f16`.
+    // pub struct float16x8_t(f16, f16, f16, f16, f16, f16, f16);
+    /// ARM-specific 128-bit wide vector of eight packed `u16`.
+    pub struct poly16x8_t(u16, u16, u16, u16, u16, u16, u16, u16);
+    /// ARM-specific 128-bit wide vector of four packed `i32`.
+    pub struct int32x4_t(i32, i32, i32, i32);
+    /// ARM-specific 128-bit wide vector of four packed `u32`.
+    pub struct uint32x4_t(u32, u32, u32, u32);
+    /// ARM-specific 128-bit wide vector of four packed `f32`.
+    pub struct float32x4_t(f32, f32, f32, f32);
+    /// ARM-specific 128-bit wide vector of two packed `i64`.
+    pub struct int64x2_t(i64, i64);
+    /// ARM-specific 128-bit wide vector of two packed `u64`.
+    pub struct uint64x2_t(u64, u64);
+    /// ARM-specific 128-bit wide vector of one packed `i128`
+    pub struct poly128_t(i128);
+}
+
+impl uint8x16_t {
+    pub fn new(a:u8,b:u8,c:u8,d:u8,e:u8,f:u8,g:u8,h:u8,i:u8,j:u8,k:u8,l:u8,m:u8,n:u8,o:u8,p:u8) -> uint8x16_t {
+        uint8x16_t(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)
+    }
+}
+
+impl int8x16_t {
+    pub fn new(a:i8,b:i8,c:i8,d:i8,e:i8,f:i8,g:i8,h:i8,i:i8,j:i8,k:i8,l:i8,m:i8,n:i8,o:i8,p:i8) -> int8x16_t {
+        int8x16_t(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)
+    }
+}
+
+macro_rules! aarch64_simd_2 {
+    ($name:ident, $type:ty, $simd_fn:ident, $intr:ident) => {
+        #[inline]
+        #[target_feature(enable = "neon")]
+        #[cfg_attr(test, assert_instr($intr))]
+        pub unsafe fn $name(a: $type, b: $type) -> $type {
+            simd_llvm::$simd_fn(a, b)
+        }
+    };
+}
+macro_rules! aarch64_simd_ceq {
+    ($name:ident, $type:ty) => {
+        /// Compare bitwise Equal (vector)
+        aarch64_simd_2!($name, $type, simd_eq, cmeq);
+    };
+}
+
+aarch64_simd_ceq!(vceqq_u8, uint8x16_t);
+aarch64_simd_ceq!(vceq_s64, int64x1_t);
+aarch64_simd_ceq!(vceqq_s64, int64x2_t);
+aarch64_simd_ceq!(vceq_u64, uint64x1_t);
+aarch64_simd_ceq!(vceqq_u64, uint64x2_t);
+aarch64_simd_ceq!(vceq_p64, uint64x1_t);
+aarch64_simd_ceq!(vceqq_p64, uint64x2_t);
+
+macro_rules! aarch64_simd_cgt {
+    ($name:ident, $type:ty) => {
+        /// Compare signed Greater than (vector)
+        aarch64_simd_2!($name, $type, simd_gt, cmgt);
+    };
+}
+macro_rules! aarch64_simd_cgtu {
+    ($name:ident, $type:ty) => {
+        /// Compare Greater than (vector)
+        aarch64_simd_2!($name, $type, simd_gt, cmhi);
+    };
+}
+
+aarch64_simd_cgt!(vcgt_s64, int64x1_t);
+aarch64_simd_cgt!(vcgtq_s64, int64x2_t);
+aarch64_simd_cgtu!(vcgt_u64, uint64x1_t);
+aarch64_simd_cgtu!(vcgtq_u64, uint64x2_t);
+
+macro_rules! aarch64_simd_clt {
+    ($name:ident, $type:ty) => {
+        /// Compare signed Lesser than (vector)
+        aarch64_simd_2!($name, $type, simd_lt, cmgt);
+    };
+}
+macro_rules! aarch64_simd_cltu {
+    ($name:ident, $type:ty) => {
+        /// Compare Lesser than (vector)
+        aarch64_simd_2!($name, $type, simd_lt, cmhi);
+    };
+}
+
+aarch64_simd_clt!(vclt_s64, int64x1_t);
+aarch64_simd_clt!(vcltq_s64, int64x2_t);
+aarch64_simd_cltu!(vclt_u64, uint64x1_t);
+aarch64_simd_cltu!(vcltq_u64, uint64x2_t);
+
+macro_rules! aarch64_simd_cge {
+    ($name:ident, $type:ty) => {
+        /// Compare signed Greater than (vector)
+        aarch64_simd_2!($name, $type, simd_ge, cmge);
+    };
+}
+macro_rules! aarch64_simd_cgeu {
+    ($name:ident, $type:ty) => {
+        /// Compare Greater than (vector)
+        aarch64_simd_2!($name, $type, simd_ge, cmhs);
+    };
+}
+
+aarch64_simd_cge!(vcge_s64, int64x1_t);
+aarch64_simd_cge!(vcgeq_s64, int64x2_t);
+aarch64_simd_cgeu!(vcge_u64, uint64x1_t);
+aarch64_simd_cgeu!(vcgeq_u64, uint64x2_t);
+
+macro_rules! aarch64_simd_cle {
+    ($name:ident, $type:ty) => {
+        /// Compare signed Lesser than (vector)
+        aarch64_simd_2!($name, $type, simd_le, cmge);
+    };
+}
+macro_rules! aarch64_simd_cleu {
+    ($name:ident, $type:ty) => {
+        /// Compare Lesser than (vector)
+        aarch64_simd_2!($name, $type, simd_le, cmhs);
+    };
+}
+
+aarch64_simd_cleu!(vcleq_u8, uint8x16_t);
+aarch64_simd_cle!(vcle_s64, int64x1_t);
+aarch64_simd_cle!(vcleq_s64, int64x2_t);
+aarch64_simd_cleu!(vcle_u64, uint64x1_t);
+aarch64_simd_cleu!(vcleq_u64, uint64x2_t);
+
+
+pub fn vdupq_n_u8(a:u8) -> uint8x16_t {
+    uint8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
+pub fn zero8x16() -> uint8x16_t {
+    uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00)
+}
+
+pub unsafe fn vpaddq_u8(a:uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_add(a, b) }
+
+pub unsafe fn vandq_u8(a:uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_and(a, b) }
+pub unsafe fn vorrq_u8(a:uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_or(a, b) }
+pub unsafe fn vandq_s8(a:int8x16_t, b: int8x16_t) -> int8x16_t { simd_llvm::simd_and(a, b) }
+pub unsafe fn vorrq_s8(a:int8x16_t, b: int8x16_t) -> int8x16_t { simd_llvm::simd_or(a, b) }
+
+macro_rules! arm_reinterpret {
+    ($name:ident, $from:ty, $to:ty) => {
+        // Vector reinterpret cast operation
+        #[inline]
+        #[target_feature(enable = "neon")]
+        #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+        pub unsafe fn $name(a: $from) -> $to {
+            mem::transmute(a)
+        }
+    };
+}
+
+arm_reinterpret!(vreinterpret_u64_u32, uint32x2_t, uint64x1_t);
+arm_reinterpret!(vreinterpretq_s8_u8, uint8x16_t, int8x16_t);
+arm_reinterpret!(vreinterpretq_u16_u8, uint8x16_t, uint16x8_t);
+arm_reinterpret!(vreinterpretq_u32_u8, uint8x16_t, uint32x4_t);
+arm_reinterpret!(vreinterpretq_u64_u8, uint8x16_t, uint64x2_t);
+arm_reinterpret!(vreinterpretq_u8_s8, int8x16_t, uint8x16_t);
+
+macro_rules! arm_vget_lane {
+    ($name:ident, $to:ty, $from:ty, $lanes:literal) => {
+        #[inline]
+        #[target_feature(enable = "neon")]
+        #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+        #[cfg_attr(test, assert_instr(umov))]
+        pub unsafe fn $name(v: $from, lane: u32) -> $to {
+            if lane > $lanes { unreachable_unchecked() }
+            simd_llvm::simd_extract(v, lane)
+        }
+    };
+}
+
+arm_vget_lane!(vgetq_lane_u16, u16, uint8x16_t, 7);
+arm_vget_lane!(vgetq_lane_u64, u64, uint64x2_t, 1);
+arm_vget_lane!(vget_lane_u64, u64, uint64x1_t, 0);
+
+pub fn vqmovn_u64(a:uint64x2_t) -> uint32x2_t {
+    uint32x2_t(a.0 as u32, a.1 as u32)
+}
+
+//fn vaddq_s8() -> u16 { 0 }
+//fn vceqq_s8() -> u16 { 0 }
+//fn vceqq_u8() -> u16 { 0 }
+//fn vcgtq_s8() -> u16 { 0 }
+//fn vcleq_u8() -> u16 { 0 }
+//fn vdupq_n_s8() -> u16 { 0 }
+//fn vextq_s8() -> u16 { 0 }
+//fn vget_lane_u64() -> u16 { 0 }
+//fn vgetq_lane_u32() -> u16 { 0 }
+//fn vgetq_lane_u64() -> u16 { 0 }
+//fn vld1q_s8() -> u16 { 0 }
+//fn vld1q_u8() -> u16 { 0 }
+//fn vmovq_n_u8() -> u16 { 0 }
+//fn vmull_p64() -> u16 { 0 }
+//fn vorrq_s8() -> u16 { 0 }
+//fn vqsubq_u8() -> u16 { 0 }
+//fn vqtbl1q_s8() -> u16 { 0 }
+//fn vqtbl1q_u8() -> u16 { 0 }
+//fn vshrq_n_u8() -> u16 { 0 }
+//fn vst1q_u8() -> u16 { 0 }
\ No newline at end of file
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
new file mode 100644
index 00000000..6d23b075
--- /dev/null
+++ b/src/neon/mod.rs
@@ -0,0 +1,6 @@
+//pub mod deser;
+pub mod stage1;
+//pub mod utf8check;
+mod simd;
+mod simd_llvm;
+mod intrinsics;
\ No newline at end of file
diff --git a/src/neon/simd.rs b/src/neon/simd.rs
new file mode 100644
index 00000000..ecf5a4cb
--- /dev/null
+++ b/src/neon/simd.rs
@@ -0,0 +1,469 @@
+#![allow(non_camel_case_types)]
+
+use crate::neon::simd_llvm;
+
+macro_rules! simd_ty {
+    ($id:ident [$ety:ident]: $($elem_ty:ident),* | $($elem_name:ident),*) => {
+        #[repr(simd)]
+        #[derive(Copy, Clone, Debug, PartialEq)]
+        pub(crate) struct $id($(pub $elem_ty),*);
+
+        #[allow(clippy::use_self)]
+        impl $id {
+            #[inline]
+            pub(crate) const fn new($($elem_name: $elem_ty),*) -> Self {
+                $id($($elem_name),*)
+            }
+            // FIXME: Workaround rust@60637
+            #[inline(always)]
+            pub(crate) const fn splat(value: $ety) -> Self {
+                $id($({
+                    #[allow(non_camel_case_types, dead_code)]
+                    struct $elem_name;
+                    value
+                }),*)
+            }
+
+            // FIXME: Workaround rust@60637
+            #[inline(always)]
+            pub(crate) fn extract(self, index: usize) -> $ety {
+                unsafe {
+                    simd_llvm::simd_extract(self, index as u32)
+                }
+            }
+        }
+    }
+}
+
+macro_rules! simd_m_ty {
+    ($id:ident [$ety:ident]: $($elem_ty:ident),* | $($elem_name:ident),*) => {
+        #[repr(simd)]
+        #[derive(Copy, Clone, Debug, PartialEq)]
+        pub(crate) struct $id($(pub $elem_ty),*);
+
+        #[allow(clippy::use_self)]
+        impl $id {
+            #[inline]
+            const fn bool_to_internal(x: bool) -> $ety {
+                [0 as $ety, !(0 as $ety)][x as usize]
+            }
+
+            #[inline]
+            pub(crate) const fn new($($elem_name: bool),*) -> Self {
+                $id($(Self::bool_to_internal($elem_name)),*)
+            }
+
+            // FIXME: Workaround rust@60637
+            #[inline(always)]
+            pub(crate) const fn splat(value: bool) -> Self {
+                $id($({
+                    #[allow(non_camel_case_types, dead_code)]
+                    struct $elem_name;
+                    Self::bool_to_internal(value)
+                }),*)
+            }
+
+            // FIXME: Workaround rust@60637
+            #[inline(always)]
+            pub(crate) fn extract(self, index: usize) -> bool {
+                let r: $ety = unsafe {
+                    simd_llvm::simd_extract(self, index as u32)
+                };
+                r != 0
+            }
+        }
+    }
+}
+
+// 16-bit wide types:
+
+simd_ty!(u8x2[u8]: u8, u8 | x0, x1);
+simd_ty!(i8x2[i8]: i8, i8 | x0, x1);
+
+// 32-bit wide types:
+
+simd_ty!(u8x4[u8]: u8, u8, u8, u8 | x0, x1, x2, x3);
+simd_ty!(u16x2[u16]: u16, u16 | x0, x1);
+
+simd_ty!(i8x4[i8]: i8, i8, i8, i8 | x0, x1, x2, x3);
+simd_ty!(i16x2[i16]: i16, i16 | x0, x1);
+
+// 64-bit wide types:
+
+simd_ty!(u8x8[u8]:
+         u8, u8, u8, u8, u8, u8, u8, u8
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(u16x4[u16]: u16, u16, u16, u16 | x0, x1, x2, x3);
+simd_ty!(u32x2[u32]: u32, u32 | x0, x1);
+simd_ty!(u64x1[u64]: u64 | x1);
+
+simd_ty!(i8x8[i8]:
+         i8, i8, i8, i8, i8, i8, i8, i8
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(i16x4[i16]: i16, i16, i16, i16 | x0, x1, x2, x3);
+simd_ty!(i32x2[i32]: i32, i32 | x0, x1);
+simd_ty!(i64x1[i64]: i64 | x1);
+
+simd_ty!(f32x2[f32]: f32, f32 | x0, x1);
+
+// 128-bit wide types:
+
+simd_ty!(u8x16[u8]:
+         u8, u8, u8, u8, u8, u8, u8, u8,
+         u8, u8, u8, u8, u8, u8, u8, u8
+         | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+);
+simd_ty!(u16x8[u16]:
+         u16, u16, u16, u16, u16, u16, u16, u16
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(u32x4[u32]: u32, u32, u32, u32 | x0, x1, x2, x3);
+simd_ty!(u64x2[u64]: u64, u64 | x0, x1);
+
+simd_ty!(i8x16[i8]:
+         i8, i8, i8, i8, i8, i8, i8, i8,
+         i8, i8, i8, i8, i8, i8, i8, i8
+         | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+);
+simd_ty!(i16x8[i16]:
+         i16, i16, i16, i16, i16, i16, i16, i16
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(i32x4[i32]: i32, i32, i32, i32 | x0, x1, x2, x3);
+simd_ty!(i64x2[i64]: i64, i64 | x0, x1);
+
+simd_ty!(f32x4[f32]: f32, f32, f32, f32 | x0, x1, x2, x3);
+simd_ty!(f64x2[f64]: f64, f64 | x0, x1);
+
+simd_m_ty!(m8x16[i8]:
+           i8, i8, i8, i8, i8, i8, i8, i8,
+           i8, i8, i8, i8, i8, i8, i8, i8
+           | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+);
+simd_m_ty!(m16x8[i16]:
+           i16, i16, i16, i16, i16, i16, i16, i16
+           | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_m_ty!(m32x4[i32]: i32, i32, i32, i32 | x0, x1, x2, x3);
+simd_m_ty!(m64x2[i64]: i64, i64 | x0, x1);
+
+// 256-bit wide types:
+
+simd_ty!(u8x32[u8]:
+         u8, u8, u8, u8, u8, u8, u8, u8,
+         u8, u8, u8, u8, u8, u8, u8, u8,
+         u8, u8, u8, u8, u8, u8, u8, u8,
+         u8, u8, u8, u8, u8, u8, u8, u8
+         | x0, x1, x2, x3, x4, x5, x6, x7,
+         x8, x9, x10, x11, x12, x13, x14, x15,
+         x16, x17, x18, x19, x20, x21, x22, x23,
+         x24, x25, x26, x27, x28, x29, x30, x31
+);
+simd_ty!(u16x16[u16]:
+         u16, u16, u16, u16, u16, u16, u16, u16,
+         u16, u16, u16, u16, u16, u16, u16, u16
+         | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+);
+simd_ty!(u32x8[u32]:
+         u32, u32, u32, u32, u32, u32, u32, u32
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(u64x4[u64]: u64, u64, u64, u64 | x0, x1, x2, x3);
+
+simd_ty!(i8x32[i8]:
+         i8, i8, i8, i8, i8, i8, i8, i8,
+         i8, i8, i8, i8, i8, i8, i8, i8,
+         i8, i8, i8, i8, i8, i8, i8, i8,
+         i8, i8, i8, i8, i8, i8, i8, i8
+         | x0, x1, x2, x3, x4, x5, x6, x7,
+         x8, x9, x10, x11, x12, x13, x14, x15,
+         x16, x17, x18, x19, x20, x21, x22, x23,
+         x24, x25, x26, x27, x28, x29, x30, x31
+);
+simd_ty!(i16x16[i16]:
+         i16, i16, i16, i16, i16, i16, i16, i16,
+         i16, i16, i16, i16, i16, i16, i16, i16
+         | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15
+);
+simd_ty!(i32x8[i32]:
+         i32, i32, i32, i32, i32, i32, i32, i32
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+simd_ty!(i64x4[i64]: i64, i64, i64, i64 | x0, x1, x2, x3);
+
+simd_ty!(f32x8[f32]:
+         f32, f32, f32, f32, f32, f32, f32, f32 |
+         x0, x1, x2, x3, x4, x5, x6, x7);
+
+// 512-bit wide types:
+
+simd_ty!(i32x16[i32]:
+         i32, i32, i32, i32, i32, i32, i32, i32,
+         i32, i32, i32, i32, i32, i32, i32, i32
+         | x0, x1, x2, x3, x4, x5, x6, x7,
+         x8, x9, x10, x11, x12, x13, x14, x15);
+
+simd_ty!(i64x8[i64]:
+         i64, i64, i64, i64, i64, i64, i64, i64
+         | x0, x1, x2, x3, x4, x5, x6, x7);
+
+#[allow(unused)]
+#[macro_export]
+macro_rules! constify_imm8 {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match ($imm8) & 0b1111_1111 {
+            0 => $expand!(0),
+            1 => $expand!(1),
+            2 => $expand!(2),
+            3 => $expand!(3),
+            4 => $expand!(4),
+            5 => $expand!(5),
+            6 => $expand!(6),
+            7 => $expand!(7),
+            8 => $expand!(8),
+            9 => $expand!(9),
+            10 => $expand!(10),
+            11 => $expand!(11),
+            12 => $expand!(12),
+            13 => $expand!(13),
+            14 => $expand!(14),
+            15 => $expand!(15),
+            16 => $expand!(16),
+            17 => $expand!(17),
+            18 => $expand!(18),
+            19 => $expand!(19),
+            20 => $expand!(20),
+            21 => $expand!(21),
+            22 => $expand!(22),
+            23 => $expand!(23),
+            24 => $expand!(24),
+            25 => $expand!(25),
+            26 => $expand!(26),
+            27 => $expand!(27),
+            28 => $expand!(28),
+            29 => $expand!(29),
+            30 => $expand!(30),
+            31 => $expand!(31),
+            32 => $expand!(32),
+            33 => $expand!(33),
+            34 => $expand!(34),
+            35 => $expand!(35),
+            36 => $expand!(36),
+            37 => $expand!(37),
+            38 => $expand!(38),
+            39 => $expand!(39),
+            40 => $expand!(40),
+            41 => $expand!(41),
+            42 => $expand!(42),
+            43 => $expand!(43),
+            44 => $expand!(44),
+            45 => $expand!(45),
+            46 => $expand!(46),
+            47 => $expand!(47),
+            48 => $expand!(48),
+            49 => $expand!(49),
+            50 => $expand!(50),
+            51 => $expand!(51),
+            52 => $expand!(52),
+            53 => $expand!(53),
+            54 => $expand!(54),
+            55 => $expand!(55),
+            56 => $expand!(56),
+            57 => $expand!(57),
+            58 => $expand!(58),
+            59 => $expand!(59),
+            60 => $expand!(60),
+            61 => $expand!(61),
+            62 => $expand!(62),
+            63 => $expand!(63),
+            64 => $expand!(64),
+            65 => $expand!(65),
+            66 => $expand!(66),
+            67 => $expand!(67),
+            68 => $expand!(68),
+            69 => $expand!(69),
+            70 => $expand!(70),
+            71 => $expand!(71),
+            72 => $expand!(72),
+            73 => $expand!(73),
+            74 => $expand!(74),
+            75 => $expand!(75),
+            76 => $expand!(76),
+            77 => $expand!(77),
+            78 => $expand!(78),
+            79 => $expand!(79),
+            80 => $expand!(80),
+            81 => $expand!(81),
+            82 => $expand!(82),
+            83 => $expand!(83),
+            84 => $expand!(84),
+            85 => $expand!(85),
+            86 => $expand!(86),
+            87 => $expand!(87),
+            88 => $expand!(88),
+            89 => $expand!(89),
+            90 => $expand!(90),
+            91 => $expand!(91),
+            92 => $expand!(92),
+            93 => $expand!(93),
+            94 => $expand!(94),
+            95 => $expand!(95),
+            96 => $expand!(96),
+            97 => $expand!(97),
+            98 => $expand!(98),
+            99 => $expand!(99),
+            100 => $expand!(100),
+            101 => $expand!(101),
+            102 => $expand!(102),
+            103 => $expand!(103),
+            104 => $expand!(104),
+            105 => $expand!(105),
+            106 => $expand!(106),
+            107 => $expand!(107),
+            108 => $expand!(108),
+            109 => $expand!(109),
+            110 => $expand!(110),
+            111 => $expand!(111),
+            112 => $expand!(112),
+            113 => $expand!(113),
+            114 => $expand!(114),
+            115 => $expand!(115),
+            116 => $expand!(116),
+            117 => $expand!(117),
+            118 => $expand!(118),
+            119 => $expand!(119),
+            120 => $expand!(120),
+            121 => $expand!(121),
+            122 => $expand!(122),
+            123 => $expand!(123),
+            124 => $expand!(124),
+            125 => $expand!(125),
+            126 => $expand!(126),
+            127 => $expand!(127),
+            128 => $expand!(128),
+            129 => $expand!(129),
+            130 => $expand!(130),
+            131 => $expand!(131),
+            132 => $expand!(132),
+            133 => $expand!(133),
+            134 => $expand!(134),
+            135 => $expand!(135),
+            136 => $expand!(136),
+            137 => $expand!(137),
+            138 => $expand!(138),
+            139 => $expand!(139),
+            140 => $expand!(140),
+            141 => $expand!(141),
+            142 => $expand!(142),
+            143 => $expand!(143),
+            144 => $expand!(144),
+            145 => $expand!(145),
+            146 => $expand!(146),
+            147 => $expand!(147),
+            148 => $expand!(148),
+            149 => $expand!(149),
+            150 => $expand!(150),
+            151 => $expand!(151),
+            152 => $expand!(152),
+            153 => $expand!(153),
+            154 => $expand!(154),
+            155 => $expand!(155),
+            156 => $expand!(156),
+            157 => $expand!(157),
+            158 => $expand!(158),
+            159 => $expand!(159),
+            160 => $expand!(160),
+            161 => $expand!(161),
+            162 => $expand!(162),
+            163 => $expand!(163),
+            164 => $expand!(164),
+            165 => $expand!(165),
+            166 => $expand!(166),
+            167 => $expand!(167),
+            168 => $expand!(168),
+            169 => $expand!(169),
+            170 => $expand!(170),
+            171 => $expand!(171),
+            172 => $expand!(172),
+            173 => $expand!(173),
+            174 => $expand!(174),
+            175 => $expand!(175),
+            176 => $expand!(176),
+            177 => $expand!(177),
+            178 => $expand!(178),
+            179 => $expand!(179),
+            180 => $expand!(180),
+            181 => $expand!(181),
+            182 => $expand!(182),
+            183 => $expand!(183),
+            184 => $expand!(184),
+            185 => $expand!(185),
+            186 => $expand!(186),
+            187 => $expand!(187),
+            188 => $expand!(188),
+            189 => $expand!(189),
+            190 => $expand!(190),
+            191 => $expand!(191),
+            192 => $expand!(192),
+            193 => $expand!(193),
+            194 => $expand!(194),
+            195 => $expand!(195),
+            196 => $expand!(196),
+            197 => $expand!(197),
+            198 => $expand!(198),
+            199 => $expand!(199),
+            200 => $expand!(200),
+            201 => $expand!(201),
+            202 => $expand!(202),
+            203 => $expand!(203),
+            204 => $expand!(204),
+            205 => $expand!(205),
+            206 => $expand!(206),
+            207 => $expand!(207),
+            208 => $expand!(208),
+            209 => $expand!(209),
+            210 => $expand!(210),
+            211 => $expand!(211),
+            212 => $expand!(212),
+            213 => $expand!(213),
+            214 => $expand!(214),
+            215 => $expand!(215),
+            216 => $expand!(216),
+            217 => $expand!(217),
+            218 => $expand!(218),
+            219 => $expand!(219),
+            220 => $expand!(220),
+            221 => $expand!(221),
+            222 => $expand!(222),
+            223 => $expand!(223),
+            224 => $expand!(224),
+            225 => $expand!(225),
+            226 => $expand!(226),
+            227 => $expand!(227),
+            228 => $expand!(228),
+            229 => $expand!(229),
+            230 => $expand!(230),
+            231 => $expand!(231),
+            232 => $expand!(232),
+            233 => $expand!(233),
+            234 => $expand!(234),
+            235 => $expand!(235),
+            236 => $expand!(236),
+            237 => $expand!(237),
+            238 => $expand!(238),
+            239 => $expand!(239),
+            240 => $expand!(240),
+            241 => $expand!(241),
+            242 => $expand!(242),
+            243 => $expand!(243),
+            244 => $expand!(244),
+            245 => $expand!(245),
+            246 => $expand!(246),
+            247 => $expand!(247),
+            248 => $expand!(248),
+            249 => $expand!(249),
+            250 => $expand!(250),
+            251 => $expand!(251),
+            252 => $expand!(252),
+            253 => $expand!(253),
+            254 => $expand!(254),
+            _ => $expand!(255),
+        }
+    };
+}
\ No newline at end of file
diff --git a/src/neon/simd_llvm.rs b/src/neon/simd_llvm.rs
new file mode 100644
index 00000000..a6c1eb52
--- /dev/null
+++ b/src/neon/simd_llvm.rs
@@ -0,0 +1,54 @@
+extern "platform-intrinsic" {
+    pub fn simd_eq<T, U>(x: T, y: T) -> U;
+    pub fn simd_ne<T, U>(x: T, y: T) -> U;
+    pub fn simd_lt<T, U>(x: T, y: T) -> U;
+    pub fn simd_le<T, U>(x: T, y: T) -> U;
+    pub fn simd_gt<T, U>(x: T, y: T) -> U;
+    pub fn simd_ge<T, U>(x: T, y: T) -> U;
+
+    pub fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U;
+    pub fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U;
+    pub fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U;
+    pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
+    pub fn simd_shuffle32<T, U>(x: T, y: T, idx: [u32; 32]) -> U;
+    pub fn simd_shuffle64<T, U>(x: T, y: T, idx: [u32; 64]) -> U;
+    pub fn simd_shuffle128<T, U>(x: T, y: T, idx: [u32; 128]) -> U;
+
+    pub fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
+    pub fn simd_extract<T, U>(x: T, idx: u32) -> U;
+
+    pub fn simd_cast<T, U>(x: T) -> U;
+
+    pub fn simd_add<T>(x: T, y: T) -> T;
+    pub fn simd_sub<T>(x: T, y: T) -> T;
+    pub fn simd_mul<T>(x: T, y: T) -> T;
+    pub fn simd_div<T>(x: T, y: T) -> T;
+    pub fn simd_shl<T>(x: T, y: T) -> T;
+    pub fn simd_shr<T>(x: T, y: T) -> T;
+    pub fn simd_and<T>(x: T, y: T) -> T;
+    pub fn simd_or<T>(x: T, y: T) -> T;
+    pub fn simd_xor<T>(x: T, y: T) -> T;
+
+    pub fn simd_reduce_add_unordered<T, U>(x: T) -> U;
+    pub fn simd_reduce_mul_unordered<T, U>(x: T) -> U;
+    pub fn simd_reduce_add_ordered<T, U>(x: T, acc: U) -> U;
+    pub fn simd_reduce_mul_ordered<T, U>(x: T, acc: U) -> U;
+    pub fn simd_reduce_min<T, U>(x: T) -> U;
+    pub fn simd_reduce_max<T, U>(x: T) -> U;
+    pub fn simd_reduce_min_nanless<T, U>(x: T) -> U;
+    pub fn simd_reduce_max_nanless<T, U>(x: T) -> U;
+    pub fn simd_reduce_and<T, U>(x: T) -> U;
+    pub fn simd_reduce_or<T, U>(x: T) -> U;
+    pub fn simd_reduce_xor<T, U>(x: T) -> U;
+    pub fn simd_reduce_all<T>(x: T) -> bool;
+    pub fn simd_reduce_any<T>(x: T) -> bool;
+
+    pub fn simd_select<M, T>(m: M, a: T, b: T) -> T;
+    pub fn simd_select_bitmask<M, T>(m: M, a: T, b: T) -> T;
+
+    pub fn simd_fmin<T>(a: T, b: T) -> T;
+    pub fn simd_fmax<T>(a: T, b: T) -> T;
+
+    pub fn simd_fsqrt<T>(a: T) -> T;
+    pub fn simd_fma<T>(a: T, b: T, c: T) -> T;
+}
\ No newline at end of file
diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs
new file mode 100644
index 00000000..6e5f6b58
--- /dev/null
+++ b/src/neon/stage1.rs
@@ -0,0 +1,560 @@
+#![allow(dead_code)]
+
+//use crate::portability::*;
+use crate::neon::intrinsics::*;
+//use core::arch::aarch64::*;
+//use crate::neon::utf8check::*;
+use crate::*;
+use std::mem;
+
+#[macro_use]
+use crate::neon::simd::*;
+
+pub const SIMDJSON_PADDING: usize = mem::size_of::<uint8x16_t>() * 4;
+
+#[derive(Debug)]
+struct SimdInput {
+    v0: uint8x16_t,
+    v1: uint8x16_t,
+    v2: uint8x16_t,
+    v3: uint8x16_t,
+}
+
+fn fill_input(ptr: &[u8]) -> SimdInput {
+    unsafe {
+        #[allow(clippy::cast_ptr_alignment)]
+            SimdInput {
+            v0: mem::transmute(ptr.as_ptr() as *const uint8x16_t),
+            v1: mem::transmute(ptr.as_ptr().add(16) as *const uint8x16_t),
+            v2: mem::transmute(ptr.as_ptr().add(32) as *const uint8x16_t),
+            v3: mem::transmute(ptr.as_ptr().add(48) as *const uint8x16_t),
+        }
+    }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn neon_movemask(input: uint8x16_t) -> u16 {
+    const bit_mask: uint8x16_t = uint8x16_t::new(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                                 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+
+    let minput: uint8x16_t = vandq_u8(input, bit_mask);
+    let tmp: uint8x16_t = vpaddq_u8(minput, minput);
+    let tmp = vpaddq_u8(tmp, tmp);
+    let tmp = vpaddq_u8(tmp, tmp);
+
+    unsafe {
+        vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0)
+    }
+}
+
+
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn neon_movemask_bulk(p0: uint8x16_t, p1: uint8x16_t, p2: uint8x16_t, p3: uint8x16_t) -> u64 {
+    const bit_mask: uint8x16_t = uint8x16_t::new(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                                 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+    let t0 = vandq_u8(p0, bit_mask);
+    let t1 = vandq_u8(p1, bit_mask);
+    let t2 = vandq_u8(p2, bit_mask);
+    let t3 = vandq_u8(p3, bit_mask);
+    let sum0 = vpaddq_u8(t0, t1);
+    let sum1 = vpaddq_u8(t2, t3);
+    let sum0 = vpaddq_u8(sum0, sum1);
+    let sum0 = vpaddq_u8(sum0, sum0);
+    unsafe {
+        vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0)
+    }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn check_ascii_neon(si: &SimdInput) -> bool {
+    let high_bit: uint8x16_t = vdupq_n_u8(0x80);
+    let t0: uint8x16_t = vorrq_u8(si.v0, si.v1);
+    let t1: uint8x16_t = vorrq_u8(si.v2, si.v3);
+    let t3: uint8x16_t = vorrq_u8(t0, t1);
+    let t4: uint8x16_t = vandq_u8(t3, high_bit);
+
+    unsafe {
+        let v64: uint64x2_t = vreinterpretq_u64_u8(t4);
+        let v32: uint32x2_t = vqmovn_u64(v64);
+        let result: uint64x1_t = vreinterpret_u64_u32(v32);
+
+        vget_lane_u64(result, 0) == 0
+    }
+}
+
+struct utf8_checking_state {
+    has_error: int8x16_t,
+    previous: processed_utf_bytes,
+}
+//
+//#[cfg_attr(not(feature = "no-inline"), inline(always))]
+//unsafe fn check_utf8(
+//    input: &SimdInput,
+//    has_error: &mut uint8x16_t,
+//    previous: &mut utf8_checking_state,
+//) {
+//    if check_ascii_neon(input) {
+//        // All bytes are ascii. Therefore the byte that was just before must be
+//        // ascii too. We only check the byte that was just before simd_input. Nines
+//        // are arbitrary values.
+//        const verror: int8x16_t =
+//            int8x16_t::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1);
+//        state.has_error =
+//            vorrq_s8(vreinterpretq_s8_u8(
+//                vcgtq_s8(state.previous.carried_continuations, verror)),
+//                     state.has_error);
+//    } else {
+//        // it is not ascii so we have to do heavy work
+//        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v0),
+//                                          &(state.previous), &(state.has_error));
+//        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v1),
+//                                          &(state.previous), &(state.has_error));
+//        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v2),
+//                                          &(state.previous), &(state.has_error));
+//        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v3),
+//                                          &(state.previous), &(state.has_error));
+//    }
+//}
+
+// a straightforward comparison of a mask against input
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn cmp_mask_against_input(input: &SimdInput, m: u8) -> u64 {
+    const MASK: uint8x16_t = vdupq_n_u8(m);
+
+    let cmp_res_0: uint8x16_t = vceqq_u8(input.v0, MASK);
+    let cmp_res_1: uint8x16_t = vceqq_u8(input.v1, MASK);
+    let cmp_res_2: uint8x16_t = vceqq_u8(input.v2, MASK);
+    let cmp_res_3: uint8x16_t = vceqq_u8(input.v3, MASK);
+
+    neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3)
+}
+
+// find all values less than or equal than the content of maxval (using unsigned arithmetic)
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn unsigned_lteq_against_input(input: &SimdInput, maxval: u8) -> u64 {
+    const MASK: uint8x16_t = vdupq_n_u8(maxval);
+
+    let cmp_res_0: uint8x16_t = vcleq_u8(input.v0, MASK);
+    let cmp_res_1: uint8x16_t = vcleq_u8(input.v1, MASK);
+    let cmp_res_2: uint8x16_t = vcleq_u8(input.v2, MASK);
+    let cmp_res_3: uint8x16_t = vcleq_u8(input.v3, MASK);
+
+    neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3)
+}
+
+unsafe fn find_odd_backslash_sequences(input: &SimdInput, prev_iter_ends_odd_backslash: &mut u64) -> u64 {
+    const EVEN_BITS: u64 = 0x5555_5555_5555_5555;
+    const ODD_BITS: u64 = !EVEN_BITS;
+
+    let bs_bits: u64 = cmp_mask_against_input(input, b'\\');
+    let start_edges: u64 = bs_bits & !(bs_bits << 1);
+    // flip lowest if we have an odd-length run at the end of the prior
+    // iteration
+    let even_start_mask: u64 = EVEN_BITS ^ *prev_iter_ends_odd_backslash;
+    let even_starts: u64 = start_edges & even_start_mask;
+    let odd_starts: u64 = start_edges & !even_start_mask;
+    let even_carries: u64 = bs_bits.wrapping_add(even_starts);
+
+    let mut odd_carries: u64 = 0;
+    // must record the carry-out of our odd-carries out of bit 63; this
+    // indicates whether the sense of any edge going to the next iteration
+    // should be flipped
+    let iter_ends_odd_backslash: bool = add_overflow(bs_bits, odd_starts, &mut odd_carries);
+
+    odd_carries |= *prev_iter_ends_odd_backslash; // push in bit zero as a potential end
+                                                  // if we had an odd-numbered run at the
+                                                  // end of the previous iteration
+    *prev_iter_ends_odd_backslash = if iter_ends_odd_backslash { 0x1 } else { 0x0 };
+    let even_carry_ends: u64 = even_carries & !bs_bits;
+    let odd_carry_ends: u64 = odd_carries & !bs_bits;
+    let even_start_odd_end: u64 = even_carry_ends & ODD_BITS;
+    let odd_start_even_end: u64 = odd_carry_ends & EVEN_BITS;
+    let odd_ends: u64 = even_start_odd_end | odd_start_even_end;
+    odd_ends
+}
+
+unsafe fn find_quote_mask_and_bits(input: &SimdInput, odd_ends: u64,
+                                   prev_iter_inside_quote: &u64, quote_bits: &mut u64, error_mask: &u64) -> u64 {
+    quote_bits = cmp_mask_against_input(input, b'"');
+    quote_bits = &(quote_bits & !odd_ends);
+    let quote_mask: u64 = vmull_p64(-1, quote_bits);
+
+    quote_mask ^= prev_iter_inside_quote;
+
+    // All Unicode characters may be placed within the
+    // quotation marks, except for the characters that MUST be escaped:
+    // quotation mark, reverse solidus, and the control characters (U+0000
+    //through U+001F).
+    // https://tools.ietf.org/html/rfc8259
+    let unescaped: u64 = unsigned_lteq_against_input(input, 0x1F);
+    error_mask |= quote_mask & unescaped;
+
+    // right shift of a signed value expected to be well-defined and standard
+    // compliant as of C++20,
+    // John Regher from Utah U. says this is fine code
+    prev_iter_inside_quote = quote_mask >> 63;
+
+    quote_mask
+}
+
+//
+//#[cfg_attr(not(feature = "no-inline"), inline(always))]
+//unsafe fn find_whitespace_and_structurals(
+//    input: &SimdInput,
+//    whitespace: &mut u64,
+//    structurals: &mut u64,
+//) {
+//    // do a 'shufti' to detect structural JSON characters
+//    // they are
+//    // * `{` 0x7b
+//    // * `}` 0x7d
+//    // * `:` 0x3a
+//    // * `[` 0x5b
+//    // * `]` 0x5d
+//    // * `,` 0x2c
+//    // these go into the first 3 buckets of the comparison (1/2/4)
+//
+//    // we are also interested in the four whitespace characters:
+//    // * space 0x20
+//    // * linefeed 0x0a
+//    // * horizontal tab 0x09
+//    // * carriage return 0x0d
+//    // these go into the next 2 buckets of the comparison (8/16)
+//
+//    // TODO: const?
+//    let low_nibble_mask: __m128i = _mm_setr_epi8(
+//        16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0,
+//    );
+//    // TODO: const?
+//    let high_nibble_mask: __m128i = _mm_setr_epi8(
+//        8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0,
+//    );
+//
+//    let structural_shufti_mask: __m128i = _mm_set1_epi8(0x7);
+//    let whitespace_shufti_mask: __m128i = _mm_set1_epi8(0x18);
+//
+//    let v_v0: __m128i = _mm_and_si128(
+//        _mm_shuffle_epi8(low_nibble_mask, input.v0),
+//        _mm_shuffle_epi8(
+//            high_nibble_mask,
+//            _mm_and_si128(_mm_srli_epi32(input.v0, 4), _mm_set1_epi8(0x7f)),
+//        ),
+//    );
+//    let v_v1: __m128i = _mm_and_si128(
+//        _mm_shuffle_epi8(low_nibble_mask, input.v1),
+//        _mm_shuffle_epi8(
+//            high_nibble_mask,
+//            _mm_and_si128(_mm_srli_epi32(input.v1, 4), _mm_set1_epi8(0x7f)),
+//        ),
+//    );
+//    let v_v2: __m128i = _mm_and_si128(
+//        _mm_shuffle_epi8(low_nibble_mask, input.v2),
+//        _mm_shuffle_epi8(
+//            high_nibble_mask,
+//            _mm_and_si128(_mm_srli_epi32(input.v2, 4), _mm_set1_epi8(0x7f)),
+//        ),
+//    );
+//    let v_v3: __m128i = _mm_and_si128(
+//        _mm_shuffle_epi8(low_nibble_mask, input.v3),
+//        _mm_shuffle_epi8(
+//            high_nibble_mask,
+//            _mm_and_si128(_mm_srli_epi32(input.v3, 4), _mm_set1_epi8(0x7f)),
+//        ),
+//    );
+//
+//    let tmp_v0: __m128i = _mm_cmpeq_epi8(
+//        _mm_and_si128(v_v0, structural_shufti_mask),
+//        _mm_set1_epi8(0),
+//    );
+//    let tmp_v1: __m128i = _mm_cmpeq_epi8(
+//        _mm_and_si128(v_v1, structural_shufti_mask),
+//        _mm_set1_epi8(0),
+//    );
+//    let tmp_v2: __m128i = _mm_cmpeq_epi8(
+//        _mm_and_si128(v_v2, structural_shufti_mask),
+//        _mm_set1_epi8(0),
+//    );
+//    let tmp_v3: __m128i = _mm_cmpeq_epi8(
+//        _mm_and_si128(v_v3, structural_shufti_mask),
+//        _mm_set1_epi8(0),
+//    );
+//
+//    let structural_res_0: u64 = u64::from(static_cast_u32!(_mm_movemask_epi8(tmp_v0)));
+//    let structural_res_1: u64 = _mm_movemask_epi8(tmp_v1) as u64;
+//    let structural_res_2: u64 = _mm_movemask_epi8(tmp_v2) as u64;
+//    let structural_res_3: u64 = _mm_movemask_epi8(tmp_v3) as u64;
+//
+//    *structurals = !(structural_res_0 | (structural_res_1 << 16) | (structural_res_2 << 32) | (structural_res_3 << 48));
+//
+//    let tmp_ws_v0: __m128i = _mm_cmpeq_epi8(
+//        _mm_and_si128(v_v0, whitespace_shufti_mask),
+//        _mm_set1_epi8(0),
+//    );
+//    let tmp_ws_v1: __m128i = _mm_cmpeq_epi8(
+//        _mm_and_si128(v_v1, whitespace_shufti_mask),
+//        _mm_set1_epi8(0),
+//    );
+//    let tmp_ws_v2: __m128i = _mm_cmpeq_epi8(
+//        _mm_and_si128(v_v2, whitespace_shufti_mask),
+//        _mm_set1_epi8(0),
+//    );
+//    let tmp_ws_v3: __m128i = _mm_cmpeq_epi8(
+//        _mm_and_si128(v_v3, whitespace_shufti_mask),
+//        _mm_set1_epi8(0),
+//    );
+//
+//    let ws_res_0: u64 = u64::from(static_cast_u32!(_mm_movemask_epi8(tmp_ws_v0)));
+//    let ws_res_1: u64 = _mm_movemask_epi8(tmp_ws_v1) as u64;
+//    let ws_res_2: u64 = _mm_movemask_epi8(tmp_ws_v2) as u64;
+//    let ws_res_3: u64 = _mm_movemask_epi8(tmp_ws_v3) as u64;
+//
+//    *whitespace = !(ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48));
+//}
+//
+
+// flatten out values in 'bits' assuming that they are are to have values of idx
+// plus their position in the bitvector, and store these indexes at
+// base_ptr[base] incrementing base as we go
+// will potentially store extra values beyond end of valid bits, so base_ptr
+// needs to be large enough to handle this
+//TODO: usize was u32 here does this matter?
+//#[cfg_attr(not(feature = "no-inline"), inline(always))]
+//fn flatten_bits(base: &mut Vec<u32>, idx: u32, mut bits: u64) {
+//    let cnt: usize = hamming(bits) as usize;
+//    let mut l = base.len();
+//    let idx_minus_64 = idx.wrapping_sub(64);
+//    let idx_64_v = unsafe {
+//        _mm_set_epi32(
+//            static_cast_i32!(idx_minus_64),
+//            static_cast_i32!(idx_minus_64),
+//            static_cast_i32!(idx_minus_64),
+//            static_cast_i32!(idx_minus_64),
+//        )
+//    };
+//
+//    // We're doing some trickery here.
+//    // We reserve 64 extra entries, because we've at most 64 bit to set
+//    // then we trunctate the base to the next base (that we calcuate above)
+//    // We later indiscriminatory writre over the len we set but that's OK
+//    // since we ensure we reserve the needed space
+//    base.reserve(64);
+//    unsafe {
+//        base.set_len(l + cnt);
+//    }
+//
+//    while bits != 0 {
+//        unsafe {
+//            let v0 = static_cast_i32!(trailingzeroes(bits));
+//            bits &= bits.wrapping_sub(1);
+//            let v1 = static_cast_i32!(trailingzeroes(bits));
+//            bits &= bits.wrapping_sub(1);
+//            let v2 = static_cast_i32!(trailingzeroes(bits));
+//            bits &= bits.wrapping_sub(1);
+//            let v3 = static_cast_i32!(trailingzeroes(bits));
+//            bits &= bits.wrapping_sub(1);
+//
+//            let v: __m128i = _mm_set_epi32(v3, v2, v1, v0);
+//            let v: __m128i = _mm_add_epi32(idx_64_v, v);
+//            #[allow(clippy::cast_ptr_alignment)]
+//                _mm_storeu_si128(base.as_mut_ptr().add(l) as *mut __m128i, v);
+//        }
+//        l += 4;
+//    }
+//}
+
+//
+//// return a updated structural bit vector with quoted contents cleared out and
+//// pseudo-structural characters added to the mask
+//// updates prev_iter_ends_pseudo_pred which tells us whether the previous
+//// iteration ended on a whitespace or a structural character (which means that
+//// the next iteration
+//// will have a pseudo-structural character at its start)
+//#[cfg_attr(not(feature = "no-inline"), inline(always))]
+//fn finalize_structurals(
+//    mut structurals: u64,
+//    whitespace: u64,
+//    quote_mask: u64,
+//    quote_bits: u64,
+//    prev_iter_ends_pseudo_pred: &mut u64,
+//) -> u64 {
+//    // mask off anything inside quotes
+//    structurals &= !quote_mask;
+//    // add the real quote bits back into our bitmask as well, so we can
+//    // quickly traverse the strings we've spent all this trouble gathering
+//    structurals |= quote_bits;
+//    // Now, establish "pseudo-structural characters". These are non-whitespace
+//    // characters that are (a) outside quotes and (b) have a predecessor that's
+//    // either whitespace or a structural character. This means that subsequent
+//    // passes will get a chance to encounter the first character of every string
+//    // of non-whitespace and, if we're parsing an atom like true/false/null or a
+//    // number we can stop at the first whitespace or structural character
+//    // following it.
+//
+//    // a qualified predecessor is something that can happen 1 position before an
+//    // psuedo-structural character
+//    let pseudo_pred: u64 = structurals | whitespace;
+//
+//    let shifted_pseudo_pred: u64 = (pseudo_pred << 1) | *prev_iter_ends_pseudo_pred;
+//    *prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
+//    let pseudo_structurals: u64 = shifted_pseudo_pred & (!whitespace) & (!quote_mask);
+//    structurals |= pseudo_structurals;
+//
+//    // now, we've used our close quotes all we need to. So let's switch them off
+//    // they will be off in the quote mask and on in quote bits.
+//    structurals &= !(quote_bits & !quote_mask);
+//    structurals
+//}
+//
+
+//impl<'de> Deserializer<'de> {
+//    #[inline(never)]
+//    pub unsafe fn find_structural_bits(input: &[u8]) -> std::result::Result<Vec<u32>, ErrorType> {
+//        let len = input.len();
+//        // 6 is a heuristic number to estimate it turns out a rate of 1/6 structural caracters lears
+//        // almost never to relocations.
+//        let mut structural_indexes = Vec::with_capacity(len / 6);
+//        structural_indexes.push(0); // push extra root element
+//
+//        let mut has_error: __m128i = _mm_setzero_si128();
+//        let mut previous = AvxProcessedUtfBytes::default();
+//        // we have padded the input out to 64 byte multiple with the remainder being
+//        // zeros
+//
+//        // persistent state across loop
+//        // does the last iteration end with an odd-length sequence of backslashes?
+//        // either 0 or 1, but a 64-bit value
+//        let mut prev_iter_ends_odd_backslash: u64 = 0;
+//        // does the previous iteration end inside a double-quote pair?
+//        let mut prev_iter_inside_quote: u64 = 0;
+//        // either all zeros or all ones
+//        // does the previous iteration end on something that is a predecessor of a
+//        // pseudo-structural character - i.e. whitespace or a structural character
+//        // effectively the very first char is considered to follow "whitespace" for
+//        // the
+//        // purposes of pseudo-structural character detection so we initialize to 1
+//        let mut prev_iter_ends_pseudo_pred: u64 = 1;
+//
+//        // structurals are persistent state across loop as we flatten them on the
+//        // subsequent iteration into our array pointed to be base_ptr.
+//        // This is harmless on the first iteration as structurals==0
+//        // and is done for performance reasons; we can hide some of the latency of the
+//        // expensive carryless multiply in the previous step with this work
+//        let mut structurals: u64 = 0;
+//
+//        let lenminus64: usize = if len < 64 { 0 } else { len as usize - 64 };
+//        let mut idx: usize = 0;
+//        let mut error_mask: u64 = 0; // for unescaped characters within strings (ASCII code points < 0x20)
+//
+//        while idx < lenminus64 {
+//            /*
+//            #ifndef _MSC_VER
+//              __builtin_prefetch(buf + idx + 128);
+//            #endif
+//             */
+//            let input: SimdInput = fill_input(input.get_unchecked(idx as usize..));
+//            check_utf8(&input, &mut has_error, &mut previous);
+//            // detect odd sequences of backslashes
+//            let odd_ends: u64 =
+//                find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash);
+//
+//            // detect insides of quote pairs ("quote_mask") and also our quote_bits
+//            // themselves
+//            let mut quote_bits: u64 = 0;
+//            let quote_mask: u64 = find_quote_mask_and_bits(
+//                &input,
+//                odd_ends,
+//                &mut prev_iter_inside_quote,
+//                &mut quote_bits,
+//                &mut error_mask,
+//            );
+//
+//            // take the previous iterations structural bits, not our current iteration,
+//            // and flatten
+//            flatten_bits(&mut structural_indexes, idx as u32, structurals);
+//
+//            let mut whitespace: u64 = 0;
+//            find_whitespace_and_structurals(&input, &mut whitespace, &mut structurals);
+//
+//            // fixup structurals to reflect quotes and add pseudo-structural characters
+//            structurals = finalize_structurals(
+//                structurals,
+//                whitespace,
+//                quote_mask,
+//                quote_bits,
+//                &mut prev_iter_ends_pseudo_pred,
+//            );
+//            idx += 64;
+//        }
+//
+//        // we use a giant copy-paste which is ugly.
+//        // but otherwise the string needs to be properly padded or else we
+//        // risk invalidating the UTF-8 checks.
+//        if idx < len {
+//            let mut tmpbuf: [u8; 64] = [0x20; 64];
+//            tmpbuf
+//                .as_mut_ptr()
+//                .copy_from(input.as_ptr().add(idx), len as usize - idx);
+//            let input: SimdInput = fill_input(&tmpbuf);
+//
+//            check_utf8(&input, &mut has_error, &mut previous);
+//
+//            // detect odd sequences of backslashes
+//            let odd_ends: u64 =
+//                find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash);
+//
+//            // detect insides of quote pairs ("quote_mask") and also our quote_bits
+//            // themselves
+//            let mut quote_bits: u64 = 0;
+//            let quote_mask: u64 = find_quote_mask_and_bits(
+//                &input,
+//                odd_ends,
+//                &mut prev_iter_inside_quote,
+//                &mut quote_bits,
+//                &mut error_mask,
+//            );
+//
+//            // take the previous iterations structural bits, not our current iteration,
+//            // and flatten
+//            flatten_bits(&mut structural_indexes, idx as u32, structurals);
+//
+//            let mut whitespace: u64 = 0;
+//            find_whitespace_and_structurals(&input, &mut whitespace, &mut structurals);
+//
+//            // fixup structurals to reflect quotes and add pseudo-structural characters
+//            structurals = finalize_structurals(
+//                structurals,
+//                whitespace,
+//                quote_mask,
+//                quote_bits,
+//                &mut prev_iter_ends_pseudo_pred,
+//            );
+//            idx += 64;
+//        }
+//        // This test isn't in upstream, for some reason the error mask is et for then.
+//        if prev_iter_inside_quote != 0 {
+//            return Err(ErrorType::Syntax);
+//        }
+//        // finally, flatten out the remaining structurals from the last iteration
+//        flatten_bits(&mut structural_indexes, idx as u32, structurals);
+//
+//        // a valid JSON file cannot have zero structural indexes - we should have
+//        // found something (note that we compare to 1 as we always add the root!)
+//        if structural_indexes.len() == 1 {
+//            return Err(ErrorType::EOF);
+//        }
+//
+//        if structural_indexes.last() > Some(&(len as u32)) {
+//            return Err(ErrorType::InternalError);
+//        }
+//
+//        if error_mask != 0 {
+//            return Err(ErrorType::Syntax);
+//        }
+//
+//        if _mm_testz_si128(has_error, has_error) != 0 {
+//            Ok(structural_indexes)
+//        } else {
+//            Err(ErrorType::InvalidUTF8)
+//        }
+//    }
+//}
diff --git a/src/neon/utf8check.rs b/src/neon/utf8check.rs
new file mode 100644
index 00000000..04bf0640
--- /dev/null
+++ b/src/neon/utf8check.rs
@@ -0,0 +1,247 @@
+use crate::*;
+
+/*
+ * legal utf-8 byte sequence
+ * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
+ *
+ *  Code Points        1st       2s       3s       4s
+ * U+0000..U+007F     00..7F
+ * U+0080..U+07FF     C2..DF   80..BF
+ * U+0800..U+0FFF     E0       A0..BF   80..BF
+ * U+1000..U+CFFF     E1..EC   80..BF   80..BF
+ * U+D000..U+D7FF     ED       80..9F   80..BF
+ * U+E000..U+FFFF     EE..EF   80..BF   80..BF
+ * U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
+ * U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
+ * U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
+ *
+ */
+
+// all byte values must be no larger than 0xF4
+
+/*****************************/
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn push_last_byte_of_a_to_b(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { _mm_alignr_epi8(b, a, 15) }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn push_last_2bytes_of_a_to_b(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { _mm_alignr_epi8(b, a, 14) }
+}
+
+// all byte values must be no larger than 0xF4
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn avxcheck_smaller_than_0xf4(current_bytes: __m128i, has_error: &mut __m128i) {
+    // unsigned, saturates to 0 below max
+    *has_error = unsafe {
+        _mm_or_si128(
+            *has_error,
+            _mm_subs_epu8(current_bytes, _mm_set1_epi8(static_cast_i8!(0xF4u8))),
+        )
+    };
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn avxcontinuation_lengths(high_nibbles: __m128i) -> __m128i {
+    unsafe {
+        _mm_shuffle_epi8(
+            _mm_setr_epi8(
+                1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
+                0, 0, 0, 0, // 10xx (continuation)
+                2, 2, // 110x
+                3, // 1110
+                4, // 1111, next should be 0 (not checked here)
+            ),
+            high_nibbles,
+        )
+    }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn avxcarry_continuations(initial_lengths: __m128i, previous_carries: __m128i) -> __m128i {
+    unsafe {
+        let right1: __m128i = _mm_subs_epu8(
+            push_last_byte_of_a_to_b(previous_carries, initial_lengths),
+            _mm_set1_epi8(1),
+        );
+        let sum: __m128i = _mm_add_epi8(initial_lengths, right1);
+        let right2: __m128i = _mm_subs_epu8(
+            push_last_2bytes_of_a_to_b(previous_carries, sum),
+            _mm_set1_epi8(2),
+        );
+        _mm_add_epi8(sum, right2)
+    }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn avxcheck_continuations(initial_lengths: __m128i, carries: __m128i, has_error: &mut __m128i) {
+    // overlap || underlap
+    // carry > length && length > 0 || !(carry > length) && !(length > 0)
+    // (carries > length) == (lengths > 0)
+    unsafe {
+        let overunder: __m128i = _mm_cmpeq_epi8(
+            _mm_cmpgt_epi8(carries, initial_lengths),
+            _mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128()),
+        );
+
+        *has_error = _mm_or_si128(*has_error, overunder);
+    }
+}
+
+// when 0xED is found, next byte must be no larger than 0x9F
+// when 0xF4 is found, next byte must be no larger than 0x8F
+// next byte must be continuation, ie sign bit is set, so signed < is ok
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn avxcheck_first_continuation_max(
+    current_bytes: __m128i,
+    off1_current_bytes: __m128i,
+    has_error: &mut __m128i,
+) {
+    unsafe {
+        let mask_ed: __m128i = _mm_cmpeq_epi8(
+            off1_current_bytes,
+            _mm_set1_epi8(static_cast_i8!(0xEDu8)),
+        );
+        let mask_f4: __m128i = _mm_cmpeq_epi8(
+            off1_current_bytes,
+            _mm_set1_epi8(static_cast_i8!(0xF4u8)),
+        );
+
+        let badfollow_ed: __m128i = _mm_and_si128(
+            _mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(static_cast_i8!(0x9Fu8))),
+            mask_ed,
+        );
+        let badfollow_f4: __m128i = _mm_and_si128(
+            _mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(static_cast_i8!(0x8Fu8))),
+            mask_f4,
+        );
+
+        *has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollow_ed, badfollow_f4));
+    }
+}
+
+// map off1_hibits => error condition
+// hibits     off1    cur
+// C       => < C2 && true
+// E       => < E1 && < A0
+// F       => < F1 && < 90
+// else      false && false
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn avxcheck_overlong(
+    current_bytes: __m128i,
+    off1_current_bytes: __m128i,
+    hibits: __m128i,
+    previous_hibits: __m128i,
+    has_error: &mut __m128i,
+) {
+    unsafe {
+        let off1_hibits: __m128i = push_last_byte_of_a_to_b(previous_hibits, hibits);
+        let initial_mins: __m128i = _mm_shuffle_epi8(
+            _mm_setr_epi8(
+                -128,
+                -128,
+                -128,
+                -128,
+                -128,
+                -128,
+                -128,
+                -128,
+                -128,
+                -128,
+                -128,
+                -128, // 10xx => false
+                static_cast_i8!(0xC2u8),
+                -128,                    // 110x
+                static_cast_i8!(0xE1u8), // 1110
+                static_cast_i8!(0xF1u8), // 1111
+            ),
+            off1_hibits,
+        );
+
+        let initial_under: __m128i = _mm_cmpgt_epi8(initial_mins, off1_current_bytes);
+
+        let second_mins: __m128i = _mm_shuffle_epi8(
+            _mm_setr_epi8(
+                -128,
+                -128,
+                -128,
+                -128,
+                -128,
+                -128,
+                -128,
+                -128,
+                -128,
+                -128,
+                -128,
+                -128, // 10xx => false
+                127,
+                127,                     // 110x => true
+                static_cast_i8!(0xA0u8), // 1110
+                static_cast_i8!(0x90u8), // 1111
+            ),
+            off1_hibits,
+        );
+        let second_under: __m128i = _mm_cmpgt_epi8(second_mins, current_bytes);
+        *has_error = _mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under));
+    }
+}
+
+pub struct AvxProcessedUtfBytes {
+    rawbytes: __m128i,
+    high_nibbles: __m128i,
+    pub carried_continuations: __m128i,
+}
+
+impl Default for AvxProcessedUtfBytes {
+    #[cfg_attr(not(feature = "no-inline"), inline)]
+    fn default() -> Self {
+        unsafe {
+            AvxProcessedUtfBytes {
+                rawbytes: _mm_setzero_si128(),
+                high_nibbles: _mm_setzero_si128(),
+                carried_continuations: _mm_setzero_si128(),
+            }
+        }
+    }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn avx_count_nibbles(bytes: __m128i, answer: &mut AvxProcessedUtfBytes) {
+    answer.rawbytes = bytes;
+    answer.high_nibbles =
+        unsafe { _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F)) };
+}
+
+// check whether the current bytes are valid UTF-8
+// at the end of the function, previous gets updated
+#[cfg_attr(not(feature = "no-inline"), inline)]
+pub fn avxcheck_utf8_bytes(
+    current_bytes: __m128i,
+    previous: &AvxProcessedUtfBytes,
+    has_error: &mut __m128i,
+) -> AvxProcessedUtfBytes {
+    let mut pb = AvxProcessedUtfBytes::default();
+    avx_count_nibbles(current_bytes, &mut pb);
+
+    avxcheck_smaller_than_0xf4(current_bytes, has_error);
+
+    let initial_lengths: __m128i = avxcontinuation_lengths(pb.high_nibbles);
+
+    pb.carried_continuations =
+        avxcarry_continuations(initial_lengths, previous.carried_continuations);
+
+    avxcheck_continuations(initial_lengths, pb.carried_continuations, has_error);
+
+    let off1_current_bytes: __m128i = push_last_byte_of_a_to_b(previous.rawbytes, pb.rawbytes);
+    avxcheck_first_continuation_max(current_bytes, off1_current_bytes, has_error);
+
+    avxcheck_overlong(
+        current_bytes,
+        off1_current_bytes,
+        pb.high_nibbles,
+        previous.high_nibbles,
+        has_error,
+    );
+    pb
+}
diff --git a/src/numberparse.rs b/src/numberparse.rs
index 03880d56..425d16ab 100644
--- a/src/numberparse.rs
+++ b/src/numberparse.rs
@@ -1,10 +1,6 @@
 use crate::charutils::*;
 use crate::unlikely;
 use crate::*;
-#[cfg(target_arch = "x86")]
-use std::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
 
 const POWER_OF_TEN: [f64; 617] = [
     1e-308, 1e-307, 1e-306, 1e-305, 1e-304, 1e-303, 1e-302, 1e-301, 1e-300, 1e-299, 1e-298, 1e-297,
@@ -133,26 +129,14 @@ pub enum Number {
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
+#[cfg(target_arch = "aarch64")]
 fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 {
-    unsafe {
-        // this actually computes *16* values so we are being wasteful.
-        let ascii0: __m128i = _mm_set1_epi8(b'0' as i8);
-        let mul_1_10: __m128i =
-            _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
-        let mul_1_100: __m128i = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
-        let mul_1_10000: __m128i = _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
-        // We know what we're doing right? :P
-        #[allow(clippy::cast_ptr_alignment)]
-        let input: __m128i = _mm_sub_epi8(
-            _mm_loadu_si128(chars.get_unchecked(0..16).as_ptr() as *const __m128i),
-            ascii0,
-        );
-        let t1: __m128i = _mm_maddubs_epi16(input, mul_1_10);
-        let t2: __m128i = _mm_madd_epi16(t1, mul_1_100);
-        let t3: __m128i = _mm_packus_epi32(t2, t2);
-        let t4: __m128i = _mm_madd_epi16(t3, mul_1_10000);
-        _mm_cvtsi128_si32(t4) as u32 // only captures the sum of the first 8 digits, drop the rest
-    }
+    let val: u64 = unsafe { *(chars.as_ptr() as *const u64) };
+    //    memcpy(&val, chars, sizeof(u64));
+    let val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
+    let val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
+
+    return ((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32) as u32;
 }
 
 impl<'de> Deserializer<'de> {
diff --git a/src/portability.rs b/src/portability.rs
new file mode 100644
index 00000000..69481a85
--- /dev/null
+++ b/src/portability.rs
@@ -0,0 +1,30 @@
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+#[cfg(target_arch = "x86_64")]
+pub fn add_overflow(value1: u64, value2: u64, result: &mut u64) -> bool {
+    unsafe { _addcarry_u64(0, value1, value2, result) != 0 }
+}
+
+//TODO: static?
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+#[cfg(target_arch = "x86_64")]
+pub fn hamming(input_num: u64) -> u32 {
+    unsafe { _popcnt64(input_num as i64) as u32 }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+#[cfg(target_arch = "x86_64")]
+pub fn hamming(input_num: u64) -> u32 {
+    unsafe { __popcnt(input_num as u32) + __popcnt((input_num >> 32) as u32) as u32 }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+#[cfg(target_arch = "x86_64")]
+pub fn trailingzeroes(input_num: u64) -> u32 {
+    unsafe { _tzcnt_u64(input_num) as u32 }
+}
diff --git a/src/stage2.rs b/src/stage2.rs
index aaa51ad8..f33a3877 100644
--- a/src/stage2.rs
+++ b/src/stage2.rs
@@ -1,9 +1,11 @@
 #![allow(dead_code)]
+use crate::charutils::*;
 #[cfg(target_feature = "avx2")]
 use crate::avx2::stage1::SIMDJSON_PADDING;
-use crate::charutils::*;
-#[cfg(not(target_feature = "avx2"))]
+#[cfg(target_feature = "sse4.2")]
 use crate::sse42::stage1::SIMDJSON_PADDING;
+#[cfg(target_feature = "neon")]
+use crate::neon::stage1::SIMDJSON_PADDING;
 use crate::{Deserializer, Error, ErrorType, Result};
 
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
diff --git a/src/value/generator.rs b/src/value/generator.rs
index 55824f57..1d21a833 100644
--- a/src/value/generator.rs
+++ b/src/value/generator.rs
@@ -101,91 +101,91 @@ pub trait BaseGenerator {
         let mut len = string.len();
         let mut idx = 0;
 
-        unsafe {
+//        unsafe {
             // Looking at the table above the lower 5 bits are entirely
             // quote characters that gives us a bitmask of 0x1f for that
             // region, only quote (`"`) and backslash (`\`) are not in
             // this range.
-            if AVX2_PRESENT {
-                let zero = _mm256_set1_epi8(0);
-                let lower_quote_range = _mm256_set1_epi8(0x1F as i8);
-                let quote = _mm256_set1_epi8(b'"' as i8);
-                let backslash = _mm256_set1_epi8(b'\\' as i8);
-                while len - idx >= 32 {
-                    // Load 32 bytes of data;
-                    #[allow(clippy::cast_ptr_alignment)]
-                    let data: __m256i = _mm256_loadu_si256(string.as_ptr().add(idx) as *const __m256i);
-                    // Test the data against being backslash and quote.
-                    let bs_or_quote = _mm256_or_si256(
-                        _mm256_cmpeq_epi8(data, backslash),
-                        _mm256_cmpeq_epi8(data, quote),
-                    );
-                    // Now mask the data with the quote range (0x1F).
-                    let in_quote_range = _mm256_and_si256(data, lower_quote_range);
-                    // then test of the data is unchanged. aka: xor it with the
-                    // Any field that was inside the quote range it will be zero
-                    // now.
-                    let is_unchanged = _mm256_xor_si256(data, in_quote_range);
-                    let in_range = _mm256_cmpeq_epi8(is_unchanged, zero);
-                    let quote_bits = _mm256_movemask_epi8(_mm256_or_si256(bs_or_quote, in_range));
-                    if quote_bits != 0 {
-                        let quote_dist = quote_bits.trailing_zeros() as usize;
-                        stry!(self.get_writer().write_all(&string[0..idx + quote_dist]));
-                        let ch = string[idx + quote_dist];
-                        match ESCAPED[ch as usize] {
-                            b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
-
-                            escape => stry!(self.write(&[b'\\', escape])),
-                        };
-                        string = &string[idx + quote_dist + 1..];
-                        idx = 0;
-                        len = string.len();
-                    } else {
-                        idx += 32;
-                    }
-                }
-            }
+//          if AVX2_PRESENT {
+//              let zero = _mm256_set1_epi8(0);
+//              let lower_quote_range = _mm256_set1_epi8(0x1F as i8);
+//              let quote = _mm256_set1_epi8(b'"' as i8);
+//              let backslash = _mm256_set1_epi8(b'\\' as i8);
+//              while len - idx >= 32 {
+//                  // Load 32 bytes of data;
+//                  #[allow(clippy::cast_ptr_alignment)]
+//                  let data: __m256i = _mm256_loadu_si256(string.as_ptr().add(idx) as *const __m256i);
+//                  // Test the data against being backslash and quote.
+//                  let bs_or_quote = _mm256_or_si256(
+//                      _mm256_cmpeq_epi8(data, backslash),
+//                      _mm256_cmpeq_epi8(data, quote),
+//                  );
+//                  // Now mask the data with the quote range (0x1F).
+//                  let in_quote_range = _mm256_and_si256(data, lower_quote_range);
+//                  // then test of the data is unchanged. aka: xor it with the
+//                  // Any field that was inside the quote range it will be zero
+//                  // now.
+//                  let is_unchanged = _mm256_xor_si256(data, in_quote_range);
+//                  let in_range = _mm256_cmpeq_epi8(is_unchanged, zero);
+//                  let quote_bits = _mm256_movemask_epi8(_mm256_or_si256(bs_or_quote, in_range));
+//                  if quote_bits != 0 {
+//                      let quote_dist = trailingzeroes(quote_bits as u64) as usize;
+//                      stry!(self.get_writer().write_all(&string[0..idx + quote_dist]));
+//                      let ch = string[idx + quote_dist];
+//                      match ESCAPED[ch as usize] {
+//                          b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
+//
+//                          escape => stry!(self.write(&[b'\\', escape])),
+//                      };
+//                      string = &string[idx + quote_dist + 1..];
+//                      idx = 0;
+//                      len = string.len();
+//                  } else {
+//                      idx += 32;
+//                  }
+//              }
+//          }
             // The case where we have a 16+ byte block
             // we repeate the same logic as above but with
             // only 16 bytes
-            let zero = _mm_set1_epi8(0);
-            let lower_quote_range = _mm_set1_epi8(0x1F as i8);
-            let quote = _mm_set1_epi8(b'"' as i8);
-            let backslash = _mm_set1_epi8(b'\\' as i8);
-            while len - idx > 16 {
-                // Load 16 bytes of data;
-                #[allow(clippy::cast_ptr_alignment)]
-                let data: __m128i = _mm_loadu_si128(string.as_ptr().add(idx) as *const __m128i);
-                // Test the data against being backslash and quote.
-                let bs_or_quote =
-                    _mm_or_si128(_mm_cmpeq_epi8(data, backslash), _mm_cmpeq_epi8(data, quote));
-                // Now mask the data with the quote range (0x1F).
-                let in_quote_range = _mm_and_si128(data, lower_quote_range);
-                // then test of the data is unchanged. aka: xor it with the
-                // Any field that was inside the quote range it will be zero
-                // now.
-                let is_unchanged = _mm_xor_si128(data, in_quote_range);
-                let in_range = _mm_cmpeq_epi8(is_unchanged, zero);
-                let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range));
-                if quote_bits != 0 {
-                    let quote_dist = quote_bits.trailing_zeros() as usize;
-                    stry!(self.get_writer().write_all(&string[0..idx + quote_dist]));
-                    let ch = string[idx + quote_dist];
-                    match ESCAPED[ch as usize] {
-                        b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
-
-                        escape => stry!(self.write(&[b'\\', escape])),
-                    };
-                    string = &string[idx + quote_dist + 1..];
-                    idx = 0;
-                    len = string.len();
-                } else {
-                    idx += 16;
-                }
-            }
-            stry!(self.get_writer().write_all(&string[0..idx]));
-            string = &string[idx..];
-        }
+//          let zero = _mm_set1_epi8(0);
+//          let lower_quote_range = _mm_set1_epi8(0x1F as i8);
+//          let quote = _mm_set1_epi8(b'"' as i8);
+//          let backslash = _mm_set1_epi8(b'\\' as i8);
+//          while len - idx > 16 {
+//              // Load 16 bytes of data;
+//              #[allow(clippy::cast_ptr_alignment)]
+//              let data: __m128i = _mm_loadu_si128(string.as_ptr().add(idx) as *const __m128i);
+//              // Test the data against being backslash and quote.
+//              let bs_or_quote =
+//                  _mm_or_si128(_mm_cmpeq_epi8(data, backslash), _mm_cmpeq_epi8(data, quote));
+//              // Now mask the data with the quote range (0x1F).
+//              let in_quote_range = _mm_and_si128(data, lower_quote_range);
+//              // then test of the data is unchanged. aka: xor it with the
+//              // Any field that was inside the quote range it will be zero
+//              // now.
+//              let is_unchanged = _mm_xor_si128(data, in_quote_range);
+//              let in_range = _mm_cmpeq_epi8(is_unchanged, zero);
+//              let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range));
+//              if quote_bits != 0 {
+//                  let quote_dist = trailingzeroes(quote_bits as u64) as usize;
+//                  stry!(self.get_writer().write_all(&string[0..idx + quote_dist]));
+//                  let ch = string[idx + quote_dist];
+//                  match ESCAPED[ch as usize] {
+//                      b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
+//
+//                      escape => stry!(self.write(&[b'\\', escape])),
+//                  };
+//                  string = &string[idx + quote_dist + 1..];
+//                  idx = 0;
+//                  len = string.len();
+//              } else {
+//                  idx += 16;
+//              }
+//          }
+//          stry!(self.get_writer().write_all(&string[0..idx]));
+//          string = &string[idx..];
+//      }
         // Legacy code to handle the remainder of the code
         for (index, ch) in string.iter().enumerate() {
             if ESCAPED[*ch as usize] > 0 {

From 9e96fabe62757039e05faf7aabffca9734e33acb Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Fri, 9 Aug 2019 18:21:50 -0400
Subject: [PATCH 02/34] feat: progress on neon compilation

---
 src/neon/intrinsics.rs | 180 +++++++--
 src/neon/mod.rs        |   2 +-
 src/neon/simd.rs       |   1 +
 src/neon/simd_llvm.rs  |  84 ++---
 src/neon/stage1.rs     | 814 +++++++++++++++++++----------------------
 src/neon/utf8check.rs  | 287 ++++++---------
 6 files changed, 686 insertions(+), 682 deletions(-)

diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs
index a2bca414..065047b6 100644
--- a/src/neon/intrinsics.rs
+++ b/src/neon/intrinsics.rs
@@ -1,10 +1,10 @@
 //use std::arch::
 
 use crate::neon::simd_llvm;
-use crate::neon::simd;
 
 use std::mem;
 use std::hint::unreachable_unchecked;
+use core;
 
 #[allow(unused)]
 macro_rules! types {
@@ -21,21 +21,44 @@ macro_rules! types {
     )*)
 }
 
-pub type poly64_t = i64;
-
 extern "C" {
     #[link_name = "llvm.aarch64.neon.pmull64"]
     fn vmull_p64_(a: i64, b: i64) -> int8x16_t;
+    #[link_name = "llvm.aarch64.neon.vshrq"]
+    fn vshrq_n_u8_(a: uint8x16_t, b: u8) -> uint8x16_t;
+    #[link_name = "llvm.aarch64.neon.addp.v16i8"]
+    fn vpaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    #[link_name = "llvm.aarch64.neon.addp.v16i8"]
+    fn vaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    #[link_name = "llvm.aarch64.neon.addp.v16i32"]
+    fn vaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    #[link_name = "llvm.aarch64.neon.vextq.v16s8"]
+    fn vextq_s8_(a: int8x16_t, b: int8x16_t, n:u8) -> int8x16_t;
+    #[link_name = "llvm.aarch64.neon.vtstq.v16u8"]
+    fn vtstq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    #[link_name = "llvm.aarch64.neon.vtstq.v16s8"]
+    fn vtstq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    #[link_name = "llvm.ctpop.u64"]
+    fn ctpop_u64_(a: u64) -> u32;
+    #[link_name = "llvm.ctlz.u64"]
+    fn ctlz_u64_(a: u64) -> u32;
+    #[link_name = "llvm.cttz.u64"]
+    fn cttz_u64_(a: u64) -> u32;
 }
 
-//poly128_t vmull_p64 (poly64_t a, poly64_t b)
 #[inline]
-#[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(pmull))]
-pub unsafe fn vmull_p64(a: poly64_t, b: poly64_t) -> poly128_t {
+pub unsafe fn vmull_p64(a: i64, b: i64) -> uint8x16_t {
     mem::transmute(vmull_p64_(mem::transmute(a), mem::transmute(b)))
 }
 
+
+#[inline]
+pub unsafe fn vshrq_n_u8(a: uint8x16_t, b: u8) -> uint8x16_t {
+    // FIXME?
+    vshrq_n_u8_(a, b)
+}
+
 types! {
     /// ARM-specific 64-bit wide vector of eight packed `i8`.
     pub struct int8x8_t(i8, i8, i8, i8, i8, i8, i8, i8);
@@ -47,8 +70,6 @@ types! {
     pub struct int16x4_t(i16, i16, i16, i16);
     /// ARM-specific 64-bit wide vector of four packed `u16`.
     pub struct uint16x4_t(u16, u16, u16, u16);
-    // FIXME: ARM-specific 64-bit wide vector of four packed `f16`.
-    // pub struct float16x4_t(f16, f16, f16, f16);
     /// ARM-specific 64-bit wide vector of four packed `u16`.
     pub struct poly16x4_t(u16, u16, u16, u16);
     /// ARM-specific 64-bit wide vector of two packed `i32`.
@@ -61,7 +82,6 @@ types! {
     pub struct int64x1_t(i64);
     /// ARM-specific 64-bit wide vector of one packed `u64`.
     pub struct uint64x1_t(u64);
-
     /// ARM-specific 128-bit wide vector of sixteen packed `i8`.
     pub struct int8x16_t(
         i8, i8 ,i8, i8, i8, i8 ,i8, i8,
@@ -81,8 +101,6 @@ types! {
     pub struct int16x8_t(i16, i16, i16, i16, i16, i16, i16, i16);
     /// ARM-specific 128-bit wide vector of eight packed `u16`.
     pub struct uint16x8_t(u16, u16, u16, u16, u16, u16, u16, u16);
-    // FIXME: ARM-specific 128-bit wide vector of eight packed `f16`.
-    // pub struct float16x8_t(f16, f16, f16, f16, f16, f16, f16);
     /// ARM-specific 128-bit wide vector of eight packed `u16`.
     pub struct poly16x8_t(u16, u16, u16, u16, u16, u16, u16, u16);
     /// ARM-specific 128-bit wide vector of four packed `i32`.
@@ -95,8 +113,6 @@ types! {
     pub struct int64x2_t(i64, i64);
     /// ARM-specific 128-bit wide vector of two packed `u64`.
     pub struct uint64x2_t(u64, u64);
-    /// ARM-specific 128-bit wide vector of one packed `i128`
-    pub struct poly128_t(i128);
 }
 
 impl uint8x16_t {
@@ -111,6 +127,33 @@ impl int8x16_t {
     }
 }
 
+impl int32x4_t {
+    pub fn new(a:i32,b:i32,c:i32,d:i32) -> int32x4_t {
+        int32x4_t(a, b, c, d)
+    }
+}
+
+#[inline]
+pub fn add_overflow(a: u64, b: u64, out: &mut u64) -> bool {
+    let (carry, did_carry) = a.overflowing_add(b);
+
+    if did_carry {
+        *out = carry;
+    };
+
+    did_carry
+}
+
+#[inline]
+pub unsafe fn vld1q_s8(addr: *const i8) -> int8x16_t {
+    *(addr as *const int8x16_t)
+}
+
+#[inline]
+pub unsafe fn vld1q_u8(addr: *const u8) -> uint8x16_t {
+    *(addr as *const uint8x16_t)
+}
+
 macro_rules! aarch64_simd_2 {
     ($name:ident, $type:ty, $simd_fn:ident, $intr:ident) => {
         #[inline]
@@ -136,6 +179,8 @@ aarch64_simd_ceq!(vceqq_u64, uint64x2_t);
 aarch64_simd_ceq!(vceq_p64, uint64x1_t);
 aarch64_simd_ceq!(vceqq_p64, uint64x2_t);
 
+aarch64_simd_ceq!(vceqq_s8, int8x16_t);
+
 macro_rules! aarch64_simd_cgt {
     ($name:ident, $type:ty) => {
         /// Compare signed Greater than (vector)
@@ -209,20 +254,58 @@ aarch64_simd_cle!(vcleq_s64, int64x2_t);
 aarch64_simd_cleu!(vcle_u64, uint64x1_t);
 aarch64_simd_cleu!(vcleq_u64, uint64x2_t);
 
+aarch64_simd_cleu!(vcgtq_s8, int8x16_t);
+
+#[inline]
+pub fn vdupq_n_s8(a:i8) -> int8x16_t {
+    int8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
+#[inline]
+pub fn zeroi8x16() -> int8x16_t {
+    int8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00)
+}
 
+#[inline]
 pub fn vdupq_n_u8(a:u8) -> uint8x16_t {
     uint8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
 }
 
-pub fn zero8x16() -> uint8x16_t {
+#[inline]
+pub fn vmovq_n_u8(a:u8) -> uint8x16_t {
+    uint8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
+#[inline]
+pub fn zerou8x16() -> uint8x16_t {
     uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00)
 }
 
-pub unsafe fn vpaddq_u8(a:uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_add(a, b) }
+#[inline]
+pub unsafe fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    vpaddq_u8_(a, b)
+}
 
+#[inline]
+pub unsafe fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    vaddq_s8_(a, b)
+}
+
+#[inline]
+pub unsafe fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    vaddq_s32_(a, b)
+}
+
+#[inline]
 pub unsafe fn vandq_u8(a:uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_and(a, b) }
+
+#[inline]
 pub unsafe fn vorrq_u8(a:uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_or(a, b) }
+
+#[inline]
 pub unsafe fn vandq_s8(a:int8x16_t, b: int8x16_t) -> int8x16_t { simd_llvm::simd_and(a, b) }
+
+#[inline]
 pub unsafe fn vorrq_s8(a:int8x16_t, b: int8x16_t) -> int8x16_t { simd_llvm::simd_or(a, b) }
 
 macro_rules! arm_reinterpret {
@@ -257,31 +340,56 @@ macro_rules! arm_vget_lane {
     };
 }
 
-arm_vget_lane!(vgetq_lane_u16, u16, uint8x16_t, 7);
+arm_vget_lane!(vgetq_lane_u16, u16, uint16x8_t, 7);
 arm_vget_lane!(vgetq_lane_u64, u64, uint64x2_t, 1);
 arm_vget_lane!(vget_lane_u64, u64, uint64x1_t, 0);
 
+pub unsafe fn vextq_s8(a:int8x16_t, b:int8x16_t, n:u8) -> int8x16_t {
+    vextq_s8_(a, b, n)
+}
+
+#[inline]
 pub fn vqmovn_u64(a:uint64x2_t) -> uint32x2_t {
     uint32x2_t(a.0 as u32, a.1 as u32)
 }
 
-//fn vaddq_s8() -> u16 { 0 }
-//fn vceqq_s8() -> u16 { 0 }
-//fn vceqq_u8() -> u16 { 0 }
-//fn vcgtq_s8() -> u16 { 0 }
-//fn vcleq_u8() -> u16 { 0 }
-//fn vdupq_n_s8() -> u16 { 0 }
-//fn vextq_s8() -> u16 { 0 }
-//fn vget_lane_u64() -> u16 { 0 }
-//fn vgetq_lane_u32() -> u16 { 0 }
-//fn vgetq_lane_u64() -> u16 { 0 }
-//fn vld1q_s8() -> u16 { 0 }
-//fn vld1q_u8() -> u16 { 0 }
-//fn vmovq_n_u8() -> u16 { 0 }
-//fn vmull_p64() -> u16 { 0 }
-//fn vorrq_s8() -> u16 { 0 }
-//fn vqsubq_u8() -> u16 { 0 }
-//fn vqtbl1q_s8() -> u16 { 0 }
-//fn vqtbl1q_u8() -> u16 { 0 }
-//fn vshrq_n_u8() -> u16 { 0 }
-//fn vst1q_u8() -> u16 { 0 }
\ No newline at end of file
+#[inline]
+pub unsafe fn vqtbl1q_s8(t: int8x16_t, idx: uint8x16_t) -> int8x16_t {
+    mem::transmute(core::arch::aarch64::vqtbl1q_s8(mem::transmute(t), mem::transmute(idx)))
+}
+
+#[inline]
+pub unsafe fn vqtbl1q_u8(t: uint8x16_t, idx: uint8x16_t) -> uint8x16_t {
+    mem::transmute(core::arch::aarch64::vqtbl1q_s8(mem::transmute(t), mem::transmute(idx)))
+}
+
+#[inline]
+pub unsafe fn vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    // FIXME?
+    simd_llvm::simd_sub(mem::transmute(a), mem::transmute(b))
+}
+
+#[inline]
+pub unsafe fn vtstq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    vtstq_u8_(a, b)
+}
+
+#[inline]
+pub unsafe fn vtstq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    vtstq_s8_(a, b)
+}
+
+#[inline]
+pub unsafe fn hamming(a:u64) -> u32 {
+    ctpop_u64_(a)
+}
+
+#[inline]
+pub unsafe fn trailingzeroes(a:u64) -> u32 {
+    cttz_u64_(a)
+}
+
+#[inline]
+pub unsafe fn vst1q_u32(addr: *mut u8, val : uint32x4_t) {
+    std::ptr::write(addr as *mut uint32x4_t, val)
+}
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
index 6d23b075..c1a87d98 100644
--- a/src/neon/mod.rs
+++ b/src/neon/mod.rs
@@ -1,6 +1,6 @@
 //pub mod deser;
 pub mod stage1;
-//pub mod utf8check;
+pub mod utf8check;
 mod simd;
 mod simd_llvm;
 mod intrinsics;
\ No newline at end of file
diff --git a/src/neon/simd.rs b/src/neon/simd.rs
index ecf5a4cb..544bc0d8 100644
--- a/src/neon/simd.rs
+++ b/src/neon/simd.rs
@@ -1,4 +1,5 @@
 #![allow(non_camel_case_types)]
+#![allow(unused)]
 
 use crate::neon::simd_llvm;
 
diff --git a/src/neon/simd_llvm.rs b/src/neon/simd_llvm.rs
index a6c1eb52..892ae59c 100644
--- a/src/neon/simd_llvm.rs
+++ b/src/neon/simd_llvm.rs
@@ -1,54 +1,54 @@
 extern "platform-intrinsic" {
     pub fn simd_eq<T, U>(x: T, y: T) -> U;
-    pub fn simd_ne<T, U>(x: T, y: T) -> U;
+//    pub fn simd_ne<T, U>(x: T, y: T) -> U;
     pub fn simd_lt<T, U>(x: T, y: T) -> U;
     pub fn simd_le<T, U>(x: T, y: T) -> U;
     pub fn simd_gt<T, U>(x: T, y: T) -> U;
     pub fn simd_ge<T, U>(x: T, y: T) -> U;
-
-    pub fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U;
-    pub fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U;
-    pub fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U;
-    pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
-    pub fn simd_shuffle32<T, U>(x: T, y: T, idx: [u32; 32]) -> U;
-    pub fn simd_shuffle64<T, U>(x: T, y: T, idx: [u32; 64]) -> U;
-    pub fn simd_shuffle128<T, U>(x: T, y: T, idx: [u32; 128]) -> U;
-
-    pub fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
+//
+//    pub fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U;
+//    pub fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U;
+//    pub fn simd_shuffle8<T, U>(x: T, y: T, idx: [u32; 8]) -> U;
+//    pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
+//    pub fn simd_shuffle32<T, U>(x: T, y: T, idx: [u32; 32]) -> U;
+//    pub fn simd_shuffle64<T, U>(x: T, y: T, idx: [u32; 64]) -> U;
+//    pub fn simd_shuffle128<T, U>(x: T, y: T, idx: [u32; 128]) -> U;
+//
+//    pub fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
     pub fn simd_extract<T, U>(x: T, idx: u32) -> U;
-
-    pub fn simd_cast<T, U>(x: T) -> U;
-
+//
+//    pub fn simd_cast<T, U>(x: T) -> U;
+//
     pub fn simd_add<T>(x: T, y: T) -> T;
     pub fn simd_sub<T>(x: T, y: T) -> T;
-    pub fn simd_mul<T>(x: T, y: T) -> T;
-    pub fn simd_div<T>(x: T, y: T) -> T;
-    pub fn simd_shl<T>(x: T, y: T) -> T;
-    pub fn simd_shr<T>(x: T, y: T) -> T;
+//    pub fn simd_mul<T>(x: T, y: T) -> T;
+//    pub fn simd_div<T>(x: T, y: T) -> T;
+//    pub fn simd_shl<T>(x: T, y: T) -> T;
+//    pub fn simd_shr<T>(x: T, y: T) -> T;
     pub fn simd_and<T>(x: T, y: T) -> T;
     pub fn simd_or<T>(x: T, y: T) -> T;
-    pub fn simd_xor<T>(x: T, y: T) -> T;
-
-    pub fn simd_reduce_add_unordered<T, U>(x: T) -> U;
-    pub fn simd_reduce_mul_unordered<T, U>(x: T) -> U;
-    pub fn simd_reduce_add_ordered<T, U>(x: T, acc: U) -> U;
-    pub fn simd_reduce_mul_ordered<T, U>(x: T, acc: U) -> U;
-    pub fn simd_reduce_min<T, U>(x: T) -> U;
-    pub fn simd_reduce_max<T, U>(x: T) -> U;
-    pub fn simd_reduce_min_nanless<T, U>(x: T) -> U;
-    pub fn simd_reduce_max_nanless<T, U>(x: T) -> U;
-    pub fn simd_reduce_and<T, U>(x: T) -> U;
-    pub fn simd_reduce_or<T, U>(x: T) -> U;
-    pub fn simd_reduce_xor<T, U>(x: T) -> U;
-    pub fn simd_reduce_all<T>(x: T) -> bool;
-    pub fn simd_reduce_any<T>(x: T) -> bool;
-
-    pub fn simd_select<M, T>(m: M, a: T, b: T) -> T;
-    pub fn simd_select_bitmask<M, T>(m: M, a: T, b: T) -> T;
-
-    pub fn simd_fmin<T>(a: T, b: T) -> T;
-    pub fn simd_fmax<T>(a: T, b: T) -> T;
-
-    pub fn simd_fsqrt<T>(a: T) -> T;
-    pub fn simd_fma<T>(a: T, b: T, c: T) -> T;
+//    pub fn simd_xor<T>(x: T, y: T) -> T;
+//
+//    pub fn simd_reduce_add_unordered<T, U>(x: T) -> U;
+//    pub fn simd_reduce_mul_unordered<T, U>(x: T) -> U;
+//    pub fn simd_reduce_add_ordered<T, U>(x: T, acc: U) -> U;
+//    pub fn simd_reduce_mul_ordered<T, U>(x: T, acc: U) -> U;
+//    pub fn simd_reduce_min<T, U>(x: T) -> U;
+//    pub fn simd_reduce_max<T, U>(x: T) -> U;
+//    pub fn simd_reduce_min_nanless<T, U>(x: T) -> U;
+//    pub fn simd_reduce_max_nanless<T, U>(x: T) -> U;
+//    pub fn simd_reduce_and<T, U>(x: T) -> U;
+//    pub fn simd_reduce_or<T, U>(x: T) -> U;
+//    pub fn simd_reduce_xor<T, U>(x: T) -> U;
+//    pub fn simd_reduce_all<T>(x: T) -> bool;
+//    pub fn simd_reduce_any<T>(x: T) -> bool;
+//
+//    pub fn simd_select<M, T>(m: M, a: T, b: T) -> T;
+//    pub fn simd_select_bitmask<M, T>(m: M, a: T, b: T) -> T;
+//
+//    pub fn simd_fmin<T>(a: T, b: T) -> T;
+//    pub fn simd_fmax<T>(a: T, b: T) -> T;
+//
+//    pub fn simd_fsqrt<T>(a: T) -> T;
+//    pub fn simd_fma<T>(a: T, b: T, c: T) -> T;
 }
\ No newline at end of file
diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs
index 6e5f6b58..fc4daabc 100644
--- a/src/neon/stage1.rs
+++ b/src/neon/stage1.rs
@@ -1,15 +1,10 @@
 #![allow(dead_code)]
 
-//use crate::portability::*;
 use crate::neon::intrinsics::*;
-//use core::arch::aarch64::*;
-//use crate::neon::utf8check::*;
+use crate::neon::utf8check::*;
 use crate::*;
 use std::mem;
 
-#[macro_use]
-use crate::neon::simd::*;
-
 pub const SIMDJSON_PADDING: usize = mem::size_of::<uint8x16_t>() * 4;
 
 #[derive(Debug)]
@@ -24,34 +19,31 @@ fn fill_input(ptr: &[u8]) -> SimdInput {
     unsafe {
         #[allow(clippy::cast_ptr_alignment)]
             SimdInput {
-            v0: mem::transmute(ptr.as_ptr() as *const uint8x16_t),
-            v1: mem::transmute(ptr.as_ptr().add(16) as *const uint8x16_t),
-            v2: mem::transmute(ptr.as_ptr().add(32) as *const uint8x16_t),
-            v3: mem::transmute(ptr.as_ptr().add(48) as *const uint8x16_t),
+            v0: vld1q_u8(ptr.as_ptr() as *const u8),
+            v1: vld1q_u8(ptr.as_ptr().add(16) as *const u8),
+            v2: vld1q_u8(ptr.as_ptr().add(32) as *const u8),
+            v3: vld1q_u8(ptr.as_ptr().add(48) as *const u8),
         }
     }
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
 unsafe fn neon_movemask(input: uint8x16_t) -> u16 {
-    const bit_mask: uint8x16_t = uint8x16_t::new(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                                                 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+    let bit_mask: uint8x16_t = uint8x16_t::new(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
 
     let minput: uint8x16_t = vandq_u8(input, bit_mask);
     let tmp: uint8x16_t = vpaddq_u8(minput, minput);
     let tmp = vpaddq_u8(tmp, tmp);
     let tmp = vpaddq_u8(tmp, tmp);
 
-    unsafe {
-        vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0)
-    }
+    vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0)
 }
 
-
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
 unsafe fn neon_movemask_bulk(p0: uint8x16_t, p1: uint8x16_t, p2: uint8x16_t, p3: uint8x16_t) -> u64 {
-    const bit_mask: uint8x16_t = uint8x16_t::new(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                                                 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+    let bit_mask: uint8x16_t = uint8x16_t::new(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
     let t0 = vandq_u8(p0, bit_mask);
     let t1 = vandq_u8(p1, bit_mask);
     let t2 = vandq_u8(p2, bit_mask);
@@ -60,8 +52,26 @@ unsafe fn neon_movemask_bulk(p0: uint8x16_t, p1: uint8x16_t, p2: uint8x16_t, p3:
     let sum1 = vpaddq_u8(t2, t3);
     let sum0 = vpaddq_u8(sum0, sum1);
     let sum0 = vpaddq_u8(sum0, sum0);
-    unsafe {
-        vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0)
+
+    vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0)
+}
+
+unsafe fn compute_quote_mask(quote_bits: u64) -> u64 {
+    vgetq_lane_u64(vreinterpretq_u64_u8(vmull_p64(-1, quote_bits as i64)), 0)
+}
+
+struct Utf8CheckingState {
+    has_error: int8x16_t,
+    previous: ProcessedUtfBytes,
+}
+
+impl Default for Utf8CheckingState {
+    #[cfg_attr(not(feature = "no-inline"), inline)]
+    fn default() -> Self {
+        Utf8CheckingState {
+            has_error: vdupq_n_s8(0),
+            previous:ProcessedUtfBytes::default()
+        }
     }
 }
 
@@ -73,58 +83,54 @@ unsafe fn check_ascii_neon(si: &SimdInput) -> bool {
     let t3: uint8x16_t = vorrq_u8(t0, t1);
     let t4: uint8x16_t = vandq_u8(t3, high_bit);
 
-    unsafe {
-        let v64: uint64x2_t = vreinterpretq_u64_u8(t4);
-        let v32: uint32x2_t = vqmovn_u64(v64);
-        let result: uint64x1_t = vreinterpret_u64_u32(v32);
+    let v64: uint64x2_t = vreinterpretq_u64_u8(t4);
+    let v32: uint32x2_t = vqmovn_u64(v64);
+    let result: uint64x1_t = vreinterpret_u64_u32(v32);
 
-        vget_lane_u64(result, 0) == 0
-    }
+    vget_lane_u64(result, 0) == 0
 }
 
-struct utf8_checking_state {
-    has_error: int8x16_t,
-    previous: processed_utf_bytes,
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn check_utf8(
+    input: &SimdInput,
+    state: &mut Utf8CheckingState,
+) {
+    if check_ascii_neon(input) {
+        // All bytes are ascii. Therefore the byte that was just before must be
+        // ascii too. We only check the byte that was just before simd_input. Nines
+        // are arbitrary values.
+        let VERROR: int8x16_t =
+            int8x16_t::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1);
+        state.has_error =
+            vorrq_s8(vcgtq_s8(state.previous.carried_continuations, VERROR),
+                     state.has_error);
+    } else {
+        // it is not ascii so we have to do heavy work
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v0),
+                                          &(state.previous), &mut (state.has_error));
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v1),
+                                          &(state.previous), &mut (state.has_error));
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v2),
+                                          &(state.previous), &mut (state.has_error));
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v3),
+                                          &(state.previous), &mut (state.has_error));
+    }
 }
-//
-//#[cfg_attr(not(feature = "no-inline"), inline(always))]
-//unsafe fn check_utf8(
-//    input: &SimdInput,
-//    has_error: &mut uint8x16_t,
-//    previous: &mut utf8_checking_state,
-//) {
-//    if check_ascii_neon(input) {
-//        // All bytes are ascii. Therefore the byte that was just before must be
-//        // ascii too. We only check the byte that was just before simd_input. Nines
-//        // are arbitrary values.
-//        const verror: int8x16_t =
-//            int8x16_t::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1);
-//        state.has_error =
-//            vorrq_s8(vreinterpretq_s8_u8(
-//                vcgtq_s8(state.previous.carried_continuations, verror)),
-//                     state.has_error);
-//    } else {
-//        // it is not ascii so we have to do heavy work
-//        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v0),
-//                                          &(state.previous), &(state.has_error));
-//        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v1),
-//                                          &(state.previous), &(state.has_error));
-//        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v2),
-//                                          &(state.previous), &(state.has_error));
-//        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v3),
-//                                          &(state.previous), &(state.has_error));
-//    }
-//}
 
 // a straightforward comparison of a mask against input
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
 unsafe fn cmp_mask_against_input(input: &SimdInput, m: u8) -> u64 {
-    const MASK: uint8x16_t = vdupq_n_u8(m);
-
-    let cmp_res_0: uint8x16_t = vceqq_u8(input.v0, MASK);
-    let cmp_res_1: uint8x16_t = vceqq_u8(input.v1, MASK);
-    let cmp_res_2: uint8x16_t = vceqq_u8(input.v2, MASK);
-    let cmp_res_3: uint8x16_t = vceqq_u8(input.v3, MASK);
+    macro_rules! call {
+        ($imm8:expr) => {
+            vdupq_n_u8($imm8)
+        };
+    };
+    let mask: uint8x16_t = constify_imm8!(m, call);
+
+    let cmp_res_0: uint8x16_t = vceqq_u8(input.v0, mask);
+    let cmp_res_1: uint8x16_t = vceqq_u8(input.v1, mask);
+    let cmp_res_2: uint8x16_t = vceqq_u8(input.v2, mask);
+    let cmp_res_3: uint8x16_t = vceqq_u8(input.v3, mask);
 
     neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3)
 }
@@ -132,16 +138,66 @@ unsafe fn cmp_mask_against_input(input: &SimdInput, m: u8) -> u64 {
 // find all values less than or equal than the content of maxval (using unsigned arithmetic)
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
 unsafe fn unsigned_lteq_against_input(input: &SimdInput, maxval: u8) -> u64 {
-    const MASK: uint8x16_t = vdupq_n_u8(maxval);
-
-    let cmp_res_0: uint8x16_t = vcleq_u8(input.v0, MASK);
-    let cmp_res_1: uint8x16_t = vcleq_u8(input.v1, MASK);
-    let cmp_res_2: uint8x16_t = vcleq_u8(input.v2, MASK);
-    let cmp_res_3: uint8x16_t = vcleq_u8(input.v3, MASK);
+    macro_rules! call {
+        ($imm8:expr) => {
+            vdupq_n_u8($imm8)
+        };
+    };
+    let mask: uint8x16_t = constify_imm8!(maxval, call);
+
+    let cmp_res_0: uint8x16_t = vcleq_u8(input.v0, mask);
+    let cmp_res_1: uint8x16_t = vcleq_u8(input.v1, mask);
+    let cmp_res_2: uint8x16_t = vcleq_u8(input.v2, mask);
+    let cmp_res_3: uint8x16_t = vcleq_u8(input.v3, mask);
 
     neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3)
 }
 
+unsafe fn find_whitespace_and_structurals(input: &SimdInput, whitespace: &mut u64, structurals: &mut u64) {
+    let low_nibble_mask: uint8x16_t = uint8x16_t::new(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
+    let high_nibble_mask: uint8x16_t = uint8x16_t::new(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
+    let structural_shufti_mask: uint8x16_t = vmovq_n_u8(0x7);
+    let whitespace_shufti_mask: uint8x16_t = vmovq_n_u8(0x18);
+    let low_nib_and_mask: uint8x16_t = vmovq_n_u8(0xf);
+
+    let nib_0_lo: uint8x16_t = vandq_u8(input.v0, low_nib_and_mask);
+    let nib_0_hi: uint8x16_t = vshrq_n_u8(input.v0, 4);
+    let shuf_0_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
+    let shuf_0_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
+    let v_0: uint8x16_t = vandq_u8(shuf_0_lo, shuf_0_hi);
+
+    let nib_1_lo: uint8x16_t = vandq_u8(input.v1, low_nib_and_mask);
+    let nib_1_hi: uint8x16_t = vshrq_n_u8(input.v1, 4);
+    let shuf_1_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
+    let shuf_1_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
+    let v_1: uint8x16_t = vandq_u8(shuf_1_lo, shuf_1_hi);
+
+    let nib_2_lo: uint8x16_t = vandq_u8(input.v2, low_nib_and_mask);
+    let nib_2_hi: uint8x16_t = vshrq_n_u8(input.v2, 4);
+    let shuf_2_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
+    let shuf_2_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
+    let v_2: uint8x16_t = vandq_u8(shuf_2_lo, shuf_2_hi);
+
+    let nib_3_lo: uint8x16_t = vandq_u8(input.v3, low_nib_and_mask);
+    let nib_3_hi: uint8x16_t = vshrq_n_u8(input.v3, 4);
+    let shuf_3_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
+    let shuf_3_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
+    let v_3: uint8x16_t = vandq_u8(shuf_3_lo, shuf_3_hi);
+
+    let tmp_0: uint8x16_t = vtstq_u8(v_0, structural_shufti_mask);
+    let tmp_1: uint8x16_t = vtstq_u8(v_1, structural_shufti_mask);
+    let tmp_2: uint8x16_t = vtstq_u8(v_2, structural_shufti_mask);
+    let tmp_3: uint8x16_t = vtstq_u8(v_3, structural_shufti_mask);
+    *structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
+
+    let tmp_ws_0: uint8x16_t = vtstq_u8(v_0, whitespace_shufti_mask);
+    let tmp_ws_1: uint8x16_t = vtstq_u8(v_1, whitespace_shufti_mask);
+    let tmp_ws_2: uint8x16_t = vtstq_u8(v_2, whitespace_shufti_mask);
+    let tmp_ws_3: uint8x16_t = vtstq_u8(v_3, whitespace_shufti_mask);
+    *whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
+}
+
+
 unsafe fn find_odd_backslash_sequences(input: &SimdInput, prev_iter_ends_odd_backslash: &mut u64) -> u64 {
     const EVEN_BITS: u64 = 0x5555_5555_5555_5555;
     const ODD_BITS: u64 = !EVEN_BITS;
@@ -162,8 +218,8 @@ unsafe fn find_odd_backslash_sequences(input: &SimdInput, prev_iter_ends_odd_bac
     let iter_ends_odd_backslash: bool = add_overflow(bs_bits, odd_starts, &mut odd_carries);
 
     odd_carries |= *prev_iter_ends_odd_backslash; // push in bit zero as a potential end
-                                                  // if we had an odd-numbered run at the
-                                                  // end of the previous iteration
+    // if we had an odd-numbered run at the
+    // end of the previous iteration
     *prev_iter_ends_odd_backslash = if iter_ends_odd_backslash { 0x1 } else { 0x0 };
     let even_carry_ends: u64 = even_carries & !bs_bits;
     let odd_carry_ends: u64 = odd_carries & !bs_bits;
@@ -173,13 +229,24 @@ unsafe fn find_odd_backslash_sequences(input: &SimdInput, prev_iter_ends_odd_bac
     odd_ends
 }
 
+//template <>
+//really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
+//    utf8_checking_state<Architecture::ARM64> &state) {
+//  uint64x2_t v64 = vreinterpretq_u64_s8(state.has_error);
+//  uint32x2_t v32 = vqmovn_u64(v64);
+//  uint64x1_t result = vreinterpret_u64_u32(v32);
+//  return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR
+//                                       : simdjson::SUCCESS;
+//}
+
 unsafe fn find_quote_mask_and_bits(input: &SimdInput, odd_ends: u64,
-                                   prev_iter_inside_quote: &u64, quote_bits: &mut u64, error_mask: &u64) -> u64 {
-    quote_bits = cmp_mask_against_input(input, b'"');
-    quote_bits = &(quote_bits & !odd_ends);
-    let quote_mask: u64 = vmull_p64(-1, quote_bits);
+                                   prev_iter_inside_quote: &mut u64, quote_bits: &mut u64, error_mask: &mut u64) -> u64 {
+    *quote_bits = cmp_mask_against_input(input, b'"');
+    *quote_bits &= !odd_ends;
+
+    let mut quote_mask: u64 = compute_quote_mask(*quote_bits);
 
-    quote_mask ^= prev_iter_inside_quote;
+    quote_mask ^= *prev_iter_inside_quote;
 
     // All Unicode characters may be placed within the
     // quotation marks, except for the characters that MUST be escaped:
@@ -187,374 +254,255 @@ unsafe fn find_quote_mask_and_bits(input: &SimdInput, odd_ends: u64,
     //through U+001F).
     // https://tools.ietf.org/html/rfc8259
     let unescaped: u64 = unsigned_lteq_against_input(input, 0x1F);
-    error_mask |= quote_mask & unescaped;
+    *error_mask |= quote_mask & unescaped;
 
     // right shift of a signed value expected to be well-defined and standard
     // compliant as of C++20,
     // John Regher from Utah U. says this is fine code
-    prev_iter_inside_quote = quote_mask >> 63;
+    *prev_iter_inside_quote = quote_mask >> 63;
 
     quote_mask
 }
 
-//
-//#[cfg_attr(not(feature = "no-inline"), inline(always))]
-//unsafe fn find_whitespace_and_structurals(
-//    input: &SimdInput,
-//    whitespace: &mut u64,
-//    structurals: &mut u64,
-//) {
-//    // do a 'shufti' to detect structural JSON characters
-//    // they are
-//    // * `{` 0x7b
-//    // * `}` 0x7d
-//    // * `:` 0x3a
-//    // * `[` 0x5b
-//    // * `]` 0x5d
-//    // * `,` 0x2c
-//    // these go into the first 3 buckets of the comparison (1/2/4)
-//
-//    // we are also interested in the four whitespace characters:
-//    // * space 0x20
-//    // * linefeed 0x0a
-//    // * horizontal tab 0x09
-//    // * carriage return 0x0d
-//    // these go into the next 2 buckets of the comparison (8/16)
-//
-//    // TODO: const?
-//    let low_nibble_mask: __m128i = _mm_setr_epi8(
-//        16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0,
-//    );
-//    // TODO: const?
-//    let high_nibble_mask: __m128i = _mm_setr_epi8(
-//        8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0,
-//    );
-//
-//    let structural_shufti_mask: __m128i = _mm_set1_epi8(0x7);
-//    let whitespace_shufti_mask: __m128i = _mm_set1_epi8(0x18);
-//
-//    let v_v0: __m128i = _mm_and_si128(
-//        _mm_shuffle_epi8(low_nibble_mask, input.v0),
-//        _mm_shuffle_epi8(
-//            high_nibble_mask,
-//            _mm_and_si128(_mm_srli_epi32(input.v0, 4), _mm_set1_epi8(0x7f)),
-//        ),
-//    );
-//    let v_v1: __m128i = _mm_and_si128(
-//        _mm_shuffle_epi8(low_nibble_mask, input.v1),
-//        _mm_shuffle_epi8(
-//            high_nibble_mask,
-//            _mm_and_si128(_mm_srli_epi32(input.v1, 4), _mm_set1_epi8(0x7f)),
-//        ),
-//    );
-//    let v_v2: __m128i = _mm_and_si128(
-//        _mm_shuffle_epi8(low_nibble_mask, input.v2),
-//        _mm_shuffle_epi8(
-//            high_nibble_mask,
-//            _mm_and_si128(_mm_srli_epi32(input.v2, 4), _mm_set1_epi8(0x7f)),
-//        ),
-//    );
-//    let v_v3: __m128i = _mm_and_si128(
-//        _mm_shuffle_epi8(low_nibble_mask, input.v3),
-//        _mm_shuffle_epi8(
-//            high_nibble_mask,
-//            _mm_and_si128(_mm_srli_epi32(input.v3, 4), _mm_set1_epi8(0x7f)),
-//        ),
-//    );
-//
-//    let tmp_v0: __m128i = _mm_cmpeq_epi8(
-//        _mm_and_si128(v_v0, structural_shufti_mask),
-//        _mm_set1_epi8(0),
-//    );
-//    let tmp_v1: __m128i = _mm_cmpeq_epi8(
-//        _mm_and_si128(v_v1, structural_shufti_mask),
-//        _mm_set1_epi8(0),
-//    );
-//    let tmp_v2: __m128i = _mm_cmpeq_epi8(
-//        _mm_and_si128(v_v2, structural_shufti_mask),
-//        _mm_set1_epi8(0),
-//    );
-//    let tmp_v3: __m128i = _mm_cmpeq_epi8(
-//        _mm_and_si128(v_v3, structural_shufti_mask),
-//        _mm_set1_epi8(0),
-//    );
-//
-//    let structural_res_0: u64 = u64::from(static_cast_u32!(_mm_movemask_epi8(tmp_v0)));
-//    let structural_res_1: u64 = _mm_movemask_epi8(tmp_v1) as u64;
-//    let structural_res_2: u64 = _mm_movemask_epi8(tmp_v2) as u64;
-//    let structural_res_3: u64 = _mm_movemask_epi8(tmp_v3) as u64;
-//
-//    *structurals = !(structural_res_0 | (structural_res_1 << 16) | (structural_res_2 << 32) | (structural_res_3 << 48));
-//
-//    let tmp_ws_v0: __m128i = _mm_cmpeq_epi8(
-//        _mm_and_si128(v_v0, whitespace_shufti_mask),
-//        _mm_set1_epi8(0),
-//    );
-//    let tmp_ws_v1: __m128i = _mm_cmpeq_epi8(
-//        _mm_and_si128(v_v1, whitespace_shufti_mask),
-//        _mm_set1_epi8(0),
-//    );
-//    let tmp_ws_v2: __m128i = _mm_cmpeq_epi8(
-//        _mm_and_si128(v_v2, whitespace_shufti_mask),
-//        _mm_set1_epi8(0),
-//    );
-//    let tmp_ws_v3: __m128i = _mm_cmpeq_epi8(
-//        _mm_and_si128(v_v3, whitespace_shufti_mask),
-//        _mm_set1_epi8(0),
-//    );
-//
-//    let ws_res_0: u64 = u64::from(static_cast_u32!(_mm_movemask_epi8(tmp_ws_v0)));
-//    let ws_res_1: u64 = _mm_movemask_epi8(tmp_ws_v1) as u64;
-//    let ws_res_2: u64 = _mm_movemask_epi8(tmp_ws_v2) as u64;
-//    let ws_res_3: u64 = _mm_movemask_epi8(tmp_ws_v3) as u64;
-//
-//    *whitespace = !(ws_res_0 | (ws_res_1 << 16) | (ws_res_2 << 32) | (ws_res_3 << 48));
-//}
-//
-
 // flatten out values in 'bits' assuming that they are are to have values of idx
 // plus their position in the bitvector, and store these indexes at
 // base_ptr[base] incrementing base as we go
 // will potentially store extra values beyond end of valid bits, so base_ptr
 // needs to be large enough to handle this
 //TODO: usize was u32 here does this matter?
-//#[cfg_attr(not(feature = "no-inline"), inline(always))]
-//fn flatten_bits(base: &mut Vec<u32>, idx: u32, mut bits: u64) {
-//    let cnt: usize = hamming(bits) as usize;
-//    let mut l = base.len();
-//    let idx_minus_64 = idx.wrapping_sub(64);
-//    let idx_64_v = unsafe {
-//        _mm_set_epi32(
-//            static_cast_i32!(idx_minus_64),
-//            static_cast_i32!(idx_minus_64),
-//            static_cast_i32!(idx_minus_64),
-//            static_cast_i32!(idx_minus_64),
-//        )
-//    };
-//
-//    // We're doing some trickery here.
-//    // We reserve 64 extra entries, because we've at most 64 bit to set
-//    // then we trunctate the base to the next base (that we calcuate above)
-//    // We later indiscriminatory writre over the len we set but that's OK
-//    // since we ensure we reserve the needed space
-//    base.reserve(64);
-//    unsafe {
-//        base.set_len(l + cnt);
-//    }
-//
-//    while bits != 0 {
-//        unsafe {
-//            let v0 = static_cast_i32!(trailingzeroes(bits));
-//            bits &= bits.wrapping_sub(1);
-//            let v1 = static_cast_i32!(trailingzeroes(bits));
-//            bits &= bits.wrapping_sub(1);
-//            let v2 = static_cast_i32!(trailingzeroes(bits));
-//            bits &= bits.wrapping_sub(1);
-//            let v3 = static_cast_i32!(trailingzeroes(bits));
-//            bits &= bits.wrapping_sub(1);
-//
-//            let v: __m128i = _mm_set_epi32(v3, v2, v1, v0);
-//            let v: __m128i = _mm_add_epi32(idx_64_v, v);
-//            #[allow(clippy::cast_ptr_alignment)]
-//                _mm_storeu_si128(base.as_mut_ptr().add(l) as *mut __m128i, v);
-//        }
-//        l += 4;
-//    }
-//}
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn flatten_bits(base: &mut Vec<u32>, idx: u32, mut bits: u64) {
+    let cnt: usize = hamming(bits) as usize;
+    let mut l = base.len();
+    let idx_minus_64 = idx.wrapping_sub(64);
+    let idx_64_v = int32x4_t::new(
+        static_cast_i32!(idx_minus_64),
+        static_cast_i32!(idx_minus_64),
+        static_cast_i32!(idx_minus_64),
+        static_cast_i32!(idx_minus_64),
+    );
+
+    // We're doing some trickery here.
+    // We reserve 64 extra entries, because we've at most 64 bit to set
+    // then we trunctate the base to the next base (that we calcuate above)
+    // We later indiscriminatory writre over the len we set but that's OK
+    // since we ensure we reserve the needed space
+    base.reserve(64);
+    base.set_len(l + cnt);
+
+    while bits != 0 {
+        let v0 : i32 = static_cast_i32!(trailingzeroes(bits));
+        bits &= bits.wrapping_sub(1);
+        let v1 : i32 = static_cast_i32!(trailingzeroes(bits));
+        bits &= bits.wrapping_sub(1);
+        let v2 : i32 = static_cast_i32!(trailingzeroes(bits));
+        bits &= bits.wrapping_sub(1);
+        let v3 : i32 = static_cast_i32!(trailingzeroes(bits));
+        bits &= bits.wrapping_sub(1);
+
+        let v: int32x4_t = int32x4_t::new(v3, v2, v1, v0);
+        let v: int32x4_t = vaddq_s32(idx_64_v, v);
+
+        std::ptr::write(base.as_mut_ptr().add(l) as *mut int32x4_t, v);
+        l += 4;
+    }
+}
 
-//
-//// return a updated structural bit vector with quoted contents cleared out and
-//// pseudo-structural characters added to the mask
-//// updates prev_iter_ends_pseudo_pred which tells us whether the previous
-//// iteration ended on a whitespace or a structural character (which means that
-//// the next iteration
-//// will have a pseudo-structural character at its start)
-//#[cfg_attr(not(feature = "no-inline"), inline(always))]
-//fn finalize_structurals(
-//    mut structurals: u64,
-//    whitespace: u64,
-//    quote_mask: u64,
-//    quote_bits: u64,
-//    prev_iter_ends_pseudo_pred: &mut u64,
-//) -> u64 {
-//    // mask off anything inside quotes
-//    structurals &= !quote_mask;
-//    // add the real quote bits back into our bitmask as well, so we can
-//    // quickly traverse the strings we've spent all this trouble gathering
-//    structurals |= quote_bits;
-//    // Now, establish "pseudo-structural characters". These are non-whitespace
-//    // characters that are (a) outside quotes and (b) have a predecessor that's
-//    // either whitespace or a structural character. This means that subsequent
-//    // passes will get a chance to encounter the first character of every string
-//    // of non-whitespace and, if we're parsing an atom like true/false/null or a
-//    // number we can stop at the first whitespace or structural character
-//    // following it.
-//
-//    // a qualified predecessor is something that can happen 1 position before an
-//    // psuedo-structural character
-//    let pseudo_pred: u64 = structurals | whitespace;
-//
-//    let shifted_pseudo_pred: u64 = (pseudo_pred << 1) | *prev_iter_ends_pseudo_pred;
-//    *prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
-//    let pseudo_structurals: u64 = shifted_pseudo_pred & (!whitespace) & (!quote_mask);
-//    structurals |= pseudo_structurals;
-//
-//    // now, we've used our close quotes all we need to. So let's switch them off
-//    // they will be off in the quote mask and on in quote bits.
-//    structurals &= !(quote_bits & !quote_mask);
-//    structurals
-//}
-//
-
-//impl<'de> Deserializer<'de> {
-//    #[inline(never)]
-//    pub unsafe fn find_structural_bits(input: &[u8]) -> std::result::Result<Vec<u32>, ErrorType> {
-//        let len = input.len();
-//        // 6 is a heuristic number to estimate it turns out a rate of 1/6 structural caracters lears
-//        // almost never to relocations.
-//        let mut structural_indexes = Vec::with_capacity(len / 6);
-//        structural_indexes.push(0); // push extra root element
-//
-//        let mut has_error: __m128i = _mm_setzero_si128();
-//        let mut previous = AvxProcessedUtfBytes::default();
-//        // we have padded the input out to 64 byte multiple with the remainder being
-//        // zeros
-//
-//        // persistent state across loop
-//        // does the last iteration end with an odd-length sequence of backslashes?
-//        // either 0 or 1, but a 64-bit value
-//        let mut prev_iter_ends_odd_backslash: u64 = 0;
-//        // does the previous iteration end inside a double-quote pair?
-//        let mut prev_iter_inside_quote: u64 = 0;
-//        // either all zeros or all ones
-//        // does the previous iteration end on something that is a predecessor of a
-//        // pseudo-structural character - i.e. whitespace or a structural character
-//        // effectively the very first char is considered to follow "whitespace" for
-//        // the
-//        // purposes of pseudo-structural character detection so we initialize to 1
-//        let mut prev_iter_ends_pseudo_pred: u64 = 1;
-//
-//        // structurals are persistent state across loop as we flatten them on the
-//        // subsequent iteration into our array pointed to be base_ptr.
-//        // This is harmless on the first iteration as structurals==0
-//        // and is done for performance reasons; we can hide some of the latency of the
-//        // expensive carryless multiply in the previous step with this work
-//        let mut structurals: u64 = 0;
-//
-//        let lenminus64: usize = if len < 64 { 0 } else { len as usize - 64 };
-//        let mut idx: usize = 0;
-//        let mut error_mask: u64 = 0; // for unescaped characters within strings (ASCII code points < 0x20)
-//
-//        while idx < lenminus64 {
-//            /*
-//            #ifndef _MSC_VER
-//              __builtin_prefetch(buf + idx + 128);
-//            #endif
-//             */
-//            let input: SimdInput = fill_input(input.get_unchecked(idx as usize..));
-//            check_utf8(&input, &mut has_error, &mut previous);
-//            // detect odd sequences of backslashes
-//            let odd_ends: u64 =
-//                find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash);
-//
-//            // detect insides of quote pairs ("quote_mask") and also our quote_bits
-//            // themselves
-//            let mut quote_bits: u64 = 0;
-//            let quote_mask: u64 = find_quote_mask_and_bits(
-//                &input,
-//                odd_ends,
-//                &mut prev_iter_inside_quote,
-//                &mut quote_bits,
-//                &mut error_mask,
-//            );
-//
-//            // take the previous iterations structural bits, not our current iteration,
-//            // and flatten
-//            flatten_bits(&mut structural_indexes, idx as u32, structurals);
-//
-//            let mut whitespace: u64 = 0;
-//            find_whitespace_and_structurals(&input, &mut whitespace, &mut structurals);
-//
-//            // fixup structurals to reflect quotes and add pseudo-structural characters
-//            structurals = finalize_structurals(
-//                structurals,
-//                whitespace,
-//                quote_mask,
-//                quote_bits,
-//                &mut prev_iter_ends_pseudo_pred,
-//            );
-//            idx += 64;
-//        }
-//
-//        // we use a giant copy-paste which is ugly.
-//        // but otherwise the string needs to be properly padded or else we
-//        // risk invalidating the UTF-8 checks.
-//        if idx < len {
-//            let mut tmpbuf: [u8; 64] = [0x20; 64];
-//            tmpbuf
-//                .as_mut_ptr()
-//                .copy_from(input.as_ptr().add(idx), len as usize - idx);
-//            let input: SimdInput = fill_input(&tmpbuf);
-//
-//            check_utf8(&input, &mut has_error, &mut previous);
-//
-//            // detect odd sequences of backslashes
-//            let odd_ends: u64 =
-//                find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash);
-//
-//            // detect insides of quote pairs ("quote_mask") and also our quote_bits
-//            // themselves
-//            let mut quote_bits: u64 = 0;
-//            let quote_mask: u64 = find_quote_mask_and_bits(
-//                &input,
-//                odd_ends,
-//                &mut prev_iter_inside_quote,
-//                &mut quote_bits,
-//                &mut error_mask,
-//            );
-//
-//            // take the previous iterations structural bits, not our current iteration,
-//            // and flatten
-//            flatten_bits(&mut structural_indexes, idx as u32, structurals);
-//
-//            let mut whitespace: u64 = 0;
-//            find_whitespace_and_structurals(&input, &mut whitespace, &mut structurals);
-//
-//            // fixup structurals to reflect quotes and add pseudo-structural characters
-//            structurals = finalize_structurals(
-//                structurals,
-//                whitespace,
-//                quote_mask,
-//                quote_bits,
-//                &mut prev_iter_ends_pseudo_pred,
-//            );
-//            idx += 64;
-//        }
-//        // This test isn't in upstream, for some reason the error mask is et for then.
-//        if prev_iter_inside_quote != 0 {
-//            return Err(ErrorType::Syntax);
-//        }
-//        // finally, flatten out the remaining structurals from the last iteration
-//        flatten_bits(&mut structural_indexes, idx as u32, structurals);
-//
-//        // a valid JSON file cannot have zero structural indexes - we should have
-//        // found something (note that we compare to 1 as we always add the root!)
-//        if structural_indexes.len() == 1 {
-//            return Err(ErrorType::EOF);
-//        }
-//
-//        if structural_indexes.last() > Some(&(len as u32)) {
-//            return Err(ErrorType::InternalError);
-//        }
-//
-//        if error_mask != 0 {
-//            return Err(ErrorType::Syntax);
-//        }
-//
-//        if _mm_testz_si128(has_error, has_error) != 0 {
-//            Ok(structural_indexes)
-//        } else {
-//            Err(ErrorType::InvalidUTF8)
-//        }
-//    }
-//}
+
+// return a updated structural bit vector with quoted contents cleared out and
+// pseudo-structural characters added to the mask
+// updates prev_iter_ends_pseudo_pred which tells us whether the previous
+// iteration ended on a whitespace or a structural character (which means that
+// the next iteration
+// will have a pseudo-structural character at its start)
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+fn finalize_structurals(
+    mut structurals: u64,
+    whitespace: u64,
+    quote_mask: u64,
+    quote_bits: u64,
+    prev_iter_ends_pseudo_pred: &mut u64,
+) -> u64 {
+    // mask off anything inside quotes
+    structurals &= !quote_mask;
+    // add the real quote bits back into our bitmask as well, so we can
+    // quickly traverse the strings we've spent all this trouble gathering
+    structurals |= quote_bits;
+    // Now, establish "pseudo-structural characters". These are non-whitespace
+    // characters that are (a) outside quotes and (b) have a predecessor that's
+    // either whitespace or a structural character. This means that subsequent
+    // passes will get a chance to encounter the first character of every string
+    // of non-whitespace and, if we're parsing an atom like true/false/null or a
+    // number we can stop at the first whitespace or structural character
+    // following it.
+
+    // a qualified predecessor is something that can happen 1 position before an
+    // psuedo-structural character
+    let pseudo_pred: u64 = structurals | whitespace;
+
+    let shifted_pseudo_pred: u64 = (pseudo_pred << 1) | *prev_iter_ends_pseudo_pred;
+    *prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
+    let pseudo_structurals: u64 = shifted_pseudo_pred & (!whitespace) & (!quote_mask);
+    structurals |= pseudo_structurals;
+
+    // now, we've used our close quotes all we need to. So let's switch them off
+    // they will be off in the quote mask and on in quote bits.
+    structurals &= !(quote_bits & !quote_mask);
+    structurals
+}
+
+impl<'de> Deserializer<'de> {
+    #[inline(never)]
+    pub unsafe fn find_structural_bits(input: &[u8]) -> std::result::Result<Vec<u32>, ErrorType> {
+        let len = input.len();
+        // 6 is a heuristic number to estimate it turns out a rate of 1/6 structural caracters lears
+        // almost never to relocations.
+        let mut structural_indexes = Vec::with_capacity(len / 6);
+        structural_indexes.push(0); // push extra root element
+
+        // let mut has_error: uint8x16_t = vdupq_n_u8(0);
+        let mut previous = Utf8CheckingState::default();
+        // we have padded the input out to 64 byte multiple with the remainder being
+        // zeros
+
+        // persistent state across loop
+        // does the last iteration end with an odd-length sequence of backslashes?
+        // either 0 or 1, but a 64-bit value
+        let mut prev_iter_ends_odd_backslash: u64 = 0;
+        // does the previous iteration end inside a double-quote pair?
+        let mut prev_iter_inside_quote: u64 = 0;
+        // either all zeros or all ones
+        // does the previous iteration end on something that is a predecessor of a
+        // pseudo-structural character - i.e. whitespace or a structural character
+        // effectively the very first char is considered to follow "whitespace" for
+        // the
+        // purposes of pseudo-structural character detection so we initialize to 1
+        let mut prev_iter_ends_pseudo_pred: u64 = 1;
+
+        // structurals are persistent state across loop as we flatten them on the
+        // subsequent iteration into our array pointed to be base_ptr.
+        // This is harmless on the first iteration as structurals==0
+        // and is done for performance reasons; we can hide some of the latency of the
+        // expensive carryless multiply in the previous step with this work
+        let mut structurals: u64 = 0;
+
+        let lenminus64: usize = if len < 64 { 0 } else { len as usize - 64 };
+        let mut idx: usize = 0;
+        let mut error_mask: u64 = 0; // for unescaped characters within strings (ASCII code points < 0x20)
+
+        while idx < lenminus64 {
+            /*
+            #ifndef _MSC_VER
+              __builtin_prefetch(buf + idx + 128);
+            #endif
+             */
+            let input: SimdInput = fill_input(input.get_unchecked(idx as usize..));
+            check_utf8(&input, &mut previous);
+            // detect odd sequences of backslashes
+            let odd_ends: u64 =
+                find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash);
+
+            // detect insides of quote pairs ("quote_mask") and also our quote_bits
+            // themselves
+            let mut quote_bits: u64 = 0;
+            let quote_mask: u64 = find_quote_mask_and_bits(
+                &input,
+                odd_ends,
+                &mut prev_iter_inside_quote,
+                &mut quote_bits,
+                &mut error_mask,
+            );
+
+            // take the previous iterations structural bits, not our current iteration,
+            // and flatten
+            flatten_bits(&mut structural_indexes, idx as u32, structurals);
+
+            let mut whitespace: u64 = 0;
+            find_whitespace_and_structurals(&input, &mut whitespace, &mut structurals);
+
+            // fixup structurals to reflect quotes and add pseudo-structural characters
+            structurals = finalize_structurals(
+                structurals,
+                whitespace,
+                quote_mask,
+                quote_bits,
+                &mut prev_iter_ends_pseudo_pred,
+            );
+            idx += 64;
+        }
+
+        // we use a giant copy-paste which is ugly.
+        // but otherwise the string needs to be properly padded or else we
+        // risk invalidating the UTF-8 checks.
+        if idx < len {
+            let mut tmpbuf: [u8; 64] = [0x20; 64];
+            tmpbuf
+                .as_mut_ptr()
+                .copy_from(input.as_ptr().add(idx), len as usize - idx);
+            let input: SimdInput = fill_input(&tmpbuf);
+
+            check_utf8(&input, &mut previous);
+
+            // detect odd sequences of backslashes
+            let odd_ends: u64 =
+                find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash);
+
+            // detect insides of quote pairs ("quote_mask") and also our quote_bits
+            // themselves
+            let mut quote_bits: u64 = 0;
+            let quote_mask: u64 = find_quote_mask_and_bits(
+                &input,
+                odd_ends,
+                &mut prev_iter_inside_quote,
+                &mut quote_bits,
+                &mut error_mask,
+            );
+
+            // take the previous iterations structural bits, not our current iteration,
+            // and flatten
+            flatten_bits(&mut structural_indexes, idx as u32, structurals);
+
+            let mut whitespace: u64 = 0;
+            find_whitespace_and_structurals(&input, &mut whitespace, &mut structurals);
+
+            // fixup structurals to reflect quotes and add pseudo-structural characters
+            structurals = finalize_structurals(
+                structurals,
+                whitespace,
+                quote_mask,
+                quote_bits,
+                &mut prev_iter_ends_pseudo_pred,
+            );
+            idx += 64;
+        }
+
+        // This test isn't in upstream, for some reason the error mask is et for then.
+        if prev_iter_inside_quote != 0 {
+            return Err(ErrorType::Syntax);
+        }
+        // finally, flatten out the remaining structurals from the last iteration
+        flatten_bits(&mut structural_indexes, idx as u32, structurals);
+
+        // a valid JSON file cannot have zero structural indexes - we should have
+        // found something (note that we compare to 1 as we always add the root!)
+        if structural_indexes.len() == 1 {
+            return Err(ErrorType::EOF);
+        }
+
+        if structural_indexes.last() > Some(&(len as u32)) {
+            return Err(ErrorType::InternalError);
+        }
+
+        if error_mask != 0 {
+            return Err(ErrorType::Syntax);
+        }
+
+        let err: i128 = mem::transmute(vtstq_s8(previous.has_error, previous.has_error));
+
+        if err != 0 {
+            Ok(structural_indexes)
+        } else {
+            Err(ErrorType::InvalidUTF8)
+        }
+    }
+}
diff --git a/src/neon/utf8check.rs b/src/neon/utf8check.rs
index 04bf0640..de190f46 100644
--- a/src/neon/utf8check.rs
+++ b/src/neon/utf8check.rs
@@ -1,4 +1,6 @@
-use crate::*;
+
+use crate::neon::intrinsics::*;
+use std::mem;
 
 /*
  * legal utf-8 byte sequence
@@ -17,108 +19,78 @@ use crate::*;
  *
  */
 
-// all byte values must be no larger than 0xF4
-
-/*****************************/
-#[cfg_attr(not(feature = "no-inline"), inline)]
-fn push_last_byte_of_a_to_b(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { _mm_alignr_epi8(b, a, 15) }
-}
-
-#[cfg_attr(not(feature = "no-inline"), inline)]
-fn push_last_2bytes_of_a_to_b(a: __m128i, b: __m128i) -> __m128i {
-    unsafe { _mm_alignr_epi8(b, a, 14) }
-}
-
 // all byte values must be no larger than 0xF4
 #[cfg_attr(not(feature = "no-inline"), inline)]
-fn avxcheck_smaller_than_0xf4(current_bytes: __m128i, has_error: &mut __m128i) {
+unsafe fn check_smaller_than_0xf4(current_bytes: int8x16_t, has_error: &mut int8x16_t) {
     // unsigned, saturates to 0 below max
-    *has_error = unsafe {
-        _mm_or_si128(
+    *has_error =
+        vorrq_s8(
             *has_error,
-            _mm_subs_epu8(current_bytes, _mm_set1_epi8(static_cast_i8!(0xF4u8))),
-        )
-    };
+            vreinterpretq_s8_u8(vqsubq_u8(
+                vreinterpretq_u8_s8(current_bytes), vdupq_n_u8(0xF4))));
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-fn avxcontinuation_lengths(high_nibbles: __m128i) -> __m128i {
-    unsafe {
-        _mm_shuffle_epi8(
-            _mm_setr_epi8(
-                1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
-                0, 0, 0, 0, // 10xx (continuation)
-                2, 2, // 110x
-                3, // 1110
-                4, // 1111, next should be 0 (not checked here)
-            ),
-            high_nibbles,
-        )
-    }
+unsafe fn continuation_lengths(high_nibbles: int8x16_t) -> int8x16_t {
+    let NIBBLES : int8x16_t = int8x16_t::new(
+        1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
+        0, 0, 0, 0,             // 10xx (continuation)
+        2, 2,                   // 110x
+        3,                      // 1110
+        4,                      // 1111, next should be 0 (not checked here)
+    );
+
+    vqtbl1q_s8(NIBBLES, vreinterpretq_u8_s8(high_nibbles))
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-fn avxcarry_continuations(initial_lengths: __m128i, previous_carries: __m128i) -> __m128i {
-    unsafe {
-        let right1: __m128i = _mm_subs_epu8(
-            push_last_byte_of_a_to_b(previous_carries, initial_lengths),
-            _mm_set1_epi8(1),
-        );
-        let sum: __m128i = _mm_add_epi8(initial_lengths, right1);
-        let right2: __m128i = _mm_subs_epu8(
-            push_last_2bytes_of_a_to_b(previous_carries, sum),
-            _mm_set1_epi8(2),
-        );
-        _mm_add_epi8(sum, right2)
-    }
+unsafe fn carry_continuations(initial_lengths: int8x16_t, previous_carries: int8x16_t) -> int8x16_t {
+    let right1 : int8x16_t = vreinterpretq_s8_u8(vqsubq_u8(
+        vreinterpretq_u8_s8(vextq_s8(previous_carries, initial_lengths, 16 - 1)),
+        vdupq_n_u8(1)));
+    let sum : int8x16_t = vaddq_s8(initial_lengths, right1);
+
+    let right2 : int8x16_t = vreinterpretq_s8_u8(
+        vqsubq_u8(vreinterpretq_u8_s8(vextq_s8(previous_carries, sum, 16 - 2)),
+                  vdupq_n_u8(2)));
+
+    vaddq_s8(sum, right2)
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-fn avxcheck_continuations(initial_lengths: __m128i, carries: __m128i, has_error: &mut __m128i) {
+unsafe fn check_continuations(initial_lengths: int8x16_t, carries: int8x16_t, has_error: &mut int8x16_t) {
     // overlap || underlap
     // carry > length && length > 0 || !(carry > length) && !(length > 0)
     // (carries > length) == (lengths > 0)
-    unsafe {
-        let overunder: __m128i = _mm_cmpeq_epi8(
-            _mm_cmpgt_epi8(carries, initial_lengths),
-            _mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128()),
-        );
 
-        *has_error = _mm_or_si128(*has_error, overunder);
-    }
+    // FIXME: was vceqq_u8 ?
+    let overunder : int8x16_t = vceqq_s8(
+                                    vcgtq_s8(carries, initial_lengths),
+                                    vcgtq_s8(initial_lengths, vdupq_n_s8(0)));
+
+    *has_error = vorrq_s8(*has_error, overunder);
 }
 
 // when 0xED is found, next byte must be no larger than 0x9F
 // when 0xF4 is found, next byte must be no larger than 0x8F
 // next byte must be continuation, ie sign bit is set, so signed < is ok
 #[cfg_attr(not(feature = "no-inline"), inline)]
-fn avxcheck_first_continuation_max(
-    current_bytes: __m128i,
-    off1_current_bytes: __m128i,
-    has_error: &mut __m128i,
+unsafe fn check_first_continuation_max(
+    current_bytes: int8x16_t,
+    off1_current_bytes: int8x16_t,
+    has_error: &mut int8x16_t,
 ) {
-    unsafe {
-        let mask_ed: __m128i = _mm_cmpeq_epi8(
-            off1_current_bytes,
-            _mm_set1_epi8(static_cast_i8!(0xEDu8)),
-        );
-        let mask_f4: __m128i = _mm_cmpeq_epi8(
-            off1_current_bytes,
-            _mm_set1_epi8(static_cast_i8!(0xF4u8)),
-        );
-
-        let badfollow_ed: __m128i = _mm_and_si128(
-            _mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(static_cast_i8!(0x9Fu8))),
-            mask_ed,
-        );
-        let badfollow_f4: __m128i = _mm_and_si128(
-            _mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(static_cast_i8!(0x8Fu8))),
-            mask_f4,
-        );
-
-        *has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollow_ed, badfollow_f4));
-    }
+    let maskED : int8x16_t = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xED));
+    let maskF4 : int8x16_t = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xF4));
+
+    // FIXME: was vandq_u8?
+    let badfollowED : int8x16_t =
+        vandq_s8(vcgtq_s8(current_bytes, vdupq_n_s8(0x9F)), maskED);
+    let badfollowF4 : int8x16_t =
+        vandq_s8(vcgtq_s8(current_bytes, vdupq_n_s8(0x8F)), maskF4);
+
+    *has_error = vorrq_s8(
+        *has_error, vorrq_s8(badfollowED, badfollowF4));
 }
 
 // map off1_hibits => error condition
@@ -128,120 +100,95 @@ fn avxcheck_first_continuation_max(
 // F       => < F1 && < 90
 // else      false && false
 #[cfg_attr(not(feature = "no-inline"), inline)]
-fn avxcheck_overlong(
-    current_bytes: __m128i,
-    off1_current_bytes: __m128i,
-    hibits: __m128i,
-    previous_hibits: __m128i,
-    has_error: &mut __m128i,
+unsafe fn check_overlong(
+    current_bytes: int8x16_t,
+    off1_current_bytes: int8x16_t,
+    hibits: int8x16_t,
+    previous_hibits: int8x16_t,
+    has_error: &mut int8x16_t,
 ) {
-    unsafe {
-        let off1_hibits: __m128i = push_last_byte_of_a_to_b(previous_hibits, hibits);
-        let initial_mins: __m128i = _mm_shuffle_epi8(
-            _mm_setr_epi8(
-                -128,
-                -128,
-                -128,
-                -128,
-                -128,
-                -128,
-                -128,
-                -128,
-                -128,
-                -128,
-                -128,
-                -128, // 10xx => false
-                static_cast_i8!(0xC2u8),
-                -128,                    // 110x
-                static_cast_i8!(0xE1u8), // 1110
-                static_cast_i8!(0xF1u8), // 1111
-            ),
-            off1_hibits,
-        );
-
-        let initial_under: __m128i = _mm_cmpgt_epi8(initial_mins, off1_current_bytes);
-
-        let second_mins: __m128i = _mm_shuffle_epi8(
-            _mm_setr_epi8(
-                -128,
-                -128,
-                -128,
-                -128,
-                -128,
-                -128,
-                -128,
-                -128,
-                -128,
-                -128,
-                -128,
-                -128, // 10xx => false
-                127,
-                127,                     // 110x => true
-                static_cast_i8!(0xA0u8), // 1110
-                static_cast_i8!(0x90u8), // 1111
-            ),
-            off1_hibits,
-        );
-        let second_under: __m128i = _mm_cmpgt_epi8(second_mins, current_bytes);
-        *has_error = _mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under));
-    }
+    let _initial_mins : int8x16_t = int8x16_t::new(
+        -128,         -128, -128, -128, -128, -128,
+        -128,         -128, -128, -128, -128, -128, // 10xx => false
+        0xC2, -128,                         // 110x
+        0xE1,                               // 1110
+        0xF1,
+    );
+
+    let _second_mins : int8x16_t = int8x16_t::new(
+        -128,         -128, -128, -128, -128, -128,
+        -128,         -128, -128, -128, -128, -128, // 10xx => false
+        127,          127,                          // 110x => true
+        0xA0,                               // 1110
+        0x90,
+    );
+
+    let off1_hibits : int8x16_t = vextq_s8(previous_hibits, hibits, 16 - 1);
+    let initial_mins : int8x16_t =
+        vqtbl1q_s8(_initial_mins, vreinterpretq_u8_s8(off1_hibits));
+
+    let initial_under : int8x16_t = vcgtq_s8(initial_mins, off1_current_bytes);
+
+    let second_mins : int8x16_t =
+        vqtbl1q_s8(_second_mins, vreinterpretq_u8_s8(off1_hibits));
+
+    let second_under : int8x16_t = vcgtq_s8(second_mins, current_bytes);
+
+    *has_error = vorrq_s8(
+        *has_error, vandq_s8(initial_under, second_under));
 }
 
-pub struct AvxProcessedUtfBytes {
-    rawbytes: __m128i,
-    high_nibbles: __m128i,
-    pub carried_continuations: __m128i,
+pub struct ProcessedUtfBytes {
+    rawbytes: int8x16_t,
+    high_nibbles: int8x16_t,
+    pub carried_continuations: int8x16_t,
 }
 
-impl Default for AvxProcessedUtfBytes {
+impl Default for ProcessedUtfBytes {
     #[cfg_attr(not(feature = "no-inline"), inline)]
     fn default() -> Self {
-        unsafe {
-            AvxProcessedUtfBytes {
-                rawbytes: _mm_setzero_si128(),
-                high_nibbles: _mm_setzero_si128(),
-                carried_continuations: _mm_setzero_si128(),
-            }
+        ProcessedUtfBytes {
+            rawbytes: vdupq_n_s8(0x00),
+            high_nibbles: vdupq_n_s8(0x00),
+            carried_continuations: vdupq_n_s8(0x00),
         }
     }
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-fn avx_count_nibbles(bytes: __m128i, answer: &mut AvxProcessedUtfBytes) {
+unsafe fn count_nibbles(bytes: int8x16_t, answer: &mut ProcessedUtfBytes) {
     answer.rawbytes = bytes;
-    answer.high_nibbles =
-        unsafe { _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F)) };
+    answer.high_nibbles = vreinterpretq_s8_u8(mem::transmute(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4)));
 }
 
 // check whether the current bytes are valid UTF-8
 // at the end of the function, previous gets updated
 #[cfg_attr(not(feature = "no-inline"), inline)]
-pub fn avxcheck_utf8_bytes(
-    current_bytes: __m128i,
-    previous: &AvxProcessedUtfBytes,
-    has_error: &mut __m128i,
-) -> AvxProcessedUtfBytes {
-    let mut pb = AvxProcessedUtfBytes::default();
-    avx_count_nibbles(current_bytes, &mut pb);
+pub fn check_utf8_bytes(
+    current_bytes: int8x16_t,
+    previous: &ProcessedUtfBytes,
+    has_error: &mut int8x16_t,
+) -> ProcessedUtfBytes {
+    let mut pb = ProcessedUtfBytes::default();
+    unsafe {
+        count_nibbles(current_bytes, &mut pb);
 
-    avxcheck_smaller_than_0xf4(current_bytes, has_error);
+        check_smaller_than_0xf4(current_bytes, has_error);
 
-    let initial_lengths: __m128i = avxcontinuation_lengths(pb.high_nibbles);
+        let initial_lengths: int8x16_t = continuation_lengths(pb.high_nibbles);
 
-    pb.carried_continuations =
-        avxcarry_continuations(initial_lengths, previous.carried_continuations);
+        pb.carried_continuations =
+            carry_continuations(initial_lengths, previous.carried_continuations);
 
-    avxcheck_continuations(initial_lengths, pb.carried_continuations, has_error);
+        check_continuations(initial_lengths, pb.carried_continuations, has_error);
 
-    let off1_current_bytes: __m128i = push_last_byte_of_a_to_b(previous.rawbytes, pb.rawbytes);
-    avxcheck_first_continuation_max(current_bytes, off1_current_bytes, has_error);
+        let off1_current_bytes : int8x16_t =
+            vextq_s8(previous.rawbytes, pb.rawbytes, 16 - 1);
 
-    avxcheck_overlong(
-        current_bytes,
-        off1_current_bytes,
-        pb.high_nibbles,
-        previous.high_nibbles,
-        has_error,
-    );
+        check_first_continuation_max(current_bytes, off1_current_bytes, has_error);
+
+        check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles,
+                       previous.high_nibbles, has_error);
+    }
     pb
 }

From 05fd4f3c65bff1e401fd30e39eb1edc9d77b6d02 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Fri, 9 Aug 2019 18:56:26 -0400
Subject: [PATCH 03/34] feat: partial implementation of deserializer (broken,
 needs movemask)

---
 src/lib.rs        | 262 +++++++++++++++---------------
 src/neon/deser.rs | 400 +++++++++++++++++++++++-----------------------
 src/neon/mod.rs   |   2 +-
 3 files changed, 332 insertions(+), 332 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 3973b7a5..d2d9f10e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -79,16 +79,16 @@
 //! let v: Value = simd_json::serde::from_slice(&mut d).unwrap();
 //! ```
 
-//#[cfg(feature = "serde_impl")]
-//extern crate serde as serde_ext;
-//#[cfg(feature = "serde_impl")]
-//pub mod serde;
+#[cfg(feature = "serde_impl")]
+extern crate serde as serde_ext;
+#[cfg(feature = "serde_impl")]
+pub mod serde;
 
 mod charutils;
 #[macro_use]
 mod macros;
 mod error;
-//mod numberparse;
+mod numberparse;
 mod parsedjson;
 mod stringparse;
 
@@ -109,19 +109,19 @@ use crate::sse42::stage1::SIMDJSON_PADDING;
 //#[cfg(target_feature = "neon")]
 mod neon;
 //#[cfg(target_feature = "neon")]
-//pub use crate::neon::deser::*;
+pub use crate::neon::deser::*;
 //#[cfg(target_feature = "neon")]
-//use crate::neon::stage1::SIMDJSON_PADDING;
+use crate::neon::stage1::SIMDJSON_PADDING;
 
-//mod stage2;
-//pub mod value;
+mod stage2;
+pub mod value;
 
-//use crate::numberparse::Number;
+use crate::numberparse::Number;
 //use std::mem;
 //use std::str;
 
 pub use crate::error::{Error, ErrorType};
-//pub use crate::value::*;
+pub use crate::value::*;
 
 pub type Result<T> = std::result::Result<T, Error>;
 
@@ -137,126 +137,126 @@ pub struct Deserializer<'de> {
     str_offset: usize,
     iidx: usize,
 }
-//
-//impl<'de> Deserializer<'de> {
-//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-//    fn error(&self, error: ErrorType) -> Error {
-//        Error::new(self.idx, self.iidx, self.c() as char, error)
-//    }
-//    // By convention, `Deserializer` constructors are named like `from_xyz`.
-//    // That way basic use cases are satisfied by something like
-//    // `serde_json::from_str(...)` while advanced use cases that require a
-//    // deserializer can make one with `serde_json::Deserializer::from_str(...)`.
-//    pub fn from_slice(input: &'de mut [u8]) -> Result<Self> {
-//        // We have to pick an initial size of the structural indexes.
-//        // 6 is a heuristic that seems to work well for the benchmark
-//        // data and limit re-allocation frequency.
-//
-//        let len = input.len();
-//
-//        let buf_start: usize = input.as_ptr() as *const () as usize;
-//        let needs_relocation = (buf_start + input.len()) % page_size::get() < SIMDJSON_PADDING;
-//
-//        let s1_result: std::result::Result<Vec<u32>, ErrorType> = if needs_relocation {
-//            let mut data: Vec<u8> = Vec::with_capacity(len + SIMDJSON_PADDING);
-//            unsafe {
-//                data.set_len(len + 1);
-//                data.as_mut_slice()
-//                    .get_unchecked_mut(0..len)
-//                    .clone_from_slice(input);
-//                *(data.get_unchecked_mut(len)) = 0;
-//                data.set_len(len);
-//                Deserializer::find_structural_bits(&data)
-//            }
-//        } else {
-//            unsafe { Deserializer::find_structural_bits(input) }
-//        };
-//        let structural_indexes = match s1_result {
-//            Ok(i) => i,
-//            Err(t) => {
-//                return Err(Error::generic(t));
-//            }
-//        };
-//
-//        let counts = Deserializer::validate(input, &structural_indexes)?;
-//
-//        let strings = Vec::with_capacity(len + SIMDJSON_PADDING);
-//
-//        Ok(Deserializer {
-//            counts,
-//            structural_indexes,
-//            input,
-//            idx: 0,
-//            strings,
-//            str_offset: 0,
-//            iidx: 0,
-//        })
-//    }
-//
-//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-//    fn skip(&mut self) {
-//        self.idx += 1;
-//        self.iidx = unsafe { *self.structural_indexes.get_unchecked(self.idx) as usize };
-//    }
-//
-//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-//    fn c(&self) -> u8 {
-//        unsafe {
-//            *self
-//                .input
-//                .get_unchecked(*self.structural_indexes.get_unchecked(self.idx) as usize)
-//        }
-//    }
-//
-//    // pull out the check so we don't need to
-//    // stry every time
-//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-//    fn next_(&mut self) -> u8 {
-//        unsafe {
-//            self.idx += 1;
-//            self.iidx = *self.structural_indexes.get_unchecked(self.idx) as usize;
-//            *self.input.get_unchecked(self.iidx)
-//        }
-//    }
-//
-//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-//    fn count_elements(&self) -> usize {
-//        unsafe { *self.counts.get_unchecked(self.idx) }
-//    }
-//
-//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-//    fn parse_number_root(&mut self, minus: bool) -> Result<Number> {
-//        let input = unsafe { &self.input.get_unchecked(self.iidx..) };
-//        let len = input.len();
-//        let mut copy = vec![0u8; len + SIMDJSON_PADDING];
-//        copy[len] = 0;
-//        unsafe {
-//            copy.as_mut_ptr().copy_from(input.as_ptr(), len);
-//        };
-//        self.parse_number_int(&copy, minus)
-//    }
-//
-//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-//    fn parse_number(&mut self, minus: bool) -> Result<Number> {
-//        let input = unsafe { &self.input.get_unchecked(self.iidx..) };
-//        let len = input.len();
-//        if len < SIMDJSON_PADDING {
-//            let mut copy = vec![0u8; len + SIMDJSON_PADDING];
-//            unsafe {
-//                copy.as_mut_ptr().copy_from(input.as_ptr(), len);
-//            };
-//            self.parse_number_int(&copy, minus)
-//        } else {
-//            self.parse_number_int(input, minus)
-//        }
-//    }
-//
-//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-//    fn parse_number_(&mut self, minus: bool) -> Result<Number> {
-//        let input = unsafe { &self.input.get_unchecked(self.iidx..) };
-//        self.parse_number_int(input, minus)
-//    }
-//}
+
+impl<'de> Deserializer<'de> {
+    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+    fn error(&self, error: ErrorType) -> Error {
+        Error::new(self.idx, self.iidx, self.c() as char, error)
+    }
+    // By convention, `Deserializer` constructors are named like `from_xyz`.
+    // That way basic use cases are satisfied by something like
+    // `serde_json::from_str(...)` while advanced use cases that require a
+    // deserializer can make one with `serde_json::Deserializer::from_str(...)`.
+    pub fn from_slice(input: &'de mut [u8]) -> Result<Self> {
+        // We have to pick an initial size of the structural indexes.
+        // 6 is a heuristic that seems to work well for the benchmark
+        // data and limit re-allocation frequency.
+
+        let len = input.len();
+
+        let buf_start: usize = input.as_ptr() as *const () as usize;
+        let needs_relocation = (buf_start + input.len()) % page_size::get() < SIMDJSON_PADDING;
+
+        let s1_result: std::result::Result<Vec<u32>, ErrorType> = if needs_relocation {
+            let mut data: Vec<u8> = Vec::with_capacity(len + SIMDJSON_PADDING);
+            unsafe {
+                data.set_len(len + 1);
+                data.as_mut_slice()
+                    .get_unchecked_mut(0..len)
+                    .clone_from_slice(input);
+                *(data.get_unchecked_mut(len)) = 0;
+                data.set_len(len);
+                Deserializer::find_structural_bits(&data)
+            }
+        } else {
+            unsafe { Deserializer::find_structural_bits(input) }
+        };
+        let structural_indexes = match s1_result {
+            Ok(i) => i,
+            Err(t) => {
+                return Err(Error::generic(t));
+            }
+        };
+
+        let counts = Deserializer::validate(input, &structural_indexes)?;
+
+        let strings = Vec::with_capacity(len + SIMDJSON_PADDING);
+
+        Ok(Deserializer {
+            counts,
+            structural_indexes,
+            input,
+            idx: 0,
+            strings,
+            str_offset: 0,
+            iidx: 0,
+        })
+    }
+
+    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+    fn skip(&mut self) {
+        self.idx += 1;
+        self.iidx = unsafe { *self.structural_indexes.get_unchecked(self.idx) as usize };
+    }
+
+    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+    fn c(&self) -> u8 {
+        unsafe {
+            *self
+                .input
+                .get_unchecked(*self.structural_indexes.get_unchecked(self.idx) as usize)
+        }
+    }
+
+    // pull out the check so we don't need to
+    // stry every time
+    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+    fn next_(&mut self) -> u8 {
+        unsafe {
+            self.idx += 1;
+            self.iidx = *self.structural_indexes.get_unchecked(self.idx) as usize;
+            *self.input.get_unchecked(self.iidx)
+        }
+    }
+
+    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+    fn count_elements(&self) -> usize {
+        unsafe { *self.counts.get_unchecked(self.idx) }
+    }
+
+    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+    fn parse_number_root(&mut self, minus: bool) -> Result<Number> {
+        let input = unsafe { &self.input.get_unchecked(self.iidx..) };
+        let len = input.len();
+        let mut copy = vec![0u8; len + SIMDJSON_PADDING];
+        copy[len] = 0;
+        unsafe {
+            copy.as_mut_ptr().copy_from(input.as_ptr(), len);
+        };
+        self.parse_number_int(&copy, minus)
+    }
+
+    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+    fn parse_number(&mut self, minus: bool) -> Result<Number> {
+        let input = unsafe { &self.input.get_unchecked(self.iidx..) };
+        let len = input.len();
+        if len < SIMDJSON_PADDING {
+            let mut copy = vec![0u8; len + SIMDJSON_PADDING];
+            unsafe {
+                copy.as_mut_ptr().copy_from(input.as_ptr(), len);
+            };
+            self.parse_number_int(&copy, minus)
+        } else {
+            self.parse_number_int(input, minus)
+        }
+    }
+
+    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+    fn parse_number_(&mut self, minus: bool) -> Result<Number> {
+        let input = unsafe { &self.input.get_unchecked(self.iidx..) };
+        self.parse_number_int(input, minus)
+    }
+}
 //
 //#[cfg(test)]
 //mod tests {
diff --git a/src/neon/deser.rs b/src/neon/deser.rs
index 790eb1b6..9cc5ca13 100644
--- a/src/neon/deser.rs
+++ b/src/neon/deser.rs
@@ -1,204 +1,204 @@
-//
-//use std::mem;
-//
-//pub use crate::error::{Error, ErrorType};
-//pub use crate::Deserializer;
-//pub use crate::Result;
-//pub use crate::neon::intrinsics::*;
-////pub use crate::neon::utf8check::*;
-//pub use crate::stringparse::*;
-//
-////use crate::portability::trailingzeroes;
-//
-//
-//impl<'de> Deserializer<'de> {
-//    #[cfg_attr(not(feature = "no-inline"), inline(always))]
-//    pub fn parse_str_(&mut self) -> Result<&'de str> {
-//        // Add 1 to skip the initial "
-//        let idx = self.iidx + 1;
-//        let mut padding = [0u8; 32];
-//        //let mut read: usize = 0;
-//
-//        // we include the terminal '"' so we know where to end
-//        // This is safe since we check sub's lenght in the range access above and only
-//        // create sub sliced form sub to `sub.len()`.
-//
-//        let src: &[u8] = unsafe { &self.input.get_unchecked(idx..) };
-//        let mut src_i: usize = 0;
-//        let mut len = src_i;
-//        loop {
-//            let v: __m128i = if src.len() >= src_i + 16 {
-//                // This is safe since we ensure src is at least 16 wide
-//                #[allow(clippy::cast_ptr_alignment)]
+use std::mem;
+
+pub use crate::error::{Error, ErrorType};
+pub use crate::Deserializer;
+pub use crate::Result;
+pub use crate::neon::intrinsics::*;
+pub use crate::neon::utf8check::*;
+pub use crate::stringparse::*;
+
+use crate::neon::intrinsics::*;
+
+
+impl<'de> Deserializer<'de> {
+    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+    pub fn parse_str_(&mut self) -> Result<&'de str> {
+        // Add 1 to skip the initial "
+        let idx = self.iidx + 1;
+        let mut padding = [0u8; 32];
+        //let mut read: usize = 0;
+
+        // we include the terminal '"' so we know where to end
+        // This is safe since we check sub's lenght in the range access above and only
+        // create sub sliced form sub to `sub.len()`.
+
+
+        //
+        // FIXME - this section is somewhat questionable -- needs a complete proofread!!!
+        //
+
+        let src: &[u8] = unsafe { &self.input.get_unchecked(idx..) };
+        let mut src_i: usize = 0;
+        let mut len = src_i;
+        loop {
+            let v: int8x16_t = if src.len() >= src_i + 16 {
+                // This is safe since we ensure src is at least 16 wide
+                #[allow(clippy::cast_ptr_alignment)]
+                    unsafe {
+                    *(src.as_ptr().add(src_i) as *const int8x16_t)
+                }
+            } else {
+                unsafe {
+                    padding
+                        .get_unchecked_mut(..src.len() - src_i)
+                        .clone_from_slice(src.get_unchecked(src_i..));
+                    // This is safe since we ensure src is at least 16 wide
+                    #[allow(clippy::cast_ptr_alignment)]
+                        *(padding.as_ptr() as *const int8x16_t)
+                }
+            };
+
+            // store to dest unconditionally - we can overwrite the bits we don't like
+            // later
+            let bs_bits: u32 = 0; // FIXME: vceqq_s8(v, vdupq_n_s8(b'\\' as i8));
+            let quote_mask = 0; // FIXME: unsafe { vceqq_s8(v, vdupq_n_s8(b'"' as i8)) };
+            let quote_bits = 0; // FIXME: unsafe { static_cast_u32!(_mm_movemask_epi8(quote_mask)) };
+            if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
+                // we encountered quotes first. Move dst to point to quotes and exit
+                // find out where the quote is...
+                let quote_dist: u32 = trailingzeroes(u64::from(quote_bits)) as u32;
+
+                ///////////////////////
+                // Above, check for overflow in case someone has a crazy string (>=4GB?)
+                // But only add the overflow check when the document itself exceeds 4GB
+                // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+                ////////////////////////
+
+                // we advance the point, accounting for the fact that we have a NULl termination
+
+                len += quote_dist as usize;
+                unsafe {
+                    let v = self.input.get_unchecked(idx..idx + len) as *const [u8] as *const str;
+                    return Ok(&*v);
+                }
+
+                // we compare the pointers since we care if they are 'at the same spot'
+                // not if they are the same value
+            }
+            if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
+                // Move to the 'bad' character
+                let bs_dist: u32 = trailingzeroes(u64::from(bs_bits));
+                len += bs_dist as usize;
+                src_i += bs_dist as usize;
+                break;
+            } else {
+                // they are the same. Since they can't co-occur, it means we encountered
+                // neither.
+                src_i += 16;
+                len += 16;
+            }
+        }
+
+        let mut dst_i: usize = 0;
+        let dst: &mut [u8] = &mut self.strings;
+
+        loop {
+            let v: int8x16_t = if src.len() >= src_i + 16 {
+                // This is safe since we ensure src is at least 16 wide
+                #[allow(clippy::cast_ptr_alignment)]
+                    unsafe {
+                    *(src.as_ptr().add(src_i) as *const int8x16_t)
+                }
+            } else {
+                unsafe {
+                    padding
+                        .get_unchecked_mut(..src.len() - src_i)
+                        .clone_from_slice(src.get_unchecked(src_i..));
+                    // This is safe since we ensure src is at least 16 wide
+                    #[allow(clippy::cast_ptr_alignment)]
+                        *(padding.as_ptr() as *const int8x16_t)
+                }
+            };
+
+            #[allow(clippy::cast_ptr_alignment)]
+                unsafe {
+                *(dst.as_mut_ptr().add(dst_i) as *mut int8x16_t) = v;
+            };
+
+            // store to dest unconditionally - we can overwrite the bits we don't like
+            // later
+            let bs_bits: u32 = 0; // FIXME
 //                unsafe {
-//                    _mm_loadu_si128(src.as_ptr().add(src_i) as *const __m128i)
-//                }
-//            } else {
-//                unsafe {
-//                    padding
-//                        .get_unchecked_mut(..src.len() - src_i)
-//                        .clone_from_slice(src.get_unchecked(src_i..));
-//                    // This is safe since we ensure src is at least 32 wide
-//                    #[allow(clippy::cast_ptr_alignment)]
-//                    _mm_loadu_si128(padding.as_ptr() as *const __m128i)
-//                }
-//            };
-//
-//            // store to dest unconditionally - we can overwrite the bits we don't like
-//            // later
-//            let bs_bits: u32 = unsafe {
-//                static_cast_u32!(_mm_movemask_epi8(_mm_cmpeq_epi8(
+//                static_cast_u32!(_mm_movemask_epi8(vceqq_s8(
 //                    v,
-//                    _mm_set1_epi8(b'\\' as i8)
+//                    vdupq_n_s8(b'\\' as i8)
 //                )))
 //            };
-//            let quote_mask = unsafe { _mm_cmpeq_epi8(v, _mm_set1_epi8(b'"' as i8)) };
-//            let quote_bits = unsafe { static_cast_u32!(_mm_movemask_epi8(quote_mask)) };
-//            if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
-//                // we encountered quotes first. Move dst to point to quotes and exit
-//                // find out where the quote is...
-//                let quote_dist: u32 = trailingzeroes(u64::from(quote_bits)) as u32;
-//
-//                ///////////////////////
-//                // Above, check for overflow in case someone has a crazy string (>=4GB?)
-//                // But only add the overflow check when the document itself exceeds 4GB
-//                // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
-//                ////////////////////////
-//
-//                // we advance the point, accounting for the fact that we have a NULl termination
-//
-//                len += quote_dist as usize;
-//                unsafe {
-//                    let v = self.input.get_unchecked(idx..idx + len) as *const [u8] as *const str;
-//                    return Ok(&*v);
-//                }
-//
-//                // we compare the pointers since we care if they are 'at the same spot'
-//                // not if they are the same value
-//            }
-//            if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
-//                // Move to the 'bad' character
-//                let bs_dist: u32 = trailingzeroes(u64::from(bs_bits));
-//                len += bs_dist as usize;
-//                src_i += bs_dist as usize;
-//                break;
-//            } else {
-//                // they are the same. Since they can't co-occur, it means we encountered
-//                // neither.
-//                src_i += 16;
-//                len += 16;
-//            }
-//        }
-//
-//        let mut dst_i: usize = 0;
-//        let dst: &mut [u8] = &mut self.strings;
-//
-//        loop {
-//            let v: __m128i = if src.len() >= src_i + 16 {
-//                // This is safe since we ensure src is at least 16 wide
-//                #[allow(clippy::cast_ptr_alignment)]
-//                unsafe {
-//                    _mm_loadu_si128(src.as_ptr().add(src_i) as *const __m128i)
-//                }
-//            } else {
-//                unsafe {
-//                    padding
-//                        .get_unchecked_mut(..src.len() - src_i)
-//                        .clone_from_slice(src.get_unchecked(src_i..));
-//                    // This is safe since we ensure src is at least 16 wide
-//                    #[allow(clippy::cast_ptr_alignment)]
-//                    _mm_loadu_si128(padding.as_ptr() as *const __m128i)
-//                }
-//            };
-//
-//            #[allow(clippy::cast_ptr_alignment)]
-//            unsafe {
-//                _mm_storeu_si128(dst.as_mut_ptr().add(dst_i) as *mut __m128i, v)
-//            };
-//
-//            // store to dest unconditionally - we can overwrite the bits we don't like
-//            // later
-//            let bs_bits: u32 = unsafe {
-//                static_cast_u32!(_mm_movemask_epi8(_mm_cmpeq_epi8(
-//                    v,
-//                    _mm_set1_epi8(b'\\' as i8)
-//                )))
-//            };
-//            let quote_mask = unsafe { _mm_cmpeq_epi8(v, _mm_set1_epi8(b'"' as i8)) };
-//            let quote_bits = unsafe { static_cast_u32!(_mm_movemask_epi8(quote_mask)) };
-//            if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
-//                // we encountered quotes first. Move dst to point to quotes and exit
-//                // find out where the quote is...
-//                let quote_dist: u32 = trailingzeroes(u64::from(quote_bits)) as u32;
-//
-//                ///////////////////////
-//                // Above, check for overflow in case someone has a crazy string (>=4GB?)
-//                // But only add the overflow check when the document itself exceeds 4GB
-//                // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
-//                ////////////////////////
-//
-//                // we advance the point, accounting for the fact that we have a NULl termination
-//
-//                dst_i += quote_dist as usize;
-//                unsafe {
-//                    self.input
-//                        .get_unchecked_mut(idx + len..idx + len + dst_i)
-//                        .clone_from_slice(&self.strings.get_unchecked(..dst_i));
-//                    let v = self.input.get_unchecked(idx..idx + len + dst_i) as *const [u8]
-//                        as *const str;
-//                    self.str_offset += dst_i as usize;
-//                    return Ok(&*v);
-//                }
-//
-//                // we compare the pointers since we care if they are 'at the same spot'
-//                // not if they are the same value
-//            }
-//            if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
-//                // find out where the backspace is
-//                let bs_dist: u32 = trailingzeroes(u64::from(bs_bits));
-//                let escape_char: u8 = unsafe { *src.get_unchecked(src_i + bs_dist as usize + 1) };
-//                // we encountered backslash first. Handle backslash
-//                if escape_char == b'u' {
-//                    // move src/dst up to the start; they will be further adjusted
-//                    // within the unicode codepoint handling code.
-//                    src_i += bs_dist as usize;
-//                    dst_i += bs_dist as usize;
-//                    let (o, s) = if let Ok(r) =
-//                        handle_unicode_codepoint(unsafe { src.get_unchecked(src_i..) }, unsafe {
-//                            dst.get_unchecked_mut(dst_i..)
-//                        }) {
-//                        r
-//                    } else {
-//                        return Err(self.error(ErrorType::InvlaidUnicodeCodepoint));
-//                    };
-//                    if o == 0 {
-//                        return Err(self.error(ErrorType::InvlaidUnicodeCodepoint));
-//                    };
-//                    // We moved o steps forword at the destiation and 6 on the source
-//                    src_i += s;
-//                    dst_i += o;
-//                } else {
-//                    // simple 1:1 conversion. Will eat bs_dist+2 characters in input and
-//                    // write bs_dist+1 characters to output
-//                    // note this may reach beyond the part of the buffer we've actually
-//                    // seen. I think this is ok
-//                    let escape_result: u8 =
-//                        unsafe { *ESCAPE_MAP.get_unchecked(escape_char as usize) };
-//                    if escape_result == 0 {
-//                        return Err(self.error(ErrorType::InvalidEscape));
-//                    }
-//                    unsafe {
-//                        *dst.get_unchecked_mut(dst_i + bs_dist as usize) = escape_result;
-//                    }
-//                    src_i += bs_dist as usize + 2;
-//                    dst_i += bs_dist as usize + 1;
-//                }
-//            } else {
-//                // they are the same. Since they can't co-occur, it means we encountered
-//                // neither.
-//                src_i += 16;
-//                dst_i += 16;
-//            }
-//        }
-//    }
-//}
+            let quote_mask = 0; // FIXME: unsafe { vceqq_s8(v, vdupq_n_s8(b'"' as i8)) };
+            let quote_bits = 0; // FIXME: unsafe { static_cast_u32!(_mm_movemask_epi8(quote_mask)) };
+            if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
+                // we encountered quotes first. Move dst to point to quotes and exit
+                // find out where the quote is...
+                let quote_dist: u32 = trailingzeroes(u64::from(quote_bits)) as u32;
+
+                ///////////////////////
+                // Above, check for overflow in case someone has a crazy string (>=4GB?)
+                // But only add the overflow check when the document itself exceeds 4GB
+                // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+                ////////////////////////
+
+                // we advance the point, accounting for the fact that we have a NULl termination
+
+                dst_i += quote_dist as usize;
+                unsafe {
+                    self.input
+                        .get_unchecked_mut(idx + len..idx + len + dst_i)
+                        .clone_from_slice(&self.strings.get_unchecked(..dst_i));
+                    let v = self.input.get_unchecked(idx..idx + len + dst_i) as *const [u8]
+                        as *const str;
+                    self.str_offset += dst_i as usize;
+                    return Ok(&*v);
+                }
+
+                // we compare the pointers since we care if they are 'at the same spot'
+                // not if they are the same value
+            }
+            if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
+                // find out where the backspace is
+                let bs_dist: u32 = trailingzeroes(u64::from(bs_bits));
+                let escape_char: u8 = unsafe { *src.get_unchecked(src_i + bs_dist as usize + 1) };
+                // we encountered backslash first. Handle backslash
+                if escape_char == b'u' {
+                    // move src/dst up to the start; they will be further adjusted
+                    // within the unicode codepoint handling code.
+                    src_i += bs_dist as usize;
+                    dst_i += bs_dist as usize;
+                    let (o, s) = if let Ok(r) =
+                    handle_unicode_codepoint(unsafe { src.get_unchecked(src_i..) }, unsafe {
+                        dst.get_unchecked_mut(dst_i..)
+                    }) {
+                        r
+                    } else {
+                        return Err(self.error(ErrorType::InvlaidUnicodeCodepoint));
+                    };
+                    if o == 0 {
+                        return Err(self.error(ErrorType::InvlaidUnicodeCodepoint));
+                    };
+                    // We moved o steps forword at the destiation and 6 on the source
+                    src_i += s;
+                    dst_i += o;
+                } else {
+                    // simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+                    // write bs_dist+1 characters to output
+                    // note this may reach beyond the part of the buffer we've actually
+                    // seen. I think this is ok
+                    let escape_result: u8 =
+                        unsafe { *ESCAPE_MAP.get_unchecked(escape_char as usize) };
+                    if escape_result == 0 {
+                        return Err(self.error(ErrorType::InvalidEscape));
+                    }
+                    unsafe {
+                        *dst.get_unchecked_mut(dst_i + bs_dist as usize) = escape_result;
+                    }
+                    src_i += bs_dist as usize + 2;
+                    dst_i += bs_dist as usize + 1;
+                }
+            } else {
+                // they are the same. Since they can't co-occur, it means we encountered
+                // neither.
+                src_i += 16;
+                dst_i += 16;
+            }
+        }
+    }
+}
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
index c1a87d98..1b979e5a 100644
--- a/src/neon/mod.rs
+++ b/src/neon/mod.rs
@@ -1,4 +1,4 @@
-//pub mod deser;
+pub mod deser;
 pub mod stage1;
 pub mod utf8check;
 mod simd;

From 086e74593dc91d501986c30166cb9cecc793b94a Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Fri, 9 Aug 2019 18:58:02 -0400
Subject: [PATCH 04/34] feat: re-enable tests (still broken)

---
 src/lib.rs | 1902 ++++++++++++++++++++++++++--------------------------
 1 file changed, 951 insertions(+), 951 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index d2d9f10e..c032bd0d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -257,954 +257,954 @@ impl<'de> Deserializer<'de> {
         self.parse_number_int(input, minus)
     }
 }
-//
-//#[cfg(test)]
-//mod tests {
-//    use super::serde::from_slice;
-//    use super::{
-//        owned::to_value, owned::Map, owned::Value, to_borrowed_value, to_owned_value, Deserializer,
-//    };
-//    use halfbrown::HashMap;
-//    use proptest::prelude::*;
-//    use serde::Deserialize;
-//    use serde_json;
-//
-//    #[test]
-//    fn count1() {
-//        let mut d = String::from("[]");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let simd = Deserializer::from_slice(&mut d).expect("");
-//        assert_eq!(simd.counts[1], 0);
-//    }
-//
-//    #[test]
-//    fn count2() {
-//        let mut d = String::from("[1]");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let simd = Deserializer::from_slice(&mut d).expect("");
-//        assert_eq!(simd.counts[1], 1);
-//    }
-//
-//    #[test]
-//    fn count3() {
-//        let mut d = String::from("[1,2]");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let simd = Deserializer::from_slice(&mut d).expect("");
-//        assert_eq!(simd.counts[1], 2);
-//    }
-//
-//    #[test]
-//    fn count4() {
-//        let mut d = String::from(" [ 1 , [ 3 ] , 2 ]");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let simd = Deserializer::from_slice(&mut d).expect("");
-//        assert_eq!(simd.counts[1], 3);
-//        assert_eq!(simd.counts[4], 1);
-//    }
-//
-//    #[test]
-//    fn count5() {
-//        let mut d = String::from("[[],null,null]");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let simd = Deserializer::from_slice(&mut d).expect("");
-//        assert_eq!(simd.counts[1], 3);
-//        assert_eq!(simd.counts[2], 0);
-//    }
-//
-//    #[test]
-//    fn empty() {
-//        let mut d = String::from("");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_simd = from_slice::<Value>(&mut d);
-//        let v_serde = serde_json::from_slice::<Value>(d);
-//        assert!(v_simd.is_err());
-//        assert!(v_serde.is_err());
-//    }
-//
-//    #[test]
-//    fn bool_true() {
-//        let mut d = String::from("true");
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//        assert_eq!(to_value(&mut d1), Ok(Value::from(true)));
-//    }
-//
-//    #[test]
-//    fn bool_false() {
-//        let mut d = String::from("false");
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//        assert_eq!(to_value(&mut d1), Ok(Value::from(false)));
-//        //assert!(false)
-//    }
-//
-//    #[test]
-//    fn union() {
-//        let mut d = String::from("null");
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//        assert_eq!(to_value(&mut d1), Ok(Value::Null));
-//    }
-//
-//    #[test]
-//    fn int() {
-//        let mut d = String::from("42");
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//        assert_eq!(to_value(&mut d1), Ok(Value::from(42)));
-//    }
-//
-//    #[test]
-//    fn zero() {
-//        let mut d = String::from("0");
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//        assert_eq!(to_value(&mut d1), Ok(Value::from(0)));
-//    }
-//
-//    #[test]
-//    fn one() {
-//        let mut d = String::from("1");
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//        assert_eq!(to_value(&mut d1), Ok(Value::from(1)));
-//    }
-//
-//    #[test]
-//    fn minus_one() {
-//        let mut d = String::from("-1");
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//        assert_eq!(to_value(&mut d1), Ok(Value::from(-1)));
-//    }
-//
-//    #[test]
-//    fn float() {
-//        let mut d = String::from("23.0");
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//        assert_eq!(to_value(&mut d1), Ok(Value::from(23.0)));
-//    }
-//
-//    #[test]
-//    fn string() {
-//        let mut d = String::from(r#""snot""#);
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(to_value(&mut d1), Ok(Value::from("snot")));
-//        assert_eq!(v_simd, v_serde);
-//    }
-//
-//    #[test]
-//    fn lonely_quote() {
-//        let mut d = String::from(r#"""#);
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
-//        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
-//        assert!(v_simd);
-//        assert!(v_serde);
-//    }
-//
-//    #[test]
-//    fn lonely_quote1() {
-//        let mut d = String::from(r#"["]"#);
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
-//        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
-//        assert!(v_simd);
-//        assert!(v_serde);
-//    }
-//    #[test]
-//    fn lonely_quote2() {
-//        let mut d = String::from(r#"[1, "]"#);
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
-//        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
-//        assert!(v_simd);
-//        assert!(v_serde);
-//    }
-//
-//    #[test]
-//    fn lonely_quote3() {
-//        let mut d = String::from(r#"{": 1}"#);
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
-//        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
-//        assert!(v_simd);
-//        assert!(v_serde);
-//    }
-//
-//    #[test]
-//    fn empty_string() {
-//        let mut d = String::from(r#""""#);
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(to_value(&mut d1), Ok(Value::from("")));
-//        assert_eq!(v_simd, v_serde);
-//    }
-//
-//    #[test]
-//    fn empty_array() {
-//        let mut d = String::from(r#"[]"#);
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("parse_serde");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("parse_simd");
-//        assert_eq!(to_value(&mut d1), Ok(Value::Array(vec![])));
-//        assert_eq!(v_simd, v_serde);
-//    }
-//
-//    #[test]
-//    fn malformed_array() {
-//        let mut d = String::from(r#"[["#);
-//        let mut d1 = d.clone();
-//        let mut d2 = d.clone();
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d2 = unsafe { d2.as_bytes_mut() };
-//        let v_serde: Result<serde_json::Value, _> = serde_json::from_slice(d);
-//        let v_simd_ov = to_owned_value(&mut d);
-//        let v_simd_bv = to_borrowed_value(&mut d1);
-//        let v_simd: Result<serde_json::Value, _> = from_slice(&mut d2);
-//        assert!(v_simd_ov.is_err());
-//        assert!(v_simd_bv.is_err());
-//        assert!(v_simd.is_err());
-//        assert!(v_serde.is_err());
-//    }
-//
-//    #[test]
-//    fn double_array() {
-//        let mut d = String::from(r#"[[]]"#);
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("parse_serde");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("parse_simd");
-//        assert_eq!(
-//            to_value(&mut d1),
-//            Ok(Value::Array(vec![Value::Array(vec![])]))
-//        );
-//        assert_eq!(v_simd, v_serde);
-//    }
-//
-//    #[test]
-//    fn null_null_array() {
-//        let mut d = String::from(r#"[[],null,null]"#);
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("parse_serde");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("parse_simd");
-//        assert_eq!(
-//            to_value(&mut d1),
-//            Ok(Value::Array(vec![
-//                Value::Array(vec![]),
-//                Value::Null,
-//                Value::Null,
-//            ]))
-//        );
-//        assert_eq!(v_simd, v_serde);
-//    }
-//
-//    #[test]
-//    fn one_element_array() {
-//        let mut d = String::from(r#"["snot"]"#);
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        assert_eq!(
-//            to_value(&mut d1),
-//            Ok(Value::Array(vec![Value::from("snot")]))
-//        );
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//    }
-//
-//    #[test]
-//    fn two_element_array() {
-//        let mut d = String::from(r#"["snot", "badger"]"#);
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        assert_eq!(
-//            to_value(&mut d1),
-//            Ok(Value::Array(vec![
-//                Value::from("snot"),
-//                Value::from("badger")
-//            ]))
-//        );
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//    }
-//
-//    #[test]
-//    fn list() {
-//        let mut d = String::from(r#"[42, 23.0, "snot badger"]"#);
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//        assert_eq!(
-//            to_value(&mut d1),
-//            Ok(Value::Array(vec![
-//                Value::from(42),
-//                Value::from(23.0),
-//                Value::from("snot badger")
-//            ]))
-//        );
-//    }
-//
-//    #[test]
-//    fn nested_list1() {
-//        let mut d = String::from(r#"[42, [23.0, "snot"], "bad", "ger"]"#);
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        assert_eq!(
-//            to_value(&mut d1),
-//            Ok(Value::Array(vec![
-//                Value::from(42),
-//                Value::Array(vec![Value::from(23.0), Value::from("snot")]),
-//                Value::from("bad"),
-//                Value::from("ger")
-//            ]))
-//        );
-//
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//    }
-//
-//    #[test]
-//    fn nested_list2() {
-//        let mut d = String::from(r#"[42, [23.0, "snot"], {"bad": "ger"}]"#);
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn utf8() {
-//        let mut d = String::from(r#""\u000e""#);
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, "\u{e}");
-//        // NOTE: serde is broken for this
-//        //assert_eq!(v_serde, "\u{e}");
-//        //assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn unicode() {
-//        let mut d = String::from(r#""¡\"""#);
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//    }
-//
-//    #[test]
-//    fn odd_array() {
-//        let mut d = String::from("[{},null]");
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//        assert_eq!(
-//            to_value(&mut d1),
-//            Ok(Value::Array(vec![Value::Object(Map::new()), Value::Null]))
-//        );
-//    }
-//
-//    #[test]
-//    fn map2() {
-//        let mut d = String::from(r#"[{"\u0000":null}]"#);
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn null() {
-//        let mut d = String::from(r#"null"#);
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        assert_eq!(to_value(&mut d1), Ok(Value::Null));
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//    }
-//    #[test]
-//    fn null_null() {
-//        let mut d = String::from(r#"[null, null]"#);
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        assert_eq!(
-//            to_value(&mut d1),
-//            Ok(Value::Array(vec![Value::Null, Value::Null,]))
-//        );
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//    }
-//
-//    #[test]
-//    fn nested_null() {
-//        let mut d = String::from(r#"[[null, null]]"#);
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        assert_eq!(
-//            to_value(&mut d1),
-//            Ok(Value::Array(vec![Value::Array(vec![
-//                Value::Null,
-//                Value::Null,
-//            ])]))
-//        );
-//
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//    }
-//
-//    #[test]
-//    fn nestednested_null() {
-//        let mut d = String::from(r#"[[[null, null]]]"#);
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//        assert_eq!(
-//            to_value(&mut d1),
-//            Ok(Value::Array(vec![Value::Array(vec![Value::Array(vec![
-//                Value::Null,
-//                Value::Null,
-//            ])])]))
-//        );
-//    }
-//
-//    #[test]
-//    fn odd_array2() {
-//        let mut d = String::from("[[\"\\u0000\\\"\"]]");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn odd_array3() {
-//        let mut d = String::from("[{\"\\u0000\\u0000\":null}]");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn odd_array4() {
-//        let mut d = String::from("[{\"\\u0000𐀀a\":null}]");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn float1() {
-//        let mut d = String::from("2.3250706903316115e307");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    // We ignore this since serde is less percise on this test
-//    #[ignore]
-//    #[test]
-//    fn float2() {
-//        let mut d = String::from("-4.5512678569607477e306");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn map0() {
-//        let mut d = String::from(r#"{"snot": "badger"}"#);
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//        let mut h = Map::new();
-//        h.insert("snot".into(), Value::from("badger"));
-//        assert_eq!(to_value(&mut d1), Ok(Value::Object(h)));
-//    }
-//
-//    #[test]
-//    fn map1() {
-//        let mut d = String::from(r#"{"snot": "badger", "badger": "snot"}"#);
-//        let mut d1 = d.clone();
-//        let mut d1 = unsafe { d1.as_bytes_mut() };
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
-//        assert_eq!(v_simd, v_serde);
-//        let mut h = Map::new();
-//        h.insert("snot".into(), Value::from("badger"));
-//        h.insert("badger".into(), Value::from("snot"));
-//        assert_eq!(to_value(&mut d1), Ok(Value::Object(h)));
-//    }
-//
-//    #[test]
-//    fn tpl1() {
-//        let mut d = String::from("[-65.613616999999977, 43.420273000000009]");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: (f32, f32) = serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: (f32, f32) = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn tpl2() {
-//        let mut d = String::from("[[-65.613616999999977, 43.420273000000009]]");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: Vec<(f32, f32)> = serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: Vec<(f32, f32)> = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn tpl3() {
-//        let mut d = String::from(
-//            "[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]",
-//        );
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: Vec<(f32, f32)> = serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: Vec<(f32, f32)> = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//    #[test]
-//    fn tpl4() {
-//        let mut d = String::from("[[[-65.613616999999977,43.420273000000009]]]");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: Vec<Vec<(f32, f32)>> = serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: Vec<Vec<(f32, f32)>> = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//    #[test]
-//    fn tpl5() {
-//        let mut d = String::from("[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: Vec<Vec<(f32, f32)>> = serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: Vec<Vec<(f32, f32)>> = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn tpl6() {
-//        let mut d = String::from("[[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]]");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: Vec<Vec<Vec<(f32, f32)>>> = serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: Vec<Vec<Vec<(f32, f32)>>> = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn tpl7() {
-//        let mut d = String::from("[[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]]");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: Vec<Vec<Vec<[f32; 2]>>> = serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: Vec<Vec<Vec<[f32; 2]>>> = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[derive(Deserialize, PartialEq, Debug)]
-//    struct Obj {
-//        a: u64,
-//        b: u64,
-//    }
-//
-//    #[derive(Deserialize, PartialEq, Debug)]
-//    struct Obj1 {
-//        a: Obj,
-//    }
-//
-//    #[test]
-//    fn obj() {
-//        let mut d = String::from(r#"{"a": 1, "b":1}"#);
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: Obj = serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: Obj = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn obj2() {
-//        let mut d =
-//            String::from(r#"{"a": {"a": 1, "b":1}, "b": {"a": 1, "b":1}, "c": {"a": 1, "b": 1}}"#);
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: HashMap<String, Obj> = serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: HashMap<String, Obj> = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn obj3() {
-//        let mut d = String::from(
-//            r#"{"c": {"a": {"a": 1, "b":1}, "b": {"a": 1, "b":1}, "c": {"a": 1, "b": 1}}}"#,
-//        );
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: HashMap<String, HashMap<String, Obj>> =
-//            serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: HashMap<String, HashMap<String, Obj>> = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn obj4() {
-//        let mut d = String::from(r#"{"c": {"a": {"a": 1, "b":1}}}"#);
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: HashMap<String, Obj1> = serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: HashMap<String, Obj1> = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn vecvec() {
-//        let mut d = String::from("[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]], [[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]");
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: Vec<Vec<(f32, f32)>> = serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: Vec<Vec<(f32, f32)>> = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn crazy_string() {
-//        // there is unicode in here!
-//        let d = "\"𐀀𐀀  𐀀𐀀0 𐀀A\\u00000A0 A \\u000b\"";
-//        let mut d = String::from(d);
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: serde_json::Value = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    #[test]
-//    fn event() {
-//        #[derive(Deserialize, Debug, PartialEq)]
-//        #[serde(deny_unknown_fields, rename_all = "camelCase")]
-//        pub struct CitmCatalog {
-//            pub area_names: HashMap<String, String>,
-//            pub audience_sub_category_names: HashMap<String, String>,
-//            pub block_names: HashMap<String, String>,
-//            pub events: HashMap<String, Event>,
-//        }
-//        pub type Id = u32;
-//        #[derive(Deserialize, Debug, PartialEq)]
-//        #[serde(deny_unknown_fields, rename_all = "camelCase")]
-//        pub struct Event {
-//            pub description: (),
-//            pub id: Id,
-//            pub logo: Option<String>,
-//            pub name: String,
-//            pub sub_topic_ids: Vec<Id>,
-//            pub subject_code: (),
-//            pub subtitle: (),
-//            pub topic_ids: Vec<Id>,
-//        }
-//
-//        let mut d = String::from(
-//            r#"
-//{
-//    "areaNames": {
-//        "205705993": "Arrière-scène central",
-//        "205705994": "1er balcon central",
-//        "205705995": "2ème balcon bergerie cour",
-//        "205705996": "2ème balcon bergerie jardin",
-//        "205705998": "1er balcon bergerie jardin",
-//        "205705999": "1er balcon bergerie cour",
-//        "205706000": "Arrière-scène jardin",
-//        "205706001": "Arrière-scène cour",
-//        "205706002": "2ème balcon jardin",
-//        "205706003": "2ème balcon cour",
-//        "205706004": "2ème Balcon central",
-//        "205706005": "1er balcon jardin",
-//        "205706006": "1er balcon cour",
-//        "205706007": "Orchestre central",
-//        "205706008": "Orchestre jardin",
-//        "205706009": "Orchestre cour",
-//        "342752287": "Zone physique secrète"
-//    },
-//    "audienceSubCategoryNames": {
-//        "337100890": "Abonné"
-//    },
-//    "blockNames": {},
-//  "events": {
-//    "138586341": {
-//      "description": null,
-//      "id": 138586341,
-//      "logo": null,
-//      "name": "30th Anniversary Tour",
-//      "subTopicIds": [
-//        337184269,
-//        337184283
-//      ],
-//      "subjectCode": null,
-//      "subtitle": null,
-//      "topicIds": [
-//        324846099,
-//        107888604
-//      ]
-//    },
-//    "138586345": {
-//      "description": null,
-//      "id": 138586345,
-//      "logo": "/images/UE0AAAAACEKo6QAAAAZDSVRN",
-//      "name": "Berliner Philharmoniker",
-//      "subTopicIds": [
-//        337184268,
-//        337184283,
-//        337184275
-//      ],
-//      "subjectCode": null,
-//      "subtitle": null,
-//      "topicIds": [
-//        324846099,
-//        107888604,
-//        324846100
-//      ]
-//    }
-//  }
-//}
-//"#,
-//        );
-//        let mut d = unsafe { d.as_bytes_mut() };
-//        let v_serde: CitmCatalog = serde_json::from_slice(d).expect("serde_json");
-//        let v_simd: CitmCatalog = from_slice(&mut d).expect("simd_json");
-//        assert_eq!(v_simd, v_serde)
-//    }
-//
-//    // How much do we care about this, it's within the same range and
-//    // based on floating point math inprecisions during parsing.
-//    // Is this a real issue worth improving?
-//    #[test]
-//    fn silly_float1() {
-//        let v = Value::from(3.0901448042322017e305);
-//        let s = v.to_string();
-//        dbg!(&s);
-//        let mut bytes = s.as_bytes().to_vec();
-//        let parsed = to_owned_value(&mut bytes).expect("failed to parse gernated float");
-//        assert_eq!(v, parsed);
-//    }
-//
-//    #[test]
-//    #[ignore]
-//    fn silly_float2() {
-//        let v = Value::from(-6.990585694841803e305);
-//        let s = v.to_string();
-//        dbg!(&s);
-//        let mut bytes = s.as_bytes().to_vec();
-//        let parsed = to_owned_value(&mut bytes).expect("failed to parse gernated float");
-//        assert_eq!(v, parsed);
-//    }
-//
-//    //6.576692109929364e305
-//    fn arb_json() -> BoxedStrategy<String> {
-//        let leaf = prop_oneof![
-//            Just(Value::Null),
-//            any::<bool>().prop_map(Value::Bool),
-//            // (-1.0e306f64..1.0e306f64).prop_map(|f| json!(f)), // The float parsing of simd and serde are too different
-//            any::<i64>().prop_map(|i| json!(i)),
-//            ".*".prop_map(Value::from),
-//        ];
-//        leaf.prop_recursive(
-//            8,   // 8 levels deep
-//            256, // Shoot for maximum size of 256 nodes
-//            10,  // We put up to 10 items per collection
-//            |inner| {
-//                prop_oneof![
-//                    // Take the inner strategy and make the two recursive cases.
-//                    prop::collection::vec(inner.clone(), 0..10).prop_map(|v| json!(v)),
-//                    prop::collection::hash_map(".*", inner, 0..10).prop_map(|m| json!(m)),
-//                ]
-//            },
-//        )
-//        .prop_map(|v| serde_json::to_string(&v).expect("").to_string())
-//        .boxed()
-//    }
-//
-//    fn arb_json_value() -> BoxedStrategy<Value> {
-//        let leaf = prop_oneof![
-//            Just(Value::Null),
-//            any::<bool>().prop_map(Value::Bool),
-//            //(-1.0e306f64..1.0e306f64).prop_map(|f| json!(f)), // damn you float!
-//            any::<i64>().prop_map(|i| json!(i)),
-//            ".*".prop_map(Value::from),
-//        ];
-//        leaf.prop_recursive(
-//            8,   // 8 levels deep
-//            256, // Shoot for maximum size of 256 nodes
-//            10,  // We put up to 10 items per collection
-//            |inner| {
-//                prop_oneof![
-//                    // Take the inner strategy and make the two recursive cases.
-//                    prop::collection::vec(inner.clone(), 0..10).prop_map(|v| json!(v)),
-//                    prop::collection::hash_map(".*", inner, 0..10).prop_map(|m| json!(m)),
-//                ]
-//            },
-//        )
-//        .boxed()
-//    }
-//
-//    proptest! {
-//        #![proptest_config(ProptestConfig {
-//            // Setting both fork and timeout is redundant since timeout implies
-//            // fork, but both are shown for clarity.
-//            fork: true,
-//            .. ProptestConfig::default()
-//        })]
-//
-//        #[test]
-//        fn prop_json_encode_decode(val in arb_json_value()) {
-//            let mut encoded: Vec<u8> = Vec::new();
-//            let _ = val.write(&mut encoded);
-//            println!("{}", String::from_utf8(encoded.clone()).unwrap());
-//            let res = to_owned_value(&mut encoded).unwrap();
-//            assert_eq!(val, res);
-//        }
-//
-//    }
-//    proptest! {
-//        #![proptest_config(ProptestConfig {
-//            // Setting both fork and timeout is redundant since timeout implies
-//            // fork, but both are shown for clarity.
-//            fork: true,
-//            .. ProptestConfig::default()
-//        })]
-//
-//        #[test]
-//        fn prop_json(d in arb_json()) {
-//            if let Ok(v_serde) = serde_json::from_slice::<serde_json::Value>(&d.as_bytes()) {
-//                let mut d1 = d.clone();
-//                let d1 = unsafe{ d1.as_bytes_mut()};
-//                let v_simd_serde: serde_json::Value = from_slice(d1).expect("");
-//                // We add our own encoder in here.
-//                let mut d2 = v_simd_serde.to_string();
-//                let d2 = unsafe{ d2.as_bytes_mut()};
-//                let mut d3 = d.clone();
-//                let d3 = unsafe{ d3.as_bytes_mut()};
-//                assert_eq!(v_simd_serde, v_serde);
-//                let v_simd_owned = to_owned_value(d2);
-//                assert!(v_simd_owned.is_ok());
-//                let v_simd_borrowed = to_borrowed_value(d3);
-//                dbg!(&v_simd_borrowed);
-//                assert!(v_simd_borrowed.is_ok());
-//                assert_eq!(v_simd_owned.unwrap(), super::OwnedValue::from(v_simd_borrowed.unwrap()));
-//            }
-//
-//        }
-//
-//    }
-//
-//    fn arb_junk() -> BoxedStrategy<Vec<u8>> {
-//        prop::collection::vec(any::<u8>(), 0..(1024 * 8)).boxed()
-//    }
-//    proptest! {
-//        #![proptest_config(ProptestConfig {
-//            // Setting both fork and timeout is redundant since timeout implies
-//            // fork, but both are shown for clarity.
-//            fork: true,
-//            .. ProptestConfig::default()
-//        })]
-//        #[test]
-//        fn prop_junk(d in arb_junk()) {
-//            let mut d1 = d.clone();
-//            let mut d2 = d.clone();
-//            let mut d3 = d.clone();
-//
-//            let _ = from_slice::<serde_json::Value>(&mut d1);
-//            let _ = to_borrowed_value(&mut d2);
-//            let _ = to_owned_value(&mut d3);
-//
-//        }
-//    }
-//
-//    proptest! {
-//        #![proptest_config(ProptestConfig {
-//            // Setting both fork and timeout is redundant since timeout implies
-//            // fork, but both are shown for clarity.
-//            fork: true,
-//            .. ProptestConfig::default()
-//        })]
-//
-//        #[test]
-//        fn prop_string(d in "\\PC*") {
-//            let mut d1 = d.clone();
-//            let mut d1 = unsafe{ d1.as_bytes_mut()};
-//            let mut d2 = d.clone();
-//            let mut d2 = unsafe{ d2.as_bytes_mut()};
-//            let mut d3 = d.clone();
-//            let mut d3 = unsafe{ d3.as_bytes_mut()};
-//            let _ = from_slice::<serde_json::Value>(&mut d1);
-//            let _ = to_borrowed_value(&mut d2);
-//            let _ = to_owned_value(&mut d3);
-//
-//        }
-//    }
-//}
+
+#[cfg(test)]
+mod tests {
+    use super::serde::from_slice;
+    use super::{
+        owned::to_value, owned::Map, owned::Value, to_borrowed_value, to_owned_value, Deserializer,
+    };
+    use halfbrown::HashMap;
+    use proptest::prelude::*;
+    use serde::Deserialize;
+    use serde_json;
+
+    #[test]
+    fn count1() {
+        let mut d = String::from("[]");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let simd = Deserializer::from_slice(&mut d).expect("");
+        assert_eq!(simd.counts[1], 0);
+    }
+
+    #[test]
+    fn count2() {
+        let mut d = String::from("[1]");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let simd = Deserializer::from_slice(&mut d).expect("");
+        assert_eq!(simd.counts[1], 1);
+    }
+
+    #[test]
+    fn count3() {
+        let mut d = String::from("[1,2]");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let simd = Deserializer::from_slice(&mut d).expect("");
+        assert_eq!(simd.counts[1], 2);
+    }
+
+    #[test]
+    fn count4() {
+        let mut d = String::from(" [ 1 , [ 3 ] , 2 ]");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let simd = Deserializer::from_slice(&mut d).expect("");
+        assert_eq!(simd.counts[1], 3);
+        assert_eq!(simd.counts[4], 1);
+    }
+
+    #[test]
+    fn count5() {
+        let mut d = String::from("[[],null,null]");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let simd = Deserializer::from_slice(&mut d).expect("");
+        assert_eq!(simd.counts[1], 3);
+        assert_eq!(simd.counts[2], 0);
+    }
+
+    #[test]
+    fn empty() {
+        let mut d = String::from("");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_simd = from_slice::<Value>(&mut d);
+        let v_serde = serde_json::from_slice::<Value>(d);
+        assert!(v_simd.is_err());
+        assert!(v_serde.is_err());
+    }
+
+    #[test]
+    fn bool_true() {
+        let mut d = String::from("true");
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+        assert_eq!(to_value(&mut d1), Ok(Value::from(true)));
+    }
+
+    #[test]
+    fn bool_false() {
+        let mut d = String::from("false");
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+        assert_eq!(to_value(&mut d1), Ok(Value::from(false)));
+        //assert!(false)
+    }
+
+    #[test]
+    fn union() {
+        let mut d = String::from("null");
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+        assert_eq!(to_value(&mut d1), Ok(Value::Null));
+    }
+
+    #[test]
+    fn int() {
+        let mut d = String::from("42");
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+        assert_eq!(to_value(&mut d1), Ok(Value::from(42)));
+    }
+
+    #[test]
+    fn zero() {
+        let mut d = String::from("0");
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+        assert_eq!(to_value(&mut d1), Ok(Value::from(0)));
+    }
+
+    #[test]
+    fn one() {
+        let mut d = String::from("1");
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+        assert_eq!(to_value(&mut d1), Ok(Value::from(1)));
+    }
+
+    #[test]
+    fn minus_one() {
+        let mut d = String::from("-1");
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+        assert_eq!(to_value(&mut d1), Ok(Value::from(-1)));
+    }
+
+    #[test]
+    fn float() {
+        let mut d = String::from("23.0");
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+        assert_eq!(to_value(&mut d1), Ok(Value::from(23.0)));
+    }
+
+    #[test]
+    fn string() {
+        let mut d = String::from(r#""snot""#);
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(to_value(&mut d1), Ok(Value::from("snot")));
+        assert_eq!(v_simd, v_serde);
+    }
+
+    #[test]
+    fn lonely_quote() {
+        let mut d = String::from(r#"""#);
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
+        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
+        assert!(v_simd);
+        assert!(v_serde);
+    }
+
+    #[test]
+    fn lonely_quote1() {
+        let mut d = String::from(r#"["]"#);
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
+        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
+        assert!(v_simd);
+        assert!(v_serde);
+    }
+    #[test]
+    fn lonely_quote2() {
+        let mut d = String::from(r#"[1, "]"#);
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
+        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
+        assert!(v_simd);
+        assert!(v_serde);
+    }
+
+    #[test]
+    fn lonely_quote3() {
+        let mut d = String::from(r#"{": 1}"#);
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde = serde_json::from_slice::<serde_json::Value>(d).is_err();
+        let v_simd = from_slice::<serde_json::Value>(&mut d).is_err();
+        assert!(v_simd);
+        assert!(v_serde);
+    }
+
+    #[test]
+    fn empty_string() {
+        let mut d = String::from(r#""""#);
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(to_value(&mut d1), Ok(Value::from("")));
+        assert_eq!(v_simd, v_serde);
+    }
+
+    #[test]
+    fn empty_array() {
+        let mut d = String::from(r#"[]"#);
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("parse_serde");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("parse_simd");
+        assert_eq!(to_value(&mut d1), Ok(Value::Array(vec![])));
+        assert_eq!(v_simd, v_serde);
+    }
+
+    #[test]
+    fn malformed_array() {
+        let mut d = String::from(r#"[["#);
+        let mut d1 = d.clone();
+        let mut d2 = d.clone();
+        let mut d = unsafe { d.as_bytes_mut() };
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d2 = unsafe { d2.as_bytes_mut() };
+        let v_serde: Result<serde_json::Value, _> = serde_json::from_slice(d);
+        let v_simd_ov = to_owned_value(&mut d);
+        let v_simd_bv = to_borrowed_value(&mut d1);
+        let v_simd: Result<serde_json::Value, _> = from_slice(&mut d2);
+        assert!(v_simd_ov.is_err());
+        assert!(v_simd_bv.is_err());
+        assert!(v_simd.is_err());
+        assert!(v_serde.is_err());
+    }
+
+    #[test]
+    fn double_array() {
+        let mut d = String::from(r#"[[]]"#);
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("parse_serde");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("parse_simd");
+        assert_eq!(
+            to_value(&mut d1),
+            Ok(Value::Array(vec![Value::Array(vec![])]))
+        );
+        assert_eq!(v_simd, v_serde);
+    }
+
+    #[test]
+    fn null_null_array() {
+        let mut d = String::from(r#"[[],null,null]"#);
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("parse_serde");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("parse_simd");
+        assert_eq!(
+            to_value(&mut d1),
+            Ok(Value::Array(vec![
+                Value::Array(vec![]),
+                Value::Null,
+                Value::Null,
+            ]))
+        );
+        assert_eq!(v_simd, v_serde);
+    }
+
+    #[test]
+    fn one_element_array() {
+        let mut d = String::from(r#"["snot"]"#);
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        assert_eq!(
+            to_value(&mut d1),
+            Ok(Value::Array(vec![Value::from("snot")]))
+        );
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+    }
+
+    #[test]
+    fn two_element_array() {
+        let mut d = String::from(r#"["snot", "badger"]"#);
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        assert_eq!(
+            to_value(&mut d1),
+            Ok(Value::Array(vec![
+                Value::from("snot"),
+                Value::from("badger")
+            ]))
+        );
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+    }
+
+    #[test]
+    fn list() {
+        let mut d = String::from(r#"[42, 23.0, "snot badger"]"#);
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+        assert_eq!(
+            to_value(&mut d1),
+            Ok(Value::Array(vec![
+                Value::from(42),
+                Value::from(23.0),
+                Value::from("snot badger")
+            ]))
+        );
+    }
+
+    #[test]
+    fn nested_list1() {
+        let mut d = String::from(r#"[42, [23.0, "snot"], "bad", "ger"]"#);
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        assert_eq!(
+            to_value(&mut d1),
+            Ok(Value::Array(vec![
+                Value::from(42),
+                Value::Array(vec![Value::from(23.0), Value::from("snot")]),
+                Value::from("bad"),
+                Value::from("ger")
+            ]))
+        );
+
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+    }
+
+    #[test]
+    fn nested_list2() {
+        let mut d = String::from(r#"[42, [23.0, "snot"], {"bad": "ger"}]"#);
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn utf8() {
+        let mut d = String::from(r#""\u000e""#);
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, "\u{e}");
+        // NOTE: serde is broken for this
+        //assert_eq!(v_serde, "\u{e}");
+        //assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn unicode() {
+        let mut d = String::from(r#""¡\"""#);
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+    }
+
+    #[test]
+    fn odd_array() {
+        let mut d = String::from("[{},null]");
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+        assert_eq!(
+            to_value(&mut d1),
+            Ok(Value::Array(vec![Value::Object(Map::new()), Value::Null]))
+        );
+    }
+
+    #[test]
+    fn map2() {
+        let mut d = String::from(r#"[{"\u0000":null}]"#);
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn null() {
+        let mut d = String::from(r#"null"#);
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        assert_eq!(to_value(&mut d1), Ok(Value::Null));
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+    }
+    #[test]
+    fn null_null() {
+        let mut d = String::from(r#"[null, null]"#);
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        assert_eq!(
+            to_value(&mut d1),
+            Ok(Value::Array(vec![Value::Null, Value::Null,]))
+        );
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+    }
+
+    #[test]
+    fn nested_null() {
+        let mut d = String::from(r#"[[null, null]]"#);
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        assert_eq!(
+            to_value(&mut d1),
+            Ok(Value::Array(vec![Value::Array(vec![
+                Value::Null,
+                Value::Null,
+            ])]))
+        );
+
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+    }
+
+    #[test]
+    fn nestednested_null() {
+        let mut d = String::from(r#"[[[null, null]]]"#);
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+        assert_eq!(
+            to_value(&mut d1),
+            Ok(Value::Array(vec![Value::Array(vec![Value::Array(vec![
+                Value::Null,
+                Value::Null,
+            ])])]))
+        );
+    }
+
+    #[test]
+    fn odd_array2() {
+        let mut d = String::from("[[\"\\u0000\\\"\"]]");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn odd_array3() {
+        let mut d = String::from("[{\"\\u0000\\u0000\":null}]");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn odd_array4() {
+        let mut d = String::from("[{\"\\u0000𐀀a\":null}]");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn float1() {
+        let mut d = String::from("2.3250706903316115e307");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("serde_json");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    // We ignore this since serde is less percise on this test
+    #[ignore]
+    #[test]
+    fn float2() {
+        let mut d = String::from("-4.5512678569607477e306");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("serde_json");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn map0() {
+        let mut d = String::from(r#"{"snot": "badger"}"#);
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+        let mut h = Map::new();
+        h.insert("snot".into(), Value::from("badger"));
+        assert_eq!(to_value(&mut d1), Ok(Value::Object(h)));
+    }
+
+    #[test]
+    fn map1() {
+        let mut d = String::from(r#"{"snot": "badger", "badger": "snot"}"#);
+        let mut d1 = d.clone();
+        let mut d1 = unsafe { d1.as_bytes_mut() };
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("");
+        assert_eq!(v_simd, v_serde);
+        let mut h = Map::new();
+        h.insert("snot".into(), Value::from("badger"));
+        h.insert("badger".into(), Value::from("snot"));
+        assert_eq!(to_value(&mut d1), Ok(Value::Object(h)));
+    }
+
+    #[test]
+    fn tpl1() {
+        let mut d = String::from("[-65.613616999999977, 43.420273000000009]");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: (f32, f32) = serde_json::from_slice(d).expect("serde_json");
+        let v_simd: (f32, f32) = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn tpl2() {
+        let mut d = String::from("[[-65.613616999999977, 43.420273000000009]]");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: Vec<(f32, f32)> = serde_json::from_slice(d).expect("serde_json");
+        let v_simd: Vec<(f32, f32)> = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn tpl3() {
+        let mut d = String::from(
+            "[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]",
+        );
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: Vec<(f32, f32)> = serde_json::from_slice(d).expect("serde_json");
+        let v_simd: Vec<(f32, f32)> = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+    #[test]
+    fn tpl4() {
+        let mut d = String::from("[[[-65.613616999999977,43.420273000000009]]]");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: Vec<Vec<(f32, f32)>> = serde_json::from_slice(d).expect("serde_json");
+        let v_simd: Vec<Vec<(f32, f32)>> = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+    #[test]
+    fn tpl5() {
+        let mut d = String::from("[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: Vec<Vec<(f32, f32)>> = serde_json::from_slice(d).expect("serde_json");
+        let v_simd: Vec<Vec<(f32, f32)>> = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn tpl6() {
+        let mut d = String::from("[[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]]");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: Vec<Vec<Vec<(f32, f32)>>> = serde_json::from_slice(d).expect("serde_json");
+        let v_simd: Vec<Vec<Vec<(f32, f32)>>> = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn tpl7() {
+        let mut d = String::from("[[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]]");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: Vec<Vec<Vec<[f32; 2]>>> = serde_json::from_slice(d).expect("serde_json");
+        let v_simd: Vec<Vec<Vec<[f32; 2]>>> = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[derive(Deserialize, PartialEq, Debug)]
+    struct Obj {
+        a: u64,
+        b: u64,
+    }
+
+    #[derive(Deserialize, PartialEq, Debug)]
+    struct Obj1 {
+        a: Obj,
+    }
+
+    #[test]
+    fn obj() {
+        let mut d = String::from(r#"{"a": 1, "b":1}"#);
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: Obj = serde_json::from_slice(d).expect("serde_json");
+        let v_simd: Obj = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn obj2() {
+        let mut d =
+            String::from(r#"{"a": {"a": 1, "b":1}, "b": {"a": 1, "b":1}, "c": {"a": 1, "b": 1}}"#);
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: HashMap<String, Obj> = serde_json::from_slice(d).expect("serde_json");
+        let v_simd: HashMap<String, Obj> = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn obj3() {
+        let mut d = String::from(
+            r#"{"c": {"a": {"a": 1, "b":1}, "b": {"a": 1, "b":1}, "c": {"a": 1, "b": 1}}}"#,
+        );
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: HashMap<String, HashMap<String, Obj>> =
+            serde_json::from_slice(d).expect("serde_json");
+        let v_simd: HashMap<String, HashMap<String, Obj>> = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn obj4() {
+        let mut d = String::from(r#"{"c": {"a": {"a": 1, "b":1}}}"#);
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: HashMap<String, Obj1> = serde_json::from_slice(d).expect("serde_json");
+        let v_simd: HashMap<String, Obj1> = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn vecvec() {
+        let mut d = String::from("[[[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]], [[-65.613616999999977,43.420273000000009], [-65.613616999999977,43.420273000000009]]]");
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: Vec<Vec<(f32, f32)>> = serde_json::from_slice(d).expect("serde_json");
+        let v_simd: Vec<Vec<(f32, f32)>> = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn crazy_string() {
+        // there is unicode in here!
+        let d = "\"𐀀𐀀  𐀀𐀀0 𐀀A\\u00000A0 A \\u000b\"";
+        let mut d = String::from(d);
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: serde_json::Value = serde_json::from_slice(d).expect("serde_json");
+        let v_simd: serde_json::Value = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    #[test]
+    fn event() {
+        #[derive(Deserialize, Debug, PartialEq)]
+        #[serde(deny_unknown_fields, rename_all = "camelCase")]
+        pub struct CitmCatalog {
+            pub area_names: HashMap<String, String>,
+            pub audience_sub_category_names: HashMap<String, String>,
+            pub block_names: HashMap<String, String>,
+            pub events: HashMap<String, Event>,
+        }
+        pub type Id = u32;
+        #[derive(Deserialize, Debug, PartialEq)]
+        #[serde(deny_unknown_fields, rename_all = "camelCase")]
+        pub struct Event {
+            pub description: (),
+            pub id: Id,
+            pub logo: Option<String>,
+            pub name: String,
+            pub sub_topic_ids: Vec<Id>,
+            pub subject_code: (),
+            pub subtitle: (),
+            pub topic_ids: Vec<Id>,
+        }
+
+        let mut d = String::from(
+            r#"
+{
+    "areaNames": {
+        "205705993": "Arrière-scène central",
+        "205705994": "1er balcon central",
+        "205705995": "2ème balcon bergerie cour",
+        "205705996": "2ème balcon bergerie jardin",
+        "205705998": "1er balcon bergerie jardin",
+        "205705999": "1er balcon bergerie cour",
+        "205706000": "Arrière-scène jardin",
+        "205706001": "Arrière-scène cour",
+        "205706002": "2ème balcon jardin",
+        "205706003": "2ème balcon cour",
+        "205706004": "2ème Balcon central",
+        "205706005": "1er balcon jardin",
+        "205706006": "1er balcon cour",
+        "205706007": "Orchestre central",
+        "205706008": "Orchestre jardin",
+        "205706009": "Orchestre cour",
+        "342752287": "Zone physique secrète"
+    },
+    "audienceSubCategoryNames": {
+        "337100890": "Abonné"
+    },
+    "blockNames": {},
+  "events": {
+    "138586341": {
+      "description": null,
+      "id": 138586341,
+      "logo": null,
+      "name": "30th Anniversary Tour",
+      "subTopicIds": [
+        337184269,
+        337184283
+      ],
+      "subjectCode": null,
+      "subtitle": null,
+      "topicIds": [
+        324846099,
+        107888604
+      ]
+    },
+    "138586345": {
+      "description": null,
+      "id": 138586345,
+      "logo": "/images/UE0AAAAACEKo6QAAAAZDSVRN",
+      "name": "Berliner Philharmoniker",
+      "subTopicIds": [
+        337184268,
+        337184283,
+        337184275
+      ],
+      "subjectCode": null,
+      "subtitle": null,
+      "topicIds": [
+        324846099,
+        107888604,
+        324846100
+      ]
+    }
+  }
+}
+"#,
+        );
+        let mut d = unsafe { d.as_bytes_mut() };
+        let v_serde: CitmCatalog = serde_json::from_slice(d).expect("serde_json");
+        let v_simd: CitmCatalog = from_slice(&mut d).expect("simd_json");
+        assert_eq!(v_simd, v_serde)
+    }
+
+    // How much do we care about this, it's within the same range and
+    // based on floating point math inprecisions during parsing.
+    // Is this a real issue worth improving?
+    #[test]
+    fn silly_float1() {
+        let v = Value::from(3.0901448042322017e305);
+        let s = v.to_string();
+        dbg!(&s);
+        let mut bytes = s.as_bytes().to_vec();
+        let parsed = to_owned_value(&mut bytes).expect("failed to parse gernated float");
+        assert_eq!(v, parsed);
+    }
+
+    #[test]
+    #[ignore]
+    fn silly_float2() {
+        let v = Value::from(-6.990585694841803e305);
+        let s = v.to_string();
+        dbg!(&s);
+        let mut bytes = s.as_bytes().to_vec();
+        let parsed = to_owned_value(&mut bytes).expect("failed to parse gernated float");
+        assert_eq!(v, parsed);
+    }
+
+    //6.576692109929364e305
+    fn arb_json() -> BoxedStrategy<String> {
+        let leaf = prop_oneof![
+            Just(Value::Null),
+            any::<bool>().prop_map(Value::Bool),
+            // (-1.0e306f64..1.0e306f64).prop_map(|f| json!(f)), // The float parsing of simd and serde are too different
+            any::<i64>().prop_map(|i| json!(i)),
+            ".*".prop_map(Value::from),
+        ];
+        leaf.prop_recursive(
+            8,   // 8 levels deep
+            256, // Shoot for maximum size of 256 nodes
+            10,  // We put up to 10 items per collection
+            |inner| {
+                prop_oneof![
+                    // Take the inner strategy and make the two recursive cases.
+                    prop::collection::vec(inner.clone(), 0..10).prop_map(|v| json!(v)),
+                    prop::collection::hash_map(".*", inner, 0..10).prop_map(|m| json!(m)),
+                ]
+            },
+        )
+        .prop_map(|v| serde_json::to_string(&v).expect("").to_string())
+        .boxed()
+    }
+
+    fn arb_json_value() -> BoxedStrategy<Value> {
+        let leaf = prop_oneof![
+            Just(Value::Null),
+            any::<bool>().prop_map(Value::Bool),
+            //(-1.0e306f64..1.0e306f64).prop_map(|f| json!(f)), // damn you float!
+            any::<i64>().prop_map(|i| json!(i)),
+            ".*".prop_map(Value::from),
+        ];
+        leaf.prop_recursive(
+            8,   // 8 levels deep
+            256, // Shoot for maximum size of 256 nodes
+            10,  // We put up to 10 items per collection
+            |inner| {
+                prop_oneof![
+                    // Take the inner strategy and make the two recursive cases.
+                    prop::collection::vec(inner.clone(), 0..10).prop_map(|v| json!(v)),
+                    prop::collection::hash_map(".*", inner, 0..10).prop_map(|m| json!(m)),
+                ]
+            },
+        )
+        .boxed()
+    }
+
+    proptest! {
+        #![proptest_config(ProptestConfig {
+            // Setting both fork and timeout is redundant since timeout implies
+            // fork, but both are shown for clarity.
+            fork: true,
+            .. ProptestConfig::default()
+        })]
+
+        #[test]
+        fn prop_json_encode_decode(val in arb_json_value()) {
+            let mut encoded: Vec<u8> = Vec::new();
+            let _ = val.write(&mut encoded);
+            println!("{}", String::from_utf8(encoded.clone()).unwrap());
+            let res = to_owned_value(&mut encoded).unwrap();
+            assert_eq!(val, res);
+        }
+
+    }
+    proptest! {
+        #![proptest_config(ProptestConfig {
+            // Setting both fork and timeout is redundant since timeout implies
+            // fork, but both are shown for clarity.
+            fork: true,
+            .. ProptestConfig::default()
+        })]
+
+        #[test]
+        fn prop_json(d in arb_json()) {
+            if let Ok(v_serde) = serde_json::from_slice::<serde_json::Value>(&d.as_bytes()) {
+                let mut d1 = d.clone();
+                let d1 = unsafe{ d1.as_bytes_mut()};
+                let v_simd_serde: serde_json::Value = from_slice(d1).expect("");
+                // We add our own encoder in here.
+                let mut d2 = v_simd_serde.to_string();
+                let d2 = unsafe{ d2.as_bytes_mut()};
+                let mut d3 = d.clone();
+                let d3 = unsafe{ d3.as_bytes_mut()};
+                assert_eq!(v_simd_serde, v_serde);
+                let v_simd_owned = to_owned_value(d2);
+                assert!(v_simd_owned.is_ok());
+                let v_simd_borrowed = to_borrowed_value(d3);
+                dbg!(&v_simd_borrowed);
+                assert!(v_simd_borrowed.is_ok());
+                assert_eq!(v_simd_owned.unwrap(), super::OwnedValue::from(v_simd_borrowed.unwrap()));
+            }
+
+        }
+
+    }
+
+    fn arb_junk() -> BoxedStrategy<Vec<u8>> {
+        prop::collection::vec(any::<u8>(), 0..(1024 * 8)).boxed()
+    }
+    proptest! {
+        #![proptest_config(ProptestConfig {
+            // Setting both fork and timeout is redundant since timeout implies
+            // fork, but both are shown for clarity.
+            fork: true,
+            .. ProptestConfig::default()
+        })]
+        #[test]
+        fn prop_junk(d in arb_junk()) {
+            let mut d1 = d.clone();
+            let mut d2 = d.clone();
+            let mut d3 = d.clone();
+
+            let _ = from_slice::<serde_json::Value>(&mut d1);
+            let _ = to_borrowed_value(&mut d2);
+            let _ = to_owned_value(&mut d3);
+
+        }
+    }
+
+    proptest! {
+        #![proptest_config(ProptestConfig {
+            // Setting both fork and timeout is redundant since timeout implies
+            // fork, but both are shown for clarity.
+            fork: true,
+            .. ProptestConfig::default()
+        })]
+
+        #[test]
+        fn prop_string(d in "\\PC*") {
+            let mut d1 = d.clone();
+            let mut d1 = unsafe{ d1.as_bytes_mut()};
+            let mut d2 = d.clone();
+            let mut d2 = unsafe{ d2.as_bytes_mut()};
+            let mut d3 = d.clone();
+            let mut d3 = unsafe{ d3.as_bytes_mut()};
+            let _ = from_slice::<serde_json::Value>(&mut d1);
+            let _ = to_borrowed_value(&mut d2);
+            let _ = to_owned_value(&mut d3);
+
+        }
+    }
+}

From b9b39007f295761fbc3b8de73c70bff55abe9487 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Fri, 9 Aug 2019 19:07:35 -0400
Subject: [PATCH 05/34] feat: remove core_arch unused dependency

---
 Cargo.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index cae31f52..399f7c9d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,6 @@ halfbrown = "0.1"
 page_size = "0.4"
 itoa = "0.4"
 ryu = "1"
-core_arch = "0.1.5"
 
 # serde compatibilty
 serde = { version = "1", features = ["derive"], optional = true}

From 1c989cf5023e7badb4d78332e18493231d4e0488 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Sat, 10 Aug 2019 08:08:14 -0400
Subject: [PATCH 06/34] feat: update string parse

---
 src/neon/deser.rs      | 144 ++++++++++++-----------------------------
 src/neon/intrinsics.rs |  10 ++-
 src/stringparse.rs     |   6 ++
 src/value/generator.rs |   7 +-
 4 files changed, 61 insertions(+), 106 deletions(-)

diff --git a/src/neon/deser.rs b/src/neon/deser.rs
index 9cc5ca13..b194dde3 100644
--- a/src/neon/deser.rs
+++ b/src/neon/deser.rs
@@ -1,131 +1,73 @@
-use std::mem;
+//use std::mem;
 
 pub use crate::error::{Error, ErrorType};
 pub use crate::Deserializer;
 pub use crate::Result;
+pub use crate::neon::stage1::SIMDJSON_PADDING;
 pub use crate::neon::intrinsics::*;
 pub use crate::neon::utf8check::*;
 pub use crate::stringparse::*;
 
-use crate::neon::intrinsics::*;
+pub use crate::neon::intrinsics::*;
 
+unsafe fn find_bs_bits_and_quote_bits(src: &[u8], dst: &mut [u8]) -> ParseStringHelper {
+    // this can read up to 31 bytes beyond the buffer size, but we require
+    // SIMDJSON_PADDING of padding
+    let v0 : uint8x16_t = vld1q_u8(src.as_ptr());
+    let v1 : uint8x16_t = vld1q_u8(src.as_ptr().add(16));
+    vst1q_u8(dst.as_mut_ptr(), v0);
+    vst1q_u8(dst.as_mut_ptr().add(16), v1);
+
+    let bs_mask : uint8x16_t = vmovq_n_u8('\\' as u8);
+    let qt_mask : uint8x16_t = vmovq_n_u8('"' as u8);
+
+    let bit_mask = uint8x16_t::new(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                   0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+
+    let cmp_bs_0 : uint8x16_t = vceqq_u8(v0, bs_mask);
+    let cmp_bs_1 : uint8x16_t = vceqq_u8(v1, bs_mask);
+    let cmp_qt_0 : uint8x16_t = vceqq_u8(v0, qt_mask);
+    let cmp_qt_1 : uint8x16_t = vceqq_u8(v1, qt_mask);
+
+    let cmp_bs_0 = vandq_u8(cmp_bs_0, bit_mask);
+    let cmp_bs_1 = vandq_u8(cmp_bs_1, bit_mask);
+    let cmp_qt_0 = vandq_u8(cmp_qt_0, bit_mask);
+    let cmp_qt_1 = vandq_u8(cmp_qt_1, bit_mask);
+
+    let sum0 : uint8x16_t = vpaddq_u8(cmp_bs_0, cmp_bs_1);
+    let sum1 : uint8x16_t = vpaddq_u8(cmp_qt_0, cmp_qt_1);
+    let sum0 = vpaddq_u8(sum0, sum1);
+    let sum0 = vpaddq_u8(sum0, sum0);
+
+    ParseStringHelper {
+        bs_bits: vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits
+        quote_bits: vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1)  // quote_bits
+    }
+}
 
 impl<'de> Deserializer<'de> {
     #[cfg_attr(not(feature = "no-inline"), inline(always))]
     pub fn parse_str_(&mut self) -> Result<&'de str> {
         // Add 1 to skip the initial "
         let idx = self.iidx + 1;
-        let mut padding = [0u8; 32];
+//        let padding = [0u8; 32];
         //let mut read: usize = 0;
 
         // we include the terminal '"' so we know where to end
         // This is safe since we check sub's lenght in the range access above and only
         // create sub sliced form sub to `sub.len()`.
 
-
-        //
-        // FIXME - this section is somewhat questionable -- needs a complete proofread!!!
-        //
-
         let src: &[u8] = unsafe { &self.input.get_unchecked(idx..) };
         let mut src_i: usize = 0;
-        let mut len = src_i;
-        loop {
-            let v: int8x16_t = if src.len() >= src_i + 16 {
-                // This is safe since we ensure src is at least 16 wide
-                #[allow(clippy::cast_ptr_alignment)]
-                    unsafe {
-                    *(src.as_ptr().add(src_i) as *const int8x16_t)
-                }
-            } else {
-                unsafe {
-                    padding
-                        .get_unchecked_mut(..src.len() - src_i)
-                        .clone_from_slice(src.get_unchecked(src_i..));
-                    // This is safe since we ensure src is at least 16 wide
-                    #[allow(clippy::cast_ptr_alignment)]
-                        *(padding.as_ptr() as *const int8x16_t)
-                }
-            };
-
-            // store to dest unconditionally - we can overwrite the bits we don't like
-            // later
-            let bs_bits: u32 = 0; // FIXME: vceqq_s8(v, vdupq_n_s8(b'\\' as i8));
-            let quote_mask = 0; // FIXME: unsafe { vceqq_s8(v, vdupq_n_s8(b'"' as i8)) };
-            let quote_bits = 0; // FIXME: unsafe { static_cast_u32!(_mm_movemask_epi8(quote_mask)) };
-            if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
-                // we encountered quotes first. Move dst to point to quotes and exit
-                // find out where the quote is...
-                let quote_dist: u32 = trailingzeroes(u64::from(quote_bits)) as u32;
-
-                ///////////////////////
-                // Above, check for overflow in case someone has a crazy string (>=4GB?)
-                // But only add the overflow check when the document itself exceeds 4GB
-                // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
-                ////////////////////////
-
-                // we advance the point, accounting for the fact that we have a NULl termination
-
-                len += quote_dist as usize;
-                unsafe {
-                    let v = self.input.get_unchecked(idx..idx + len) as *const [u8] as *const str;
-                    return Ok(&*v);
-                }
-
-                // we compare the pointers since we care if they are 'at the same spot'
-                // not if they are the same value
-            }
-            if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
-                // Move to the 'bad' character
-                let bs_dist: u32 = trailingzeroes(u64::from(bs_bits));
-                len += bs_dist as usize;
-                src_i += bs_dist as usize;
-                break;
-            } else {
-                // they are the same. Since they can't co-occur, it means we encountered
-                // neither.
-                src_i += 16;
-                len += 16;
-            }
-        }
-
+        let len = src_i;
         let mut dst_i: usize = 0;
         let dst: &mut [u8] = &mut self.strings;
 
         loop {
-            let v: int8x16_t = if src.len() >= src_i + 16 {
-                // This is safe since we ensure src is at least 16 wide
-                #[allow(clippy::cast_ptr_alignment)]
-                    unsafe {
-                    *(src.as_ptr().add(src_i) as *const int8x16_t)
-                }
-            } else {
-                unsafe {
-                    padding
-                        .get_unchecked_mut(..src.len() - src_i)
-                        .clone_from_slice(src.get_unchecked(src_i..));
-                    // This is safe since we ensure src is at least 16 wide
-                    #[allow(clippy::cast_ptr_alignment)]
-                        *(padding.as_ptr() as *const int8x16_t)
-                }
-            };
-
-            #[allow(clippy::cast_ptr_alignment)]
-                unsafe {
-                *(dst.as_mut_ptr().add(dst_i) as *mut int8x16_t) = v;
-            };
-
             // store to dest unconditionally - we can overwrite the bits we don't like
             // later
-            let bs_bits: u32 = 0; // FIXME
-//                unsafe {
-//                static_cast_u32!(_mm_movemask_epi8(vceqq_s8(
-//                    v,
-//                    vdupq_n_s8(b'\\' as i8)
-//                )))
-//            };
-            let quote_mask = 0; // FIXME: unsafe { vceqq_s8(v, vdupq_n_s8(b'"' as i8)) };
-            let quote_bits = 0; // FIXME: unsafe { static_cast_u32!(_mm_movemask_epi8(quote_mask)) };
+            let ParseStringHelper {bs_bits, quote_bits} = unsafe { find_bs_bits_and_quote_bits(src, dst) };
+
             if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
                 // we encountered quotes first. Move dst to point to quotes and exit
                 // find out where the quote is...
@@ -196,8 +138,8 @@ impl<'de> Deserializer<'de> {
             } else {
                 // they are the same. Since they can't co-occur, it means we encountered
                 // neither.
-                src_i += 16;
-                dst_i += 16;
+                src_i += 32;
+                dst_i += 32;
             }
         }
     }
diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs
index 065047b6..094e9f7f 100644
--- a/src/neon/intrinsics.rs
+++ b/src/neon/intrinsics.rs
@@ -154,6 +154,11 @@ pub unsafe fn vld1q_u8(addr: *const u8) -> uint8x16_t {
     *(addr as *const uint8x16_t)
 }
 
+#[inline]
+pub unsafe fn vst1q_u8(addr: *mut u8, val: uint8x16_t) {
+    std::ptr::write(addr as *mut uint8x16_t, val);
+}
+
 macro_rules! aarch64_simd_2 {
     ($name:ident, $type:ty, $simd_fn:ident, $intr:ident) => {
         #[inline]
@@ -341,6 +346,7 @@ macro_rules! arm_vget_lane {
 }
 
 arm_vget_lane!(vgetq_lane_u16, u16, uint16x8_t, 7);
+arm_vget_lane!(vgetq_lane_u32, u32, uint32x4_t, 3);
 arm_vget_lane!(vgetq_lane_u64, u64, uint64x2_t, 1);
 arm_vget_lane!(vget_lane_u64, u64, uint64x1_t, 0);
 
@@ -385,8 +391,8 @@ pub unsafe fn hamming(a:u64) -> u32 {
 }
 
 #[inline]
-pub unsafe fn trailingzeroes(a:u64) -> u32 {
-    cttz_u64_(a)
+pub fn trailingzeroes(a:u64) -> u32 {
+    unsafe { cttz_u64_(a) }
 }
 
 #[inline]
diff --git a/src/stringparse.rs b/src/stringparse.rs
index 0ff8a078..9a7e3b8f 100644
--- a/src/stringparse.rs
+++ b/src/stringparse.rs
@@ -73,3 +73,9 @@ pub fn handle_unicode_codepoint(
     let offset: usize = codepoint_to_utf8(code_point, dst_ptr);
     Ok((offset, src_offset))
 }
+
+// Holds backslashes and quotes locations.
+pub struct ParseStringHelper {
+    pub bs_bits: u32,
+    pub quote_bits: u32,
+}
diff --git a/src/value/generator.rs b/src/value/generator.rs
index 1d21a833..bb054fc0 100644
--- a/src/value/generator.rs
+++ b/src/value/generator.rs
@@ -97,9 +97,10 @@ pub trait BaseGenerator {
     #[inline(always)]
     fn write_string(&mut self, string: &str) -> io::Result<()> {
         stry!(self.write_char(b'"'));
-        let mut string = string.as_bytes();
-        let mut len = string.len();
-        let mut idx = 0;
+        let string = string.as_bytes();
+//        let mut string = string.as_bytes();
+//        let mut len = string.len();
+//        let mut idx = 0;
 
 //        unsafe {
             // Looking at the table above the lower 5 bits are entirely

From 592607091a67732262a6bea655dde9d532218123 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Sat, 10 Aug 2019 09:08:34 -0400
Subject: [PATCH 07/34] feat: additional tweaks, not breaks during linking

---
 src/neon/intrinsics.rs |  94 +++++++++++++++++++------------
 src/neon/simd_llvm.rs  |   2 +-
 src/neon/stage1.rs     |  22 ++++----
 src/neon/utf8check.rs  | 125 ++++++++++++++++++++---------------------
 src/value/generator.rs |   8 +--
 5 files changed, 137 insertions(+), 114 deletions(-)

diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs
index 094e9f7f..3db115b8 100644
--- a/src/neon/intrinsics.rs
+++ b/src/neon/intrinsics.rs
@@ -21,42 +21,48 @@ macro_rules! types {
     )*)
 }
 
+#[allow(non_camel_case_types)]
+type poly128 = [u8; 16];
+
 extern "C" {
     #[link_name = "llvm.aarch64.neon.pmull64"]
-    fn vmull_p64_(a: i64, b: i64) -> int8x16_t;
+    fn vmull_p64_(a: i64, b: i64) -> poly128;
     #[link_name = "llvm.aarch64.neon.vshrq"]
-    fn vshrq_n_u8_(a: uint8x16_t, b: u8) -> uint8x16_t;
+    fn vshrq_n_u8_(a: poly128, b: u8) -> poly128;
     #[link_name = "llvm.aarch64.neon.addp.v16i8"]
-    fn vpaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    fn vpaddq_u8_(a: poly128, b: poly128) -> poly128;
+    #[link_name = "llvm.aarch64.neon.addp.v16u8"]
+    fn vaddq_u8_(a: poly128, b: poly128) -> poly128;
     #[link_name = "llvm.aarch64.neon.addp.v16i8"]
-    fn vaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    fn vaddq_s8_(a: poly128, b: poly128) -> poly128;
     #[link_name = "llvm.aarch64.neon.addp.v16i32"]
-    fn vaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    fn vaddq_s32_(a: poly128, b: poly128) -> poly128;
+    #[link_name = "llvm.aarch64.neon.vextq.v16u8"]
+    fn vextq_u8_(a: poly128, b: poly128, n: u8) -> poly128;
     #[link_name = "llvm.aarch64.neon.vextq.v16s8"]
-    fn vextq_s8_(a: int8x16_t, b: int8x16_t, n:u8) -> int8x16_t;
+    fn vextq_s8_(a: poly128, b: poly128, n: u8) -> poly128;
     #[link_name = "llvm.aarch64.neon.vtstq.v16u8"]
-    fn vtstq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    fn vtstq_u8_(a: poly128, b: poly128) -> poly128;
     #[link_name = "llvm.aarch64.neon.vtstq.v16s8"]
-    fn vtstq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    fn vtstq_s8_(a: poly128, b: poly128) -> poly128;
     #[link_name = "llvm.ctpop.u64"]
     fn ctpop_u64_(a: u64) -> u32;
-    #[link_name = "llvm.ctlz.u64"]
-    fn ctlz_u64_(a: u64) -> u32;
+    //    #[link_name = "llvm.ctlz.u64"]
+//    fn ctlz_u64_(a: u64) -> u32;
     #[link_name = "llvm.cttz.u64"]
     fn cttz_u64_(a: u64) -> u32;
 }
 
 #[inline]
-#[cfg_attr(test, assert_instr(pmull))]
-pub unsafe fn vmull_p64(a: i64, b: i64) -> uint8x16_t {
-    mem::transmute(vmull_p64_(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn vmull_p64(a: i64, b: i64) -> int8x16_t {
+    mem::transmute(vmull_p64_(a, b))
 }
 
 
 #[inline]
 pub unsafe fn vshrq_n_u8(a: uint8x16_t, b: u8) -> uint8x16_t {
     // FIXME?
-    vshrq_n_u8_(a, b)
+    mem::transmute(vshrq_n_u8_(mem::transmute(a), mem::transmute(b)))
 }
 
 types! {
@@ -116,19 +122,19 @@ types! {
 }
 
 impl uint8x16_t {
-    pub fn new(a:u8,b:u8,c:u8,d:u8,e:u8,f:u8,g:u8,h:u8,i:u8,j:u8,k:u8,l:u8,m:u8,n:u8,o:u8,p:u8) -> uint8x16_t {
+    pub fn new(a: u8, b: u8, c: u8, d: u8, e: u8, f: u8, g: u8, h: u8, i: u8, j: u8, k: u8, l: u8, m: u8, n: u8, o: u8, p: u8) -> uint8x16_t {
         uint8x16_t(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)
     }
 }
 
 impl int8x16_t {
-    pub fn new(a:i8,b:i8,c:i8,d:i8,e:i8,f:i8,g:i8,h:i8,i:i8,j:i8,k:i8,l:i8,m:i8,n:i8,o:i8,p:i8) -> int8x16_t {
+    pub fn new(a: i8, b: i8, c: i8, d: i8, e: i8, f: i8, g: i8, h: i8, i: i8, j: i8, k: i8, l: i8, m: i8, n: i8, o: i8, p: i8) -> int8x16_t {
         int8x16_t(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)
     }
 }
 
 impl int32x4_t {
-    pub fn new(a:i32,b:i32,c:i32,d:i32) -> int32x4_t {
+    pub fn new(a: i32, b: i32, c: i32, d: i32) -> int32x4_t {
         int32x4_t(a, b, c, d)
     }
 }
@@ -199,6 +205,8 @@ macro_rules! aarch64_simd_cgtu {
     };
 }
 
+aarch64_simd_cgtu!(vcgtq_u8, uint8x16_t);
+
 aarch64_simd_cgt!(vcgt_s64, int64x1_t);
 aarch64_simd_cgt!(vcgtq_s64, int64x2_t);
 aarch64_simd_cgtu!(vcgt_u64, uint64x1_t);
@@ -262,7 +270,7 @@ aarch64_simd_cleu!(vcleq_u64, uint64x2_t);
 aarch64_simd_cleu!(vcgtq_s8, int8x16_t);
 
 #[inline]
-pub fn vdupq_n_s8(a:i8) -> int8x16_t {
+pub fn vdupq_n_s8(a: i8) -> int8x16_t {
     int8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
 }
 
@@ -272,12 +280,12 @@ pub fn zeroi8x16() -> int8x16_t {
 }
 
 #[inline]
-pub fn vdupq_n_u8(a:u8) -> uint8x16_t {
+pub fn vdupq_n_u8(a: u8) -> uint8x16_t {
     uint8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
 }
 
 #[inline]
-pub fn vmovq_n_u8(a:u8) -> uint8x16_t {
+pub fn vmovq_n_u8(a: u8) -> uint8x16_t {
     uint8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
 }
 
@@ -288,30 +296,35 @@ pub fn zerou8x16() -> uint8x16_t {
 
 #[inline]
 pub unsafe fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    vpaddq_u8_(a, b)
+    mem::transmute(vpaddq_u8_(mem::transmute(a), mem::transmute(b)))
+}
+
+#[inline]
+pub unsafe fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    mem::transmute(vaddq_u8_(mem::transmute(a), mem::transmute(b)))
 }
 
 #[inline]
 pub unsafe fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    vaddq_s8_(a, b)
+    mem::transmute(vaddq_s8_(mem::transmute(a), mem::transmute(b)))
 }
 
 #[inline]
 pub unsafe fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    vaddq_s32_(a, b)
+    mem::transmute(vaddq_s32_(mem::transmute(a), mem::transmute(b)))
 }
 
 #[inline]
-pub unsafe fn vandq_u8(a:uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_and(a, b) }
+pub unsafe fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_and(a, b) }
 
 #[inline]
-pub unsafe fn vorrq_u8(a:uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_or(a, b) }
+pub unsafe fn vorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_or(a, b) }
 
 #[inline]
-pub unsafe fn vandq_s8(a:int8x16_t, b: int8x16_t) -> int8x16_t { simd_llvm::simd_and(a, b) }
+pub unsafe fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { simd_llvm::simd_and(a, b) }
 
 #[inline]
-pub unsafe fn vorrq_s8(a:int8x16_t, b: int8x16_t) -> int8x16_t { simd_llvm::simd_or(a, b) }
+pub unsafe fn vorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { simd_llvm::simd_or(a, b) }
 
 macro_rules! arm_reinterpret {
     ($name:ident, $from:ty, $to:ty) => {
@@ -332,6 +345,8 @@ arm_reinterpret!(vreinterpretq_u32_u8, uint8x16_t, uint32x4_t);
 arm_reinterpret!(vreinterpretq_u64_u8, uint8x16_t, uint64x2_t);
 arm_reinterpret!(vreinterpretq_u8_s8, int8x16_t, uint8x16_t);
 
+arm_reinterpret!(vreinterpretq_s64_s8, int8x16_t, int64x2_t);
+
 macro_rules! arm_vget_lane {
     ($name:ident, $to:ty, $from:ty, $lanes:literal) => {
         #[inline]
@@ -350,12 +365,21 @@ arm_vget_lane!(vgetq_lane_u32, u32, uint32x4_t, 3);
 arm_vget_lane!(vgetq_lane_u64, u64, uint64x2_t, 1);
 arm_vget_lane!(vget_lane_u64, u64, uint64x1_t, 0);
 
-pub unsafe fn vextq_s8(a:int8x16_t, b:int8x16_t, n:u8) -> int8x16_t {
-    vextq_s8_(a, b, n)
+arm_vget_lane!(vgetq_lane_s16, i16, int16x8_t, 7);
+arm_vget_lane!(vgetq_lane_s32, i32, int32x4_t, 3);
+arm_vget_lane!(vgetq_lane_s64, i64, int64x2_t, 1);
+arm_vget_lane!(vget_lane_s64, i64, int64x1_t, 0);
+
+pub unsafe fn vextq_u8(a: uint8x16_t, b: uint8x16_t, n: u8) -> uint8x16_t {
+    mem::transmute(vextq_u8_(mem::transmute(a), mem::transmute(b), n))
+}
+
+pub unsafe fn vextq_s8(a: int8x16_t, b: int8x16_t, n: u8) -> int8x16_t {
+    mem::transmute(vextq_s8_(mem::transmute(a), mem::transmute(b), n))
 }
 
 #[inline]
-pub fn vqmovn_u64(a:uint64x2_t) -> uint32x2_t {
+pub fn vqmovn_u64(a: uint64x2_t) -> uint32x2_t {
     uint32x2_t(a.0 as u32, a.1 as u32)
 }
 
@@ -377,25 +401,25 @@ pub unsafe fn vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
 
 #[inline]
 pub unsafe fn vtstq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    vtstq_u8_(a, b)
+    mem::transmute(vtstq_u8_(mem::transmute(a), mem::transmute(b)))
 }
 
 #[inline]
 pub unsafe fn vtstq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    vtstq_s8_(a, b)
+    mem::transmute(vtstq_s8_(mem::transmute(a), mem::transmute(b)))
 }
 
 #[inline]
-pub unsafe fn hamming(a:u64) -> u32 {
+pub unsafe fn hamming(a: u64) -> u32 {
     ctpop_u64_(a)
 }
 
 #[inline]
-pub fn trailingzeroes(a:u64) -> u32 {
+pub fn trailingzeroes(a: u64) -> u32 {
     unsafe { cttz_u64_(a) }
 }
 
 #[inline]
-pub unsafe fn vst1q_u32(addr: *mut u8, val : uint32x4_t) {
+pub unsafe fn vst1q_u32(addr: *mut u8, val: uint32x4_t) {
     std::ptr::write(addr as *mut uint32x4_t, val)
 }
diff --git a/src/neon/simd_llvm.rs b/src/neon/simd_llvm.rs
index 892ae59c..d91ba422 100644
--- a/src/neon/simd_llvm.rs
+++ b/src/neon/simd_llvm.rs
@@ -19,7 +19,7 @@ extern "platform-intrinsic" {
 //
 //    pub fn simd_cast<T, U>(x: T) -> U;
 //
-    pub fn simd_add<T>(x: T, y: T) -> T;
+//    pub fn simd_add<T>(x: T, y: T) -> T;
     pub fn simd_sub<T>(x: T, y: T) -> T;
 //    pub fn simd_mul<T>(x: T, y: T) -> T;
 //    pub fn simd_div<T>(x: T, y: T) -> T;
diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs
index fc4daabc..b2cdbaf5 100644
--- a/src/neon/stage1.rs
+++ b/src/neon/stage1.rs
@@ -57,11 +57,11 @@ unsafe fn neon_movemask_bulk(p0: uint8x16_t, p1: uint8x16_t, p2: uint8x16_t, p3:
 }
 
 unsafe fn compute_quote_mask(quote_bits: u64) -> u64 {
-    vgetq_lane_u64(vreinterpretq_u64_u8(vmull_p64(-1, quote_bits as i64)), 0)
+    vgetq_lane_s64(vreinterpretq_s64_s8(vmull_p64(-1, quote_bits as i64)), 0) as u64
 }
 
 struct Utf8CheckingState {
-    has_error: int8x16_t,
+    has_error: uint8x16_t,
     previous: ProcessedUtfBytes,
 }
 
@@ -69,7 +69,7 @@ impl Default for Utf8CheckingState {
     #[cfg_attr(not(feature = "no-inline"), inline)]
     fn default() -> Self {
         Utf8CheckingState {
-            has_error: vdupq_n_s8(0),
+            has_error: vdupq_n_u8(0),
             previous:ProcessedUtfBytes::default()
         }
     }
@@ -99,20 +99,20 @@ unsafe fn check_utf8(
         // All bytes are ascii. Therefore the byte that was just before must be
         // ascii too. We only check the byte that was just before simd_input. Nines
         // are arbitrary values.
-        let VERROR: int8x16_t =
-            int8x16_t::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1);
+        let verror: uint8x16_t =
+            uint8x16_t::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1);
         state.has_error =
-            vorrq_s8(vcgtq_s8(state.previous.carried_continuations, VERROR),
+            vorrq_u8(vcgtq_u8(state.previous.carried_continuations, verror),
                      state.has_error);
     } else {
         // it is not ascii so we have to do heavy work
-        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v0),
+        state.previous = check_utf8_bytes(input.v0,
                                           &(state.previous), &mut (state.has_error));
-        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v1),
+        state.previous = check_utf8_bytes(input.v1,
                                           &(state.previous), &mut (state.has_error));
-        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v2),
+        state.previous = check_utf8_bytes(input.v2,
                                           &(state.previous), &mut (state.has_error));
-        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v3),
+        state.previous = check_utf8_bytes(input.v3,
                                           &(state.previous), &mut (state.has_error));
     }
 }
@@ -497,7 +497,7 @@ impl<'de> Deserializer<'de> {
             return Err(ErrorType::Syntax);
         }
 
-        let err: i128 = mem::transmute(vtstq_s8(previous.has_error, previous.has_error));
+        let err: i128 = mem::transmute(vtstq_u8(previous.has_error, previous.has_error));
 
         if err != 0 {
             Ok(structural_indexes)
diff --git a/src/neon/utf8check.rs b/src/neon/utf8check.rs
index de190f46..10c6b7c9 100644
--- a/src/neon/utf8check.rs
+++ b/src/neon/utf8check.rs
@@ -1,6 +1,6 @@
 
 use crate::neon::intrinsics::*;
-use std::mem;
+//use std::mem;
 
 /*
  * legal utf-8 byte sequence
@@ -21,18 +21,17 @@ use std::mem;
 
 // all byte values must be no larger than 0xF4
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn check_smaller_than_0xf4(current_bytes: int8x16_t, has_error: &mut int8x16_t) {
+unsafe fn check_smaller_than_0xf4(current_bytes: uint8x16_t, has_error: &mut uint8x16_t) {
     // unsigned, saturates to 0 below max
     *has_error =
-        vorrq_s8(
+        vorrq_u8(
             *has_error,
-            vreinterpretq_s8_u8(vqsubq_u8(
-                vreinterpretq_u8_s8(current_bytes), vdupq_n_u8(0xF4))));
+            vqsubq_u8(current_bytes, vdupq_n_u8(0xF4)));
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn continuation_lengths(high_nibbles: int8x16_t) -> int8x16_t {
-    let NIBBLES : int8x16_t = int8x16_t::new(
+unsafe fn continuation_lengths(high_nibbles: uint8x16_t) -> uint8x16_t {
+    let nibbles : uint8x16_t = uint8x16_t::new(
         1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
         0, 0, 0, 0,             // 10xx (continuation)
         2, 2,                   // 110x
@@ -40,35 +39,35 @@ unsafe fn continuation_lengths(high_nibbles: int8x16_t) -> int8x16_t {
         4,                      // 1111, next should be 0 (not checked here)
     );
 
-    vqtbl1q_s8(NIBBLES, vreinterpretq_u8_s8(high_nibbles))
+    vqtbl1q_u8(nibbles, high_nibbles)
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn carry_continuations(initial_lengths: int8x16_t, previous_carries: int8x16_t) -> int8x16_t {
-    let right1 : int8x16_t = vreinterpretq_s8_u8(vqsubq_u8(
-        vreinterpretq_u8_s8(vextq_s8(previous_carries, initial_lengths, 16 - 1)),
-        vdupq_n_u8(1)));
-    let sum : int8x16_t = vaddq_s8(initial_lengths, right1);
+unsafe fn carry_continuations(initial_lengths: uint8x16_t, previous_carries: uint8x16_t) -> uint8x16_t {
+    let right1 : uint8x16_t = vqsubq_u8(
+        vextq_u8(previous_carries, initial_lengths, 16 - 1),
+        vdupq_n_u8(1));
+    let sum : uint8x16_t = vaddq_u8(initial_lengths, right1);
 
-    let right2 : int8x16_t = vreinterpretq_s8_u8(
-        vqsubq_u8(vreinterpretq_u8_s8(vextq_s8(previous_carries, sum, 16 - 2)),
-                  vdupq_n_u8(2)));
+    let right2 : uint8x16_t = vqsubq_u8(
+        vextq_u8(previous_carries, sum, 16 - 2),
+        vdupq_n_u8(2));
 
-    vaddq_s8(sum, right2)
+    vaddq_u8(sum, right2)
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn check_continuations(initial_lengths: int8x16_t, carries: int8x16_t, has_error: &mut int8x16_t) {
+unsafe fn check_continuations(initial_lengths: uint8x16_t, carries: uint8x16_t, has_error: &mut uint8x16_t) {
     // overlap || underlap
     // carry > length && length > 0 || !(carry > length) && !(length > 0)
     // (carries > length) == (lengths > 0)
 
     // FIXME: was vceqq_u8 ?
-    let overunder : int8x16_t = vceqq_s8(
-                                    vcgtq_s8(carries, initial_lengths),
-                                    vcgtq_s8(initial_lengths, vdupq_n_s8(0)));
+    let overunder : uint8x16_t = vceqq_u8(
+                                    vcgtq_u8(carries, initial_lengths),
+                                    vcgtq_u8(initial_lengths, vdupq_n_u8(0)));
 
-    *has_error = vorrq_s8(*has_error, overunder);
+    *has_error = vorrq_u8(*has_error, overunder);
 }
 
 // when 0xED is found, next byte must be no larger than 0x9F
@@ -76,21 +75,21 @@ unsafe fn check_continuations(initial_lengths: int8x16_t, carries: int8x16_t, ha
 // next byte must be continuation, ie sign bit is set, so signed < is ok
 #[cfg_attr(not(feature = "no-inline"), inline)]
 unsafe fn check_first_continuation_max(
-    current_bytes: int8x16_t,
-    off1_current_bytes: int8x16_t,
-    has_error: &mut int8x16_t,
+    current_bytes: uint8x16_t,
+    off1_current_bytes: uint8x16_t,
+    has_error: &mut uint8x16_t,
 ) {
-    let maskED : int8x16_t = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xED));
-    let maskF4 : int8x16_t = vceqq_s8(off1_current_bytes, vdupq_n_s8(0xF4));
+    let mask_ed : uint8x16_t = vceqq_u8(off1_current_bytes, vdupq_n_u8(0xED));
+    let mask_f4 : uint8x16_t = vceqq_u8(off1_current_bytes, vdupq_n_u8(0xF4));
 
     // FIXME: was vandq_u8?
-    let badfollowED : int8x16_t =
-        vandq_s8(vcgtq_s8(current_bytes, vdupq_n_s8(0x9F)), maskED);
-    let badfollowF4 : int8x16_t =
-        vandq_s8(vcgtq_s8(current_bytes, vdupq_n_s8(0x8F)), maskF4);
+    let badfollow_ed : uint8x16_t =
+        vandq_u8(vcgtq_u8(current_bytes, vdupq_n_u8(0x9F)), mask_ed);
+    let badfollow_f4 : uint8x16_t =
+        vandq_u8(vcgtq_u8(current_bytes, vdupq_n_u8(0x8F)), mask_f4);
 
-    *has_error = vorrq_s8(
-        *has_error, vorrq_s8(badfollowED, badfollowF4));
+    *has_error = vorrq_u8(
+        *has_error, vorrq_u8(badfollow_ed, badfollow_f4));
 }
 
 // map off1_hibits => error condition
@@ -101,73 +100,73 @@ unsafe fn check_first_continuation_max(
 // else      false && false
 #[cfg_attr(not(feature = "no-inline"), inline)]
 unsafe fn check_overlong(
-    current_bytes: int8x16_t,
-    off1_current_bytes: int8x16_t,
-    hibits: int8x16_t,
-    previous_hibits: int8x16_t,
-    has_error: &mut int8x16_t,
+    current_bytes: uint8x16_t,
+    off1_current_bytes: uint8x16_t,
+    hibits: uint8x16_t,
+    previous_hibits: uint8x16_t,
+    has_error: &mut uint8x16_t,
 ) {
     let _initial_mins : int8x16_t = int8x16_t::new(
         -128,         -128, -128, -128, -128, -128,
         -128,         -128, -128, -128, -128, -128, // 10xx => false
-        0xC2, -128,                         // 110x
-        0xE1,                               // 1110
-        0xF1,
+        -62 /* 0xC2 */, -128,                         // 110x
+        -31 /* 0xE1 */,                               // 1110
+        -15 /*0xF1 */,
     );
 
     let _second_mins : int8x16_t = int8x16_t::new(
         -128,         -128, -128, -128, -128, -128,
         -128,         -128, -128, -128, -128, -128, // 10xx => false
         127,          127,                          // 110x => true
-        0xA0,                               // 1110
-        0x90,
+        -96 /* 0xA0 */,                               // 1110
+        -112 /* 0x90 */,
     );
 
-    let off1_hibits : int8x16_t = vextq_s8(previous_hibits, hibits, 16 - 1);
+    let off1_hibits : uint8x16_t = vextq_u8(previous_hibits, hibits, 16 - 1);
     let initial_mins : int8x16_t =
-        vqtbl1q_s8(_initial_mins, vreinterpretq_u8_s8(off1_hibits));
+        vqtbl1q_s8(_initial_mins, off1_hibits);
 
-    let initial_under : int8x16_t = vcgtq_s8(initial_mins, off1_current_bytes);
+    let initial_under : uint8x16_t = vreinterpretq_u8_s8(vcgtq_s8(initial_mins, vreinterpretq_s8_u8(off1_current_bytes)));
 
     let second_mins : int8x16_t =
-        vqtbl1q_s8(_second_mins, vreinterpretq_u8_s8(off1_hibits));
+        vqtbl1q_s8(_second_mins, off1_hibits);
 
-    let second_under : int8x16_t = vcgtq_s8(second_mins, current_bytes);
+    let second_under : uint8x16_t = vreinterpretq_u8_s8(vcgtq_s8(second_mins, vreinterpretq_s8_u8(current_bytes)));
 
-    *has_error = vorrq_s8(
-        *has_error, vandq_s8(initial_under, second_under));
+    *has_error = vorrq_u8(
+        *has_error, vandq_u8(initial_under, second_under));
 }
 
 pub struct ProcessedUtfBytes {
-    rawbytes: int8x16_t,
-    high_nibbles: int8x16_t,
-    pub carried_continuations: int8x16_t,
+    rawbytes: uint8x16_t,
+    high_nibbles: uint8x16_t,
+    pub carried_continuations: uint8x16_t,
 }
 
 impl Default for ProcessedUtfBytes {
     #[cfg_attr(not(feature = "no-inline"), inline)]
     fn default() -> Self {
         ProcessedUtfBytes {
-            rawbytes: vdupq_n_s8(0x00),
-            high_nibbles: vdupq_n_s8(0x00),
-            carried_continuations: vdupq_n_s8(0x00),
+            rawbytes: vdupq_n_u8(0x00),
+            high_nibbles: vdupq_n_u8(0x00),
+            carried_continuations: vdupq_n_u8(0x00),
         }
     }
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn count_nibbles(bytes: int8x16_t, answer: &mut ProcessedUtfBytes) {
+unsafe fn count_nibbles(bytes: uint8x16_t, answer: &mut ProcessedUtfBytes) {
     answer.rawbytes = bytes;
-    answer.high_nibbles = vreinterpretq_s8_u8(mem::transmute(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4)));
+    answer.high_nibbles = vshrq_n_u8(bytes, 4);
 }
 
 // check whether the current bytes are valid UTF-8
 // at the end of the function, previous gets updated
 #[cfg_attr(not(feature = "no-inline"), inline)]
 pub fn check_utf8_bytes(
-    current_bytes: int8x16_t,
+    current_bytes: uint8x16_t,
     previous: &ProcessedUtfBytes,
-    has_error: &mut int8x16_t,
+    has_error: &mut uint8x16_t,
 ) -> ProcessedUtfBytes {
     let mut pb = ProcessedUtfBytes::default();
     unsafe {
@@ -175,15 +174,15 @@ pub fn check_utf8_bytes(
 
         check_smaller_than_0xf4(current_bytes, has_error);
 
-        let initial_lengths: int8x16_t = continuation_lengths(pb.high_nibbles);
+        let initial_lengths: uint8x16_t = continuation_lengths(pb.high_nibbles);
 
         pb.carried_continuations =
             carry_continuations(initial_lengths, previous.carried_continuations);
 
         check_continuations(initial_lengths, pb.carried_continuations, has_error);
 
-        let off1_current_bytes : int8x16_t =
-            vextq_s8(previous.rawbytes, pb.rawbytes, 16 - 1);
+        let off1_current_bytes : uint8x16_t =
+            vextq_u8(previous.rawbytes, pb.rawbytes, 16 - 1);
 
         check_first_continuation_max(current_bytes, off1_current_bytes, has_error);
 
diff --git a/src/value/generator.rs b/src/value/generator.rs
index bb054fc0..1e0e4baf 100644
--- a/src/value/generator.rs
+++ b/src/value/generator.rs
@@ -14,10 +14,10 @@ use std::io::Write;
 use std::marker::PhantomData;
 use std::ptr;
 
-#[cfg(target_feature = "avx2")]
-const AVX2_PRESENT : bool = true;
-#[cfg(not(target_feature = "avx2"))]
-const AVX2_PRESENT : bool = false;
+//#[cfg(target_feature = "avx2")]
+//const AVX2_PRESENT : bool = true;
+//#[cfg(not(target_feature = "avx2"))]
+//const AVX2_PRESENT : bool = false;
 
 const QU: u8 = b'"';
 const BS: u8 = b'\\';

From 62a3f93579f06e633d69351ff83cda9bfd6250f2 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Sat, 10 Aug 2019 09:37:32 -0400
Subject: [PATCH 08/34] feat: fixing intrinsics (maybe)

---
 src/neon/intrinsics.rs | 45 +++++++++++++++++++++++-------------------
 src/neon/stage1.rs     |  2 +-
 2 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs
index 3db115b8..1520e5b0 100644
--- a/src/neon/intrinsics.rs
+++ b/src/neon/intrinsics.rs
@@ -22,40 +22,43 @@ macro_rules! types {
 }
 
 #[allow(non_camel_case_types)]
-type poly128 = [u8; 16];
+pub type poly64_t = i64;
+//#[allow(non_camel_case_types)]
+//type poly128_t = [u8; 16];
 
+#[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.aarch64.neon.pmull64"]
-    fn vmull_p64_(a: i64, b: i64) -> poly128;
+    fn vmull_p64_(a: i64, b: i64) -> int8x16_t;
     #[link_name = "llvm.aarch64.neon.vshrq"]
-    fn vshrq_n_u8_(a: poly128, b: u8) -> poly128;
+    fn vshrq_n_u8_(a: poly128_t, b: u8) -> poly128_t;
     #[link_name = "llvm.aarch64.neon.addp.v16i8"]
-    fn vpaddq_u8_(a: poly128, b: poly128) -> poly128;
+    fn vpaddq_u8_(a: poly128_t, b: poly128_t) -> poly128_t;
     #[link_name = "llvm.aarch64.neon.addp.v16u8"]
-    fn vaddq_u8_(a: poly128, b: poly128) -> poly128;
+    fn vaddq_u8_(a: poly128_t, b: poly128_t) -> poly128_t;
     #[link_name = "llvm.aarch64.neon.addp.v16i8"]
-    fn vaddq_s8_(a: poly128, b: poly128) -> poly128;
+    fn vaddq_s8_(a: poly128_t, b: poly128_t) -> poly128_t;
     #[link_name = "llvm.aarch64.neon.addp.v16i32"]
-    fn vaddq_s32_(a: poly128, b: poly128) -> poly128;
+    fn vaddq_s32_(a: poly128_t, b: poly128_t) -> poly128_t;
     #[link_name = "llvm.aarch64.neon.vextq.v16u8"]
-    fn vextq_u8_(a: poly128, b: poly128, n: u8) -> poly128;
+    fn vextq_u8_(a: poly128_t, b: poly128_t, n: u8) -> poly128_t;
     #[link_name = "llvm.aarch64.neon.vextq.v16s8"]
-    fn vextq_s8_(a: poly128, b: poly128, n: u8) -> poly128;
+    fn vextq_s8_(a: poly128_t, b: poly128_t, n: u8) -> poly128_t;
     #[link_name = "llvm.aarch64.neon.vtstq.v16u8"]
-    fn vtstq_u8_(a: poly128, b: poly128) -> poly128;
+    fn vtstq_u8_(a: poly128_t, b: poly128_t) -> poly128_t;
     #[link_name = "llvm.aarch64.neon.vtstq.v16s8"]
-    fn vtstq_s8_(a: poly128, b: poly128) -> poly128;
-    #[link_name = "llvm.ctpop.u64"]
-    fn ctpop_u64_(a: u64) -> u32;
+    fn vtstq_s8_(a: poly128_t, b: poly128_t) -> poly128_t;
+    #[link_name = "llvm.ctpop.i64"]
+    fn ctpop_s64_(a: i64) -> i64;
     //    #[link_name = "llvm.ctlz.u64"]
-//    fn ctlz_u64_(a: u64) -> u32;
-    #[link_name = "llvm.cttz.u64"]
-    fn cttz_u64_(a: u64) -> u32;
+    //    fn ctlz_u64_(a: u64) -> u32;
+    #[link_name = "llvm.cttz.i64"]
+    fn cttz_u64_(a: i64) -> i64;
 }
 
 #[inline]
-pub unsafe fn vmull_p64(a: i64, b: i64) -> int8x16_t {
-    mem::transmute(vmull_p64_(a, b))
+pub unsafe fn vmull_p64(a: poly64_t, b: poly64_t) -> poly128_t {
+    mem::transmute(vmull_p64_(mem::transmute(a), mem::transmute(b)))
 }
 
 
@@ -119,6 +122,8 @@ types! {
     pub struct int64x2_t(i64, i64);
     /// ARM-specific 128-bit wide vector of two packed `u64`.
     pub struct uint64x2_t(u64, u64);
+    /// ARM-specific 128-bit wide vector of one packed `i128`.
+    pub struct poly128_t(i128); // FIXME: check this!
 }
 
 impl uint8x16_t {
@@ -411,12 +416,12 @@ pub unsafe fn vtstq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
 
 #[inline]
 pub unsafe fn hamming(a: u64) -> u32 {
-    ctpop_u64_(a)
+    ctpop_s64_(a as i64) as u32
 }
 
 #[inline]
 pub fn trailingzeroes(a: u64) -> u32 {
-    unsafe { cttz_u64_(a) }
+    unsafe { cttz_u64_(a as i64) as u32 }
 }
 
 #[inline]
diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs
index b2cdbaf5..0e3818d0 100644
--- a/src/neon/stage1.rs
+++ b/src/neon/stage1.rs
@@ -57,7 +57,7 @@ unsafe fn neon_movemask_bulk(p0: uint8x16_t, p1: uint8x16_t, p2: uint8x16_t, p3:
 }
 
 unsafe fn compute_quote_mask(quote_bits: u64) -> u64 {
-    vgetq_lane_s64(vreinterpretq_s64_s8(vmull_p64(-1, quote_bits as i64)), 0) as u64
+    vgetq_lane_u64(vreinterpretq_u64_u8(mem::transmute(vmull_p64(-1, quote_bits as i64))), 0)
 }
 
 struct Utf8CheckingState {

From bca6cbab67e6444c78644c1ac169121f2f71c3b1 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Sat, 10 Aug 2019 11:35:06 -0400
Subject: [PATCH 09/34] feat: temp stub replacements for intrinsics (still
 broken)

---
 src/neon/intrinsics.rs | 217 +++++++++++++++++++++++++++++++++++------
 src/neon/simd_llvm.rs  |   2 +-
 2 files changed, 187 insertions(+), 32 deletions(-)

diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs
index 1520e5b0..b14fd4c0 100644
--- a/src/neon/intrinsics.rs
+++ b/src/neon/intrinsics.rs
@@ -30,32 +30,124 @@ pub type poly64_t = i64;
 extern "C" {
     #[link_name = "llvm.aarch64.neon.pmull64"]
     fn vmull_p64_(a: i64, b: i64) -> int8x16_t;
-    #[link_name = "llvm.aarch64.neon.vshrq"]
-    fn vshrq_n_u8_(a: poly128_t, b: u8) -> poly128_t;
-    #[link_name = "llvm.aarch64.neon.addp.v16i8"]
-    fn vpaddq_u8_(a: poly128_t, b: poly128_t) -> poly128_t;
-    #[link_name = "llvm.aarch64.neon.addp.v16u8"]
-    fn vaddq_u8_(a: poly128_t, b: poly128_t) -> poly128_t;
-    #[link_name = "llvm.aarch64.neon.addp.v16i8"]
-    fn vaddq_s8_(a: poly128_t, b: poly128_t) -> poly128_t;
-    #[link_name = "llvm.aarch64.neon.addp.v16i32"]
-    fn vaddq_s32_(a: poly128_t, b: poly128_t) -> poly128_t;
-    #[link_name = "llvm.aarch64.neon.vextq.v16u8"]
-    fn vextq_u8_(a: poly128_t, b: poly128_t, n: u8) -> poly128_t;
-    #[link_name = "llvm.aarch64.neon.vextq.v16s8"]
-    fn vextq_s8_(a: poly128_t, b: poly128_t, n: u8) -> poly128_t;
-    #[link_name = "llvm.aarch64.neon.vtstq.v16u8"]
-    fn vtstq_u8_(a: poly128_t, b: poly128_t) -> poly128_t;
-    #[link_name = "llvm.aarch64.neon.vtstq.v16s8"]
-    fn vtstq_s8_(a: poly128_t, b: poly128_t) -> poly128_t;
     #[link_name = "llvm.ctpop.i64"]
     fn ctpop_s64_(a: i64) -> i64;
-    //    #[link_name = "llvm.ctlz.u64"]
-    //    fn ctlz_u64_(a: u64) -> u32;
     #[link_name = "llvm.cttz.i64"]
     fn cttz_u64_(a: i64) -> i64;
 }
 
+//unsafe fn vpaddq_u8_(_a: poly128_t, _b: poly128_t) -> poly128_t { mem::transmute(vdupq_n_u8(0)) }
+
+unsafe fn vaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) }
+unsafe fn vaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t { simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) }
+unsafe fn vaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t { simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) }
+
+unsafe fn vextq_u8_(a: uint8x16_t, b: uint8x16_t, n: u8) -> uint8x16_t {
+    match n {
+        0 => b,
+        1 => uint8x16_t(
+            a.0, b.1, b.2, b.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        2 => uint8x16_t(
+            a.0, a.1, b.2, b.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        3 => uint8x16_t(
+            a.0, a.1, a.2, b.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        4 => uint8x16_t(
+            a.0, a.1, a.2, a.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        5 => uint8x16_t(
+            a.0, a.1, a.2, a.3, a.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        6 => uint8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        7 => uint8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        8 => uint8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        9 => uint8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        10 => uint8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        11 => uint8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        12 => uint8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, b.12, b.13, b.14, b.15,
+        ),
+        13 => uint8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, a.12, b.13, b.14, b.15,
+        ),
+        14 => uint8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, a.12, a.13, b.14, b.15,
+        ),
+        15 => uint8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, a.12, a.13, a.14, b.15,
+        ),
+        16 => a,
+        _ => unreachable_unchecked(),
+    }
+}
+
+unsafe fn vextq_s8_(a: int8x16_t, b: int8x16_t, n: u8) -> int8x16_t {
+    match n {
+        0 => b,
+        1 => int8x16_t(
+            a.0, b.1, b.2, b.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        2 => int8x16_t(
+            a.0, a.1, b.2, b.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        3 => int8x16_t(
+            a.0, a.1, a.2, b.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        4 => int8x16_t(
+            a.0, a.1, a.2, a.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        5 => int8x16_t(
+            a.0, a.1, a.2, a.3, a.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        6 => int8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        7 => int8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        8 => int8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        9 => int8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        10 => int8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, b.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        11 => int8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, b.11, b.12, b.13, b.14, b.15,
+        ),
+        12 => int8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, b.12, b.13, b.14, b.15,
+        ),
+        13 => int8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, a.12, b.13, b.14, b.15,
+        ),
+        14 => int8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, a.12, a.13, b.14, b.15,
+        ),
+        15 => int8x16_t(
+            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, a.12, a.13, a.14, b.15,
+        ),
+        16 => a,
+        _ => unreachable_unchecked(),
+    }
+}
+
 #[inline]
 pub unsafe fn vmull_p64(a: poly64_t, b: poly64_t) -> poly128_t {
     mem::transmute(vmull_p64_(mem::transmute(a), mem::transmute(b)))
@@ -63,9 +155,25 @@ pub unsafe fn vmull_p64(a: poly64_t, b: poly64_t) -> poly128_t {
 
 
 #[inline]
-pub unsafe fn vshrq_n_u8(a: uint8x16_t, b: u8) -> uint8x16_t {
-    // FIXME?
-    mem::transmute(vshrq_n_u8_(mem::transmute(a), mem::transmute(b)))
+pub unsafe fn vshrq_n_u8(a: uint8x16_t, n: u8) -> uint8x16_t {
+    uint8x16_t(
+        a.0 >> n,
+        a.1 >> n,
+        a.2 >> n,
+        a.3 >> n,
+        a.4 >> n,
+        a.5 >> n,
+        a.6 >> n,
+        a.7 >> n,
+        a.8 >> n,
+        a.9 >> n,
+        a.10 >> n,
+        a.11 >> n,
+        a.12 >> n,
+        a.13 >> n,
+        a.14 >> n,
+        a.15 >> n,
+    )
 }
 
 types! {
@@ -173,8 +281,6 @@ pub unsafe fn vst1q_u8(addr: *mut u8, val: uint8x16_t) {
 macro_rules! aarch64_simd_2 {
     ($name:ident, $type:ty, $simd_fn:ident, $intr:ident) => {
         #[inline]
-        #[target_feature(enable = "neon")]
-        #[cfg_attr(test, assert_instr($intr))]
         pub unsafe fn $name(a: $type, b: $type) -> $type {
             simd_llvm::$simd_fn(a, b)
         }
@@ -301,7 +407,7 @@ pub fn zerou8x16() -> uint8x16_t {
 
 #[inline]
 pub unsafe fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    mem::transmute(vpaddq_u8_(mem::transmute(a), mem::transmute(b)))
+    mem::transmute(vaddq_u8_(mem::transmute(a), mem::transmute(b)))
 }
 
 #[inline]
@@ -355,9 +461,6 @@ arm_reinterpret!(vreinterpretq_s64_s8, int8x16_t, int64x2_t);
 macro_rules! arm_vget_lane {
     ($name:ident, $to:ty, $from:ty, $lanes:literal) => {
         #[inline]
-        #[target_feature(enable = "neon")]
-        #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-        #[cfg_attr(test, assert_instr(umov))]
         pub unsafe fn $name(v: $from, lane: u32) -> $to {
             if lane > $lanes { unreachable_unchecked() }
             simd_llvm::simd_extract(v, lane)
@@ -404,14 +507,66 @@ pub unsafe fn vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
     simd_llvm::simd_sub(mem::transmute(a), mem::transmute(b))
 }
 
+#[inline]
+fn test_u8(a:u8,b:u8) -> u8 {
+    if a & b != 0 {
+        0xFF
+    } else {
+        0x00
+    }
+}
+
 #[inline]
 pub unsafe fn vtstq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    mem::transmute(vtstq_u8_(mem::transmute(a), mem::transmute(b)))
+    uint8x16_t(
+        test_u8(a.0, b.0),
+        test_u8(a.1, b.1),
+        test_u8(a.2, b.2),
+        test_u8(a.3, b.3),
+        test_u8(a.4, b.4),
+        test_u8(a.5, b.5),
+        test_u8(a.6, b.6),
+        test_u8(a.7, b.7),
+        test_u8(a.8, b.8),
+        test_u8(a.9, b.9),
+        test_u8(a.10, b.10),
+        test_u8(a.11, b.11),
+        test_u8(a.12, b.12),
+        test_u8(a.13, b.13),
+        test_u8(a.14, b.14),
+        test_u8(a.15, b.15)
+    )
+}
+
+#[inline]
+fn test_s8(a:i8,b:i8) -> i8 {
+    if a & b != 0 {
+        -1
+    } else {
+        0x00
+    }
 }
 
 #[inline]
 pub unsafe fn vtstq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    mem::transmute(vtstq_s8_(mem::transmute(a), mem::transmute(b)))
+    int8x16_t(
+        test_s8(a.0, b.0),
+        test_s8(a.1, b.1),
+        test_s8(a.2, b.2),
+        test_s8(a.3, b.3),
+        test_s8(a.4, b.4),
+        test_s8(a.5, b.5),
+        test_s8(a.6, b.6),
+        test_s8(a.7, b.7),
+        test_s8(a.8, b.8),
+        test_s8(a.9, b.9),
+        test_s8(a.10, b.10),
+        test_s8(a.11, b.11),
+        test_s8(a.12, b.12),
+        test_s8(a.13, b.13),
+        test_s8(a.14, b.14),
+        test_s8(a.15, b.15)
+    )
 }
 
 #[inline]
diff --git a/src/neon/simd_llvm.rs b/src/neon/simd_llvm.rs
index d91ba422..892ae59c 100644
--- a/src/neon/simd_llvm.rs
+++ b/src/neon/simd_llvm.rs
@@ -19,7 +19,7 @@ extern "platform-intrinsic" {
 //
 //    pub fn simd_cast<T, U>(x: T) -> U;
 //
-//    pub fn simd_add<T>(x: T, y: T) -> T;
+    pub fn simd_add<T>(x: T, y: T) -> T;
     pub fn simd_sub<T>(x: T, y: T) -> T;
 //    pub fn simd_mul<T>(x: T, y: T) -> T;
 //    pub fn simd_div<T>(x: T, y: T) -> T;

From 620e69725f23ea1c95f85b7bb276aa5ef71b5bea Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Mon, 12 Aug 2019 23:46:23 -0400
Subject: [PATCH 10/34] feat: get code closer to simdjson (still broken)

---
 src/neon/intrinsics.rs |  70 ++++++++++++---
 src/neon/simd.rs       |   2 +-
 src/neon/simd_llvm.rs  |   2 +-
 src/neon/stage1.rs     | 200 ++++++++++++++++++++++++-----------------
 src/neon/utf8check.rs  | 133 ++++++++++++++-------------
 5 files changed, 239 insertions(+), 168 deletions(-)

diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs
index b14fd4c0..2539ae33 100644
--- a/src/neon/intrinsics.rs
+++ b/src/neon/intrinsics.rs
@@ -28,16 +28,20 @@ pub type poly64_t = i64;
 
 #[allow(improper_ctypes)]
 extern "C" {
+    #[link_name = "llvm.aarch64.neon.addp.v16u8"]
+    fn vpaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
     #[link_name = "llvm.aarch64.neon.pmull64"]
     fn vmull_p64_(a: i64, b: i64) -> int8x16_t;
     #[link_name = "llvm.ctpop.i64"]
     fn ctpop_s64_(a: i64) -> i64;
     #[link_name = "llvm.cttz.i64"]
     fn cttz_u64_(a: i64) -> i64;
+    #[link_name = "llvm.aarch64.neon.uqxtn.v2u32"]
+    fn vqmovn_u64_(a: uint64x2_t) -> uint32x2_t;
+    #[link_name = "llvm.aarch64.neon.uqsub.v16u8"]
+    fn vqsubq_u8_(a: uint8x16_t, a: uint8x16_t) -> uint8x16_t;
 }
 
-//unsafe fn vpaddq_u8_(_a: poly128_t, _b: poly128_t) -> poly128_t { mem::transmute(vdupq_n_u8(0)) }
-
 unsafe fn vaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) }
 unsafe fn vaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t { simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) }
 unsafe fn vaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t { simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) }
@@ -153,6 +157,10 @@ pub unsafe fn vmull_p64(a: poly64_t, b: poly64_t) -> poly128_t {
     mem::transmute(vmull_p64_(mem::transmute(a), mem::transmute(b)))
 }
 
+#[inline]
+pub unsafe fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    vpaddq_u8_(a, b)
+}
 
 #[inline]
 pub unsafe fn vshrq_n_u8(a: uint8x16_t, n: u8) -> uint8x16_t {
@@ -294,6 +302,7 @@ macro_rules! aarch64_simd_ceq {
 }
 
 aarch64_simd_ceq!(vceqq_u8, uint8x16_t);
+
 aarch64_simd_ceq!(vceq_s64, int64x1_t);
 aarch64_simd_ceq!(vceqq_s64, int64x2_t);
 aarch64_simd_ceq!(vceq_u64, uint64x1_t);
@@ -405,11 +414,6 @@ pub fn zerou8x16() -> uint8x16_t {
     uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00)
 }
 
-#[inline]
-pub unsafe fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    mem::transmute(vaddq_u8_(mem::transmute(a), mem::transmute(b)))
-}
-
 #[inline]
 pub unsafe fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
     mem::transmute(vaddq_u8_(mem::transmute(a), mem::transmute(b)))
@@ -441,8 +445,6 @@ macro_rules! arm_reinterpret {
     ($name:ident, $from:ty, $to:ty) => {
         // Vector reinterpret cast operation
         #[inline]
-        #[target_feature(enable = "neon")]
-        #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
         pub unsafe fn $name(a: $from) -> $to {
             mem::transmute(a)
         }
@@ -462,7 +464,6 @@ macro_rules! arm_vget_lane {
     ($name:ident, $to:ty, $from:ty, $lanes:literal) => {
         #[inline]
         pub unsafe fn $name(v: $from, lane: u32) -> $to {
-            if lane > $lanes { unreachable_unchecked() }
             simd_llvm::simd_extract(v, lane)
         }
     };
@@ -487,8 +488,8 @@ pub unsafe fn vextq_s8(a: int8x16_t, b: int8x16_t, n: u8) -> int8x16_t {
 }
 
 #[inline]
-pub fn vqmovn_u64(a: uint64x2_t) -> uint32x2_t {
-    uint32x2_t(a.0 as u32, a.1 as u32)
+pub unsafe fn vqmovn_u64(a: uint64x2_t) -> uint32x2_t {
+    vqmovn_u64_(a)
 }
 
 #[inline]
@@ -503,8 +504,7 @@ pub unsafe fn vqtbl1q_u8(t: uint8x16_t, idx: uint8x16_t) -> uint8x16_t {
 
 #[inline]
 pub unsafe fn vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    // FIXME?
-    simd_llvm::simd_sub(mem::transmute(a), mem::transmute(b))
+    vqsubq_u8_(a, b)
 }
 
 #[inline]
@@ -583,3 +583,45 @@ pub fn trailingzeroes(a: u64) -> u32 {
 pub unsafe fn vst1q_u32(addr: *mut u8, val: uint32x4_t) {
     std::ptr::write(addr as *mut uint32x4_t, val)
 }
+
+
+#[allow(unused)]
+macro_rules! constify_imm5 {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match ($imm8) & 0b1_1111 {
+            0 => $expand!(0),
+            1 => $expand!(1),
+            2 => $expand!(2),
+            3 => $expand!(3),
+            4 => $expand!(4),
+            5 => $expand!(5),
+            6 => $expand!(6),
+            7 => $expand!(7),
+            8 => $expand!(8),
+            9 => $expand!(9),
+            10 => $expand!(10),
+            11 => $expand!(11),
+            12 => $expand!(12),
+            13 => $expand!(13),
+            14 => $expand!(14),
+            15 => $expand!(15),
+            16 => $expand!(16),
+            17 => $expand!(17),
+            18 => $expand!(18),
+            19 => $expand!(19),
+            20 => $expand!(20),
+            21 => $expand!(21),
+            22 => $expand!(22),
+            23 => $expand!(23),
+            24 => $expand!(24),
+            25 => $expand!(25),
+            26 => $expand!(26),
+            27 => $expand!(27),
+            28 => $expand!(28),
+            29 => $expand!(29),
+            30 => $expand!(30),
+            _ => $expand!(31),
+        }
+    };
+}
\ No newline at end of file
diff --git a/src/neon/simd.rs b/src/neon/simd.rs
index 544bc0d8..8a5a21fc 100644
--- a/src/neon/simd.rs
+++ b/src/neon/simd.rs
@@ -467,4 +467,4 @@ macro_rules! constify_imm8 {
             _ => $expand!(255),
         }
     };
-}
\ No newline at end of file
+}
diff --git a/src/neon/simd_llvm.rs b/src/neon/simd_llvm.rs
index 892ae59c..bb7b25bd 100644
--- a/src/neon/simd_llvm.rs
+++ b/src/neon/simd_llvm.rs
@@ -20,7 +20,7 @@ extern "platform-intrinsic" {
 //    pub fn simd_cast<T, U>(x: T) -> U;
 //
     pub fn simd_add<T>(x: T, y: T) -> T;
-    pub fn simd_sub<T>(x: T, y: T) -> T;
+//    pub fn simd_sub<T>(x: T, y: T) -> T;
 //    pub fn simd_mul<T>(x: T, y: T) -> T;
 //    pub fn simd_div<T>(x: T, y: T) -> T;
 //    pub fn simd_shl<T>(x: T, y: T) -> T;
diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs
index 0e3818d0..a1e15a10 100644
--- a/src/neon/stage1.rs
+++ b/src/neon/stage1.rs
@@ -57,11 +57,13 @@ unsafe fn neon_movemask_bulk(p0: uint8x16_t, p1: uint8x16_t, p2: uint8x16_t, p3:
 }
 
 unsafe fn compute_quote_mask(quote_bits: u64) -> u64 {
-    vgetq_lane_u64(vreinterpretq_u64_u8(mem::transmute(vmull_p64(-1, quote_bits as i64))), 0)
+    vgetq_lane_u64(
+        vreinterpretq_u64_u8(
+            mem::transmute(vmull_p64(-1, quote_bits as i64))), 0)
 }
 
 struct Utf8CheckingState {
-    has_error: uint8x16_t,
+    has_error: int8x16_t,
     previous: ProcessedUtfBytes,
 }
 
@@ -69,8 +71,8 @@ impl Default for Utf8CheckingState {
     #[cfg_attr(not(feature = "no-inline"), inline)]
     fn default() -> Self {
         Utf8CheckingState {
-            has_error: vdupq_n_u8(0),
-            previous:ProcessedUtfBytes::default()
+            has_error: vdupq_n_s8(0),
+            previous: ProcessedUtfBytes::default(),
         }
     }
 }
@@ -99,20 +101,20 @@ unsafe fn check_utf8(
         // All bytes are ascii. Therefore the byte that was just before must be
         // ascii too. We only check the byte that was just before simd_input. Nines
         // are arbitrary values.
-        let verror: uint8x16_t =
-            uint8x16_t::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1);
+        let verror: int8x16_t =
+            int8x16_t::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1);
         state.has_error =
-            vorrq_u8(vcgtq_u8(state.previous.carried_continuations, verror),
+            vorrq_s8(vcgtq_s8(state.previous.carried_continuations, verror),
                      state.has_error);
     } else {
         // it is not ascii so we have to do heavy work
-        state.previous = check_utf8_bytes(input.v0,
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v0),
                                           &(state.previous), &mut (state.has_error));
-        state.previous = check_utf8_bytes(input.v1,
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v1),
                                           &(state.previous), &mut (state.has_error));
-        state.previous = check_utf8_bytes(input.v2,
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v2),
                                           &(state.previous), &mut (state.has_error));
-        state.previous = check_utf8_bytes(input.v3,
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v3),
                                           &(state.previous), &mut (state.has_error));
     }
 }
@@ -122,7 +124,7 @@ unsafe fn check_utf8(
 unsafe fn cmp_mask_against_input(input: &SimdInput, m: u8) -> u64 {
     macro_rules! call {
         ($imm8:expr) => {
-            vdupq_n_u8($imm8)
+            vmovq_n_u8($imm8)
         };
     };
     let mask: uint8x16_t = constify_imm8!(m, call);
@@ -140,7 +142,7 @@ unsafe fn cmp_mask_against_input(input: &SimdInput, m: u8) -> u64 {
 unsafe fn unsigned_lteq_against_input(input: &SimdInput, maxval: u8) -> u64 {
     macro_rules! call {
         ($imm8:expr) => {
-            vdupq_n_u8($imm8)
+            vmovq_n_u8($imm8)
         };
     };
     let mask: uint8x16_t = constify_imm8!(maxval, call);
@@ -153,7 +155,94 @@ unsafe fn unsigned_lteq_against_input(input: &SimdInput, maxval: u8) -> u64 {
     neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3)
 }
 
-unsafe fn find_whitespace_and_structurals(input: &SimdInput, whitespace: &mut u64, structurals: &mut u64) {
+// return a bitvector indicating where we have characters that end an odd-length
+// sequence of backslashes (and thus change the behavior of the next character
+// to follow). A even-length sequence of backslashes, and, for that matter, the
+// largest even-length prefix of our odd-length sequence of backslashes, simply
+// modify the behavior of the backslashes themselves.
+// We also update the prev_iter_ends_odd_backslash reference parameter to
+// indicate whether we end an iteration on an odd-length sequence of
+// backslashes, which modifies our subsequent search for odd-length
+// sequences of backslashes in an obvious way.
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn find_odd_backslash_sequences(input: &SimdInput, prev_iter_ends_odd_backslash: &mut u64) -> u64 {
+    const EVEN_BITS: u64 = 0x5555_5555_5555_5555;
+    const ODD_BITS: u64 = !EVEN_BITS;
+
+    let bs_bits: u64 = cmp_mask_against_input(input, b'\\');
+    let start_edges: u64 = bs_bits & !(bs_bits << 1);
+    // flip lowest if we have an odd-length run at the end of the prior
+    // iteration
+    let even_start_mask: u64 = EVEN_BITS ^ *prev_iter_ends_odd_backslash;
+    let even_starts: u64 = start_edges & even_start_mask;
+    let odd_starts: u64 = start_edges & !even_start_mask;
+    let even_carries: u64 = bs_bits.wrapping_add(even_starts);
+
+    let mut odd_carries: u64 = 0;
+    // must record the carry-out of our odd-carries out of bit 63; this
+    // indicates whether the sense of any edge going to the next iteration
+    // should be flipped
+    let iter_ends_odd_backslash: bool = add_overflow(bs_bits, odd_starts, &mut odd_carries);
+
+    odd_carries |= *prev_iter_ends_odd_backslash; // push in bit zero as a potential end
+    // if we had an odd-numbered run at the
+    // end of the previous iteration
+    *prev_iter_ends_odd_backslash = if iter_ends_odd_backslash { 0x1 } else { 0x0 };
+    let even_carry_ends: u64 = even_carries & !bs_bits;
+    let odd_carry_ends: u64 = odd_carries & !bs_bits;
+    let even_start_odd_end: u64 = even_carry_ends & ODD_BITS;
+    let odd_start_even_end: u64 = odd_carry_ends & EVEN_BITS;
+    let odd_ends: u64 = even_start_odd_end | odd_start_even_end;
+    odd_ends
+}
+
+// return both the quote mask (which is a half-open mask that covers the first
+// quote in an unescaped quote pair and everything in the quote pair) and the
+// quote bits, which are the simple unescaped quoted bits.
+//
+// We also update the prev_iter_inside_quote value to tell the next iteration
+// whether we finished the final iteration inside a quote pair; if so, this
+// inverts our behavior of whether we're inside quotes for the next iteration.
+//
+// Note that we don't do any error checking to see if we have backslash
+// sequences outside quotes; these
+// backslash sequences (of any length) will be detected elsewhere.
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn find_quote_mask_and_bits(
+    input: &SimdInput,
+    odd_ends: u64,
+    prev_iter_inside_quote: &mut u64,
+    quote_bits: &mut u64,
+    error_mask: &mut u64
+) -> u64 {
+    *quote_bits = cmp_mask_against_input(input, b'"');
+    *quote_bits &= !odd_ends;
+
+    let mut quote_mask: u64 = compute_quote_mask(*quote_bits);
+
+    quote_mask ^= *prev_iter_inside_quote;
+
+    // All Unicode characters may be placed within the
+    // quotation marks, except for the characters that MUST be escaped:
+    // quotation mark, reverse solidus, and the control characters (U+0000
+    //through U+001F).
+    // https://tools.ietf.org/html/rfc8259
+    let unescaped: u64 = unsigned_lteq_against_input(input, 0x1F);
+    *error_mask |= quote_mask & unescaped;
+
+    // right shift of a signed value expected to be well-defined and standard
+    // compliant as of C++20,
+    // John Regher from Utah U. says this is fine code
+    *prev_iter_inside_quote = (quote_mask as i64 >> 63) as u64;
+    quote_mask
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn find_whitespace_and_structurals(
+    input: &SimdInput,
+    whitespace: &mut u64,
+    structurals: &mut u64,
+) {
     let low_nibble_mask: uint8x16_t = uint8x16_t::new(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
     let high_nibble_mask: uint8x16_t = uint8x16_t::new(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
     let structural_shufti_mask: uint8x16_t = vmovq_n_u8(0x7);
@@ -197,38 +286,6 @@ unsafe fn find_whitespace_and_structurals(input: &SimdInput, whitespace: &mut u6
     *whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
 }
 
-
-unsafe fn find_odd_backslash_sequences(input: &SimdInput, prev_iter_ends_odd_backslash: &mut u64) -> u64 {
-    const EVEN_BITS: u64 = 0x5555_5555_5555_5555;
-    const ODD_BITS: u64 = !EVEN_BITS;
-
-    let bs_bits: u64 = cmp_mask_against_input(input, b'\\');
-    let start_edges: u64 = bs_bits & !(bs_bits << 1);
-    // flip lowest if we have an odd-length run at the end of the prior
-    // iteration
-    let even_start_mask: u64 = EVEN_BITS ^ *prev_iter_ends_odd_backslash;
-    let even_starts: u64 = start_edges & even_start_mask;
-    let odd_starts: u64 = start_edges & !even_start_mask;
-    let even_carries: u64 = bs_bits.wrapping_add(even_starts);
-
-    let mut odd_carries: u64 = 0;
-    // must record the carry-out of our odd-carries out of bit 63; this
-    // indicates whether the sense of any edge going to the next iteration
-    // should be flipped
-    let iter_ends_odd_backslash: bool = add_overflow(bs_bits, odd_starts, &mut odd_carries);
-
-    odd_carries |= *prev_iter_ends_odd_backslash; // push in bit zero as a potential end
-    // if we had an odd-numbered run at the
-    // end of the previous iteration
-    *prev_iter_ends_odd_backslash = if iter_ends_odd_backslash { 0x1 } else { 0x0 };
-    let even_carry_ends: u64 = even_carries & !bs_bits;
-    let odd_carry_ends: u64 = odd_carries & !bs_bits;
-    let even_start_odd_end: u64 = even_carry_ends & ODD_BITS;
-    let odd_start_even_end: u64 = odd_carry_ends & EVEN_BITS;
-    let odd_ends: u64 = even_start_odd_end | odd_start_even_end;
-    odd_ends
-}
-
 //template <>
 //really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
 //    utf8_checking_state<Architecture::ARM64> &state) {
@@ -239,31 +296,6 @@ unsafe fn find_odd_backslash_sequences(input: &SimdInput, prev_iter_ends_odd_bac
 //                                       : simdjson::SUCCESS;
 //}
 
-unsafe fn find_quote_mask_and_bits(input: &SimdInput, odd_ends: u64,
-                                   prev_iter_inside_quote: &mut u64, quote_bits: &mut u64, error_mask: &mut u64) -> u64 {
-    *quote_bits = cmp_mask_against_input(input, b'"');
-    *quote_bits &= !odd_ends;
-
-    let mut quote_mask: u64 = compute_quote_mask(*quote_bits);
-
-    quote_mask ^= *prev_iter_inside_quote;
-
-    // All Unicode characters may be placed within the
-    // quotation marks, except for the characters that MUST be escaped:
-    // quotation mark, reverse solidus, and the control characters (U+0000
-    //through U+001F).
-    // https://tools.ietf.org/html/rfc8259
-    let unescaped: u64 = unsigned_lteq_against_input(input, 0x1F);
-    *error_mask |= quote_mask & unescaped;
-
-    // right shift of a signed value expected to be well-defined and standard
-    // compliant as of C++20,
-    // John Regher from Utah U. says this is fine code
-    *prev_iter_inside_quote = quote_mask >> 63;
-
-    quote_mask
-}
-
 // flatten out values in 'bits' assuming that they are are to have values of idx
 // plus their position in the bitvector, and store these indexes at
 // base_ptr[base] incrementing base as we go
@@ -291,13 +323,13 @@ unsafe fn flatten_bits(base: &mut Vec<u32>, idx: u32, mut bits: u64) {
     base.set_len(l + cnt);
 
     while bits != 0 {
-        let v0 : i32 = static_cast_i32!(trailingzeroes(bits));
+        let v0: i32 = static_cast_i32!(trailingzeroes(bits));
         bits &= bits.wrapping_sub(1);
-        let v1 : i32 = static_cast_i32!(trailingzeroes(bits));
+        let v1: i32 = static_cast_i32!(trailingzeroes(bits));
         bits &= bits.wrapping_sub(1);
-        let v2 : i32 = static_cast_i32!(trailingzeroes(bits));
+        let v2: i32 = static_cast_i32!(trailingzeroes(bits));
         bits &= bits.wrapping_sub(1);
-        let v3 : i32 = static_cast_i32!(trailingzeroes(bits));
+        let v3: i32 = static_cast_i32!(trailingzeroes(bits));
         bits &= bits.wrapping_sub(1);
 
         let v: int32x4_t = int32x4_t::new(v3, v2, v1, v0);
@@ -308,7 +340,6 @@ unsafe fn flatten_bits(base: &mut Vec<u32>, idx: u32, mut bits: u64) {
     }
 }
 
-
 // return a updated structural bit vector with quoted contents cleared out and
 // pseudo-structural characters added to the mask
 // updates prev_iter_ends_pseudo_pred which tells us whether the previous
@@ -352,7 +383,7 @@ fn finalize_structurals(
 }
 
 impl<'de> Deserializer<'de> {
-    #[inline(never)]
+    //#[inline(never)]
     pub unsafe fn find_structural_bits(input: &[u8]) -> std::result::Result<Vec<u32>, ErrorType> {
         let len = input.len();
         // 6 is a heuristic number to estimate it turns out a rate of 1/6 structural caracters lears
@@ -360,8 +391,8 @@ impl<'de> Deserializer<'de> {
         let mut structural_indexes = Vec::with_capacity(len / 6);
         structural_indexes.push(0); // push extra root element
 
-        // let mut has_error: uint8x16_t = vdupq_n_u8(0);
-        let mut previous = Utf8CheckingState::default();
+        let mut utf8_state : Utf8CheckingState = Utf8CheckingState::default();
+
         // we have padded the input out to 64 byte multiple with the remainder being
         // zeros
 
@@ -397,7 +428,9 @@ impl<'de> Deserializer<'de> {
             #endif
              */
             let input: SimdInput = fill_input(input.get_unchecked(idx as usize..));
-            check_utf8(&input, &mut previous);
+
+            check_utf8(&input, &mut utf8_state);
+
             // detect odd sequences of backslashes
             let odd_ends: u64 =
                 find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash);
@@ -441,7 +474,7 @@ impl<'de> Deserializer<'de> {
                 .copy_from(input.as_ptr().add(idx), len as usize - idx);
             let input: SimdInput = fill_input(&tmpbuf);
 
-            check_utf8(&input, &mut previous);
+            check_utf8(&input, &mut utf8_state);
 
             // detect odd sequences of backslashes
             let odd_ends: u64 =
@@ -475,7 +508,6 @@ impl<'de> Deserializer<'de> {
             );
             idx += 64;
         }
-
         // This test isn't in upstream, for some reason the error mask is et for then.
         if prev_iter_inside_quote != 0 {
             return Err(ErrorType::Syntax);
@@ -497,9 +529,9 @@ impl<'de> Deserializer<'de> {
             return Err(ErrorType::Syntax);
         }
 
-        let err: i128 = mem::transmute(vtstq_u8(previous.has_error, previous.has_error));
+        let has_error : i128 = mem::transmute(utf8_state.has_error);
 
-        if err != 0 {
+        if has_error != 0 {
             Ok(structural_indexes)
         } else {
             Err(ErrorType::InvalidUTF8)
diff --git a/src/neon/utf8check.rs b/src/neon/utf8check.rs
index 10c6b7c9..c62707fb 100644
--- a/src/neon/utf8check.rs
+++ b/src/neon/utf8check.rs
@@ -1,6 +1,4 @@
-
 use crate::neon::intrinsics::*;
-//use std::mem;
 
 /*
  * legal utf-8 byte sequence
@@ -21,17 +19,17 @@ use crate::neon::intrinsics::*;
 
 // all byte values must be no larger than 0xF4
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn check_smaller_than_0xf4(current_bytes: uint8x16_t, has_error: &mut uint8x16_t) {
+unsafe fn check_smaller_than_0xf4(current_bytes: int8x16_t, has_error: &mut int8x16_t) {
     // unsigned, saturates to 0 below max
     *has_error =
-        vorrq_u8(
+        vorrq_s8(
             *has_error,
-            vqsubq_u8(current_bytes, vdupq_n_u8(0xF4)));
+            vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(current_bytes), vdupq_n_u8(0xF4))));
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn continuation_lengths(high_nibbles: uint8x16_t) -> uint8x16_t {
-    let nibbles : uint8x16_t = uint8x16_t::new(
+unsafe fn continuation_lengths(high_nibbles: int8x16_t) -> int8x16_t {
+    let nibbles: int8x16_t = int8x16_t::new(
         1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
         0, 0, 0, 0,             // 10xx (continuation)
         2, 2,                   // 110x
@@ -39,35 +37,34 @@ unsafe fn continuation_lengths(high_nibbles: uint8x16_t) -> uint8x16_t {
         4,                      // 1111, next should be 0 (not checked here)
     );
 
-    vqtbl1q_u8(nibbles, high_nibbles)
+    vreinterpretq_s8_u8(vqtbl1q_u8(vreinterpretq_u8_s8(nibbles), vreinterpretq_u8_s8(high_nibbles)))
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn carry_continuations(initial_lengths: uint8x16_t, previous_carries: uint8x16_t) -> uint8x16_t {
-    let right1 : uint8x16_t = vqsubq_u8(
-        vextq_u8(previous_carries, initial_lengths, 16 - 1),
-        vdupq_n_u8(1));
-    let sum : uint8x16_t = vaddq_u8(initial_lengths, right1);
+unsafe fn carry_continuations(initial_lengths: int8x16_t, previous_carries: int8x16_t) -> int8x16_t {
+    let right1: int8x16_t = vreinterpretq_s8_u8(vqsubq_u8(
+        vreinterpretq_u8_s8(vextq_s8(previous_carries, initial_lengths, 16 - 1)),
+        vdupq_n_u8(1)));
+    let sum: int8x16_t = vaddq_s8(initial_lengths, right1);
 
-    let right2 : uint8x16_t = vqsubq_u8(
-        vextq_u8(previous_carries, sum, 16 - 2),
-        vdupq_n_u8(2));
+    let right2: int8x16_t = vreinterpretq_s8_u8(vqsubq_u8(
+        vreinterpretq_u8_s8(vextq_s8(previous_carries, sum, 16 - 2)),
+        vdupq_n_u8(2)));
 
-    vaddq_u8(sum, right2)
+    vaddq_s8(sum, right2)
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn check_continuations(initial_lengths: uint8x16_t, carries: uint8x16_t, has_error: &mut uint8x16_t) {
+unsafe fn check_continuations(initial_lengths: int8x16_t, carries: int8x16_t, has_error: &mut int8x16_t) {
     // overlap || underlap
     // carry > length && length > 0 || !(carry > length) && !(length > 0)
     // (carries > length) == (lengths > 0)
 
-    // FIXME: was vceqq_u8 ?
-    let overunder : uint8x16_t = vceqq_u8(
-                                    vcgtq_u8(carries, initial_lengths),
-                                    vcgtq_u8(initial_lengths, vdupq_n_u8(0)));
+    let overunder: uint8x16_t = vceqq_u8(
+        vreinterpretq_u8_s8(vcgtq_s8(carries, initial_lengths)),
+        vreinterpretq_u8_s8(vcgtq_s8(initial_lengths, vdupq_n_s8(0))));
 
-    *has_error = vorrq_u8(*has_error, overunder);
+    *has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(overunder));
 }
 
 // when 0xED is found, next byte must be no larger than 0x9F
@@ -75,21 +72,21 @@ unsafe fn check_continuations(initial_lengths: uint8x16_t, carries: uint8x16_t,
 // next byte must be continuation, ie sign bit is set, so signed < is ok
 #[cfg_attr(not(feature = "no-inline"), inline)]
 unsafe fn check_first_continuation_max(
-    current_bytes: uint8x16_t,
-    off1_current_bytes: uint8x16_t,
-    has_error: &mut uint8x16_t,
+    current_bytes: int8x16_t,
+    off1_current_bytes: int8x16_t,
+    has_error: &mut int8x16_t,
 ) {
-    let mask_ed : uint8x16_t = vceqq_u8(off1_current_bytes, vdupq_n_u8(0xED));
-    let mask_f4 : uint8x16_t = vceqq_u8(off1_current_bytes, vdupq_n_u8(0xF4));
+    let mask_ed: uint8x16_t = vreinterpretq_u8_s8(vceqq_s8(off1_current_bytes, vdupq_n_s8(-19 /* 0xED */)));
+    let mask_f4: uint8x16_t = vreinterpretq_u8_s8(vceqq_s8(off1_current_bytes, vdupq_n_s8(-12 /* 0xF4 */)));
 
     // FIXME: was vandq_u8?
-    let badfollow_ed : uint8x16_t =
-        vandq_u8(vcgtq_u8(current_bytes, vdupq_n_u8(0x9F)), mask_ed);
-    let badfollow_f4 : uint8x16_t =
-        vandq_u8(vcgtq_u8(current_bytes, vdupq_n_u8(0x8F)), mask_f4);
+    let badfollow_ed: uint8x16_t =
+        vandq_u8(vreinterpretq_u8_s8(vcgtq_s8(current_bytes, vdupq_n_s8(-97 /* 0x9F */))), mask_ed);
+    let badfollow_f4: uint8x16_t =
+        vandq_u8(vreinterpretq_u8_s8(vcgtq_s8(current_bytes, vdupq_n_s8(-113 /* 0x8F */))), mask_f4);
 
-    *has_error = vorrq_u8(
-        *has_error, vorrq_u8(badfollow_ed, badfollow_f4));
+    *has_error = vorrq_s8(
+        *has_error, vreinterpretq_s8_u8(vorrq_u8(badfollow_ed, badfollow_f4)));
 }
 
 // map off1_hibits => error condition
@@ -100,73 +97,73 @@ unsafe fn check_first_continuation_max(
 // else      false && false
 #[cfg_attr(not(feature = "no-inline"), inline)]
 unsafe fn check_overlong(
-    current_bytes: uint8x16_t,
-    off1_current_bytes: uint8x16_t,
-    hibits: uint8x16_t,
-    previous_hibits: uint8x16_t,
-    has_error: &mut uint8x16_t,
+    current_bytes: int8x16_t,
+    off1_current_bytes: int8x16_t,
+    hibits: int8x16_t,
+    previous_hibits: int8x16_t,
+    has_error: &mut int8x16_t,
 ) {
-    let _initial_mins : int8x16_t = int8x16_t::new(
-        -128,         -128, -128, -128, -128, -128,
-        -128,         -128, -128, -128, -128, -128, // 10xx => false
+    let _initial_mins: int8x16_t = int8x16_t::new(
+        -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -128, -128, // 10xx => false
         -62 /* 0xC2 */, -128,                         // 110x
         -31 /* 0xE1 */,                               // 1110
         -15 /*0xF1 */,
     );
 
-    let _second_mins : int8x16_t = int8x16_t::new(
-        -128,         -128, -128, -128, -128, -128,
-        -128,         -128, -128, -128, -128, -128, // 10xx => false
-        127,          127,                          // 110x => true
+    let _second_mins: int8x16_t = int8x16_t::new(
+        -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -128, -128, // 10xx => false
+        127, 127,                          // 110x => true
         -96 /* 0xA0 */,                               // 1110
         -112 /* 0x90 */,
     );
 
-    let off1_hibits : uint8x16_t = vextq_u8(previous_hibits, hibits, 16 - 1);
-    let initial_mins : int8x16_t =
-        vqtbl1q_s8(_initial_mins, off1_hibits);
+    let off1_hibits: int8x16_t = vextq_s8(previous_hibits, hibits, 16 - 1);
+    let initial_mins: int8x16_t =
+        vqtbl1q_s8(_initial_mins, vreinterpretq_u8_s8(off1_hibits));
 
-    let initial_under : uint8x16_t = vreinterpretq_u8_s8(vcgtq_s8(initial_mins, vreinterpretq_s8_u8(off1_current_bytes)));
+    let initial_under: uint8x16_t = vreinterpretq_u8_s8(vcgtq_s8(initial_mins, off1_current_bytes));
 
-    let second_mins : int8x16_t =
-        vqtbl1q_s8(_second_mins, off1_hibits);
+    let second_mins: int8x16_t =
+        vqtbl1q_s8(_second_mins, vreinterpretq_u8_s8(off1_hibits));
 
-    let second_under : uint8x16_t = vreinterpretq_u8_s8(vcgtq_s8(second_mins, vreinterpretq_s8_u8(current_bytes)));
+    let second_under: uint8x16_t = vreinterpretq_u8_s8(vcgtq_s8(second_mins, current_bytes));
 
-    *has_error = vorrq_u8(
-        *has_error, vandq_u8(initial_under, second_under));
+    *has_error = vorrq_s8(
+        *has_error, vreinterpretq_s8_u8(vandq_u8(initial_under, second_under)));
 }
 
 pub struct ProcessedUtfBytes {
-    rawbytes: uint8x16_t,
-    high_nibbles: uint8x16_t,
-    pub carried_continuations: uint8x16_t,
+    rawbytes: int8x16_t,
+    high_nibbles: int8x16_t,
+    pub carried_continuations: int8x16_t,
 }
 
 impl Default for ProcessedUtfBytes {
     #[cfg_attr(not(feature = "no-inline"), inline)]
     fn default() -> Self {
         ProcessedUtfBytes {
-            rawbytes: vdupq_n_u8(0x00),
-            high_nibbles: vdupq_n_u8(0x00),
-            carried_continuations: vdupq_n_u8(0x00),
+            rawbytes: vdupq_n_s8(0x00),
+            high_nibbles: vdupq_n_s8(0x00),
+            carried_continuations: vdupq_n_s8(0x00),
         }
     }
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn count_nibbles(bytes: uint8x16_t, answer: &mut ProcessedUtfBytes) {
+unsafe fn count_nibbles(bytes: int8x16_t, answer: &mut ProcessedUtfBytes) {
     answer.rawbytes = bytes;
-    answer.high_nibbles = vshrq_n_u8(bytes, 4);
+    answer.high_nibbles = vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4));
 }
 
 // check whether the current bytes are valid UTF-8
 // at the end of the function, previous gets updated
 #[cfg_attr(not(feature = "no-inline"), inline)]
 pub fn check_utf8_bytes(
-    current_bytes: uint8x16_t,
+    current_bytes: int8x16_t,
     previous: &ProcessedUtfBytes,
-    has_error: &mut uint8x16_t,
+    has_error: &mut int8x16_t,
 ) -> ProcessedUtfBytes {
     let mut pb = ProcessedUtfBytes::default();
     unsafe {
@@ -174,15 +171,15 @@ pub fn check_utf8_bytes(
 
         check_smaller_than_0xf4(current_bytes, has_error);
 
-        let initial_lengths: uint8x16_t = continuation_lengths(pb.high_nibbles);
+        let initial_lengths: int8x16_t = continuation_lengths(pb.high_nibbles);
 
         pb.carried_continuations =
             carry_continuations(initial_lengths, previous.carried_continuations);
 
         check_continuations(initial_lengths, pb.carried_continuations, has_error);
 
-        let off1_current_bytes : uint8x16_t =
-            vextq_u8(previous.rawbytes, pb.rawbytes, 16 - 1);
+        let off1_current_bytes: int8x16_t =
+            vextq_s8(previous.rawbytes, pb.rawbytes, 16 - 1);
 
         check_first_continuation_max(current_bytes, off1_current_bytes, has_error);
 

From 68691ad120f4943b802823b4c57fa4982bdbd7b8 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Tue, 13 Aug 2019 00:29:54 -0400
Subject: [PATCH 11/34] feat: trying to fix parse_str

---
 src/neon/deser.rs | 63 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 8 deletions(-)

diff --git a/src/neon/deser.rs b/src/neon/deser.rs
index b194dde3..0945c847 100644
--- a/src/neon/deser.rs
+++ b/src/neon/deser.rs
@@ -3,20 +3,26 @@
 pub use crate::error::{Error, ErrorType};
 pub use crate::Deserializer;
 pub use crate::Result;
-pub use crate::neon::stage1::SIMDJSON_PADDING;
+pub use crate::neon::stage1::*;
 pub use crate::neon::intrinsics::*;
 pub use crate::neon::utf8check::*;
 pub use crate::stringparse::*;
 
 pub use crate::neon::intrinsics::*;
 
-unsafe fn find_bs_bits_and_quote_bits(src: &[u8], dst: &mut [u8]) -> ParseStringHelper {
+unsafe fn find_bs_bits_and_quote_bits(src: &[u8], dstx: Option<&mut [u8]>) -> ParseStringHelper {
     // this can read up to 31 bytes beyond the buffer size, but we require
     // SIMDJSON_PADDING of padding
     let v0 : uint8x16_t = vld1q_u8(src.as_ptr());
     let v1 : uint8x16_t = vld1q_u8(src.as_ptr().add(16));
-    vst1q_u8(dst.as_mut_ptr(), v0);
-    vst1q_u8(dst.as_mut_ptr().add(16), v1);
+
+    match dstx {
+        Some(dst) => {
+            vst1q_u8(dst.as_mut_ptr(), v0);
+            vst1q_u8(dst.as_mut_ptr().add(16), v1);
+        },
+        _ => ()
+    }
 
     let bs_mask : uint8x16_t = vmovq_n_u8('\\' as u8);
     let qt_mask : uint8x16_t = vmovq_n_u8('"' as u8);
@@ -50,7 +56,7 @@ impl<'de> Deserializer<'de> {
     pub fn parse_str_(&mut self) -> Result<&'de str> {
         // Add 1 to skip the initial "
         let idx = self.iidx + 1;
-//        let padding = [0u8; 32];
+//        let mut padding = [0u8; 32];
         //let mut read: usize = 0;
 
         // we include the terminal '"' so we know where to end
@@ -59,14 +65,55 @@ impl<'de> Deserializer<'de> {
 
         let src: &[u8] = unsafe { &self.input.get_unchecked(idx..) };
         let mut src_i: usize = 0;
-        let len = src_i;
+        let mut len = src_i;
+        loop {
+            // store to dest unconditionally - we can overwrite the bits we don't like
+            // later
+            let ParseStringHelper { bs_bits, quote_bits } = unsafe { find_bs_bits_and_quote_bits(&src[src_i..], None) };
+
+            if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
+                // we encountered quotes first. Move dst to point to quotes and exit
+                // find out where the quote is...
+                let quote_dist: u32 = trailingzeroes(u64::from(quote_bits)) as u32;
+
+                ///////////////////////
+                // Above, check for overflow in case someone has a crazy string (>=4GB?)
+                // But only add the overflow check when the document itself exceeds 4GB
+                // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+                ////////////////////////
+
+                // we advance the point, accounting for the fact that we have a NULl termination
+
+                len += quote_dist as usize;
+                unsafe {
+                    let v = self.input.get_unchecked(idx..idx + len) as *const [u8] as *const str;
+                    return Ok(&*v);
+                }
+
+                // we compare the pointers since we care if they are 'at the same spot'
+                // not if they are the same value
+            }
+            if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
+                // Move to the 'bad' character
+                let bs_dist: u32 = trailingzeroes(u64::from(bs_bits));
+                len += bs_dist as usize;
+                src_i += bs_dist as usize;
+                break;
+            } else {
+                // they are the same. Since they can't co-occur, it means we encountered
+                // neither.
+                src_i += 32;
+                len += 32;
+            }
+        }
+
         let mut dst_i: usize = 0;
         let dst: &mut [u8] = &mut self.strings;
 
         loop {
             // store to dest unconditionally - we can overwrite the bits we don't like
             // later
-            let ParseStringHelper {bs_bits, quote_bits} = unsafe { find_bs_bits_and_quote_bits(src, dst) };
+            let ParseStringHelper { bs_bits, quote_bits } = unsafe { find_bs_bits_and_quote_bits(&src[src_i..], Some(&mut dst[dst_i..])) };
 
             if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
                 // we encountered quotes first. Move dst to point to quotes and exit
@@ -143,4 +190,4 @@ impl<'de> Deserializer<'de> {
             }
         }
     }
-}
+}
\ No newline at end of file

From be606903255d814c6ec2053ab1140be99d0d6b78 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Tue, 13 Aug 2019 11:19:10 -0400
Subject: [PATCH 12/34] fix: numberparse

---
 src/numberparse.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/numberparse.rs b/src/numberparse.rs
index 425d16ab..2362e6d0 100644
--- a/src/numberparse.rs
+++ b/src/numberparse.rs
@@ -129,14 +129,14 @@ pub enum Number {
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-#[cfg(target_arch = "aarch64")]
+//#[cfg(target_arch = "aarch64")]
 fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 {
     let val: u64 = unsafe { *(chars.as_ptr() as *const u64) };
     //    memcpy(&val, chars, sizeof(u64));
-    let val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
-    let val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
+    let val = (val & 0x0F0F0F0F0F0F0F0F).wrapping_mul(2561) >> 8;
+    let val = (val & 0x00FF00FF00FF00FF).wrapping_mul(6553601) >> 16;
 
-    return ((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32) as u32;
+    return ((val & 0x0000FFFF0000FFFF).wrapping_mul(42949672960001) >> 32) as u32;
 }
 
 impl<'de> Deserializer<'de> {

From e6609ff1877d2908389fdc78dbc6e8d29af42dff Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Tue, 13 Aug 2019 11:22:06 -0400
Subject: [PATCH 13/34] fix: endian in flatten_bits

---
 src/neon/stage1.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs
index a1e15a10..2af15e6d 100644
--- a/src/neon/stage1.rs
+++ b/src/neon/stage1.rs
@@ -332,7 +332,7 @@ unsafe fn flatten_bits(base: &mut Vec<u32>, idx: u32, mut bits: u64) {
         let v3: i32 = static_cast_i32!(trailingzeroes(bits));
         bits &= bits.wrapping_sub(1);
 
-        let v: int32x4_t = int32x4_t::new(v3, v2, v1, v0);
+        let v: int32x4_t = int32x4_t::new(v0, v1, v2, v3);
         let v: int32x4_t = vaddq_s32(idx_64_v, v);
 
         std::ptr::write(base.as_mut_ptr().add(l) as *mut int32x4_t, v);

From 45cbbd497385fb0f5490122a85bacf7ee9aff3f5 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Tue, 13 Aug 2019 12:11:25 -0400
Subject: [PATCH 14/34] fix: utf8 encoding (still broken but closer)

---
 src/neon/deser.rs     | 29 ++++++++++++++++++++++++++---
 src/neon/stage1.rs    |  8 ++++----
 src/neon/utf8check.rs |  2 +-
 3 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/src/neon/deser.rs b/src/neon/deser.rs
index 0945c847..65f744a1 100644
--- a/src/neon/deser.rs
+++ b/src/neon/deser.rs
@@ -56,7 +56,7 @@ impl<'de> Deserializer<'de> {
     pub fn parse_str_(&mut self) -> Result<&'de str> {
         // Add 1 to skip the initial "
         let idx = self.iidx + 1;
-//        let mut padding = [0u8; 32];
+        let mut padding = [0u8; 32];
         //let mut read: usize = 0;
 
         // we include the terminal '"' so we know where to end
@@ -69,7 +69,19 @@ impl<'de> Deserializer<'de> {
         loop {
             // store to dest unconditionally - we can overwrite the bits we don't like
             // later
-            let ParseStringHelper { bs_bits, quote_bits } = unsafe { find_bs_bits_and_quote_bits(&src[src_i..], None) };
+
+            let srcx = if src.len() >= src_i + 32 {
+                &src[src_i..]
+            } else {
+                unsafe {
+                    padding
+                        .get_unchecked_mut(..src.len() - src_i)
+                        .clone_from_slice(src.get_unchecked(src_i..));
+                    &padding
+                }
+            };
+
+            let ParseStringHelper { bs_bits, quote_bits } = unsafe { find_bs_bits_and_quote_bits(&srcx, None) };
 
             if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
                 // we encountered quotes first. Move dst to point to quotes and exit
@@ -111,9 +123,20 @@ impl<'de> Deserializer<'de> {
         let dst: &mut [u8] = &mut self.strings;
 
         loop {
+            let srcx = if src.len() >= src_i + 32 {
+                &src[src_i..]
+            } else {
+                unsafe {
+                    padding
+                        .get_unchecked_mut(..src.len() - src_i)
+                        .clone_from_slice(src.get_unchecked(src_i..));
+                    &padding
+                }
+            };
+
             // store to dest unconditionally - we can overwrite the bits we don't like
             // later
-            let ParseStringHelper { bs_bits, quote_bits } = unsafe { find_bs_bits_and_quote_bits(&src[src_i..], Some(&mut dst[dst_i..])) };
+            let ParseStringHelper { bs_bits, quote_bits } = unsafe { find_bs_bits_and_quote_bits(&srcx, Some(dst)) };
 
             if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
                 // we encountered quotes first. Move dst to point to quotes and exit
diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs
index 2af15e6d..7748ee37 100644
--- a/src/neon/stage1.rs
+++ b/src/neon/stage1.rs
@@ -109,13 +109,13 @@ unsafe fn check_utf8(
     } else {
         // it is not ascii so we have to do heavy work
         state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v0),
-                                          &(state.previous), &mut (state.has_error));
+                                          &mut (state.previous), &mut (state.has_error));
         state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v1),
-                                          &(state.previous), &mut (state.has_error));
+                                          &mut (state.previous), &mut (state.has_error));
         state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v2),
-                                          &(state.previous), &mut (state.has_error));
+                                          &mut (state.previous), &mut (state.has_error));
         state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v3),
-                                          &(state.previous), &mut (state.has_error));
+                                          &mut (state.previous), &mut (state.has_error));
     }
 }
 
diff --git a/src/neon/utf8check.rs b/src/neon/utf8check.rs
index c62707fb..8e260ae8 100644
--- a/src/neon/utf8check.rs
+++ b/src/neon/utf8check.rs
@@ -162,7 +162,7 @@ unsafe fn count_nibbles(bytes: int8x16_t, answer: &mut ProcessedUtfBytes) {
 #[cfg_attr(not(feature = "no-inline"), inline)]
 pub fn check_utf8_bytes(
     current_bytes: int8x16_t,
-    previous: &ProcessedUtfBytes,
+    previous: &mut ProcessedUtfBytes,
     has_error: &mut int8x16_t,
 ) -> ProcessedUtfBytes {
     let mut pb = ProcessedUtfBytes::default();

From 15c892b11f3fb32e25167db0f4186e4ba2c464f6 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Tue, 13 Aug 2019 14:37:11 -0400
Subject: [PATCH 15/34] fix: use write instead of intrinsic (for now)

---
 src/neon/deser.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/neon/deser.rs b/src/neon/deser.rs
index 65f744a1..30f3cabe 100644
--- a/src/neon/deser.rs
+++ b/src/neon/deser.rs
@@ -9,6 +9,7 @@ pub use crate::neon::utf8check::*;
 pub use crate::stringparse::*;
 
 pub use crate::neon::intrinsics::*;
+use std::io::Write;
 
 unsafe fn find_bs_bits_and_quote_bits(src: &[u8], dstx: Option<&mut [u8]>) -> ParseStringHelper {
     // this can read up to 31 bytes beyond the buffer size, but we require
@@ -17,9 +18,11 @@ unsafe fn find_bs_bits_and_quote_bits(src: &[u8], dstx: Option<&mut [u8]>) -> Pa
     let v1 : uint8x16_t = vld1q_u8(src.as_ptr().add(16));
 
     match dstx {
-        Some(dst) => {
-            vst1q_u8(dst.as_mut_ptr(), v0);
-            vst1q_u8(dst.as_mut_ptr().add(16), v1);
+        Some(mut dst) => {
+//          vst1q_u8(dst.as_mut_ptr(), v0);
+//          vst1q_u8(dst.as_mut_ptr().add(16), v1);
+            dst.write(&src[0..16]).unwrap();
+            dst.write(&src[16..32]).unwrap();
         },
         _ => ()
     }

From e3872b59e86a0c413137680d1936fa5be81ba734 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Tue, 13 Aug 2019 15:57:44 -0400
Subject: [PATCH 16/34] fix: fix string parsing (mostly, thanks to @licenser)

---
 src/lib.rs        |  5 ++++-
 src/neon/deser.rs | 21 ++++++---------------
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index c032bd0d..261672ec 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -180,7 +180,10 @@ impl<'de> Deserializer<'de> {
 
         let counts = Deserializer::validate(input, &structural_indexes)?;
 
-        let strings = Vec::with_capacity(len + SIMDJSON_PADDING);
+        let mut strings = Vec::with_capacity(len + SIMDJSON_PADDING);
+        unsafe {
+            strings.set_len(len + SIMDJSON_PADDING);
+        }
 
         Ok(Deserializer {
             counts,
diff --git a/src/neon/deser.rs b/src/neon/deser.rs
index 30f3cabe..4f45d2b6 100644
--- a/src/neon/deser.rs
+++ b/src/neon/deser.rs
@@ -9,24 +9,13 @@ pub use crate::neon::utf8check::*;
 pub use crate::stringparse::*;
 
 pub use crate::neon::intrinsics::*;
-use std::io::Write;
 
-unsafe fn find_bs_bits_and_quote_bits(src: &[u8], dstx: Option<&mut [u8]>) -> ParseStringHelper {
+unsafe fn find_bs_bits_and_quote_bits(src: &[u8]) -> ParseStringHelper {
     // this can read up to 31 bytes beyond the buffer size, but we require
     // SIMDJSON_PADDING of padding
     let v0 : uint8x16_t = vld1q_u8(src.as_ptr());
     let v1 : uint8x16_t = vld1q_u8(src.as_ptr().add(16));
 
-    match dstx {
-        Some(mut dst) => {
-//          vst1q_u8(dst.as_mut_ptr(), v0);
-//          vst1q_u8(dst.as_mut_ptr().add(16), v1);
-            dst.write(&src[0..16]).unwrap();
-            dst.write(&src[16..32]).unwrap();
-        },
-        _ => ()
-    }
-
     let bs_mask : uint8x16_t = vmovq_n_u8('\\' as u8);
     let qt_mask : uint8x16_t = vmovq_n_u8('"' as u8);
 
@@ -84,7 +73,7 @@ impl<'de> Deserializer<'de> {
                 }
             };
 
-            let ParseStringHelper { bs_bits, quote_bits } = unsafe { find_bs_bits_and_quote_bits(&srcx, None) };
+            let ParseStringHelper { bs_bits, quote_bits } = unsafe { find_bs_bits_and_quote_bits(&srcx) };
 
             if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
                 // we encountered quotes first. Move dst to point to quotes and exit
@@ -123,7 +112,7 @@ impl<'de> Deserializer<'de> {
         }
 
         let mut dst_i: usize = 0;
-        let dst: &mut [u8] = &mut self.strings;
+        let dst: &mut [u8] = self.strings.as_mut_slice();
 
         loop {
             let srcx = if src.len() >= src_i + 32 {
@@ -137,9 +126,11 @@ impl<'de> Deserializer<'de> {
                 }
             };
 
+            dst[dst_i..dst_i + 32].copy_from_slice(&srcx[..32]);
+
             // store to dest unconditionally - we can overwrite the bits we don't like
             // later
-            let ParseStringHelper { bs_bits, quote_bits } = unsafe { find_bs_bits_and_quote_bits(&srcx, Some(dst)) };
+            let ParseStringHelper { bs_bits, quote_bits } = unsafe { find_bs_bits_and_quote_bits(&srcx) };
 
             if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
                 // we encountered quotes first. Move dst to point to quotes and exit

From 782b0a6d8ab5af7afab2fbc0b3c1b15c599fffe7 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Wed, 14 Aug 2019 12:29:58 -0400
Subject: [PATCH 17/34] fix: add_overflow should always output carry

---
 src/neon/intrinsics.rs | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs
index 2539ae33..8c5dac7d 100644
--- a/src/neon/intrinsics.rs
+++ b/src/neon/intrinsics.rs
@@ -263,11 +263,7 @@ impl int32x4_t {
 #[inline]
 pub fn add_overflow(a: u64, b: u64, out: &mut u64) -> bool {
     let (carry, did_carry) = a.overflowing_add(b);
-
-    if did_carry {
-        *out = carry;
-    };
-
+    *out = carry;
     did_carry
 }
 

From 98b335640f118f5f1fa0ce4d9e0b741e86357e5b Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Thu, 15 Aug 2019 09:27:57 -0400
Subject: [PATCH 18/34] fix: tests PASS, improved comparison operators, thanks
 @Licenser :)

---
 src/neon/intrinsics.rs | 436 +++++++++++++++++++----------------------
 src/neon/stage1.rs     |  23 +--
 src/neon/utf8check.rs  |  31 +--
 3 files changed, 223 insertions(+), 267 deletions(-)

diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs
index 8c5dac7d..1c86659c 100644
--- a/src/neon/intrinsics.rs
+++ b/src/neon/intrinsics.rs
@@ -3,7 +3,6 @@
 use crate::neon::simd_llvm;
 
 use std::mem;
-use std::hint::unreachable_unchecked;
 use core;
 
 #[allow(unused)]
@@ -23,8 +22,6 @@ macro_rules! types {
 
 #[allow(non_camel_case_types)]
 pub type poly64_t = i64;
-//#[allow(non_camel_case_types)]
-//type poly128_t = [u8; 16];
 
 #[allow(improper_ctypes)]
 extern "C" {
@@ -40,116 +37,55 @@ extern "C" {
     fn vqmovn_u64_(a: uint64x2_t) -> uint32x2_t;
     #[link_name = "llvm.aarch64.neon.uqsub.v16u8"]
     fn vqsubq_u8_(a: uint8x16_t, a: uint8x16_t) -> uint8x16_t;
+    #[link_name = "llvm.aarch64.neon.uqsub.v16i8"]
+    fn vqsubq_s8_(a: int8x16_t, a: int8x16_t) -> int8x16_t;
 }
 
-unsafe fn vaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) }
-unsafe fn vaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t { simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) }
-unsafe fn vaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t { simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) }
-
-unsafe fn vextq_u8_(a: uint8x16_t, b: uint8x16_t, n: u8) -> uint8x16_t {
-    match n {
-        0 => b,
-        1 => uint8x16_t(
-            a.0, b.1, b.2, b.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        2 => uint8x16_t(
-            a.0, a.1, b.2, b.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        3 => uint8x16_t(
-            a.0, a.1, a.2, b.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        4 => uint8x16_t(
-            a.0, a.1, a.2, a.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        5 => uint8x16_t(
-            a.0, a.1, a.2, a.3, a.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        6 => uint8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        7 => uint8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        8 => uint8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        9 => uint8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        10 => uint8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        11 => uint8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        12 => uint8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, b.12, b.13, b.14, b.15,
-        ),
-        13 => uint8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, a.12, b.13, b.14, b.15,
-        ),
-        14 => uint8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, a.12, a.13, b.14, b.15,
-        ),
-        15 => uint8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, a.12, a.13, a.14, b.15,
-        ),
-        16 => a,
-        _ => unreachable_unchecked(),
-    }
+#[inline]
+unsafe fn vaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_llvm::simd_add(mem::transmute(a), mem::transmute(b))
 }
 
-unsafe fn vextq_s8_(a: int8x16_t, b: int8x16_t, n: u8) -> int8x16_t {
-    match n {
-        0 => b,
-        1 => int8x16_t(
-            a.0, b.1, b.2, b.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        2 => int8x16_t(
-            a.0, a.1, b.2, b.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        3 => int8x16_t(
-            a.0, a.1, a.2, b.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        4 => int8x16_t(
-            a.0, a.1, a.2, a.3, b.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        5 => int8x16_t(
-            a.0, a.1, a.2, a.3, a.4, b.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        6 => int8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, b.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        7 => int8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, b.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        8 => int8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, b.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        9 => int8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, b.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        10 => int8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, b.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        11 => int8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, b.11, b.12, b.13, b.14, b.15,
-        ),
-        12 => int8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, b.12, b.13, b.14, b.15,
-        ),
-        13 => int8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, a.12, b.13, b.14, b.15,
-        ),
-        14 => int8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, a.12, a.13, b.14, b.15,
-        ),
-        15 => int8x16_t(
-            a.0, a.1, a.2, a.3, a.4, a.5, a.6, a.7, a.8, a.9, a.10, a.11, a.12, a.13, a.14, b.15,
-        ),
-        16 => a,
-        _ => unreachable_unchecked(),
-    }
+#[inline]
+unsafe fn vaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_llvm::simd_add(mem::transmute(a), mem::transmute(b))
+}
+
+#[inline]
+unsafe fn vaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_llvm::simd_add(mem::transmute(a), mem::transmute(b))
+}
+
+#[inline]
+pub unsafe fn vnegq_u8(a: uint8x16_t) -> uint8x16_t {
+    let x: u128 = mem::transmute(a);
+    let nx = !x;
+    mem::transmute(nx)
+}
+
+#[inline]
+pub unsafe fn vnegq_s8(a: int8x16_t) -> int8x16_t {
+    let x: u128 = mem::transmute(a);
+    let nx = !x;
+    mem::transmute(nx)
+}
+
+
+#[inline]
+fn rotate_(a: u128, b: u128, n: u128) -> u128 {
+    let az = a >> (n * 8);
+    let bz = b << (128 - (n * 8));
+    az | bz
+}
+
+#[inline]
+pub unsafe fn vextq_u8(a: uint8x16_t, b: uint8x16_t, n: u8) -> uint8x16_t {
+    mem::transmute(rotate_(mem::transmute(a), mem::transmute(b), n as u128))
+}
+
+#[inline]
+pub unsafe fn vextq_s8(a: int8x16_t, b: int8x16_t, n: u8) -> int8x16_t {
+    mem::transmute(rotate_(mem::transmute(a), mem::transmute(b), n as u128))
 }
 
 #[inline]
@@ -243,18 +179,21 @@ types! {
 }
 
 impl uint8x16_t {
+    #[inline]
     pub fn new(a: u8, b: u8, c: u8, d: u8, e: u8, f: u8, g: u8, h: u8, i: u8, j: u8, k: u8, l: u8, m: u8, n: u8, o: u8, p: u8) -> uint8x16_t {
         uint8x16_t(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)
     }
 }
 
 impl int8x16_t {
+    #[inline]
     pub fn new(a: i8, b: i8, c: i8, d: i8, e: i8, f: i8, g: i8, h: i8, i: i8, j: i8, k: i8, l: i8, m: i8, n: i8, o: i8, p: i8) -> int8x16_t {
         int8x16_t(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)
     }
 }
 
 impl int32x4_t {
+    #[inline]
     pub fn new(a: i32, b: i32, c: i32, d: i32) -> int32x4_t {
         int32x4_t(a, b, c, d)
     }
@@ -283,107 +222,149 @@ pub unsafe fn vst1q_u8(addr: *mut u8, val: uint8x16_t) {
 }
 
 macro_rules! aarch64_simd_2 {
-    ($name:ident, $type:ty, $simd_fn:ident, $intr:ident) => {
+    ($name: ident, $type: ty, $simd_fn: ident, $intrarm: ident, $intraarch: ident) => {
+        aarch64_simd_2!($name, $type, $type, $simd_fn, $intrarm, $intraarch);
+    };
+    ($name: ident, $type: ty, $res: ty, $simd_fn: ident, $intrarm: ident, $intraarch: ident) => {
         #[inline]
-        pub unsafe fn $name(a: $type, b: $type) -> $type {
+        pub unsafe fn $name(a: $type, b: $type) -> $res {
             simd_llvm::$simd_fn(a, b)
         }
-    };
+    }
 }
+
 macro_rules! aarch64_simd_ceq {
-    ($name:ident, $type:ty) => {
+    ($name: ident, $type: ty, $res: ty) => {
         /// Compare bitwise Equal (vector)
-        aarch64_simd_2!($name, $type, simd_eq, cmeq);
+        aarch64_simd_2!($name, $type, $res, simd_eq, cmeq, cmeq);
     };
 }
 
-aarch64_simd_ceq!(vceqq_u8, uint8x16_t);
-
-aarch64_simd_ceq!(vceq_s64, int64x1_t);
-aarch64_simd_ceq!(vceqq_s64, int64x2_t);
-aarch64_simd_ceq!(vceq_u64, uint64x1_t);
-aarch64_simd_ceq!(vceqq_u64, uint64x2_t);
-aarch64_simd_ceq!(vceq_p64, uint64x1_t);
-aarch64_simd_ceq!(vceqq_p64, uint64x2_t);
-
-aarch64_simd_ceq!(vceqq_s8, int8x16_t);
+aarch64_simd_ceq!(vceq_s8, int8x8_t, uint8x8_t);
+aarch64_simd_ceq!(vceqq_s8, int8x16_t, uint8x16_t);
+aarch64_simd_ceq!(vceq_s16, int16x4_t, uint16x4_t);
+aarch64_simd_ceq!(vceqq_s16, int16x8_t, uint16x8_t);
+aarch64_simd_ceq!(vceq_s32, int32x2_t, uint32x2_t);
+aarch64_simd_ceq!(vceqq_s32, int32x4_t, uint32x4_t);
+aarch64_simd_ceq!(vceq_u8, uint8x8_t, uint8x8_t);
+aarch64_simd_ceq!(vceqq_u8, uint8x16_t, uint8x16_t);
+aarch64_simd_ceq!(vceq_u16, uint16x4_t, uint16x4_t);
+aarch64_simd_ceq!(vceqq_u16, uint16x8_t, uint16x8_t);
+aarch64_simd_ceq!(vceq_u32, uint32x2_t, uint32x2_t);
+aarch64_simd_ceq!(vceqq_u32, uint32x4_t, uint32x4_t);
+aarch64_simd_2!(vceq_f32, float32x2_t, uint32x2_t, simd_eq, fcmeq, fcmeq);
+aarch64_simd_2!(vceqq_f32, float32x4_t, uint32x4_t, simd_eq, fcmeq, fcmeq);
+aarch64_simd_ceq!(vceq_p8, poly8x8_t, poly8x8_t);
+aarch64_simd_ceq!(vceqq_p8, poly8x16_t, poly8x16_t);
 
 macro_rules! aarch64_simd_cgt {
-    ($name:ident, $type:ty) => {
+    ($name:ident, $type:ty, $res:ty) => {
         /// Compare signed Greater than (vector)
-        aarch64_simd_2!($name, $type, simd_gt, cmgt);
-    };
-}
-macro_rules! aarch64_simd_cgtu {
-    ($name:ident, $type:ty) => {
-        /// Compare Greater than (vector)
-        aarch64_simd_2!($name, $type, simd_gt, cmhi);
+        aarch64_simd_2!($name, $type, $res, simd_gt, cmgt, cmgt);
     };
 }
 
-aarch64_simd_cgtu!(vcgtq_u8, uint8x16_t);
+//macro_rules! aarch64_simd_cgtu {
+//    ($name: ident, $type: ty) => {
+//        /// Compare Greater than (vector)
+//        aarch64_simd_2!($name, $type, simd_gt, cmhi);
+//    };
+//}
 
-aarch64_simd_cgt!(vcgt_s64, int64x1_t);
-aarch64_simd_cgt!(vcgtq_s64, int64x2_t);
-aarch64_simd_cgtu!(vcgt_u64, uint64x1_t);
-aarch64_simd_cgtu!(vcgtq_u64, uint64x2_t);
+aarch64_simd_cgt!(vcgt_s8, int8x8_t, uint8x8_t);
+aarch64_simd_cgt!(vcgtq_s8, int8x16_t, uint8x16_t);
+aarch64_simd_cgt!(vcgt_s16, int16x4_t, uint16x4_t);
+aarch64_simd_cgt!(vcgtq_s16, int16x8_t, uint16x8_t);
+aarch64_simd_cgt!(vcgt_s32, int32x2_t, uint32x2_t);
+aarch64_simd_cgt!(vcgtq_s32, int32x4_t, uint32x4_t);
+
+//aarch64_simd_cgtu!(vcgtq_u8, uint8x16_t);
+//aarch64_simd_cgt!(vcgt_s64, int64x1_t);
+//aarch64_simd_cgt!(vcgtq_s64, int64x2_t);
+//aarch64_simd_cgtu!(vcgt_u64, uint64x1_t);
+//aarch64_simd_cgtu!(vcgtq_u64, uint64x2_t);
 
 macro_rules! aarch64_simd_clt {
-    ($name:ident, $type:ty) => {
+    ($name:ident, $type:ty, $res:ty) => {
         /// Compare signed Lesser than (vector)
-        aarch64_simd_2!($name, $type, simd_lt, cmgt);
-    };
-}
-macro_rules! aarch64_simd_cltu {
-    ($name:ident, $type:ty) => {
-        /// Compare Lesser than (vector)
-        aarch64_simd_2!($name, $type, simd_lt, cmhi);
+        aarch64_simd_2!($name, $type, $res, simd_lt, cmgt, cmgt);
     };
 }
 
-aarch64_simd_clt!(vclt_s64, int64x1_t);
-aarch64_simd_clt!(vcltq_s64, int64x2_t);
-aarch64_simd_cltu!(vclt_u64, uint64x1_t);
-aarch64_simd_cltu!(vcltq_u64, uint64x2_t);
+//macro_rules! aarch64_simd_cltu {
+//( $ name: ident, $ type: ty) => {
+///// Compare Lesser than (vector)
+//aarch64_simd_2 ! ( $ name, $ type, simd_lt, cmhi);
+//};
+//}
+
+aarch64_simd_clt!(vclt_s8, int8x8_t, uint8x8_t);
+aarch64_simd_clt!(vcltq_s8, int8x16_t, uint8x16_t);
+aarch64_simd_clt!(vclt_s16, int16x4_t, uint16x4_t);
+aarch64_simd_clt!(vcltq_s16, int16x8_t, uint16x8_t);
+aarch64_simd_clt!(vclt_s32, int32x2_t, uint32x2_t);
+aarch64_simd_clt!(vcltq_s32, int32x4_t, uint32x4_t);
+
+//arm_simd_cltu!(vclt_u8, uint8x8_t);
+//arm_simd_cltu!(vcltq_u8, uint8x16_t);
+//arm_simd_cltu!(vclt_u16, uint16x4_t);
+//arm_simd_cltu!(vcltq_u16, uint16x8_t);
+//arm_simd_cltu!(vclt_u32, uint32x2_t);
+//arm_simd_cltu!(vcltq_u32, uint32x4_t);
 
 macro_rules! aarch64_simd_cge {
-    ($name:ident, $type:ty) => {
-        /// Compare signed Greater than (vector)
-        aarch64_simd_2!($name, $type, simd_ge, cmge);
-    };
-}
-macro_rules! aarch64_simd_cgeu {
-    ($name:ident, $type:ty) => {
-        /// Compare Greater than (vector)
-        aarch64_simd_2!($name, $type, simd_ge, cmhs);
+    ($name:ident, $type:ty, $res:ty) => {
+        /// Compare signed Greater than equals (vector)
+        aarch64_simd_2!($name, $type, $res, simd_ge, cmge, cmge);
     };
 }
 
-aarch64_simd_cge!(vcge_s64, int64x1_t);
-aarch64_simd_cge!(vcgeq_s64, int64x2_t);
-aarch64_simd_cgeu!(vcge_u64, uint64x1_t);
-aarch64_simd_cgeu!(vcgeq_u64, uint64x2_t);
+//macro_rules! aarch64_simd_cgeu {
+//( $ name: ident, $ type: ty) => {
+///// Compare Greater than (vector)
+//aarch64_simd_2 ! ( $ name, $ type, simd_ge, cmhs);
+//};
+//}
+
+aarch64_simd_cge!(vcge_s8, int8x8_t, uint8x8_t);
+aarch64_simd_cge!(vcgeq_s8, int8x16_t, uint8x16_t);
+aarch64_simd_cge!(vcge_s16, int16x4_t, uint16x4_t);
+aarch64_simd_cge!(vcgeq_s16, int16x8_t, uint16x8_t);
+aarch64_simd_cge!(vcge_s32, int32x2_t, uint32x2_t);
+aarch64_simd_cge!(vcgeq_s32, int32x4_t, uint32x4_t);
+//arm_simd_cgeu!(vcge_u8, uint8x8_t);
+//arm_simd_cgeu!(vcgeq_u8, uint8x16_t);
+//arm_simd_cgeu!(vcge_u16, uint16x4_t);
+//arm_simd_cgeu!(vcgeq_u16, uint16x8_t);
+//arm_simd_cgeu!(vcge_u32, uint32x2_t);
+//arm_simd_cgeu!(vcgeq_u32, uint32x4_t);
 
 macro_rules! aarch64_simd_cle {
-    ($name:ident, $type:ty) => {
-        /// Compare signed Lesser than (vector)
-        aarch64_simd_2!($name, $type, simd_le, cmge);
-    };
-}
-macro_rules! aarch64_simd_cleu {
-    ($name:ident, $type:ty) => {
-        /// Compare Lesser than (vector)
-        aarch64_simd_2!($name, $type, simd_le, cmhs);
+    ($name:ident, $type:ty, $res:ty) => {
+        /// Compare signed Lesser than equals (vector)
+        aarch64_simd_2!($name, $type, $res, simd_le, cmge, cmge);
     };
 }
 
-aarch64_simd_cleu!(vcleq_u8, uint8x16_t);
-aarch64_simd_cle!(vcle_s64, int64x1_t);
-aarch64_simd_cle!(vcleq_s64, int64x2_t);
-aarch64_simd_cleu!(vcle_u64, uint64x1_t);
-aarch64_simd_cleu!(vcleq_u64, uint64x2_t);
-
-aarch64_simd_cleu!(vcgtq_s8, int8x16_t);
+//macro_rules! aarch64_simd_cleu {
+//( $ name: ident, $ type: ty) => {
+///// Compare Lesser than (vector)
+//aarch64_simd_2 ! ( $ name, $ type, simd_le, cmhs);
+//};
+//}
+
+aarch64_simd_cle!(vcle_s8, int8x8_t, uint8x8_t);
+aarch64_simd_cle!(vcleq_s8, int8x16_t, uint8x16_t);
+aarch64_simd_cle!(vcle_s16, int16x4_t, uint16x4_t);
+aarch64_simd_cle!(vcleq_s16, int16x8_t, uint16x8_t);
+aarch64_simd_cle!(vcle_s32, int32x2_t, uint32x2_t);
+aarch64_simd_cle!(vcleq_s32, int32x4_t, uint32x4_t);
+//arm_simd_cleu!(vcle_u8, uint8x8_t);
+aarch64_simd_cle!(vcleq_u8, uint8x16_t, uint8x16_t);
+//arm_simd_cleu!(vcle_u16, uint16x4_t);
+//arm_simd_cleu!(vcleq_u16, uint16x8_t);
+//arm_simd_cleu!(vcle_u32, uint32x2_t);
+//arm_simd_cleu!(vcleq_u32, uint32x4_t);
 
 #[inline]
 pub fn vdupq_n_s8(a: i8) -> int8x16_t {
@@ -428,6 +409,30 @@ pub unsafe fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
 #[inline]
 pub unsafe fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_and(a, b) }
 
+#[inline]
+fn and16(a: i16, b: i16) -> i16 {
+    if (a & b) != 0 {
+        0
+    } else {
+        -1 /* 0xFFFF */
+    }
+}
+
+#[inline]
+pub unsafe fn vandq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+//    simd_llvm::simd_and(a, b)
+    int16x8_t(
+        and16(a.0, b.0),
+        and16(a.1, b.1),
+        and16(a.2, b.2),
+        and16(a.3, b.3),
+        and16(a.4, b.4),
+        and16(a.5, b.5),
+        and16(a.6, b.6),
+        and16(a.7, b.7),
+    )
+}
+
 #[inline]
 pub unsafe fn vorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_or(a, b) }
 
@@ -438,9 +443,9 @@ pub unsafe fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { simd_llvm::sim
 pub unsafe fn vorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { simd_llvm::simd_or(a, b) }
 
 macro_rules! arm_reinterpret {
-    ($name:ident, $from:ty, $to:ty) => {
+    ($name: ident, $from: ty, $to: ty) => {
         // Vector reinterpret cast operation
-        #[inline]
+        # [inline]
         pub unsafe fn $name(a: $from) -> $to {
             mem::transmute(a)
         }
@@ -448,18 +453,22 @@ macro_rules! arm_reinterpret {
 }
 
 arm_reinterpret!(vreinterpret_u64_u32, uint32x2_t, uint64x1_t);
+arm_reinterpret!(vreinterpretq_u64_u32, uint32x4_t, uint64x2_t);
 arm_reinterpret!(vreinterpretq_s8_u8, uint8x16_t, int8x16_t);
 arm_reinterpret!(vreinterpretq_u16_u8, uint8x16_t, uint16x8_t);
 arm_reinterpret!(vreinterpretq_u32_u8, uint8x16_t, uint32x4_t);
 arm_reinterpret!(vreinterpretq_u64_u8, uint8x16_t, uint64x2_t);
+arm_reinterpret!(vreinterpretq_u64_s8, int8x16_t, uint64x2_t);
 arm_reinterpret!(vreinterpretq_u8_s8, int8x16_t, uint8x16_t);
 
+arm_reinterpret!(vreinterpretq_s16_s8, int8x16_t, int16x8_t);
+arm_reinterpret!(vreinterpretq_s32_s8, int8x16_t, int32x4_t);
 arm_reinterpret!(vreinterpretq_s64_s8, int8x16_t, int64x2_t);
 
 macro_rules! arm_vget_lane {
-    ($name:ident, $to:ty, $from:ty, $lanes:literal) => {
+    ($name: ident, $to: ty, $from: ty, $lanes: literal) => {
         #[inline]
-        pub unsafe fn $name(v: $from, lane: u32) -> $to {
+        pub unsafe fn $ name(v: $from, lane: u32) -> $ to {
             simd_llvm::simd_extract(v, lane)
         }
     };
@@ -475,14 +484,6 @@ arm_vget_lane!(vgetq_lane_s32, i32, int32x4_t, 3);
 arm_vget_lane!(vgetq_lane_s64, i64, int64x2_t, 1);
 arm_vget_lane!(vget_lane_s64, i64, int64x1_t, 0);
 
-pub unsafe fn vextq_u8(a: uint8x16_t, b: uint8x16_t, n: u8) -> uint8x16_t {
-    mem::transmute(vextq_u8_(mem::transmute(a), mem::transmute(b), n))
-}
-
-pub unsafe fn vextq_s8(a: int8x16_t, b: int8x16_t, n: u8) -> int8x16_t {
-    mem::transmute(vextq_s8_(mem::transmute(a), mem::transmute(b), n))
-}
-
 #[inline]
 pub unsafe fn vqmovn_u64(a: uint64x2_t) -> uint32x2_t {
     vqmovn_u64_(a)
@@ -504,7 +505,12 @@ pub unsafe fn vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
 }
 
 #[inline]
-fn test_u8(a:u8,b:u8) -> u8 {
+pub unsafe fn vqsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    vqsubq_s8_(a, b)
+}
+
+#[inline]
+fn test_u8(a: u8, b: u8) -> u8 {
     if a & b != 0 {
         0xFF
     } else {
@@ -530,12 +536,12 @@ pub unsafe fn vtstq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
         test_u8(a.12, b.12),
         test_u8(a.13, b.13),
         test_u8(a.14, b.14),
-        test_u8(a.15, b.15)
+        test_u8(a.15, b.15),
     )
 }
 
 #[inline]
-fn test_s8(a:i8,b:i8) -> i8 {
+fn test_s8(a: i8, b: i8) -> i8 {
     if a & b != 0 {
         -1
     } else {
@@ -561,7 +567,7 @@ pub unsafe fn vtstq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
         test_s8(a.12, b.12),
         test_s8(a.13, b.13),
         test_s8(a.14, b.14),
-        test_s8(a.15, b.15)
+        test_s8(a.15, b.15),
     )
 }
 
@@ -579,45 +585,3 @@ pub fn trailingzeroes(a: u64) -> u32 {
 pub unsafe fn vst1q_u32(addr: *mut u8, val: uint32x4_t) {
     std::ptr::write(addr as *mut uint32x4_t, val)
 }
-
-
-#[allow(unused)]
-macro_rules! constify_imm5 {
-    ($imm8:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match ($imm8) & 0b1_1111 {
-            0 => $expand!(0),
-            1 => $expand!(1),
-            2 => $expand!(2),
-            3 => $expand!(3),
-            4 => $expand!(4),
-            5 => $expand!(5),
-            6 => $expand!(6),
-            7 => $expand!(7),
-            8 => $expand!(8),
-            9 => $expand!(9),
-            10 => $expand!(10),
-            11 => $expand!(11),
-            12 => $expand!(12),
-            13 => $expand!(13),
-            14 => $expand!(14),
-            15 => $expand!(15),
-            16 => $expand!(16),
-            17 => $expand!(17),
-            18 => $expand!(18),
-            19 => $expand!(19),
-            20 => $expand!(20),
-            21 => $expand!(21),
-            22 => $expand!(22),
-            23 => $expand!(23),
-            24 => $expand!(24),
-            25 => $expand!(25),
-            26 => $expand!(26),
-            27 => $expand!(27),
-            28 => $expand!(28),
-            29 => $expand!(29),
-            30 => $expand!(30),
-            _ => $expand!(31),
-        }
-    };
-}
\ No newline at end of file
diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs
index 7748ee37..4ff84aff 100644
--- a/src/neon/stage1.rs
+++ b/src/neon/stage1.rs
@@ -104,8 +104,8 @@ unsafe fn check_utf8(
         let verror: int8x16_t =
             int8x16_t::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1);
         state.has_error =
-            vorrq_s8(vcgtq_s8(state.previous.carried_continuations, verror),
-                     state.has_error);
+            vreinterpretq_s8_u8(vorrq_u8(vcgtq_s8(state.previous.carried_continuations, verror),
+                                         vreinterpretq_u8_s8(state.has_error)));
     } else {
         // it is not ascii so we have to do heavy work
         state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v0),
@@ -213,7 +213,7 @@ unsafe fn find_quote_mask_and_bits(
     odd_ends: u64,
     prev_iter_inside_quote: &mut u64,
     quote_bits: &mut u64,
-    error_mask: &mut u64
+    error_mask: &mut u64,
 ) -> u64 {
     *quote_bits = cmp_mask_against_input(input, b'"');
     *quote_bits &= !odd_ends;
@@ -286,16 +286,6 @@ unsafe fn find_whitespace_and_structurals(
     *whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
 }
 
-//template <>
-//really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
-//    utf8_checking_state<Architecture::ARM64> &state) {
-//  uint64x2_t v64 = vreinterpretq_u64_s8(state.has_error);
-//  uint32x2_t v32 = vqmovn_u64(v64);
-//  uint64x1_t result = vreinterpret_u64_u32(v32);
-//  return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR
-//                                       : simdjson::SUCCESS;
-//}
-
 // flatten out values in 'bits' assuming that they are are to have values of idx
 // plus their position in the bitvector, and store these indexes at
 // base_ptr[base] incrementing base as we go
@@ -391,7 +381,7 @@ impl<'de> Deserializer<'de> {
         let mut structural_indexes = Vec::with_capacity(len / 6);
         structural_indexes.push(0); // push extra root element
 
-        let mut utf8_state : Utf8CheckingState = Utf8CheckingState::default();
+        let mut utf8_state: Utf8CheckingState = Utf8CheckingState::default();
 
         // we have padded the input out to 64 byte multiple with the remainder being
         // zeros
@@ -529,9 +519,10 @@ impl<'de> Deserializer<'de> {
             return Err(ErrorType::Syntax);
         }
 
-        let has_error : i128 = mem::transmute(utf8_state.has_error);
+        let utf8_error_bits: u128 = mem::transmute(vandq_s16(mem::transmute(utf8_state.has_error), mem::transmute(utf8_state.has_error)));
+        let utf8_error: u16 = utf8_error_bits as u16;
 
-        if has_error != 0 {
+        if utf8_error != 0 {
             Ok(structural_indexes)
         } else {
             Err(ErrorType::InvalidUTF8)
diff --git a/src/neon/utf8check.rs b/src/neon/utf8check.rs
index 8e260ae8..5e2fa56f 100644
--- a/src/neon/utf8check.rs
+++ b/src/neon/utf8check.rs
@@ -24,7 +24,7 @@ unsafe fn check_smaller_than_0xf4(current_bytes: int8x16_t, has_error: &mut int8
     *has_error =
         vorrq_s8(
             *has_error,
-            vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(current_bytes), vdupq_n_u8(0xF4))));
+            vqsubq_s8(current_bytes, vdupq_n_s8(-12 /* 0xF4 */)));
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
@@ -42,14 +42,15 @@ unsafe fn continuation_lengths(high_nibbles: int8x16_t) -> int8x16_t {
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
 unsafe fn carry_continuations(initial_lengths: int8x16_t, previous_carries: int8x16_t) -> int8x16_t {
-    let right1: int8x16_t = vreinterpretq_s8_u8(vqsubq_u8(
-        vreinterpretq_u8_s8(vextq_s8(previous_carries, initial_lengths, 16 - 1)),
-        vdupq_n_u8(1)));
+    let right1: int8x16_t = vqsubq_s8(
+        vextq_s8(previous_carries, initial_lengths, 16 - 1),
+        vdupq_n_s8(1));
+
     let sum: int8x16_t = vaddq_s8(initial_lengths, right1);
 
-    let right2: int8x16_t = vreinterpretq_s8_u8(vqsubq_u8(
-        vreinterpretq_u8_s8(vextq_s8(previous_carries, sum, 16 - 2)),
-        vdupq_n_u8(2)));
+    let right2: int8x16_t = vqsubq_s8(
+        vextq_s8(previous_carries, sum, 16 - 2),
+        vdupq_n_s8(2));
 
     vaddq_s8(sum, right2)
 }
@@ -61,8 +62,8 @@ unsafe fn check_continuations(initial_lengths: int8x16_t, carries: int8x16_t, ha
     // (carries > length) == (lengths > 0)
 
     let overunder: uint8x16_t = vceqq_u8(
-        vreinterpretq_u8_s8(vcgtq_s8(carries, initial_lengths)),
-        vreinterpretq_u8_s8(vcgtq_s8(initial_lengths, vdupq_n_s8(0))));
+        vcgtq_s8(carries, initial_lengths),
+        vcgtq_s8(initial_lengths, vdupq_n_s8(0)));
 
     *has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(overunder));
 }
@@ -76,14 +77,14 @@ unsafe fn check_first_continuation_max(
     off1_current_bytes: int8x16_t,
     has_error: &mut int8x16_t,
 ) {
-    let mask_ed: uint8x16_t = vreinterpretq_u8_s8(vceqq_s8(off1_current_bytes, vdupq_n_s8(-19 /* 0xED */)));
-    let mask_f4: uint8x16_t = vreinterpretq_u8_s8(vceqq_s8(off1_current_bytes, vdupq_n_s8(-12 /* 0xF4 */)));
+    let mask_ed: uint8x16_t = vceqq_s8(off1_current_bytes, vdupq_n_s8(-19 /* 0xED */));
+    let mask_f4: uint8x16_t = vceqq_s8(off1_current_bytes, vdupq_n_s8(-12 /* 0xF4 */));
 
     // FIXME: was vandq_u8?
     let badfollow_ed: uint8x16_t =
-        vandq_u8(vreinterpretq_u8_s8(vcgtq_s8(current_bytes, vdupq_n_s8(-97 /* 0x9F */))), mask_ed);
+        vandq_u8(vcgtq_s8(current_bytes, vdupq_n_s8(-97 /* 0x9F */)), mask_ed);
     let badfollow_f4: uint8x16_t =
-        vandq_u8(vreinterpretq_u8_s8(vcgtq_s8(current_bytes, vdupq_n_s8(-113 /* 0x8F */))), mask_f4);
+        vandq_u8(vcgtq_s8(current_bytes, vdupq_n_s8(-113 /* 0x8F */)), mask_f4);
 
     *has_error = vorrq_s8(
         *has_error, vreinterpretq_s8_u8(vorrq_u8(badfollow_ed, badfollow_f4)));
@@ -123,12 +124,12 @@ unsafe fn check_overlong(
     let initial_mins: int8x16_t =
         vqtbl1q_s8(_initial_mins, vreinterpretq_u8_s8(off1_hibits));
 
-    let initial_under: uint8x16_t = vreinterpretq_u8_s8(vcgtq_s8(initial_mins, off1_current_bytes));
+    let initial_under: uint8x16_t = vcgtq_s8(initial_mins, off1_current_bytes);
 
     let second_mins: int8x16_t =
         vqtbl1q_s8(_second_mins, vreinterpretq_u8_s8(off1_hibits));
 
-    let second_under: uint8x16_t = vreinterpretq_u8_s8(vcgtq_s8(second_mins, current_bytes));
+    let second_under: uint8x16_t = vcgtq_s8(second_mins, current_bytes);
 
     *has_error = vorrq_s8(
         *has_error, vreinterpretq_s8_u8(vandq_u8(initial_under, second_under)));

From 19ad13e5533fdefc29af185cce7713736f43492b Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Thu, 15 Aug 2019 09:51:54 -0400
Subject: [PATCH 19/34] fix: update arm64 to use nightly

---
 .drone.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index 9b8e8bb2..8ceef40b 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -58,5 +58,5 @@ steps:
 - name: test
   image: rust:1
   commands:
-  - cargo build --verbose --all
-  - cargo test --verbose --all
+  - cargo +nightly build --verbose --all
+  - cargo +nightly test --verbose --all

From a97487b40c61673236e61dd430ce99ce51ba68c9 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Thu, 15 Aug 2019 10:54:58 -0400
Subject: [PATCH 20/34] fix: use rust nightly image for drone CI

---
 .drone.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.drone.yml b/.drone.yml
index 8ceef40b..370db257 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -56,7 +56,7 @@ platform:
 
 steps:
 - name: test
-  image: rust:1
+  image: rustlang/rust:nightly
   commands:
   - cargo +nightly build --verbose --all
   - cargo +nightly test --verbose --all

From faea015d0bea8dd9602e658c6e355c6851266512 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Thu, 15 Aug 2019 11:55:35 -0400
Subject: [PATCH 21/34] fix: drone CI rustup nightly

---
 .drone.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.drone.yml b/.drone.yml
index 370db257..8f00d391 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -56,7 +56,9 @@ platform:
 
 steps:
 - name: test
-  image: rustlang/rust:nightly
+  image: rust:1
   commands:
+  - rustup default nightly
+  - rustup update
   - cargo +nightly build --verbose --all
   - cargo +nightly test --verbose --all

From d66d68a473d3c540d2e3c42ce3e79c9219393c0b Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Thu, 15 Aug 2019 14:23:26 -0400
Subject: [PATCH 22/34] feat: fix guards, use rust stdlib for bit count
 operations

---
 src/lib.rs             | 17 +++++++++--------
 src/neon/deser.rs      |  8 ++++----
 src/neon/intrinsics.rs | 14 --------------
 src/neon/stage1.rs     | 10 +++++-----
 src/stage2.rs          |  2 +-
 src/value/generator.rs |  4 ----
 6 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 261672ec..9a7931d1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -99,26 +99,27 @@ pub use crate::avx2::deser::*;
 #[cfg(target_feature = "avx2")]
 use crate::avx2::stage1::SIMDJSON_PADDING;
 
-#[cfg(target_feature = "sse42")]
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))]
 mod sse42;
-#[cfg(target_feature = "sse42")]
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))]
 pub use crate::sse42::deser::*;
-#[cfg(target_feature = "sse42")]
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))]
 use crate::sse42::stage1::SIMDJSON_PADDING;
 
-//#[cfg(target_feature = "neon")]
+#[cfg(target_feature = "neon")]
 mod neon;
-//#[cfg(target_feature = "neon")]
+#[cfg(target_feature = "neon")]
 pub use crate::neon::deser::*;
-//#[cfg(target_feature = "neon")]
+#[cfg(target_feature = "neon")]
 use crate::neon::stage1::SIMDJSON_PADDING;
 
 mod stage2;
 pub mod value;
 
 use crate::numberparse::Number;
-//use std::mem;
-//use std::str;
+#[cfg(not(target_feature = "neon"))]
+use std::mem;
+use std::str;
 
 pub use crate::error::{Error, ErrorType};
 pub use crate::value::*;
diff --git a/src/neon/deser.rs b/src/neon/deser.rs
index 4f45d2b6..bc093865 100644
--- a/src/neon/deser.rs
+++ b/src/neon/deser.rs
@@ -78,7 +78,7 @@ impl<'de> Deserializer<'de> {
             if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
                 // we encountered quotes first. Move dst to point to quotes and exit
                 // find out where the quote is...
-                let quote_dist: u32 = trailingzeroes(u64::from(quote_bits)) as u32;
+                let quote_dist: u32 = quote_bits.trailing_zeros();
 
                 ///////////////////////
                 // Above, check for overflow in case someone has a crazy string (>=4GB?)
@@ -99,7 +99,7 @@ impl<'de> Deserializer<'de> {
             }
             if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
                 // Move to the 'bad' character
-                let bs_dist: u32 = trailingzeroes(u64::from(bs_bits));
+                let bs_dist: u32 = bs_bits.trailing_zeros();
                 len += bs_dist as usize;
                 src_i += bs_dist as usize;
                 break;
@@ -135,7 +135,7 @@ impl<'de> Deserializer<'de> {
             if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
                 // we encountered quotes first. Move dst to point to quotes and exit
                 // find out where the quote is...
-                let quote_dist: u32 = trailingzeroes(u64::from(quote_bits)) as u32;
+                let quote_dist: u32 = quote_bits.trailing_zeros();
 
                 ///////////////////////
                 // Above, check for overflow in case someone has a crazy string (>=4GB?)
@@ -161,7 +161,7 @@ impl<'de> Deserializer<'de> {
             }
             if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
                 // find out where the backspace is
-                let bs_dist: u32 = trailingzeroes(u64::from(bs_bits));
+                let bs_dist: u32 = bs_bits.trailing_zeros();
                 let escape_char: u8 = unsafe { *src.get_unchecked(src_i + bs_dist as usize + 1) };
                 // we encountered backslash first. Handle backslash
                 if escape_char == b'u' {
diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs
index 1c86659c..c8edefe8 100644
--- a/src/neon/intrinsics.rs
+++ b/src/neon/intrinsics.rs
@@ -29,10 +29,6 @@ extern "C" {
     fn vpaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
     #[link_name = "llvm.aarch64.neon.pmull64"]
     fn vmull_p64_(a: i64, b: i64) -> int8x16_t;
-    #[link_name = "llvm.ctpop.i64"]
-    fn ctpop_s64_(a: i64) -> i64;
-    #[link_name = "llvm.cttz.i64"]
-    fn cttz_u64_(a: i64) -> i64;
     #[link_name = "llvm.aarch64.neon.uqxtn.v2u32"]
     fn vqmovn_u64_(a: uint64x2_t) -> uint32x2_t;
     #[link_name = "llvm.aarch64.neon.uqsub.v16u8"]
@@ -571,16 +567,6 @@ pub unsafe fn vtstq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
     )
 }
 
-#[inline]
-pub unsafe fn hamming(a: u64) -> u32 {
-    ctpop_s64_(a as i64) as u32
-}
-
-#[inline]
-pub fn trailingzeroes(a: u64) -> u32 {
-    unsafe { cttz_u64_(a as i64) as u32 }
-}
-
 #[inline]
 pub unsafe fn vst1q_u32(addr: *mut u8, val: uint32x4_t) {
     std::ptr::write(addr as *mut uint32x4_t, val)
diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs
index 4ff84aff..6fadcc50 100644
--- a/src/neon/stage1.rs
+++ b/src/neon/stage1.rs
@@ -294,7 +294,7 @@ unsafe fn find_whitespace_and_structurals(
 //TODO: usize was u32 here does this matter?
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
 unsafe fn flatten_bits(base: &mut Vec<u32>, idx: u32, mut bits: u64) {
-    let cnt: usize = hamming(bits) as usize;
+    let cnt: usize = bits.count_ones() as usize;
     let mut l = base.len();
     let idx_minus_64 = idx.wrapping_sub(64);
     let idx_64_v = int32x4_t::new(
@@ -313,13 +313,13 @@ unsafe fn flatten_bits(base: &mut Vec<u32>, idx: u32, mut bits: u64) {
     base.set_len(l + cnt);
 
     while bits != 0 {
-        let v0: i32 = static_cast_i32!(trailingzeroes(bits));
+        let v0: i32 = bits.trailing_zeros() as i32;
         bits &= bits.wrapping_sub(1);
-        let v1: i32 = static_cast_i32!(trailingzeroes(bits));
+        let v1: i32 = bits.trailing_zeros() as i32;
         bits &= bits.wrapping_sub(1);
-        let v2: i32 = static_cast_i32!(trailingzeroes(bits));
+        let v2: i32 = bits.trailing_zeros() as i32;
         bits &= bits.wrapping_sub(1);
-        let v3: i32 = static_cast_i32!(trailingzeroes(bits));
+        let v3: i32 = bits.trailing_zeros() as i32;
         bits &= bits.wrapping_sub(1);
 
         let v: int32x4_t = int32x4_t::new(v0, v1, v2, v3);
diff --git a/src/stage2.rs b/src/stage2.rs
index f33a3877..3bbd71c1 100644
--- a/src/stage2.rs
+++ b/src/stage2.rs
@@ -2,7 +2,7 @@
 use crate::charutils::*;
 #[cfg(target_feature = "avx2")]
 use crate::avx2::stage1::SIMDJSON_PADDING;
-#[cfg(target_feature = "sse4.2")]
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))]
 use crate::sse42::stage1::SIMDJSON_PADDING;
 #[cfg(target_feature = "neon")]
 use crate::neon::stage1::SIMDJSON_PADDING;
diff --git a/src/value/generator.rs b/src/value/generator.rs
index 1e0e4baf..e17b2d13 100644
--- a/src/value/generator.rs
+++ b/src/value/generator.rs
@@ -5,10 +5,6 @@
 // https://github.com/maciejhirsz/json-rust/blob/master/src/codegen.rs
 
 use crate::value::ValueTrait;
-#[cfg(target_arch = "x86")]
-use std::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
 use std::io;
 use std::io::Write;
 use std::marker::PhantomData;

From 4c51dc4311196e08ae179eb63ac33b574e42cd34 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Thu, 15 Aug 2019 14:53:14 -0400
Subject: [PATCH 23/34] feat: address review comments, platform handling and
 misc

---
 src/lib.rs            |  1 +
 src/neon/deser.rs     | 41 +++++------------------------------------
 src/neon/stage1.rs    | 42 ++++++++++++++++++++++++++++++++++++++----
 src/neon/utf8check.rs |  8 ++++----
 src/numberparse.rs    | 31 ++++++++++++++++++++++++++++++-
 5 files changed, 78 insertions(+), 45 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 9a7931d1..690a5b0c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -181,6 +181,7 @@ impl<'de> Deserializer<'de> {
 
         let counts = Deserializer::validate(input, &structural_indexes)?;
 
+        // Set length to allow slice access in ARM code
         let mut strings = Vec::with_capacity(len + SIMDJSON_PADDING);
         unsafe {
             strings.set_len(len + SIMDJSON_PADDING);
diff --git a/src/neon/deser.rs b/src/neon/deser.rs
index bc093865..26de719b 100644
--- a/src/neon/deser.rs
+++ b/src/neon/deser.rs
@@ -10,39 +10,6 @@ pub use crate::stringparse::*;
 
 pub use crate::neon::intrinsics::*;
 
-unsafe fn find_bs_bits_and_quote_bits(src: &[u8]) -> ParseStringHelper {
-    // this can read up to 31 bytes beyond the buffer size, but we require
-    // SIMDJSON_PADDING of padding
-    let v0 : uint8x16_t = vld1q_u8(src.as_ptr());
-    let v1 : uint8x16_t = vld1q_u8(src.as_ptr().add(16));
-
-    let bs_mask : uint8x16_t = vmovq_n_u8('\\' as u8);
-    let qt_mask : uint8x16_t = vmovq_n_u8('"' as u8);
-
-    let bit_mask = uint8x16_t::new(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                                   0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
-
-    let cmp_bs_0 : uint8x16_t = vceqq_u8(v0, bs_mask);
-    let cmp_bs_1 : uint8x16_t = vceqq_u8(v1, bs_mask);
-    let cmp_qt_0 : uint8x16_t = vceqq_u8(v0, qt_mask);
-    let cmp_qt_1 : uint8x16_t = vceqq_u8(v1, qt_mask);
-
-    let cmp_bs_0 = vandq_u8(cmp_bs_0, bit_mask);
-    let cmp_bs_1 = vandq_u8(cmp_bs_1, bit_mask);
-    let cmp_qt_0 = vandq_u8(cmp_qt_0, bit_mask);
-    let cmp_qt_1 = vandq_u8(cmp_qt_1, bit_mask);
-
-    let sum0 : uint8x16_t = vpaddq_u8(cmp_bs_0, cmp_bs_1);
-    let sum1 : uint8x16_t = vpaddq_u8(cmp_qt_0, cmp_qt_1);
-    let sum0 = vpaddq_u8(sum0, sum1);
-    let sum0 = vpaddq_u8(sum0, sum0);
-
-    ParseStringHelper {
-        bs_bits: vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits
-        quote_bits: vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1)  // quote_bits
-    }
-}
-
 impl<'de> Deserializer<'de> {
     #[cfg_attr(not(feature = "no-inline"), inline(always))]
     pub fn parse_str_(&mut self) -> Result<&'de str> {
@@ -63,7 +30,7 @@ impl<'de> Deserializer<'de> {
             // later
 
             let srcx = if src.len() >= src_i + 32 {
-                &src[src_i..]
+                unsafe { src.get_unchecked(src_i..) }
             } else {
                 unsafe {
                     padding
@@ -116,7 +83,7 @@ impl<'de> Deserializer<'de> {
 
         loop {
             let srcx = if src.len() >= src_i + 32 {
-                &src[src_i..]
+                unsafe { src.get_unchecked(src_i..) }
             } else {
                 unsafe {
                     padding
@@ -126,7 +93,9 @@ impl<'de> Deserializer<'de> {
                 }
             };
 
-            dst[dst_i..dst_i + 32].copy_from_slice(&srcx[..32]);
+            unsafe {
+                dst.get_unchecked_mut(dst_i..dst_i + 32).copy_from_slice(srcx.get_unchecked(..32));
+            }
 
             // store to dest unconditionally - we can overwrite the bits we don't like
             // later
diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs
index 6fadcc50..3f005ad5 100644
--- a/src/neon/stage1.rs
+++ b/src/neon/stage1.rs
@@ -27,10 +27,12 @@ fn fill_input(ptr: &[u8]) -> SimdInput {
     }
 }
 
+const BIT_MASK: [u8; 16] = [0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80];
+
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
 unsafe fn neon_movemask(input: uint8x16_t) -> u16 {
-    let bit_mask: uint8x16_t = uint8x16_t::new(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+    let bit_mask = vld1q_u8(BIT_MASK.as_ptr());
 
     let minput: uint8x16_t = vandq_u8(input, bit_mask);
     let tmp: uint8x16_t = vpaddq_u8(minput, minput);
@@ -42,8 +44,8 @@ unsafe fn neon_movemask(input: uint8x16_t) -> u16 {
 
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
 unsafe fn neon_movemask_bulk(p0: uint8x16_t, p1: uint8x16_t, p2: uint8x16_t, p3: uint8x16_t) -> u64 {
-    let bit_mask: uint8x16_t = uint8x16_t::new(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+    let bit_mask = vld1q_u8(BIT_MASK.as_ptr());
+
     let t0 = vandq_u8(p0, bit_mask);
     let t1 = vandq_u8(p1, bit_mask);
     let t2 = vandq_u8(p2, bit_mask);
@@ -372,6 +374,38 @@ fn finalize_structurals(
     structurals
 }
 
+pub unsafe fn find_bs_bits_and_quote_bits(src: &[u8]) -> ParseStringHelper {
+    // this can read up to 31 bytes beyond the buffer size, but we require
+    // SIMDJSON_PADDING of padding
+    let v0 : uint8x16_t = vld1q_u8(src.as_ptr());
+    let v1 : uint8x16_t = vld1q_u8(src.as_ptr().add(16));
+
+    let bs_mask : uint8x16_t = vmovq_n_u8(b'\\');
+    let qt_mask : uint8x16_t = vmovq_n_u8(b'"');
+
+    let bit_mask = vld1q_u8(BIT_MASK.as_ptr());
+
+    let cmp_bs_0 : uint8x16_t = vceqq_u8(v0, bs_mask);
+    let cmp_bs_1 : uint8x16_t = vceqq_u8(v1, bs_mask);
+    let cmp_qt_0 : uint8x16_t = vceqq_u8(v0, qt_mask);
+    let cmp_qt_1 : uint8x16_t = vceqq_u8(v1, qt_mask);
+
+    let cmp_bs_0 = vandq_u8(cmp_bs_0, bit_mask);
+    let cmp_bs_1 = vandq_u8(cmp_bs_1, bit_mask);
+    let cmp_qt_0 = vandq_u8(cmp_qt_0, bit_mask);
+    let cmp_qt_1 = vandq_u8(cmp_qt_1, bit_mask);
+
+    let sum0 : uint8x16_t = vpaddq_u8(cmp_bs_0, cmp_bs_1);
+    let sum1 : uint8x16_t = vpaddq_u8(cmp_qt_0, cmp_qt_1);
+    let sum0 = vpaddq_u8(sum0, sum1);
+    let sum0 = vpaddq_u8(sum0, sum0);
+
+    ParseStringHelper {
+        bs_bits: vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits
+        quote_bits: vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1)  // quote_bits
+    }
+}
+
 impl<'de> Deserializer<'de> {
     //#[inline(never)]
     pub unsafe fn find_structural_bits(input: &[u8]) -> std::result::Result<Vec<u32>, ErrorType> {
diff --git a/src/neon/utf8check.rs b/src/neon/utf8check.rs
index 5e2fa56f..87505f94 100644
--- a/src/neon/utf8check.rs
+++ b/src/neon/utf8check.rs
@@ -104,7 +104,7 @@ unsafe fn check_overlong(
     previous_hibits: int8x16_t,
     has_error: &mut int8x16_t,
 ) {
-    let _initial_mins: int8x16_t = int8x16_t::new(
+    let initial_mins_tbl: int8x16_t = int8x16_t::new(
         -128, -128, -128, -128, -128, -128,
         -128, -128, -128, -128, -128, -128, // 10xx => false
         -62 /* 0xC2 */, -128,                         // 110x
@@ -112,7 +112,7 @@ unsafe fn check_overlong(
         -15 /*0xF1 */,
     );
 
-    let _second_mins: int8x16_t = int8x16_t::new(
+    let second_mins_tbl: int8x16_t = int8x16_t::new(
         -128, -128, -128, -128, -128, -128,
         -128, -128, -128, -128, -128, -128, // 10xx => false
         127, 127,                          // 110x => true
@@ -122,12 +122,12 @@ unsafe fn check_overlong(
 
     let off1_hibits: int8x16_t = vextq_s8(previous_hibits, hibits, 16 - 1);
     let initial_mins: int8x16_t =
-        vqtbl1q_s8(_initial_mins, vreinterpretq_u8_s8(off1_hibits));
+        vqtbl1q_s8(initial_mins_tbl, vreinterpretq_u8_s8(off1_hibits));
 
     let initial_under: uint8x16_t = vcgtq_s8(initial_mins, off1_current_bytes);
 
     let second_mins: int8x16_t =
-        vqtbl1q_s8(_second_mins, vreinterpretq_u8_s8(off1_hibits));
+        vqtbl1q_s8(second_mins_tbl, vreinterpretq_u8_s8(off1_hibits));
 
     let second_under: uint8x16_t = vcgtq_s8(second_mins, current_bytes);
 
diff --git a/src/numberparse.rs b/src/numberparse.rs
index 2362e6d0..7674a977 100644
--- a/src/numberparse.rs
+++ b/src/numberparse.rs
@@ -2,6 +2,11 @@ use crate::charutils::*;
 use crate::unlikely;
 use crate::*;
 
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
 const POWER_OF_TEN: [f64; 617] = [
     1e-308, 1e-307, 1e-306, 1e-305, 1e-304, 1e-303, 1e-302, 1e-301, 1e-300, 1e-299, 1e-298, 1e-297,
     1e-296, 1e-295, 1e-294, 1e-293, 1e-292, 1e-291, 1e-290, 1e-289, 1e-288, 1e-287, 1e-286, 1e-285,
@@ -129,7 +134,31 @@ pub enum Number {
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-//#[cfg(target_arch = "aarch64")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 {
+    unsafe {
+        // this actually computes *16* values so we are being wasteful.
+        let ascii0: __m128i = _mm_set1_epi8(b'0' as i8);
+        let mul_1_10: __m128i =
+            _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
+        let mul_1_100: __m128i = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
+        let mul_1_10000: __m128i = _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
+        // We know what we're doing right? :P
+        #[allow(clippy::cast_ptr_alignment)]
+            let input: __m128i = _mm_sub_epi8(
+            _mm_loadu_si128(chars.get_unchecked(0..16).as_ptr() as *const __m128i),
+            ascii0,
+        );
+        let t1: __m128i = _mm_maddubs_epi16(input, mul_1_10);
+        let t2: __m128i = _mm_madd_epi16(t1, mul_1_100);
+        let t3: __m128i = _mm_packus_epi32(t2, t2);
+        let t4: __m128i = _mm_madd_epi16(t3, mul_1_10000);
+        _mm_cvtsi128_si32(t4) as u32 // only captures the sum of the first 8 digits, drop the rest
+    }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+#[cfg(target_feature = "neon")]
 fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 {
     let val: u64 = unsafe { *(chars.as_ptr() as *const u64) };
     //    memcpy(&val, chars, sizeof(u64));

From bdc82d68d3625af5fe09f385c392e8be04ec2a51 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Thu, 15 Aug 2019 14:58:21 -0400
Subject: [PATCH 24/34] feat: refactor immediate tables into loads

---
 src/neon/utf8check.rs | 48 +++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/src/neon/utf8check.rs b/src/neon/utf8check.rs
index 87505f94..b78fe7d1 100644
--- a/src/neon/utf8check.rs
+++ b/src/neon/utf8check.rs
@@ -27,15 +27,17 @@ unsafe fn check_smaller_than_0xf4(current_bytes: int8x16_t, has_error: &mut int8
             vqsubq_s8(current_bytes, vdupq_n_s8(-12 /* 0xF4 */)));
 }
 
+const NIBBLES_TBL: [i8; 16] = [
+    1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
+    0, 0, 0, 0,             // 10xx (continuation)
+    2, 2,                   // 110x
+    3,                      // 1110
+    4,                      // 1111, next should be 0 (not checked here)
+];
+
 #[cfg_attr(not(feature = "no-inline"), inline)]
 unsafe fn continuation_lengths(high_nibbles: int8x16_t) -> int8x16_t {
-    let nibbles: int8x16_t = int8x16_t::new(
-        1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
-        0, 0, 0, 0,             // 10xx (continuation)
-        2, 2,                   // 110x
-        3,                      // 1110
-        4,                      // 1111, next should be 0 (not checked here)
-    );
+    let nibbles: int8x16_t = vld1q_s8(NIBBLES_TBL.as_ptr());
 
     vreinterpretq_s8_u8(vqtbl1q_u8(vreinterpretq_u8_s8(nibbles), vreinterpretq_u8_s8(high_nibbles)))
 }
@@ -90,6 +92,21 @@ unsafe fn check_first_continuation_max(
         *has_error, vreinterpretq_s8_u8(vorrq_u8(badfollow_ed, badfollow_f4)));
 }
 
+const INITIAL_MINS_TBL: [i8; 16] = [
+    -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, // 10xx => false
+    -62 /* 0xC2 */, -128,                         // 110x
+    -31 /* 0xE1 */,                               // 1110
+    -15 /*0xF1 */];
+
+const SECOND_MINS_TBL: [i8; 16] = [
+    -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, // 10xx => false
+    127, 127,                          // 110x => true
+    -96 /* 0xA0 */,                               // 1110
+    -112 /* 0x90 */,
+];
+
 // map off1_hibits => error condition
 // hibits     off1    cur
 // C       => < C2 && true
@@ -104,21 +121,8 @@ unsafe fn check_overlong(
     previous_hibits: int8x16_t,
     has_error: &mut int8x16_t,
 ) {
-    let initial_mins_tbl: int8x16_t = int8x16_t::new(
-        -128, -128, -128, -128, -128, -128,
-        -128, -128, -128, -128, -128, -128, // 10xx => false
-        -62 /* 0xC2 */, -128,                         // 110x
-        -31 /* 0xE1 */,                               // 1110
-        -15 /*0xF1 */,
-    );
-
-    let second_mins_tbl: int8x16_t = int8x16_t::new(
-        -128, -128, -128, -128, -128, -128,
-        -128, -128, -128, -128, -128, -128, // 10xx => false
-        127, 127,                          // 110x => true
-        -96 /* 0xA0 */,                               // 1110
-        -112 /* 0x90 */,
-    );
+    let initial_mins_tbl: int8x16_t = vld1q_s8(INITIAL_MINS_TBL.as_ptr());
+    let second_mins_tbl: int8x16_t = vld1q_s8(SECOND_MINS_TBL.as_ptr());
 
     let off1_hibits: int8x16_t = vextq_s8(previous_hibits, hibits, 16 - 1);
     let initial_mins: int8x16_t =

From 4fb672a91375244d1299a70d549a804a298cbd2a Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Thu, 15 Aug 2019 20:23:05 -0400
Subject: [PATCH 25/34] feat: improving code style and similarity across
 architectures

---
 src/neon/deser.rs      |  54 ++++---
 src/neon/intrinsics.rs |  68 ++++-----
 src/neon/stage1.rs     | 316 +++++++++++++++++++++++------------------
 src/neon/utf8check.rs  | 238 +++++++++++++++++++------------
 4 files changed, 386 insertions(+), 290 deletions(-)

diff --git a/src/neon/deser.rs b/src/neon/deser.rs
index 26de719b..5d70b7af 100644
--- a/src/neon/deser.rs
+++ b/src/neon/deser.rs
@@ -1,14 +1,11 @@
-//use std::mem;
 
 pub use crate::error::{Error, ErrorType};
 pub use crate::Deserializer;
 pub use crate::Result;
 pub use crate::neon::stage1::*;
-pub use crate::neon::intrinsics::*;
 pub use crate::neon::utf8check::*;
-pub use crate::stringparse::*;
-
 pub use crate::neon::intrinsics::*;
+pub use crate::stringparse::*;
 
 impl<'de> Deserializer<'de> {
     #[cfg_attr(not(feature = "no-inline"), inline(always))]
@@ -29,18 +26,29 @@ impl<'de> Deserializer<'de> {
             // store to dest unconditionally - we can overwrite the bits we don't like
             // later
 
-            let srcx = if src.len() >= src_i + 32 {
-                unsafe { src.get_unchecked(src_i..) }
+            let (v0, v1) = if src.len() >= src_i + 32 {
+                // This is safe since we ensure src is at least 16 wide
+                #[allow(clippy::cast_ptr_alignment)]
+                unsafe {
+                    (
+                        vld1q_u8(src.get_unchecked(src_i..src_i + 16).as_ptr()),
+                        vld1q_u8(src.get_unchecked(src_i + 16..src_i + 32).as_ptr()),
+                    )
+                }
             } else {
                 unsafe {
                     padding
                         .get_unchecked_mut(..src.len() - src_i)
                         .clone_from_slice(src.get_unchecked(src_i..));
-                    &padding
+                    // This is safe since we ensure src is at least 32 wide
+                    (
+                        vld1q_u8(padding.get_unchecked(0..16).as_ptr()),
+                        vld1q_u8(padding.get_unchecked(16..32).as_ptr()),
+                    )
                 }
             };
 
-            let ParseStringHelper { bs_bits, quote_bits } = unsafe { find_bs_bits_and_quote_bits(&srcx) };
+            let ParseStringHelper { bs_bits, quote_bits } = find_bs_bits_and_quote_bits(v0, v1);
 
             if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
                 // we encountered quotes first. Move dst to point to quotes and exit
@@ -82,24 +90,35 @@ impl<'de> Deserializer<'de> {
         let dst: &mut [u8] = self.strings.as_mut_slice();
 
         loop {
-            let srcx = if src.len() >= src_i + 32 {
-                unsafe { src.get_unchecked(src_i..) }
+            let (v0, v1) = if src.len() >= src_i + 32 {
+                // This is safe since we ensure src is at least 16 wide
+                #[allow(clippy::cast_ptr_alignment)]
+                    unsafe {
+                    (
+                        vld1q_u8(src.get_unchecked(src_i..src_i + 16).as_ptr()),
+                        vld1q_u8(src.get_unchecked(src_i + 16..src_i + 32).as_ptr()),
+                    )
+                }
             } else {
                 unsafe {
                     padding
                         .get_unchecked_mut(..src.len() - src_i)
                         .clone_from_slice(src.get_unchecked(src_i..));
-                    &padding
+                    // This is safe since we ensure src is at least 32 wide
+                    (
+                        vld1q_u8(padding.get_unchecked(0..16).as_ptr()),
+                        vld1q_u8(padding.get_unchecked(16..32).as_ptr()),
+                    )
                 }
             };
 
             unsafe {
-                dst.get_unchecked_mut(dst_i..dst_i + 32).copy_from_slice(srcx.get_unchecked(..32));
+                dst.get_unchecked_mut(dst_i..dst_i + 32).copy_from_slice(src.get_unchecked(src_i..src_i + 32));
             }
 
             // store to dest unconditionally - we can overwrite the bits we don't like
             // later
-            let ParseStringHelper { bs_bits, quote_bits } = unsafe { find_bs_bits_and_quote_bits(&srcx) };
+            let ParseStringHelper { bs_bits, quote_bits } = find_bs_bits_and_quote_bits(v0, v1);
 
             if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
                 // we encountered quotes first. Move dst to point to quotes and exit
@@ -138,10 +157,11 @@ impl<'de> Deserializer<'de> {
                     // within the unicode codepoint handling code.
                     src_i += bs_dist as usize;
                     dst_i += bs_dist as usize;
-                    let (o, s) = if let Ok(r) =
-                    handle_unicode_codepoint(unsafe { src.get_unchecked(src_i..) }, unsafe {
-                        dst.get_unchecked_mut(dst_i..)
-                    }) {
+                    let (o, s) = if let Ok(r) = handle_unicode_codepoint(
+                            unsafe { src.get_unchecked(src_i..) },
+                            unsafe { dst.get_unchecked_mut(dst_i..) }
+                    )
+                    {
                         r
                     } else {
                         return Err(self.error(ErrorType::InvlaidUnicodeCodepoint));
diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs
index c8edefe8..2ecfda86 100644
--- a/src/neon/intrinsics.rs
+++ b/src/neon/intrinsics.rs
@@ -90,8 +90,8 @@ pub unsafe fn vmull_p64(a: poly64_t, b: poly64_t) -> poly128_t {
 }
 
 #[inline]
-pub unsafe fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    vpaddq_u8_(a, b)
+pub fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe { vpaddq_u8_(a, b) }
 }
 
 #[inline]
@@ -195,12 +195,12 @@ impl int32x4_t {
     }
 }
 
-#[inline]
-pub fn add_overflow(a: u64, b: u64, out: &mut u64) -> bool {
-    let (carry, did_carry) = a.overflowing_add(b);
-    *out = carry;
-    did_carry
-}
+//#[inline]
+//pub fn add_overflow(a: u64, b: u64, out: &mut u64) -> bool {
+//    let (carry, did_carry) = a.overflowing_add(b);
+//    *out = carry;
+//    did_carry
+//}
 
 #[inline]
 pub unsafe fn vld1q_s8(addr: *const i8) -> int8x16_t {
@@ -223,8 +223,8 @@ macro_rules! aarch64_simd_2 {
     };
     ($name: ident, $type: ty, $res: ty, $simd_fn: ident, $intrarm: ident, $intraarch: ident) => {
         #[inline]
-        pub unsafe fn $name(a: $type, b: $type) -> $res {
-            simd_llvm::$simd_fn(a, b)
+        pub fn $name(a: $type, b: $type) -> $res {
+            unsafe { simd_llvm::$simd_fn(a, b) }
         }
     }
 }
@@ -382,6 +382,11 @@ pub fn vmovq_n_u8(a: u8) -> uint8x16_t {
     uint8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
 }
 
+#[inline]
+pub fn vmovq_n_s8(a: i8) -> int8x16_t {
+    int8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
 #[inline]
 pub fn zerou8x16() -> uint8x16_t {
     uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00)
@@ -403,47 +408,22 @@ pub unsafe fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
 }
 
 #[inline]
-pub unsafe fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_and(a, b) }
-
+pub fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_and(a, b) } }
 #[inline]
-fn and16(a: i16, b: i16) -> i16 {
-    if (a & b) != 0 {
-        0
-    } else {
-        -1 /* 0xFFFF */
-    }
-}
-
+pub fn vandq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { unsafe { simd_llvm::simd_and(a, b) } }
 #[inline]
-pub unsafe fn vandq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-//    simd_llvm::simd_and(a, b)
-    int16x8_t(
-        and16(a.0, b.0),
-        and16(a.1, b.1),
-        and16(a.2, b.2),
-        and16(a.3, b.3),
-        and16(a.4, b.4),
-        and16(a.5, b.5),
-        and16(a.6, b.6),
-        and16(a.7, b.7),
-    )
-}
-
+pub fn vorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_or(a, b) } }
 #[inline]
-pub unsafe fn vorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { simd_llvm::simd_or(a, b) }
-
-#[inline]
-pub unsafe fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { simd_llvm::simd_and(a, b) }
-
+pub fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_and(a, b) } }
 #[inline]
-pub unsafe fn vorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { simd_llvm::simd_or(a, b) }
+pub fn vorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_or(a, b) } }
 
 macro_rules! arm_reinterpret {
     ($name: ident, $from: ty, $to: ty) => {
         // Vector reinterpret cast operation
-        # [inline]
-        pub unsafe fn $name(a: $from) -> $to {
-            mem::transmute(a)
+        #[inline]
+        pub fn $name(a: $from) -> $to {
+            unsafe { mem::transmute(a) }
         }
     };
 }
@@ -464,7 +444,7 @@ arm_reinterpret!(vreinterpretq_s64_s8, int8x16_t, int64x2_t);
 macro_rules! arm_vget_lane {
     ($name: ident, $to: ty, $from: ty, $lanes: literal) => {
         #[inline]
-        pub unsafe fn $ name(v: $from, lane: u32) -> $ to {
+        pub unsafe fn $name(v: $from, lane: u32) -> $ to {
             simd_llvm::simd_extract(v, lane)
         }
     };
diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs
index 3f005ad5..b23f059f 100644
--- a/src/neon/stage1.rs
+++ b/src/neon/stage1.rs
@@ -3,38 +3,23 @@
 use crate::neon::intrinsics::*;
 use crate::neon::utf8check::*;
 use crate::*;
-use std::mem;
 
-pub const SIMDJSON_PADDING: usize = mem::size_of::<uint8x16_t>() * 4;
+use std::mem;
 
-#[derive(Debug)]
-struct SimdInput {
-    v0: uint8x16_t,
-    v1: uint8x16_t,
-    v2: uint8x16_t,
-    v3: uint8x16_t,
-}
+// NEON-SPECIFIC
 
-fn fill_input(ptr: &[u8]) -> SimdInput {
-    unsafe {
-        #[allow(clippy::cast_ptr_alignment)]
-            SimdInput {
-            v0: vld1q_u8(ptr.as_ptr() as *const u8),
-            v1: vld1q_u8(ptr.as_ptr().add(16) as *const u8),
-            v2: vld1q_u8(ptr.as_ptr().add(32) as *const u8),
-            v3: vld1q_u8(ptr.as_ptr().add(48) as *const u8),
-        }
-    }
+macro_rules! bit_mask {
+    () => {
+        uint8x16_t::new(
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+        )
+    };
 }
 
-const BIT_MASK: [u8; 16] = [0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80];
-
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
 unsafe fn neon_movemask(input: uint8x16_t) -> u16 {
-    let bit_mask = vld1q_u8(BIT_MASK.as_ptr());
-
-    let minput: uint8x16_t = vandq_u8(input, bit_mask);
+    let minput: uint8x16_t = vandq_u8(input, bit_mask!());
     let tmp: uint8x16_t = vpaddq_u8(minput, minput);
     let tmp = vpaddq_u8(tmp, tmp);
     let tmp = vpaddq_u8(tmp, tmp);
@@ -44,7 +29,7 @@ unsafe fn neon_movemask(input: uint8x16_t) -> u16 {
 
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
 unsafe fn neon_movemask_bulk(p0: uint8x16_t, p1: uint8x16_t, p2: uint8x16_t, p3: uint8x16_t) -> u64 {
-    let bit_mask = vld1q_u8(BIT_MASK.as_ptr());
+    let bit_mask = bit_mask!();
 
     let t0 = vandq_u8(p0, bit_mask);
     let t1 = vandq_u8(p1, bit_mask);
@@ -58,10 +43,56 @@ unsafe fn neon_movemask_bulk(p0: uint8x16_t, p1: uint8x16_t, p2: uint8x16_t, p3:
     vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0)
 }
 
+// /NEON-SPECIFIC
+
+pub const SIMDJSON_PADDING: usize = mem::size_of::<uint8x16_t>() * 4;
+
 unsafe fn compute_quote_mask(quote_bits: u64) -> u64 {
     vgetq_lane_u64(
         vreinterpretq_u64_u8(
-            mem::transmute(vmull_p64(-1, quote_bits as i64))), 0)
+            mem::transmute(
+                vmull_p64(
+                    -1,
+                    quote_bits as i64)
+            )
+        ),
+        0
+    )
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline(always))]
+unsafe fn check_ascii(si: &SimdInput) -> bool {
+    let highbit: uint8x16_t = vdupq_n_u8(0x80);
+    let t0: uint8x16_t = vorrq_u8(si.v0, si.v1);
+    let t1: uint8x16_t = vorrq_u8(si.v2, si.v3);
+    let t3: uint8x16_t = vorrq_u8(t0, t1);
+    let t4: uint8x16_t = vandq_u8(t3, highbit);
+
+    let v64: uint64x2_t = vreinterpretq_u64_u8(t4);
+    let v32: uint32x2_t = vqmovn_u64(v64);
+    let result: uint64x1_t = vreinterpret_u64_u32(v32);
+
+    vget_lane_u64(result, 0) == 0
+}
+
+#[derive(Debug)]
+struct SimdInput {
+    v0: uint8x16_t,
+    v1: uint8x16_t,
+    v2: uint8x16_t,
+    v3: uint8x16_t,
+}
+
+fn fill_input(ptr: &[u8]) -> SimdInput {
+    unsafe {
+        #[allow(clippy::cast_ptr_alignment)]
+        SimdInput {
+            v0: vld1q_u8(ptr.as_ptr() as *const u8),
+            v1: vld1q_u8(ptr.as_ptr().add(16) as *const u8),
+            v2: vld1q_u8(ptr.as_ptr().add(32) as *const u8),
+            v3: vld1q_u8(ptr.as_ptr().add(48) as *const u8),
+        }
+    }
 }
 
 struct Utf8CheckingState {
@@ -79,19 +110,18 @@ impl Default for Utf8CheckingState {
     }
 }
 
-#[cfg_attr(not(feature = "no-inline"), inline(always))]
-unsafe fn check_ascii_neon(si: &SimdInput) -> bool {
-    let high_bit: uint8x16_t = vdupq_n_u8(0x80);
-    let t0: uint8x16_t = vorrq_u8(si.v0, si.v1);
-    let t1: uint8x16_t = vorrq_u8(si.v2, si.v3);
-    let t3: uint8x16_t = vorrq_u8(t0, t1);
-    let t4: uint8x16_t = vandq_u8(t3, high_bit);
-
-    let v64: uint64x2_t = vreinterpretq_u64_u8(t4);
-    let v32: uint32x2_t = vqmovn_u64(v64);
-    let result: uint64x1_t = vreinterpret_u64_u32(v32);
-
-    vget_lane_u64(result, 0) == 0
+#[inline]
+fn is_utf8_status_ok(has_error: int8x16_t) -> bool {
+    unsafe {
+        let utf8_error_bits: u128 = mem::transmute(
+            vandq_s16(
+                mem::transmute(has_error),
+                mem::transmute(has_error)
+            )
+        );
+
+        utf8_error_bits as u16 == 0
+    }
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
@@ -99,62 +129,53 @@ unsafe fn check_utf8(
     input: &SimdInput,
     state: &mut Utf8CheckingState,
 ) {
-    if check_ascii_neon(input) {
+    if check_ascii(input) {
         // All bytes are ascii. Therefore the byte that was just before must be
         // ascii too. We only check the byte that was just before simd_input. Nines
         // are arbitrary values.
-        let verror: int8x16_t =
-            int8x16_t::new(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1);
-        state.has_error =
-            vreinterpretq_s8_u8(vorrq_u8(vcgtq_s8(state.previous.carried_continuations, verror),
-                                         vreinterpretq_u8_s8(state.has_error)));
+        let verror: int8x16_t = int8x16_t::new(
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1,
+        );
+        state.has_error = vreinterpretq_s8_u8(vorrq_u8(
+            vcgtq_s8(
+                state.previous.carried_continuations,
+                verror,
+            ),
+            vreinterpretq_u8_s8(state.has_error)),
+        );
     } else {
         // it is not ascii so we have to do heavy work
-        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v0),
-                                          &mut (state.previous), &mut (state.has_error));
-        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v1),
-                                          &mut (state.previous), &mut (state.has_error));
-        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v2),
-                                          &mut (state.previous), &mut (state.has_error));
-        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v3),
-                                          &mut (state.previous), &mut (state.has_error));
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v0), &mut state.previous, &mut state.has_error);
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v1), &mut state.previous, &mut state.has_error);
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v2), &mut state.previous, &mut state.has_error);
+        state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v3), &mut state.previous, &mut state.has_error);
     }
 }
 
 // a straightforward comparison of a mask against input
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
-unsafe fn cmp_mask_against_input(input: &SimdInput, m: u8) -> u64 {
-    macro_rules! call {
-        ($imm8:expr) => {
-            vmovq_n_u8($imm8)
-        };
-    };
-    let mask: uint8x16_t = constify_imm8!(m, call);
-
-    let cmp_res_0: uint8x16_t = vceqq_u8(input.v0, mask);
-    let cmp_res_1: uint8x16_t = vceqq_u8(input.v1, mask);
-    let cmp_res_2: uint8x16_t = vceqq_u8(input.v2, mask);
-    let cmp_res_3: uint8x16_t = vceqq_u8(input.v3, mask);
+fn cmp_mask_against_input(input: &SimdInput, m: u8) -> u64 {
+    unsafe {
+        let mask: uint8x16_t = vmovq_n_u8(m);
+        let cmp_res_0: uint8x16_t = vceqq_u8(input.v0, mask);
+        let cmp_res_1: uint8x16_t = vceqq_u8(input.v1, mask);
+        let cmp_res_2: uint8x16_t = vceqq_u8(input.v2, mask);
+        let cmp_res_3: uint8x16_t = vceqq_u8(input.v3, mask);
 
-    neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3)
+        neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3)
+    }
 }
 
 // find all values less than or equal than the content of maxval (using unsigned arithmetic)
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
-unsafe fn unsigned_lteq_against_input(input: &SimdInput, maxval: u8) -> u64 {
-    macro_rules! call {
-        ($imm8:expr) => {
-            vmovq_n_u8($imm8)
-        };
-    };
-    let mask: uint8x16_t = constify_imm8!(maxval, call);
-
-    let cmp_res_0: uint8x16_t = vcleq_u8(input.v0, mask);
-    let cmp_res_1: uint8x16_t = vcleq_u8(input.v1, mask);
-    let cmp_res_2: uint8x16_t = vcleq_u8(input.v2, mask);
-    let cmp_res_3: uint8x16_t = vcleq_u8(input.v3, mask);
-
-    neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3)
+fn unsigned_lteq_against_input(input: &SimdInput, maxval: uint8x16_t) -> u64 {
+    unsafe {
+        let cmp_res_0: uint8x16_t = vcleq_u8(input.v0, maxval);
+        let cmp_res_1: uint8x16_t = vcleq_u8(input.v1, maxval);
+        let cmp_res_2: uint8x16_t = vcleq_u8(input.v2, maxval);
+        let cmp_res_3: uint8x16_t = vcleq_u8(input.v3, maxval);
+        neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3)
+    }
 }
 
 // return a bitvector indicating where we have characters that end an odd-length
@@ -171,7 +192,7 @@ unsafe fn find_odd_backslash_sequences(input: &SimdInput, prev_iter_ends_odd_bac
     const EVEN_BITS: u64 = 0x5555_5555_5555_5555;
     const ODD_BITS: u64 = !EVEN_BITS;
 
-    let bs_bits: u64 = cmp_mask_against_input(input, b'\\');
+    let bs_bits: u64 = cmp_mask_against_input(&input, b'\\');
     let start_edges: u64 = bs_bits & !(bs_bits << 1);
     // flip lowest if we have an odd-length run at the end of the prior
     // iteration
@@ -180,13 +201,13 @@ unsafe fn find_odd_backslash_sequences(input: &SimdInput, prev_iter_ends_odd_bac
     let odd_starts: u64 = start_edges & !even_start_mask;
     let even_carries: u64 = bs_bits.wrapping_add(even_starts);
 
-    let mut odd_carries: u64 = 0;
     // must record the carry-out of our odd-carries out of bit 63; this
     // indicates whether the sense of any edge going to the next iteration
     // should be flipped
-    let iter_ends_odd_backslash: bool = add_overflow(bs_bits, odd_starts, &mut odd_carries);
+    let (mut odd_carries, iter_ends_odd_backslash) = bs_bits.overflowing_add(odd_starts);
 
-    odd_carries |= *prev_iter_ends_odd_backslash; // push in bit zero as a potential end
+    odd_carries |= *prev_iter_ends_odd_backslash;
+    // push in bit zero as a potential end
     // if we had an odd-numbered run at the
     // end of the previous iteration
     *prev_iter_ends_odd_backslash = if iter_ends_odd_backslash { 0x1 } else { 0x0 };
@@ -217,25 +238,23 @@ unsafe fn find_quote_mask_and_bits(
     quote_bits: &mut u64,
     error_mask: &mut u64,
 ) -> u64 {
-    *quote_bits = cmp_mask_against_input(input, b'"');
+    *quote_bits = cmp_mask_against_input(&input, b'"');
     *quote_bits &= !odd_ends;
-
+    // remove from the valid quoted region the unescapted characters.
     let mut quote_mask: u64 = compute_quote_mask(*quote_bits);
 
     quote_mask ^= *prev_iter_inside_quote;
-
     // All Unicode characters may be placed within the
     // quotation marks, except for the characters that MUST be escaped:
     // quotation mark, reverse solidus, and the control characters (U+0000
     //through U+001F).
     // https://tools.ietf.org/html/rfc8259
-    let unescaped: u64 = unsigned_lteq_against_input(input, 0x1F);
+    let unescaped: u64 = unsigned_lteq_against_input(input, vmovq_n_u8(0x1F));
     *error_mask |= quote_mask & unescaped;
-
     // right shift of a signed value expected to be well-defined and standard
     // compliant as of C++20,
     // John Regher from Utah U. says this is fine code
-    *prev_iter_inside_quote = (quote_mask as i64 >> 63) as u64;
+    *prev_iter_inside_quote = static_cast_u64!(static_cast_i64!(quote_mask) >> 63);
     quote_mask
 }
 
@@ -245,8 +264,32 @@ unsafe fn find_whitespace_and_structurals(
     whitespace: &mut u64,
     structurals: &mut u64,
 ) {
-    let low_nibble_mask: uint8x16_t = uint8x16_t::new(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
-    let high_nibble_mask: uint8x16_t = uint8x16_t::new(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
+    // do a 'shufti' to detect structural JSON characters
+    // they are
+    // * `{` 0x7b
+    // * `}` 0x7d
+    // * `:` 0x3a
+    // * `[` 0x5b
+    // * `]` 0x5d
+    // * `,` 0x2c
+    // these go into the first 3 buckets of the comparison (1/2/4)
+
+    // we are also interested in the four whitespace characters:
+    // * space 0x20
+    // * linefeed 0x0a
+    // * horizontal tab 0x09
+    // * carriage return 0x0d
+    // these go into the next 2 buckets of the comparison (8/16)
+
+    // TODO: const?
+    let low_nibble_mask: uint8x16_t = uint8x16_t::new(
+        16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0,
+    );
+    // TODO: const?
+    let high_nibble_mask: uint8x16_t = uint8x16_t::new(
+        8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0,
+    );
+
     let structural_shufti_mask: uint8x16_t = vmovq_n_u8(0x7);
     let whitespace_shufti_mask: uint8x16_t = vmovq_n_u8(0x18);
     let low_nib_and_mask: uint8x16_t = vmovq_n_u8(0xf);
@@ -281,11 +324,11 @@ unsafe fn find_whitespace_and_structurals(
     let tmp_3: uint8x16_t = vtstq_u8(v_3, structural_shufti_mask);
     *structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
 
-    let tmp_ws_0: uint8x16_t = vtstq_u8(v_0, whitespace_shufti_mask);
-    let tmp_ws_1: uint8x16_t = vtstq_u8(v_1, whitespace_shufti_mask);
-    let tmp_ws_2: uint8x16_t = vtstq_u8(v_2, whitespace_shufti_mask);
-    let tmp_ws_3: uint8x16_t = vtstq_u8(v_3, whitespace_shufti_mask);
-    *whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
+    let tmp_ws_v0: uint8x16_t = vtstq_u8(v_0, whitespace_shufti_mask);
+    let tmp_ws_v1: uint8x16_t = vtstq_u8(v_1, whitespace_shufti_mask);
+    let tmp_ws_v2: uint8x16_t = vtstq_u8(v_2, whitespace_shufti_mask);
+    let tmp_ws_v3: uint8x16_t = vtstq_u8(v_3, whitespace_shufti_mask);
+    *whitespace = neon_movemask_bulk(tmp_ws_v0, tmp_ws_v1, tmp_ws_v2, tmp_ws_v3);
 }
 
 // flatten out values in 'bits' assuming that they are are to have values of idx
@@ -295,16 +338,18 @@ unsafe fn find_whitespace_and_structurals(
 // needs to be large enough to handle this
 //TODO: usize was u32 here does this matter?
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
-unsafe fn flatten_bits(base: &mut Vec<u32>, idx: u32, mut bits: u64) {
+fn flatten_bits(base: &mut Vec<u32>, idx: u32, mut bits: u64) {
     let cnt: usize = bits.count_ones() as usize;
     let mut l = base.len();
     let idx_minus_64 = idx.wrapping_sub(64);
-    let idx_64_v = int32x4_t::new(
-        static_cast_i32!(idx_minus_64),
-        static_cast_i32!(idx_minus_64),
-        static_cast_i32!(idx_minus_64),
-        static_cast_i32!(idx_minus_64),
-    );
+    let idx_64_v = unsafe {
+        int32x4_t::new(
+            static_cast_i32!(idx_minus_64),
+            static_cast_i32!(idx_minus_64),
+            static_cast_i32!(idx_minus_64),
+            static_cast_i32!(idx_minus_64),
+        )
+    };
 
     // We're doing some trickery here.
     // We reserve 64 extra entries, because we've at most 64 bit to set
@@ -312,22 +357,26 @@ unsafe fn flatten_bits(base: &mut Vec<u32>, idx: u32, mut bits: u64) {
     // We later indiscriminatory writre over the len we set but that's OK
     // since we ensure we reserve the needed space
     base.reserve(64);
-    base.set_len(l + cnt);
+    unsafe {
+        base.set_len(l + cnt);
+    }
 
     while bits != 0 {
-        let v0: i32 = bits.trailing_zeros() as i32;
-        bits &= bits.wrapping_sub(1);
-        let v1: i32 = bits.trailing_zeros() as i32;
-        bits &= bits.wrapping_sub(1);
-        let v2: i32 = bits.trailing_zeros() as i32;
-        bits &= bits.wrapping_sub(1);
-        let v3: i32 = bits.trailing_zeros() as i32;
-        bits &= bits.wrapping_sub(1);
-
-        let v: int32x4_t = int32x4_t::new(v0, v1, v2, v3);
-        let v: int32x4_t = vaddq_s32(idx_64_v, v);
-
-        std::ptr::write(base.as_mut_ptr().add(l) as *mut int32x4_t, v);
+        unsafe {
+            let v0 = bits.trailing_zeros() as i32;
+            bits &= bits.wrapping_sub(1);
+            let v1 = bits.trailing_zeros() as i32;
+            bits &= bits.wrapping_sub(1);
+            let v2 = bits.trailing_zeros() as i32;
+            bits &= bits.wrapping_sub(1);
+            let v3 = bits.trailing_zeros() as i32;
+            bits &= bits.wrapping_sub(1);
+
+            let v: int32x4_t = int32x4_t::new(v0, v1, v2, v3);
+            let v: int32x4_t = vaddq_s32(idx_64_v, v);
+            #[allow(clippy::cast_ptr_alignment)]
+            std::ptr::write(base.as_mut_ptr().add(l) as *mut int32x4_t, v);
+        }
         l += 4;
     }
 }
@@ -374,21 +423,15 @@ fn finalize_structurals(
     structurals
 }
 
-pub unsafe fn find_bs_bits_and_quote_bits(src: &[u8]) -> ParseStringHelper {
-    // this can read up to 31 bytes beyond the buffer size, but we require
-    // SIMDJSON_PADDING of padding
-    let v0 : uint8x16_t = vld1q_u8(src.as_ptr());
-    let v1 : uint8x16_t = vld1q_u8(src.as_ptr().add(16));
-
-    let bs_mask : uint8x16_t = vmovq_n_u8(b'\\');
-    let qt_mask : uint8x16_t = vmovq_n_u8(b'"');
-
-    let bit_mask = vld1q_u8(BIT_MASK.as_ptr());
+pub fn find_bs_bits_and_quote_bits(v0: uint8x16_t, v1: uint8x16_t) -> ParseStringHelper {
+    let quote_mask = vmovq_n_u8(b'"');
+    let bs_mask = vmovq_n_u8(b'\\');
+    let bit_mask = bit_mask!();
 
     let cmp_bs_0 : uint8x16_t = vceqq_u8(v0, bs_mask);
     let cmp_bs_1 : uint8x16_t = vceqq_u8(v1, bs_mask);
-    let cmp_qt_0 : uint8x16_t = vceqq_u8(v0, qt_mask);
-    let cmp_qt_1 : uint8x16_t = vceqq_u8(v1, qt_mask);
+    let cmp_qt_0 : uint8x16_t = vceqq_u8(v0, quote_mask);
+    let cmp_qt_1 : uint8x16_t = vceqq_u8(v1, quote_mask);
 
     let cmp_bs_0 = vandq_u8(cmp_bs_0, bit_mask);
     let cmp_bs_1 = vandq_u8(cmp_bs_1, bit_mask);
@@ -401,8 +444,8 @@ pub unsafe fn find_bs_bits_and_quote_bits(src: &[u8]) -> ParseStringHelper {
     let sum0 = vpaddq_u8(sum0, sum0);
 
     ParseStringHelper {
-        bs_bits: vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0), // bs_bits
-        quote_bits: vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1)  // quote_bits
+        bs_bits: unsafe { vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0) },
+        quote_bits: unsafe { vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) },
     }
 }
 
@@ -452,9 +495,7 @@ impl<'de> Deserializer<'de> {
             #endif
              */
             let input: SimdInput = fill_input(input.get_unchecked(idx as usize..));
-
             check_utf8(&input, &mut utf8_state);
-
             // detect odd sequences of backslashes
             let odd_ends: u64 =
                 find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash);
@@ -553,10 +594,7 @@ impl<'de> Deserializer<'de> {
             return Err(ErrorType::Syntax);
         }
 
-        let utf8_error_bits: u128 = mem::transmute(vandq_s16(mem::transmute(utf8_state.has_error), mem::transmute(utf8_state.has_error)));
-        let utf8_error: u16 = utf8_error_bits as u16;
-
-        if utf8_error != 0 {
+        if is_utf8_status_ok(utf8_state.has_error) {
             Ok(structural_indexes)
         } else {
             Err(ErrorType::InvalidUTF8)
diff --git a/src/neon/utf8check.rs b/src/neon/utf8check.rs
index b78fe7d1..082183b1 100644
--- a/src/neon/utf8check.rs
+++ b/src/neon/utf8check.rs
@@ -17,95 +17,144 @@ use crate::neon::intrinsics::*;
  *
  */
 
+/*****************************/
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn push_last_byte_of_a_to_b(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe {
+        vextq_s8(a, b, 16 - 1)
+    }
+}
+
+#[cfg_attr(not(feature = "no-inline"), inline)]
+fn push_last_2bytes_of_a_to_b(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    unsafe {
+        vextq_s8(a, b, 16 - 2)
+    }
+}
+
 // all byte values must be no larger than 0xF4
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn check_smaller_than_0xf4(current_bytes: int8x16_t, has_error: &mut int8x16_t) {
+fn check_smaller_than_0xf4(current_bytes: int8x16_t, has_error: &mut int8x16_t) {
     // unsigned, saturates to 0 below max
-    *has_error =
+    *has_error = unsafe {
         vorrq_s8(
             *has_error,
-            vqsubq_s8(current_bytes, vdupq_n_s8(-12 /* 0xF4 */)));
+            vqsubq_s8(current_bytes, vdupq_n_s8(-12 /* 0xF4 */))
+        )
+    };
 }
 
-const NIBBLES_TBL: [i8; 16] = [
-    1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
-    0, 0, 0, 0,             // 10xx (continuation)
-    2, 2,                   // 110x
-    3,                      // 1110
-    4,                      // 1111, next should be 0 (not checked here)
-];
+macro_rules! nibbles_tbl {
+    () => {
+        int8x16_t::new(
+            1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
+            0, 0, 0, 0,             // 10xx (continuation)
+            2, 2,                   // 110x
+            3,                      // 1110
+            4,                      // 1111, next should be 0 (not checked here)
+        )
+    };
+}
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn continuation_lengths(high_nibbles: int8x16_t) -> int8x16_t {
-    let nibbles: int8x16_t = vld1q_s8(NIBBLES_TBL.as_ptr());
-
-    vreinterpretq_s8_u8(vqtbl1q_u8(vreinterpretq_u8_s8(nibbles), vreinterpretq_u8_s8(high_nibbles)))
+fn continuation_lengths(high_nibbles: int8x16_t) -> int8x16_t {
+    unsafe {
+        vqtbl1q_s8(
+            nibbles_tbl!(),
+            vreinterpretq_u8_s8(high_nibbles),
+        )
+    }
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn carry_continuations(initial_lengths: int8x16_t, previous_carries: int8x16_t) -> int8x16_t {
-    let right1: int8x16_t = vqsubq_s8(
-        vextq_s8(previous_carries, initial_lengths, 16 - 1),
-        vdupq_n_s8(1));
-
-    let sum: int8x16_t = vaddq_s8(initial_lengths, right1);
-
-    let right2: int8x16_t = vqsubq_s8(
-        vextq_s8(previous_carries, sum, 16 - 2),
-        vdupq_n_s8(2));
-
-    vaddq_s8(sum, right2)
+fn carry_continuations(initial_lengths: int8x16_t, previous_carries: int8x16_t) -> int8x16_t {
+    unsafe {
+        let right1: int8x16_t = vqsubq_s8(
+            push_last_byte_of_a_to_b(previous_carries, initial_lengths),
+            vdupq_n_s8(1),
+        );
+        let sum: int8x16_t = vaddq_s8(initial_lengths, right1);
+        let right2: int8x16_t = vqsubq_s8(
+            push_last_2bytes_of_a_to_b(previous_carries, sum),
+            vdupq_n_s8(2),
+        );
+        vaddq_s8(sum, right2)
+    }
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn check_continuations(initial_lengths: int8x16_t, carries: int8x16_t, has_error: &mut int8x16_t) {
+fn check_continuations(initial_lengths: int8x16_t, carries: int8x16_t, has_error: &mut int8x16_t) {
     // overlap || underlap
     // carry > length && length > 0 || !(carry > length) && !(length > 0)
     // (carries > length) == (lengths > 0)
+    {
+        let overunder: uint8x16_t = vceqq_u8(
+            vcgtq_s8(carries, initial_lengths),
+            vcgtq_s8(initial_lengths, vdupq_n_s8(0)),
+        );
 
-    let overunder: uint8x16_t = vceqq_u8(
-        vcgtq_s8(carries, initial_lengths),
-        vcgtq_s8(initial_lengths, vdupq_n_s8(0)));
-
-    *has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(overunder));
+        *has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(overunder));
+    }
 }
 
 // when 0xED is found, next byte must be no larger than 0x9F
 // when 0xF4 is found, next byte must be no larger than 0x8F
 // next byte must be continuation, ie sign bit is set, so signed < is ok
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn check_first_continuation_max(
+fn check_first_continuation_max(
     current_bytes: int8x16_t,
     off1_current_bytes: int8x16_t,
     has_error: &mut int8x16_t,
 ) {
-    let mask_ed: uint8x16_t = vceqq_s8(off1_current_bytes, vdupq_n_s8(-19 /* 0xED */));
-    let mask_f4: uint8x16_t = vceqq_s8(off1_current_bytes, vdupq_n_s8(-12 /* 0xF4 */));
-
-    // FIXME: was vandq_u8?
-    let badfollow_ed: uint8x16_t =
-        vandq_u8(vcgtq_s8(current_bytes, vdupq_n_s8(-97 /* 0x9F */)), mask_ed);
-    let badfollow_f4: uint8x16_t =
-        vandq_u8(vcgtq_s8(current_bytes, vdupq_n_s8(-113 /* 0x8F */)), mask_f4);
+    {
+        let mask_ed: uint8x16_t = vceqq_s8(
+            off1_current_bytes,
+            vdupq_n_s8(-19 /* 0xED */),
+        );
+        let mask_f4: uint8x16_t = vceqq_s8(
+            off1_current_bytes,
+            vdupq_n_s8(-12 /* 0xF4 */),
+        );
+
+        let badfollow_ed: uint8x16_t = vandq_u8(
+            vcgtq_s8(current_bytes, vdupq_n_s8(-97 /* 0x9F */)),
+            mask_ed,
+        );
+        let badfollow_f4: uint8x16_t = vandq_u8(
+            vcgtq_s8(current_bytes, vdupq_n_s8(-113 /* 0x8F */)),
+            mask_f4,
+        );
+
+        *has_error = vorrq_s8(
+            *has_error,
+            vreinterpretq_s8_u8(vorrq_u8(badfollow_ed, badfollow_f4)),
+        );
+    }
+}
 
-    *has_error = vorrq_s8(
-        *has_error, vreinterpretq_s8_u8(vorrq_u8(badfollow_ed, badfollow_f4)));
+macro_rules! initial_mins_tbl {
+    () => {
+        int8x16_t::new(
+            -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, // 10xx => false
+            -62 /* 0xC2 */, -128,               // 110x
+            -31 /* 0xE1 */,                     // 1110
+            -15 /*0xF1 */,                      // 1111
+        )
+    };
 }
 
-const INITIAL_MINS_TBL: [i8; 16] = [
-    -128, -128, -128, -128, -128, -128,
-    -128, -128, -128, -128, -128, -128, // 10xx => false
-    -62 /* 0xC2 */, -128,                         // 110x
-    -31 /* 0xE1 */,                               // 1110
-    -15 /*0xF1 */];
-
-const SECOND_MINS_TBL: [i8; 16] = [
-    -128, -128, -128, -128, -128, -128,
-    -128, -128, -128, -128, -128, -128, // 10xx => false
-    127, 127,                          // 110x => true
-    -96 /* 0xA0 */,                               // 1110
-    -112 /* 0x90 */,
-];
+macro_rules! second_mins_tbl {
+    () => {
+        int8x16_t::new(
+            -128, -128, -128, -128, -128, -128,
+            -128, -128, -128, -128, -128, -128, // 10xx => false
+            127, 127,                           // 110x => true
+            -96 /* 0xA0 */,                     // 1110
+            -112 /* 0x90 */,                    // 1111
+        )
+    };
+}
 
 // map off1_hibits => error condition
 // hibits     off1    cur
@@ -114,29 +163,32 @@ const SECOND_MINS_TBL: [i8; 16] = [
 // F       => < F1 && < 90
 // else      false && false
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn check_overlong(
+fn check_overlong(
     current_bytes: int8x16_t,
     off1_current_bytes: int8x16_t,
     hibits: int8x16_t,
     previous_hibits: int8x16_t,
     has_error: &mut int8x16_t,
 ) {
-    let initial_mins_tbl: int8x16_t = vld1q_s8(INITIAL_MINS_TBL.as_ptr());
-    let second_mins_tbl: int8x16_t = vld1q_s8(SECOND_MINS_TBL.as_ptr());
-
-    let off1_hibits: int8x16_t = vextq_s8(previous_hibits, hibits, 16 - 1);
-    let initial_mins: int8x16_t =
-        vqtbl1q_s8(initial_mins_tbl, vreinterpretq_u8_s8(off1_hibits));
-
-    let initial_under: uint8x16_t = vcgtq_s8(initial_mins, off1_current_bytes);
-
-    let second_mins: int8x16_t =
-        vqtbl1q_s8(second_mins_tbl, vreinterpretq_u8_s8(off1_hibits));
-
-    let second_under: uint8x16_t = vcgtq_s8(second_mins, current_bytes);
-
-    *has_error = vorrq_s8(
-        *has_error, vreinterpretq_s8_u8(vandq_u8(initial_under, second_under)));
+    unsafe {
+        let off1_hibits: int8x16_t = push_last_byte_of_a_to_b(previous_hibits, hibits);
+        let initial_mins: int8x16_t = vqtbl1q_s8(
+            initial_mins_tbl!(),
+            vreinterpretq_u8_s8(off1_hibits)
+        );
+
+        let initial_under: uint8x16_t = vcgtq_s8(initial_mins, off1_current_bytes);
+
+        let second_mins: int8x16_t = vqtbl1q_s8(
+            second_mins_tbl!(),
+            vreinterpretq_u8_s8(off1_hibits)
+        );
+        let second_under: uint8x16_t = vcgtq_s8(second_mins, current_bytes);
+        *has_error = vorrq_s8(
+            *has_error,
+            vreinterpretq_s8_u8(vandq_u8(initial_under, second_under))
+        );
+    }
 }
 
 pub struct ProcessedUtfBytes {
@@ -157,9 +209,14 @@ impl Default for ProcessedUtfBytes {
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline)]
-unsafe fn count_nibbles(bytes: int8x16_t, answer: &mut ProcessedUtfBytes) {
+fn count_nibbles(bytes: int8x16_t, answer: &mut ProcessedUtfBytes) {
     answer.rawbytes = bytes;
-    answer.high_nibbles = vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4));
+    answer.high_nibbles = unsafe {
+        vandq_s8(
+            vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4)),
+            vmovq_n_s8(0x0F)
+        )
+    };
 }
 
 // check whether the current bytes are valid UTF-8
@@ -171,25 +228,26 @@ pub fn check_utf8_bytes(
     has_error: &mut int8x16_t,
 ) -> ProcessedUtfBytes {
     let mut pb = ProcessedUtfBytes::default();
-    unsafe {
-        count_nibbles(current_bytes, &mut pb);
+    count_nibbles(current_bytes, &mut pb);
 
-        check_smaller_than_0xf4(current_bytes, has_error);
+    check_smaller_than_0xf4(current_bytes, has_error);
 
-        let initial_lengths: int8x16_t = continuation_lengths(pb.high_nibbles);
+    let initial_lengths: int8x16_t = continuation_lengths(pb.high_nibbles);
 
-        pb.carried_continuations =
-            carry_continuations(initial_lengths, previous.carried_continuations);
+    pb.carried_continuations =
+        carry_continuations(initial_lengths, previous.carried_continuations);
 
-        check_continuations(initial_lengths, pb.carried_continuations, has_error);
+    check_continuations(initial_lengths, pb.carried_continuations, has_error);
 
-        let off1_current_bytes: int8x16_t =
-            vextq_s8(previous.rawbytes, pb.rawbytes, 16 - 1);
+    let off1_current_bytes: int8x16_t = push_last_byte_of_a_to_b(previous.rawbytes, pb.rawbytes);
+    check_first_continuation_max(current_bytes, off1_current_bytes, has_error);
 
-        check_first_continuation_max(current_bytes, off1_current_bytes, has_error);
-
-        check_overlong(current_bytes, off1_current_bytes, pb.high_nibbles,
-                       previous.high_nibbles, has_error);
-    }
+    check_overlong(
+        current_bytes,
+        off1_current_bytes,
+        pb.high_nibbles,
+        previous.high_nibbles,
+        has_error,
+    );
     pb
 }

From ee2bd9b655ddacb1c0f0d7a5b2375a428d7e78e7 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Thu, 15 Aug 2019 22:02:39 -0400
Subject: [PATCH 26/34] feat: update generator with neon support, conditional
 compilation

---
 src/neon/intrinsics.rs |   8 +-
 src/neon/simd_llvm.rs  |   2 +-
 src/value/generator.rs | 278 ++++++++++++++++++++++++++++-------------
 3 files changed, 196 insertions(+), 92 deletions(-)

diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs
index 2ecfda86..c5c98cb6 100644
--- a/src/neon/intrinsics.rs
+++ b/src/neon/intrinsics.rs
@@ -410,13 +410,17 @@ pub unsafe fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
 #[inline]
 pub fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_and(a, b) } }
 #[inline]
+pub fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_and(a, b) } }
+#[inline]
 pub fn vandq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { unsafe { simd_llvm::simd_and(a, b) } }
 #[inline]
 pub fn vorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_or(a, b) } }
 #[inline]
-pub fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_and(a, b) } }
-#[inline]
 pub fn vorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_or(a, b) } }
+#[inline]
+pub fn vxorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_xor(a, b) } }
+#[inline]
+pub fn vxorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_xor(a, b) } }
 
 macro_rules! arm_reinterpret {
     ($name: ident, $from: ty, $to: ty) => {
diff --git a/src/neon/simd_llvm.rs b/src/neon/simd_llvm.rs
index bb7b25bd..6e60b63c 100644
--- a/src/neon/simd_llvm.rs
+++ b/src/neon/simd_llvm.rs
@@ -27,7 +27,7 @@ extern "platform-intrinsic" {
 //    pub fn simd_shr<T>(x: T, y: T) -> T;
     pub fn simd_and<T>(x: T, y: T) -> T;
     pub fn simd_or<T>(x: T, y: T) -> T;
-//    pub fn simd_xor<T>(x: T, y: T) -> T;
+    pub fn simd_xor<T>(x: T, y: T) -> T;
 //
 //    pub fn simd_reduce_add_unordered<T, U>(x: T) -> U;
 //    pub fn simd_reduce_mul_unordered<T, U>(x: T) -> U;
diff --git a/src/value/generator.rs b/src/value/generator.rs
index e17b2d13..8b6ef266 100644
--- a/src/value/generator.rs
+++ b/src/value/generator.rs
@@ -10,10 +10,25 @@ use std::io::Write;
 use std::marker::PhantomData;
 use std::ptr;
 
-//#[cfg(target_feature = "avx2")]
-//const AVX2_PRESENT : bool = true;
-//#[cfg(not(target_feature = "avx2"))]
-//const AVX2_PRESENT : bool = false;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+#[cfg(target_feature = "neon")]
+use crate::*;
+
+#[cfg(target_feature = "avx2")]
+const AVX2_PRESENT : bool = true;
+#[cfg(not(target_feature = "avx2"))]
+const AVX2_PRESENT : bool = false;
+#[cfg(target_feature = "sse4.2")]
+const SSE42_PRESENT : bool = true;
+#[cfg(not(target_feature = "sse4.2"))]
+const SSE42_PRESENT : bool = false;
+#[cfg(target_feature = "neon")]
+const NEON_PRESENT : bool = true;
+#[cfg(not(target_feature = "neon"))]
+const NEON_PRESENT : bool = false;
 
 const QU: u8 = b'"';
 const BS: u8 = b'\\';
@@ -93,96 +108,23 @@ pub trait BaseGenerator {
     #[inline(always)]
     fn write_string(&mut self, string: &str) -> io::Result<()> {
         stry!(self.write_char(b'"'));
-        let string = string.as_bytes();
-//        let mut string = string.as_bytes();
-//        let mut len = string.len();
-//        let mut idx = 0;
+//        let string = string.as_bytes();
+        let mut string = string.as_bytes();
+        let mut len = string.len();
+        let mut idx = 0;
 
-//        unsafe {
+        unsafe {
             // Looking at the table above the lower 5 bits are entirely
             // quote characters that gives us a bitmask of 0x1f for that
             // region, only quote (`"`) and backslash (`\`) are not in
             // this range.
-//          if AVX2_PRESENT {
-//              let zero = _mm256_set1_epi8(0);
-//              let lower_quote_range = _mm256_set1_epi8(0x1F as i8);
-//              let quote = _mm256_set1_epi8(b'"' as i8);
-//              let backslash = _mm256_set1_epi8(b'\\' as i8);
-//              while len - idx >= 32 {
-//                  // Load 32 bytes of data;
-//                  #[allow(clippy::cast_ptr_alignment)]
-//                  let data: __m256i = _mm256_loadu_si256(string.as_ptr().add(idx) as *const __m256i);
-//                  // Test the data against being backslash and quote.
-//                  let bs_or_quote = _mm256_or_si256(
-//                      _mm256_cmpeq_epi8(data, backslash),
-//                      _mm256_cmpeq_epi8(data, quote),
-//                  );
-//                  // Now mask the data with the quote range (0x1F).
-//                  let in_quote_range = _mm256_and_si256(data, lower_quote_range);
-//                  // then test of the data is unchanged. aka: xor it with the
-//                  // Any field that was inside the quote range it will be zero
-//                  // now.
-//                  let is_unchanged = _mm256_xor_si256(data, in_quote_range);
-//                  let in_range = _mm256_cmpeq_epi8(is_unchanged, zero);
-//                  let quote_bits = _mm256_movemask_epi8(_mm256_or_si256(bs_or_quote, in_range));
-//                  if quote_bits != 0 {
-//                      let quote_dist = trailingzeroes(quote_bits as u64) as usize;
-//                      stry!(self.get_writer().write_all(&string[0..idx + quote_dist]));
-//                      let ch = string[idx + quote_dist];
-//                      match ESCAPED[ch as usize] {
-//                          b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
-//
-//                          escape => stry!(self.write(&[b'\\', escape])),
-//                      };
-//                      string = &string[idx + quote_dist + 1..];
-//                      idx = 0;
-//                      len = string.len();
-//                  } else {
-//                      idx += 32;
-//                  }
-//              }
-//          }
-            // The case where we have a 16+ byte block
-            // we repeate the same logic as above but with
-            // only 16 bytes
-//          let zero = _mm_set1_epi8(0);
-//          let lower_quote_range = _mm_set1_epi8(0x1F as i8);
-//          let quote = _mm_set1_epi8(b'"' as i8);
-//          let backslash = _mm_set1_epi8(b'\\' as i8);
-//          while len - idx > 16 {
-//              // Load 16 bytes of data;
-//              #[allow(clippy::cast_ptr_alignment)]
-//              let data: __m128i = _mm_loadu_si128(string.as_ptr().add(idx) as *const __m128i);
-//              // Test the data against being backslash and quote.
-//              let bs_or_quote =
-//                  _mm_or_si128(_mm_cmpeq_epi8(data, backslash), _mm_cmpeq_epi8(data, quote));
-//              // Now mask the data with the quote range (0x1F).
-//              let in_quote_range = _mm_and_si128(data, lower_quote_range);
-//              // then test of the data is unchanged. aka: xor it with the
-//              // Any field that was inside the quote range it will be zero
-//              // now.
-//              let is_unchanged = _mm_xor_si128(data, in_quote_range);
-//              let in_range = _mm_cmpeq_epi8(is_unchanged, zero);
-//              let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range));
-//              if quote_bits != 0 {
-//                  let quote_dist = trailingzeroes(quote_bits as u64) as usize;
-//                  stry!(self.get_writer().write_all(&string[0..idx + quote_dist]));
-//                  let ch = string[idx + quote_dist];
-//                  match ESCAPED[ch as usize] {
-//                      b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
-//
-//                      escape => stry!(self.write(&[b'\\', escape])),
-//                  };
-//                  string = &string[idx + quote_dist + 1..];
-//                  idx = 0;
-//                  len = string.len();
-//              } else {
-//                  idx += 16;
-//              }
-//          }
-//          stry!(self.get_writer().write_all(&string[0..idx]));
-//          string = &string[idx..];
-//      }
+            if AVX2_PRESENT {
+                self.process_32_bytes(&mut string, &mut len, &mut idx).unwrap();
+            }
+            if SSE42_PRESENT || NEON_PRESENT {
+                self.process_16_bytes(&mut string, &mut len, &mut idx).unwrap();
+            }
+        }
         // Legacy code to handle the remainder of the code
         for (index, ch) in string.iter().enumerate() {
             if ESCAPED[*ch as usize] > 0 {
@@ -194,6 +136,164 @@ pub trait BaseGenerator {
         self.write_char(b'"')
     }
 
+    #[cfg(target_feature = "neon")]
+    unsafe fn process_16_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {
+        macro_rules! bit_mask {
+            () => {
+                uint8x16_t::new(
+                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+                )
+            };
+        }
+
+        #[cfg_attr(not(feature = "no-inline"), inline(always))]
+        unsafe fn neon_movemask(input: uint8x16_t) -> u16 {
+            let minput: uint8x16_t = vandq_u8(input, bit_mask!());
+            let tmp: uint8x16_t = vpaddq_u8(minput, minput);
+            let tmp = vpaddq_u8(tmp, tmp);
+            let tmp = vpaddq_u8(tmp, tmp);
+
+            vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0)
+        }
+
+        // The case where we have a 16+ byte block
+        // we repeate the same logic as above but with
+        // only 16 bytes
+        let zero = vdupq_n_u8(0);
+        let lower_quote_range = vdupq_n_u8(0x1F);
+        let quote = vdupq_n_u8(b'"');
+        let backslash = vdupq_n_u8(b'\\');
+        while *len - *idx > 16 {
+            // Load 16 bytes of data;
+            let data: uint8x16_t = vld1q_u8(string.as_ptr().add(*idx));
+            // Test the data against being backslash and quote.
+            let bs_or_quote =
+                vorrq_u8(vceqq_u8(data, backslash), vceqq_u8(data, quote));
+            // Now mask the data with the quote range (0x1F).
+            let in_quote_range = vandq_u8(data, lower_quote_range);
+            // then test of the data is unchanged. aka: xor it with the
+            // Any field that was inside the quote range it will be zero
+            // now.
+            let is_unchanged = vxorrq_u8(data, in_quote_range);
+            let in_range = vceqq_u8(is_unchanged, zero);
+            let quote_bits = neon_movemask(vorrq_u8(bs_or_quote, in_range));
+            if quote_bits != 0 {
+                let quote_dist = quote_bits.trailing_zeros() as usize;
+                stry!(self.get_writer().write_all(&string[0..*idx + quote_dist]));
+                let ch = string[*idx + quote_dist];
+                match ESCAPED[ch as usize] {
+                    b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
+
+                    escape => stry!(self.write(&[b'\\', escape])),
+                };
+                *string = &string[*idx + quote_dist + 1..];
+                *idx = 0;
+                *len = string.len();
+            } else {
+                *idx += 16;
+            }
+        }
+        stry!(self.get_writer().write_all(&string[0..*idx]));
+        *string = &string[*idx..];
+        Ok(())
+    }
+
+    #[cfg(not(target_feature = "avx2"))]
+    #[inline(always)]
+    fn process_32_bytes(&mut self, _string: &mut &[u8], _len: &mut usize, _idx: &mut usize) -> io::Result<()> {
+        Ok(())
+    }
+
+    #[cfg(target_feature = "sse4.2")]
+    #[inline(always)]
+    unsafe fn process_16_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {
+        // The case where we have a 16+ byte block
+        // we repeate the same logic as above but with
+        // only 16 bytes
+        let zero = _mm_set1_epi8(0);
+        let lower_quote_range = _mm_set1_epi8(0x1F as i8);
+        let quote = _mm_set1_epi8(b'"' as i8);
+        let backslash = _mm_set1_epi8(b'\\' as i8);
+        while *len - *idx > 16 {
+            // Load 16 bytes of data;
+            #[allow(clippy::cast_ptr_alignment)]
+                let data: __m128i = _mm_loadu_si128(string.as_ptr().add(*idx) as *const __m128i);
+            // Test the data against being backslash and quote.
+            let bs_or_quote =
+                _mm_or_si128(_mm_cmpeq_epi8(data, backslash), _mm_cmpeq_epi8(data, quote));
+            // Now mask the data with the quote range (0x1F).
+            let in_quote_range = _mm_and_si128(data, lower_quote_range);
+            // then test of the data is unchanged. aka: xor it with the
+            // Any field that was inside the quote range it will be zero
+            // now.
+            let is_unchanged = _mm_xor_si128(data, in_quote_range);
+            let in_range = _mm_cmpeq_epi8(is_unchanged, zero);
+            let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range));
+            if quote_bits != 0 {
+                let quote_dist = quote_bits.trailing_zeros() as usize;
+                stry!(self.get_writer().write_all(&string[0..*idx + quote_dist]));
+                let ch = string[*idx + quote_dist];
+                match ESCAPED[ch as usize] {
+                    b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
+
+                    escape => stry!(self.write(&[b'\\', escape])),
+                };
+                *string = &string[*idx + quote_dist + 1..];
+                *idx = 0;
+                *len = string.len();
+            } else {
+                *idx += 16;
+            }
+        }
+        stry!(self.get_writer().write_all(&string[0..*idx]));
+        *string = &string[*idx..];
+        Ok(())
+    }
+
+    #[cfg(target_feature = "avx2")]
+    #[inline(always)]
+    unsafe fn process_32_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {
+        let zero = _mm256_set1_epi8(0);
+        let lower_quote_range = _mm256_set1_epi8(0x1F as i8);
+        let quote = _mm256_set1_epi8(b'"' as i8);
+        let backslash = _mm256_set1_epi8(b'\\' as i8);
+        while *len - *idx >= 32 {
+            // Load 32 bytes of data;
+            #[allow(clippy::cast_ptr_alignment)]
+                let data: __m256i = _mm256_loadu_si256(string.as_ptr().add(*idx) as *const __m256i);
+            // Test the data against being backslash and quote.
+            let bs_or_quote = _mm256_or_si256(
+                _mm256_cmpeq_epi8(data, backslash),
+                _mm256_cmpeq_epi8(data, quote),
+            );
+            // Now mask the data with the quote range (0x1F).
+            let in_quote_range = _mm256_and_si256(data, lower_quote_range);
+            // then test of the data is unchanged. aka: xor it with the
+            // Any field that was inside the quote range it will be zero
+            // now.
+            let is_unchanged = _mm256_xor_si256(data, in_quote_range);
+            let in_range = _mm256_cmpeq_epi8(is_unchanged, zero);
+            let quote_bits = _mm256_movemask_epi8(_mm256_or_si256(bs_or_quote, in_range));
+            if quote_bits != 0 {
+                let quote_dist = quote_bits.trailing_zeros() as usize;
+                stry!(self.get_writer().write_all(&string[0..*idx + quote_dist]));
+                let ch = string[*idx + quote_dist];
+                match ESCAPED[ch as usize] {
+                    b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
+
+                    escape => stry!(self.write(&[b'\\', escape])),
+                };
+                *string = &string[*idx + quote_dist + 1..];
+                *idx = 0;
+                *len = string.len();
+            } else {
+                *idx += 32;
+            }
+        }
+        Ok(())
+    }
+
     #[inline(always)]
     fn write_float(&mut self, num: f64) -> io::Result<()> {
         let mut buffer = ryu::Buffer::new();

From 75511d388aa2c8bbe4a5512033c03cbf0fb7d527 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Thu, 15 Aug 2019 22:10:47 -0400
Subject: [PATCH 27/34] fix: remove double semicolon

---
 src/numberparse.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/numberparse.rs b/src/numberparse.rs
index 7674a977..01af460e 100644
--- a/src/numberparse.rs
+++ b/src/numberparse.rs
@@ -228,7 +228,7 @@ impl<'de> Deserializer<'de> {
                 digit = unsafe { *p.get_unchecked(digitcount) } - b'0';
                 digitcount += 1;
                 fraction_weight *= 10.0;
-                fraction += f64::from(digit) / fraction_weight;;
+                fraction += f64::from(digit) / fraction_weight;
             }
             i += fraction;
         }

From 23191c61d164d854f1842abac8e7e7b274293bdb Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Thu, 15 Aug 2019 23:44:54 -0400
Subject: [PATCH 28/34] feat: conditional feature enablement for neon only

---
 src/lib.rs | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 690a5b0c..b68deadb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,13 +1,17 @@
 #![deny(warnings)]
-#![feature(asm)]
-#![feature(stdsimd)]
-#![feature(repr_simd)]
-#![feature(custom_inner_attributes)]
-#![feature(aarch64_target_feature)]
-#![feature(platform_intrinsics)]
-#![feature(stmt_expr_attributes)]
-#![feature(simd_ffi)]
-#![feature(link_llvm_intrinsics)]
+
+#![cfg_attr(target_feature = "neon", feature(
+    asm,
+    stdsimd,
+    repr_simd,
+    custom_inner_attributes,
+    aarch64_target_feature,
+    platform_intrinsics,
+    stmt_expr_attributes,
+    simd_ffi,
+    link_llvm_intrinsics
+    )
+)]
 
 #![cfg_attr(feature = "hints", feature(core_intrinsics))]
 //! simdjson-rs is a rust port of the simejson c++ library. It follows

From d514f2d30a11a5b1a861ad3babe31c819645cdf6 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Fri, 16 Aug 2019 00:09:17 -0400
Subject: [PATCH 29/34] fix: add conditional compilation for
 target_feature=-avx2,-sse4.2 on x86/x86_64 platforms

---
 src/value/generator.rs | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/value/generator.rs b/src/value/generator.rs
index 8b6ef266..920b6615 100644
--- a/src/value/generator.rs
+++ b/src/value/generator.rs
@@ -12,7 +12,7 @@ use std::ptr;
 
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
+#[cfg(any(target_feature = "sse4.2", target_feature = "avx2"))]
 use std::arch::x86_64::*;
 #[cfg(target_feature = "neon")]
 use crate::*;
@@ -108,7 +108,6 @@ pub trait BaseGenerator {
     #[inline(always)]
     fn write_string(&mut self, string: &str) -> io::Result<()> {
         stry!(self.write_char(b'"'));
-//        let string = string.as_bytes();
         let mut string = string.as_bytes();
         let mut len = string.len();
         let mut idx = 0;
@@ -199,12 +198,6 @@ pub trait BaseGenerator {
         Ok(())
     }
 
-    #[cfg(not(target_feature = "avx2"))]
-    #[inline(always)]
-    fn process_32_bytes(&mut self, _string: &mut &[u8], _len: &mut usize, _idx: &mut usize) -> io::Result<()> {
-        Ok(())
-    }
-
     #[cfg(target_feature = "sse4.2")]
     #[inline(always)]
     unsafe fn process_16_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {
@@ -251,6 +244,18 @@ pub trait BaseGenerator {
         Ok(())
     }
 
+    #[cfg(all(not(target_feature = "sse4.2"), not(target_feature = "neon")))]
+    #[inline(always)]
+    unsafe fn process_16_bytes(&mut self, _string: &mut &[u8], _len: &mut usize, _idx: &mut usize) -> io::Result<()> {
+        Ok(())
+    }
+
+    #[cfg(not(target_feature = "avx2"))]
+    #[inline(always)]
+    fn process_32_bytes(&mut self, _string: &mut &[u8], _len: &mut usize, _idx: &mut usize) -> io::Result<()> {
+        Ok(())
+    }
+
     #[cfg(target_feature = "avx2")]
     #[inline(always)]
     unsafe fn process_32_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {

From 3bdc16922d747170e5d42314a7168e87b46ee8f8 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Fri, 16 Aug 2019 13:23:42 -0400
Subject: [PATCH 30/34] feat: factor arch-specific methods into separate
 modules (with macros, need some advice there)

---
 src/avx2/generator.rs  |  97 +++++++++++++++++++++
 src/avx2/mod.rs        |   3 +-
 src/neon/generator.rs  |  73 ++++++++++++++++
 src/neon/mod.rs        |   1 +
 src/sse42/generator.rs |  72 ++++++++++++++++
 src/sse42/mod.rs       |   3 +-
 src/value/generator.rs | 192 +++--------------------------------------
 7 files changed, 260 insertions(+), 181 deletions(-)
 create mode 100644 src/avx2/generator.rs
 create mode 100644 src/neon/generator.rs
 create mode 100644 src/sse42/generator.rs

diff --git a/src/avx2/generator.rs b/src/avx2/generator.rs
new file mode 100644
index 00000000..c77a4ebc
--- /dev/null
+++ b/src/avx2/generator.rs
@@ -0,0 +1,97 @@
+
+#[macro_export]
+macro_rules! process_32_bytes {
+    () => {
+        #[inline(always)]
+        unsafe fn process_32_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {
+            let zero = _mm256_set1_epi8(0);
+            let lower_quote_range = _mm256_set1_epi8(0x1F as i8);
+            let quote = _mm256_set1_epi8(b'"' as i8);
+            let backslash = _mm256_set1_epi8(b'\\' as i8);
+            while *len - *idx >= 32 {
+                // Load 32 bytes of data;
+                #[allow(clippy::cast_ptr_alignment)]
+                    let data: __m256i = _mm256_loadu_si256(string.as_ptr().add(*idx) as *const __m256i);
+                // Test the data against being backslash and quote.
+                let bs_or_quote = _mm256_or_si256(
+                    _mm256_cmpeq_epi8(data, backslash),
+                    _mm256_cmpeq_epi8(data, quote),
+                );
+                // Now mask the data with the quote range (0x1F).
+                let in_quote_range = _mm256_and_si256(data, lower_quote_range);
+                // then test of the data is unchanged. aka: xor it with the
+                // Any field that was inside the quote range it will be zero
+                // now.
+                let is_unchanged = _mm256_xor_si256(data, in_quote_range);
+                let in_range = _mm256_cmpeq_epi8(is_unchanged, zero);
+                let quote_bits = _mm256_movemask_epi8(_mm256_or_si256(bs_or_quote, in_range));
+                if quote_bits != 0 {
+                    let quote_dist = quote_bits.trailing_zeros() as usize;
+                    stry!(self.get_writer().write_all(&string[0..*idx + quote_dist]));
+                    let ch = string[*idx + quote_dist];
+                    match ESCAPED[ch as usize] {
+                        b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
+
+                        escape => stry!(self.write(&[b'\\', escape])),
+                    };
+                    *string = &string[*idx + quote_dist + 1..];
+                    *idx = 0;
+                    *len = string.len();
+                } else {
+                    *idx += 32;
+                }
+            }
+            Ok(())
+        }
+    }
+}
+
+#[macro_export]
+macro_rules! process_16_bytes {
+    () => {
+    #[inline(always)]
+        unsafe fn process_16_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {
+            // The case where we have a 16+ byte block
+            // we repeate the same logic as above but with
+            // only 16 bytes
+            let zero = _mm_set1_epi8(0);
+            let lower_quote_range = _mm_set1_epi8(0x1F as i8);
+            let quote = _mm_set1_epi8(b'"' as i8);
+            let backslash = _mm_set1_epi8(b'\\' as i8);
+            while *len - *idx > 16 {
+                // Load 16 bytes of data;
+                #[allow(clippy::cast_ptr_alignment)]
+                    let data: __m128i = _mm_loadu_si128(string.as_ptr().add(*idx) as *const __m128i);
+                // Test the data against being backslash and quote.
+                let bs_or_quote =
+                    _mm_or_si128(_mm_cmpeq_epi8(data, backslash), _mm_cmpeq_epi8(data, quote));
+                // Now mask the data with the quote range (0x1F).
+                let in_quote_range = _mm_and_si128(data, lower_quote_range);
+                // then test of the data is unchanged. aka: xor it with the
+                // Any field that was inside the quote range it will be zero
+                // now.
+                let is_unchanged = _mm_xor_si128(data, in_quote_range);
+                let in_range = _mm_cmpeq_epi8(is_unchanged, zero);
+                let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range));
+                if quote_bits != 0 {
+                    let quote_dist = quote_bits.trailing_zeros() as usize;
+                    stry!(self.get_writer().write_all(&string[0..*idx + quote_dist]));
+                    let ch = string[*idx + quote_dist];
+                    match ESCAPED[ch as usize] {
+                        b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
+
+                        escape => stry!(self.write(&[b'\\', escape])),
+                    };
+                    *string = &string[*idx + quote_dist + 1..];
+                    *idx = 0;
+                    *len = string.len();
+                } else {
+                    *idx += 16;
+                }
+            }
+            stry!(self.get_writer().write_all(&string[0..*idx]));
+            *string = &string[*idx..];
+            Ok(())
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs
index 30c55c86..ac608ae2 100644
--- a/src/avx2/mod.rs
+++ b/src/avx2/mod.rs
@@ -1,3 +1,4 @@
 pub mod deser;
 pub mod stage1;
-pub mod utf8check;
\ No newline at end of file
+pub mod utf8check;
+pub mod generator;
\ No newline at end of file
diff --git a/src/neon/generator.rs b/src/neon/generator.rs
new file mode 100644
index 00000000..7e0c8b60
--- /dev/null
+++ b/src/neon/generator.rs
@@ -0,0 +1,73 @@
+
+#[macro_export]
+macro_rules! process_32_bytes {
+    () => {
+        #[inline(always)]
+        unsafe fn process_32_bytes(&mut self, _string: &mut &[u8], _len: &mut usize, _idx: &mut usize) -> io::Result < () > {
+            Ok(())
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! process_16_bytes {
+    () => {
+        #[inline(always)]
+        unsafe fn process_16_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {
+            #[cfg_attr(not(feature = "no-inline"), inline(always))]
+            unsafe fn __neon_movemask(input: uint8x16_t) -> u16 {
+                let bit_mask = uint8x16_t::new(
+                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+                );
+                let minput: uint8x16_t = vandq_u8(input, bit_mask);
+                let tmp: uint8x16_t = vpaddq_u8(minput, minput);
+                let tmp = vpaddq_u8(tmp, tmp);
+                let tmp = vpaddq_u8(tmp, tmp);
+
+                vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0)
+            }
+
+            // The case where we have a 16+ byte block
+            // we repeate the same logic as above but with
+            // only 16 bytes
+            let zero = vdupq_n_u8(0);
+            let lower_quote_range = vdupq_n_u8(0x1F);
+            let quote = vdupq_n_u8(b'"');
+            let backslash = vdupq_n_u8(b'\\');
+            while *len - *idx > 16 {
+                // Load 16 bytes of data;
+                let data: uint8x16_t = vld1q_u8(string.as_ptr().add(*idx));
+                // Test the data against being backslash and quote.
+                let bs_or_quote =
+                    vorrq_u8(vceqq_u8(data, backslash), vceqq_u8(data, quote));
+                // Now mask the data with the quote range (0x1F).
+                let in_quote_range = vandq_u8(data, lower_quote_range);
+                // then test of the data is unchanged. aka: xor it with the
+                // Any field that was inside the quote range it will be zero
+                // now.
+                let is_unchanged = vxorrq_u8(data, in_quote_range);
+                let in_range = vceqq_u8(is_unchanged, zero);
+                let quote_bits = __neon_movemask(vorrq_u8(bs_or_quote, in_range));
+                if quote_bits != 0 {
+                    let quote_dist = quote_bits.trailing_zeros() as usize;
+                    stry!(self.get_writer().write_all(&string[0..*idx + quote_dist]));
+                    let ch = string[*idx + quote_dist];
+                    match ESCAPED[ch as usize] {
+                        b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
+
+                        escape => stry!(self.write(&[b'\\', escape])),
+                    };
+                    *string = &string[*idx + quote_dist + 1..];
+                    *idx = 0;
+                    *len = string.len();
+                } else {
+                    *idx += 16;
+                }
+            }
+            stry!(self.get_writer().write_all(&string[0..*idx]));
+            *string = &string[*idx..];
+            Ok(())
+        }
+    };
+}
\ No newline at end of file
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
index 1b979e5a..f7868249 100644
--- a/src/neon/mod.rs
+++ b/src/neon/mod.rs
@@ -1,6 +1,7 @@
 pub mod deser;
 pub mod stage1;
 pub mod utf8check;
+pub mod generator;
 mod simd;
 mod simd_llvm;
 mod intrinsics;
\ No newline at end of file
diff --git a/src/sse42/generator.rs b/src/sse42/generator.rs
new file mode 100644
index 00000000..4b3a90f3
--- /dev/null
+++ b/src/sse42/generator.rs
@@ -0,0 +1,72 @@
+
+#[macro_export]
+macro_rules! process_32_bytes {
+    () => {
+        #[inline(always)]
+        unsafe fn process_32_bytes(&mut self, _string: &mut &[u8], _len: &mut usize, _idx: &mut usize) -> io::Result<()> {
+            Ok(())
+        }
+    }
+}
+
+#[macro_export]
+#[cfg(target_feature = "sse4.2")]
+macro_rules! process_16_bytes {
+    () => {
+        #[inline(always)]
+        unsafe fn process_16_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {
+            // The case where we have a 16+ byte block
+            // we repeate the same logic as above but with
+            // only 16 bytes
+            let zero = _mm_set1_epi8(0);
+            let lower_quote_range = _mm_set1_epi8(0x1F as i8);
+            let quote = _mm_set1_epi8(b'"' as i8);
+            let backslash = _mm_set1_epi8(b'\\' as i8);
+            while *len - *idx > 16 {
+                // Load 16 bytes of data;
+                #[allow(clippy::cast_ptr_alignment)]
+                    let data: __m128i = _mm_loadu_si128(string.as_ptr().add(*idx) as *const __m128i);
+                // Test the data against being backslash and quote.
+                let bs_or_quote =
+                    _mm_or_si128(_mm_cmpeq_epi8(data, backslash), _mm_cmpeq_epi8(data, quote));
+                // Now mask the data with the quote range (0x1F).
+                let in_quote_range = _mm_and_si128(data, lower_quote_range);
+                // then test of the data is unchanged. aka: xor it with the
+                // Any field that was inside the quote range it will be zero
+                // now.
+                let is_unchanged = _mm_xor_si128(data, in_quote_range);
+                let in_range = _mm_cmpeq_epi8(is_unchanged, zero);
+                let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range));
+                if quote_bits != 0 {
+                    let quote_dist = quote_bits.trailing_zeros() as usize;
+                    stry!(self.get_writer().write_all(&string[0..*idx + quote_dist]));
+                    let ch = string[*idx + quote_dist];
+                    match ESCAPED[ch as usize] {
+                        b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
+
+                        escape => stry!(self.write(&[b'\\', escape])),
+                    };
+                    *string = &string[*idx + quote_dist + 1..];
+                    *idx = 0;
+                    *len = string.len();
+                } else {
+                    *idx += 16;
+                }
+            }
+            stry!(self.get_writer().write_all(&string[0..*idx]));
+            *string = &string[*idx..];
+            Ok(())
+        }
+    }
+}
+
+#[macro_export]
+#[cfg(not(target_feature = "sse4.2"))]
+macro_rules! process_16_bytes {
+    () => {
+        #[inline(always)]
+        unsafe fn process_16_bytes(&mut self, _string: &mut &[u8], _len: &mut usize, _idx: &mut usize) -> io::Result<()> {
+            Ok(())
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/sse42/mod.rs b/src/sse42/mod.rs
index 30c55c86..ac608ae2 100644
--- a/src/sse42/mod.rs
+++ b/src/sse42/mod.rs
@@ -1,3 +1,4 @@
 pub mod deser;
 pub mod stage1;
-pub mod utf8check;
\ No newline at end of file
+pub mod utf8check;
+pub mod generator;
\ No newline at end of file
diff --git a/src/value/generator.rs b/src/value/generator.rs
index 920b6615..b053344f 100644
--- a/src/value/generator.rs
+++ b/src/value/generator.rs
@@ -14,21 +14,17 @@ use std::ptr;
 use std::arch::x86::*;
 #[cfg(any(target_feature = "sse4.2", target_feature = "avx2"))]
 use std::arch::x86_64::*;
-#[cfg(target_feature = "neon")]
+
 use crate::*;
 
 #[cfg(target_feature = "avx2")]
-const AVX2_PRESENT : bool = true;
-#[cfg(not(target_feature = "avx2"))]
-const AVX2_PRESENT : bool = false;
-#[cfg(target_feature = "sse4.2")]
-const SSE42_PRESENT : bool = true;
-#[cfg(not(target_feature = "sse4.2"))]
-const SSE42_PRESENT : bool = false;
+pub use crate::avx2::generator::*;
+
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))]
+pub use crate::sse42::generator::*;
+
 #[cfg(target_feature = "neon")]
-const NEON_PRESENT : bool = true;
-#[cfg(not(target_feature = "neon"))]
-const NEON_PRESENT : bool = false;
+pub use crate::neon::generator::*;
 
 const QU: u8 = b'"';
 const BS: u8 = b'\\';
@@ -117,12 +113,8 @@ pub trait BaseGenerator {
             // quote characters that gives us a bitmask of 0x1f for that
             // region, only quote (`"`) and backslash (`\`) are not in
             // this range.
-            if AVX2_PRESENT {
-                self.process_32_bytes(&mut string, &mut len, &mut idx).unwrap();
-            }
-            if SSE42_PRESENT || NEON_PRESENT {
-                self.process_16_bytes(&mut string, &mut len, &mut idx).unwrap();
-            }
+            stry!(self.process_32_bytes(&mut string, &mut len, &mut idx));
+            stry!(self.process_32_bytes(&mut string, &mut len, &mut idx));
         }
         // Legacy code to handle the remainder of the code
         for (index, ch) in string.iter().enumerate() {
@@ -135,169 +127,11 @@ pub trait BaseGenerator {
         self.write_char(b'"')
     }
 
-    #[cfg(target_feature = "neon")]
-    unsafe fn process_16_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {
-        macro_rules! bit_mask {
-            () => {
-                uint8x16_t::new(
-                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-                )
-            };
-        }
-
-        #[cfg_attr(not(feature = "no-inline"), inline(always))]
-        unsafe fn neon_movemask(input: uint8x16_t) -> u16 {
-            let minput: uint8x16_t = vandq_u8(input, bit_mask!());
-            let tmp: uint8x16_t = vpaddq_u8(minput, minput);
-            let tmp = vpaddq_u8(tmp, tmp);
-            let tmp = vpaddq_u8(tmp, tmp);
-
-            vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0)
-        }
-
-        // The case where we have a 16+ byte block
-        // we repeate the same logic as above but with
-        // only 16 bytes
-        let zero = vdupq_n_u8(0);
-        let lower_quote_range = vdupq_n_u8(0x1F);
-        let quote = vdupq_n_u8(b'"');
-        let backslash = vdupq_n_u8(b'\\');
-        while *len - *idx > 16 {
-            // Load 16 bytes of data;
-            let data: uint8x16_t = vld1q_u8(string.as_ptr().add(*idx));
-            // Test the data against being backslash and quote.
-            let bs_or_quote =
-                vorrq_u8(vceqq_u8(data, backslash), vceqq_u8(data, quote));
-            // Now mask the data with the quote range (0x1F).
-            let in_quote_range = vandq_u8(data, lower_quote_range);
-            // then test of the data is unchanged. aka: xor it with the
-            // Any field that was inside the quote range it will be zero
-            // now.
-            let is_unchanged = vxorrq_u8(data, in_quote_range);
-            let in_range = vceqq_u8(is_unchanged, zero);
-            let quote_bits = neon_movemask(vorrq_u8(bs_or_quote, in_range));
-            if quote_bits != 0 {
-                let quote_dist = quote_bits.trailing_zeros() as usize;
-                stry!(self.get_writer().write_all(&string[0..*idx + quote_dist]));
-                let ch = string[*idx + quote_dist];
-                match ESCAPED[ch as usize] {
-                    b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
-
-                    escape => stry!(self.write(&[b'\\', escape])),
-                };
-                *string = &string[*idx + quote_dist + 1..];
-                *idx = 0;
-                *len = string.len();
-            } else {
-                *idx += 16;
-            }
-        }
-        stry!(self.get_writer().write_all(&string[0..*idx]));
-        *string = &string[*idx..];
-        Ok(())
-    }
-
-    #[cfg(target_feature = "sse4.2")]
-    #[inline(always)]
-    unsafe fn process_16_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {
-        // The case where we have a 16+ byte block
-        // we repeate the same logic as above but with
-        // only 16 bytes
-        let zero = _mm_set1_epi8(0);
-        let lower_quote_range = _mm_set1_epi8(0x1F as i8);
-        let quote = _mm_set1_epi8(b'"' as i8);
-        let backslash = _mm_set1_epi8(b'\\' as i8);
-        while *len - *idx > 16 {
-            // Load 16 bytes of data;
-            #[allow(clippy::cast_ptr_alignment)]
-                let data: __m128i = _mm_loadu_si128(string.as_ptr().add(*idx) as *const __m128i);
-            // Test the data against being backslash and quote.
-            let bs_or_quote =
-                _mm_or_si128(_mm_cmpeq_epi8(data, backslash), _mm_cmpeq_epi8(data, quote));
-            // Now mask the data with the quote range (0x1F).
-            let in_quote_range = _mm_and_si128(data, lower_quote_range);
-            // then test of the data is unchanged. aka: xor it with the
-            // Any field that was inside the quote range it will be zero
-            // now.
-            let is_unchanged = _mm_xor_si128(data, in_quote_range);
-            let in_range = _mm_cmpeq_epi8(is_unchanged, zero);
-            let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range));
-            if quote_bits != 0 {
-                let quote_dist = quote_bits.trailing_zeros() as usize;
-                stry!(self.get_writer().write_all(&string[0..*idx + quote_dist]));
-                let ch = string[*idx + quote_dist];
-                match ESCAPED[ch as usize] {
-                    b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
-
-                    escape => stry!(self.write(&[b'\\', escape])),
-                };
-                *string = &string[*idx + quote_dist + 1..];
-                *idx = 0;
-                *len = string.len();
-            } else {
-                *idx += 16;
-            }
-        }
-        stry!(self.get_writer().write_all(&string[0..*idx]));
-        *string = &string[*idx..];
-        Ok(())
-    }
-
-    #[cfg(all(not(target_feature = "sse4.2"), not(target_feature = "neon")))]
-    #[inline(always)]
-    unsafe fn process_16_bytes(&mut self, _string: &mut &[u8], _len: &mut usize, _idx: &mut usize) -> io::Result<()> {
-        Ok(())
-    }
+    // 32-byte generation implementation
+    process_32_bytes!();
 
-    #[cfg(not(target_feature = "avx2"))]
-    #[inline(always)]
-    fn process_32_bytes(&mut self, _string: &mut &[u8], _len: &mut usize, _idx: &mut usize) -> io::Result<()> {
-        Ok(())
-    }
-
-    #[cfg(target_feature = "avx2")]
-    #[inline(always)]
-    unsafe fn process_32_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {
-        let zero = _mm256_set1_epi8(0);
-        let lower_quote_range = _mm256_set1_epi8(0x1F as i8);
-        let quote = _mm256_set1_epi8(b'"' as i8);
-        let backslash = _mm256_set1_epi8(b'\\' as i8);
-        while *len - *idx >= 32 {
-            // Load 32 bytes of data;
-            #[allow(clippy::cast_ptr_alignment)]
-                let data: __m256i = _mm256_loadu_si256(string.as_ptr().add(*idx) as *const __m256i);
-            // Test the data against being backslash and quote.
-            let bs_or_quote = _mm256_or_si256(
-                _mm256_cmpeq_epi8(data, backslash),
-                _mm256_cmpeq_epi8(data, quote),
-            );
-            // Now mask the data with the quote range (0x1F).
-            let in_quote_range = _mm256_and_si256(data, lower_quote_range);
-            // then test of the data is unchanged. aka: xor it with the
-            // Any field that was inside the quote range it will be zero
-            // now.
-            let is_unchanged = _mm256_xor_si256(data, in_quote_range);
-            let in_range = _mm256_cmpeq_epi8(is_unchanged, zero);
-            let quote_bits = _mm256_movemask_epi8(_mm256_or_si256(bs_or_quote, in_range));
-            if quote_bits != 0 {
-                let quote_dist = quote_bits.trailing_zeros() as usize;
-                stry!(self.get_writer().write_all(&string[0..*idx + quote_dist]));
-                let ch = string[*idx + quote_dist];
-                match ESCAPED[ch as usize] {
-                    b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
-
-                    escape => stry!(self.write(&[b'\\', escape])),
-                };
-                *string = &string[*idx + quote_dist + 1..];
-                *idx = 0;
-                *len = string.len();
-            } else {
-                *idx += 32;
-            }
-        }
-        Ok(())
-    }
+    // 16-byte generation implementation
+    process_16_bytes!();
 
     #[inline(always)]
     fn write_float(&mut self, num: f64) -> io::Result<()> {

From 40bb843919a4f88261cb77b4b12e1eb404419924 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Fri, 16 Aug 2019 14:25:09 -0400
Subject: [PATCH 31/34] fix: does drone need clean?

---
 .drone.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.drone.yml b/.drone.yml
index 8f00d391..dbc33acb 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -60,5 +60,5 @@ steps:
   commands:
   - rustup default nightly
   - rustup update
-  - cargo +nightly build --verbose --all
+  - cargo clean && cargo +nightly build --verbose --all
   - cargo +nightly test --verbose --all

From 78b0aa8c6ef0b6002c3de7a0e53d86ec2a983381 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Fri, 16 Aug 2019 14:44:31 -0400
Subject: [PATCH 32/34] fix: utf8 error checking (maybe)

---
 src/neon/stage1.rs | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs
index b23f059f..8f3b9b5d 100644
--- a/src/neon/stage1.rs
+++ b/src/neon/stage1.rs
@@ -113,14 +113,9 @@ impl Default for Utf8CheckingState {
 #[inline]
 fn is_utf8_status_ok(has_error: int8x16_t) -> bool {
     unsafe {
-        let utf8_error_bits: u128 = mem::transmute(
-            vandq_s16(
-                mem::transmute(has_error),
-                mem::transmute(has_error)
-            )
-        );
+        let has_error_128 : i128 = mem::transmute(has_error);
 
-        utf8_error_bits as u16 == 0
+        has_error_128 as u32 == 0
     }
 }
 

From 5a23b4058608d7e226c9e6c20a7d9c8354d1d122 Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Fri, 16 Aug 2019 15:10:18 -0400
Subject: [PATCH 33/34] fix: utf8 error checking (maybe maybe)

---
 src/neon/stage1.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs
index 8f3b9b5d..0f008574 100644
--- a/src/neon/stage1.rs
+++ b/src/neon/stage1.rs
@@ -115,7 +115,7 @@ fn is_utf8_status_ok(has_error: int8x16_t) -> bool {
     unsafe {
         let has_error_128 : i128 = mem::transmute(has_error);
 
-        has_error_128 as u32 == 0
+        has_error_128 == 0
     }
 }
 

From af177f3cb3da9d6200ca4dd428103345d929357c Mon Sep 17 00:00:00 2001
From: Sunny Gleason <sunny.gleason@gmail.com>
Date: Fri, 16 Aug 2019 16:11:19 -0400
Subject: [PATCH 34/34] feat: fancy generic generator functions, thanks
 @Licenser

---
 src/avx2/generator.rs  | 136 ++++++++++++++---------------------------
 src/neon/generator.rs  | 115 ++++++++++++++--------------------
 src/neon/stage1.rs     |   4 +-
 src/sse42/generator.rs | 111 ++++++++++++++-------------------
 src/value.rs           |   2 +-
 src/value/generator.rs |  22 ++-----
 6 files changed, 143 insertions(+), 247 deletions(-)

diff --git a/src/avx2/generator.rs b/src/avx2/generator.rs
index c77a4ebc..13e72061 100644
--- a/src/avx2/generator.rs
+++ b/src/avx2/generator.rs
@@ -1,97 +1,51 @@
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
 
-#[macro_export]
-macro_rules! process_32_bytes {
-    () => {
-        #[inline(always)]
-        unsafe fn process_32_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {
-            let zero = _mm256_set1_epi8(0);
-            let lower_quote_range = _mm256_set1_epi8(0x1F as i8);
-            let quote = _mm256_set1_epi8(b'"' as i8);
-            let backslash = _mm256_set1_epi8(b'\\' as i8);
-            while *len - *idx >= 32 {
-                // Load 32 bytes of data;
-                #[allow(clippy::cast_ptr_alignment)]
-                    let data: __m256i = _mm256_loadu_si256(string.as_ptr().add(*idx) as *const __m256i);
-                // Test the data against being backslash and quote.
-                let bs_or_quote = _mm256_or_si256(
-                    _mm256_cmpeq_epi8(data, backslash),
-                    _mm256_cmpeq_epi8(data, quote),
-                );
-                // Now mask the data with the quote range (0x1F).
-                let in_quote_range = _mm256_and_si256(data, lower_quote_range);
-                // then test of the data is unchanged. aka: xor it with the
-                // Any field that was inside the quote range it will be zero
-                // now.
-                let is_unchanged = _mm256_xor_si256(data, in_quote_range);
-                let in_range = _mm256_cmpeq_epi8(is_unchanged, zero);
-                let quote_bits = _mm256_movemask_epi8(_mm256_or_si256(bs_or_quote, in_range));
-                if quote_bits != 0 {
-                    let quote_dist = quote_bits.trailing_zeros() as usize;
-                    stry!(self.get_writer().write_all(&string[0..*idx + quote_dist]));
-                    let ch = string[*idx + quote_dist];
-                    match ESCAPED[ch as usize] {
-                        b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
+use crate::value::generator::ESCAPED;
+use std::io;
 
-                        escape => stry!(self.write(&[b'\\', escape])),
-                    };
-                    *string = &string[*idx + quote_dist + 1..];
-                    *idx = 0;
-                    *len = string.len();
-                } else {
-                    *idx += 32;
-                }
-            }
-            Ok(())
-        }
-    }
-}
-
-#[macro_export]
-macro_rules! process_16_bytes {
-    () => {
-    #[inline(always)]
-        unsafe fn process_16_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {
-            // The case where we have a 16+ byte block
-            // we repeate the same logic as above but with
-            // only 16 bytes
-            let zero = _mm_set1_epi8(0);
-            let lower_quote_range = _mm_set1_epi8(0x1F as i8);
-            let quote = _mm_set1_epi8(b'"' as i8);
-            let backslash = _mm_set1_epi8(b'\\' as i8);
-            while *len - *idx > 16 {
-                // Load 16 bytes of data;
-                #[allow(clippy::cast_ptr_alignment)]
-                    let data: __m128i = _mm_loadu_si128(string.as_ptr().add(*idx) as *const __m128i);
-                // Test the data against being backslash and quote.
-                let bs_or_quote =
-                    _mm_or_si128(_mm_cmpeq_epi8(data, backslash), _mm_cmpeq_epi8(data, quote));
-                // Now mask the data with the quote range (0x1F).
-                let in_quote_range = _mm_and_si128(data, lower_quote_range);
-                // then test of the data is unchanged. aka: xor it with the
-                // Any field that was inside the quote range it will be zero
-                // now.
-                let is_unchanged = _mm_xor_si128(data, in_quote_range);
-                let in_range = _mm_cmpeq_epi8(is_unchanged, zero);
-                let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range));
-                if quote_bits != 0 {
-                    let quote_dist = quote_bits.trailing_zeros() as usize;
-                    stry!(self.get_writer().write_all(&string[0..*idx + quote_dist]));
-                    let ch = string[*idx + quote_dist];
-                    match ESCAPED[ch as usize] {
-                        b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
+#[inline(always)]
+pub unsafe fn write_str_simd<W>(writer: &mut W, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> where W: std::io::Write {
+    let zero = _mm256_set1_epi8(0);
+    let lower_quote_range = _mm256_set1_epi8(0x1F as i8);
+    let quote = _mm256_set1_epi8(b'"' as i8);
+    let backslash = _mm256_set1_epi8(b'\\' as i8);
+    while *len - *idx >= 32 {
+        // Load 32 bytes of data;
+        #[allow(clippy::cast_ptr_alignment)]
+            let data: __m256i = _mm256_loadu_si256(string.as_ptr().add(*idx) as *const __m256i);
+        // Test the data against being backslash and quote.
+        let bs_or_quote = _mm256_or_si256(
+            _mm256_cmpeq_epi8(data, backslash),
+            _mm256_cmpeq_epi8(data, quote),
+        );
+        // Now mask the data with the quote range (0x1F).
+        let in_quote_range = _mm256_and_si256(data, lower_quote_range);
+        // then test of the data is unchanged. aka: xor it with the
+        // Any field that was inside the quote range it will be zero
+        // now.
+        let is_unchanged = _mm256_xor_si256(data, in_quote_range);
+        let in_range = _mm256_cmpeq_epi8(is_unchanged, zero);
+        let quote_bits = _mm256_movemask_epi8(_mm256_or_si256(bs_or_quote, in_range));
+        if quote_bits != 0 {
+            let quote_dist = quote_bits.trailing_zeros() as usize;
+            stry!(writer.write_all(&string[0..*idx + quote_dist]));
+            let ch = string[*idx + quote_dist];
+            match ESCAPED[ch as usize] {
+                b'u' => stry!(write!(writer, "\\u{:04x}", ch)),
 
-                        escape => stry!(self.write(&[b'\\', escape])),
-                    };
-                    *string = &string[*idx + quote_dist + 1..];
-                    *idx = 0;
-                    *len = string.len();
-                } else {
-                    *idx += 16;
-                }
-            }
-            stry!(self.get_writer().write_all(&string[0..*idx]));
-            *string = &string[*idx..];
-            Ok(())
+                escape => stry!(writer.write_all(&[b'\\', escape])),
+            };
+            *string = &string[*idx + quote_dist + 1..];
+            *idx = 0;
+            *len = string.len();
+        } else {
+            *idx += 32;
         }
     }
+    stry!(writer.write_all(&string[0..*idx]));
+    *string = &string[*idx..];
+    Ok(())
 }
\ No newline at end of file
diff --git a/src/neon/generator.rs b/src/neon/generator.rs
index 7e0c8b60..6c8cf358 100644
--- a/src/neon/generator.rs
+++ b/src/neon/generator.rs
@@ -1,73 +1,48 @@
+use crate::value::generator::ESCAPED;
+use std::io;
+use crate::neon::intrinsics::*;
+use crate::neon::stage1::neon_movemask;
 
-#[macro_export]
-macro_rules! process_32_bytes {
-    () => {
-        #[inline(always)]
-        unsafe fn process_32_bytes(&mut self, _string: &mut &[u8], _len: &mut usize, _idx: &mut usize) -> io::Result < () > {
-            Ok(())
-        }
-    };
-}
-
-#[macro_export]
-macro_rules! process_16_bytes {
-    () => {
-        #[inline(always)]
-        unsafe fn process_16_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {
-            #[cfg_attr(not(feature = "no-inline"), inline(always))]
-            unsafe fn __neon_movemask(input: uint8x16_t) -> u16 {
-                let bit_mask = uint8x16_t::new(
-                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-                );
-                let minput: uint8x16_t = vandq_u8(input, bit_mask);
-                let tmp: uint8x16_t = vpaddq_u8(minput, minput);
-                let tmp = vpaddq_u8(tmp, tmp);
-                let tmp = vpaddq_u8(tmp, tmp);
+#[inline(always)]
+pub unsafe fn write_str_simd<W>(writer: &mut W, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> where W: std::io::Write {
+    // The case where we have a 16+ byte block
+    // we repeate the same logic as above but with
+    // only 16 bytes
+    let zero = vdupq_n_u8(0);
+    let lower_quote_range = vdupq_n_u8(0x1F);
+    let quote = vdupq_n_u8(b'"');
+    let backslash = vdupq_n_u8(b'\\');
+    while *len - *idx > 16 {
+        // Load 16 bytes of data;
+        let data: uint8x16_t = vld1q_u8(string.as_ptr().add(*idx));
+        // Test the data against being backslash and quote.
+        let bs_or_quote =
+            vorrq_u8(vceqq_u8(data, backslash), vceqq_u8(data, quote));
+        // Now mask the data with the quote range (0x1F).
+        let in_quote_range = vandq_u8(data, lower_quote_range);
+        // then test of the data is unchanged. aka: xor it with the
+        // Any field that was inside the quote range it will be zero
+        // now.
+        let is_unchanged = vxorrq_u8(data, in_quote_range);
+        let in_range = vceqq_u8(is_unchanged, zero);
+        let quote_bits = neon_movemask(vorrq_u8(bs_or_quote, in_range));
+        if quote_bits != 0 {
+            let quote_dist = quote_bits.trailing_zeros() as usize;
+            stry!(writer.write_all(&string[0..*idx + quote_dist]));
+            let ch = string[*idx + quote_dist];
+            match ESCAPED[ch as usize] {
+                b'u' => stry!(write!(writer, "\\u{:04x}", ch)),
 
-                vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0)
-            }
-
-            // The case where we have a 16+ byte block
-            // we repeate the same logic as above but with
-            // only 16 bytes
-            let zero = vdupq_n_u8(0);
-            let lower_quote_range = vdupq_n_u8(0x1F);
-            let quote = vdupq_n_u8(b'"');
-            let backslash = vdupq_n_u8(b'\\');
-            while *len - *idx > 16 {
-                // Load 16 bytes of data;
-                let data: uint8x16_t = vld1q_u8(string.as_ptr().add(*idx));
-                // Test the data against being backslash and quote.
-                let bs_or_quote =
-                    vorrq_u8(vceqq_u8(data, backslash), vceqq_u8(data, quote));
-                // Now mask the data with the quote range (0x1F).
-                let in_quote_range = vandq_u8(data, lower_quote_range);
-                // then test of the data is unchanged. aka: xor it with the
-                // Any field that was inside the quote range it will be zero
-                // now.
-                let is_unchanged = vxorrq_u8(data, in_quote_range);
-                let in_range = vceqq_u8(is_unchanged, zero);
-                let quote_bits = __neon_movemask(vorrq_u8(bs_or_quote, in_range));
-                if quote_bits != 0 {
-                    let quote_dist = quote_bits.trailing_zeros() as usize;
-                    stry!(self.get_writer().write_all(&string[0..*idx + quote_dist]));
-                    let ch = string[*idx + quote_dist];
-                    match ESCAPED[ch as usize] {
-                        b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
-
-                        escape => stry!(self.write(&[b'\\', escape])),
-                    };
-                    *string = &string[*idx + quote_dist + 1..];
-                    *idx = 0;
-                    *len = string.len();
-                } else {
-                    *idx += 16;
-                }
-            }
-            stry!(self.get_writer().write_all(&string[0..*idx]));
-            *string = &string[*idx..];
-            Ok(())
+                escape => stry!(writer.write_all(&[b'\\', escape])),
+            };
+            *string = &string[*idx + quote_dist + 1..];
+            *idx = 0;
+            *len = string.len();
+        } else {
+            *idx += 16;
         }
-    };
-}
\ No newline at end of file
+    }
+    stry!(writer.write_all(&string[0..*idx]));
+    *string = &string[*idx..];
+    Ok(())
+}
diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs
index 0f008574..45322cee 100644
--- a/src/neon/stage1.rs
+++ b/src/neon/stage1.rs
@@ -18,7 +18,7 @@ macro_rules! bit_mask {
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
-unsafe fn neon_movemask(input: uint8x16_t) -> u16 {
+pub(crate) unsafe fn neon_movemask(input: uint8x16_t) -> u16 {
     let minput: uint8x16_t = vandq_u8(input, bit_mask!());
     let tmp: uint8x16_t = vpaddq_u8(minput, minput);
     let tmp = vpaddq_u8(tmp, tmp);
@@ -28,7 +28,7 @@ unsafe fn neon_movemask(input: uint8x16_t) -> u16 {
 }
 
 #[cfg_attr(not(feature = "no-inline"), inline(always))]
-unsafe fn neon_movemask_bulk(p0: uint8x16_t, p1: uint8x16_t, p2: uint8x16_t, p3: uint8x16_t) -> u64 {
+pub unsafe fn neon_movemask_bulk(p0: uint8x16_t, p1: uint8x16_t, p2: uint8x16_t, p3: uint8x16_t) -> u64 {
     let bit_mask = bit_mask!();
 
     let t0 = vandq_u8(p0, bit_mask);
diff --git a/src/sse42/generator.rs b/src/sse42/generator.rs
index 4b3a90f3..e6636585 100644
--- a/src/sse42/generator.rs
+++ b/src/sse42/generator.rs
@@ -1,72 +1,51 @@
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
 
-#[macro_export]
-macro_rules! process_32_bytes {
-    () => {
-        #[inline(always)]
-        unsafe fn process_32_bytes(&mut self, _string: &mut &[u8], _len: &mut usize, _idx: &mut usize) -> io::Result<()> {
-            Ok(())
-        }
-    }
-}
+use crate::value::generator::ESCAPED;
+use std::io;
 
-#[macro_export]
-#[cfg(target_feature = "sse4.2")]
-macro_rules! process_16_bytes {
-    () => {
-        #[inline(always)]
-        unsafe fn process_16_bytes(&mut self, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> {
-            // The case where we have a 16+ byte block
-            // we repeate the same logic as above but with
-            // only 16 bytes
-            let zero = _mm_set1_epi8(0);
-            let lower_quote_range = _mm_set1_epi8(0x1F as i8);
-            let quote = _mm_set1_epi8(b'"' as i8);
-            let backslash = _mm_set1_epi8(b'\\' as i8);
-            while *len - *idx > 16 {
-                // Load 16 bytes of data;
-                #[allow(clippy::cast_ptr_alignment)]
-                    let data: __m128i = _mm_loadu_si128(string.as_ptr().add(*idx) as *const __m128i);
-                // Test the data against being backslash and quote.
-                let bs_or_quote =
-                    _mm_or_si128(_mm_cmpeq_epi8(data, backslash), _mm_cmpeq_epi8(data, quote));
-                // Now mask the data with the quote range (0x1F).
-                let in_quote_range = _mm_and_si128(data, lower_quote_range);
-                // then test of the data is unchanged. aka: xor it with the
-                // Any field that was inside the quote range it will be zero
-                // now.
-                let is_unchanged = _mm_xor_si128(data, in_quote_range);
-                let in_range = _mm_cmpeq_epi8(is_unchanged, zero);
-                let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range));
-                if quote_bits != 0 {
-                    let quote_dist = quote_bits.trailing_zeros() as usize;
-                    stry!(self.get_writer().write_all(&string[0..*idx + quote_dist]));
-                    let ch = string[*idx + quote_dist];
-                    match ESCAPED[ch as usize] {
-                        b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)),
+#[inline(always)]
+pub unsafe fn write_str_simd<W>(writer: &mut W, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> where W: std::io::Write {
+    let zero = _mm_set1_epi8(0);
+    let lower_quote_range = _mm_set1_epi8(0x1F as i8);
+    let quote = _mm_set1_epi8(b'"' as i8);
+    let backslash = _mm_set1_epi8(b'\\' as i8);
+    while *len - *idx > 16 {
+        // Load 16 bytes of data;
+        #[allow(clippy::cast_ptr_alignment)]
+            let data: __m128i = _mm_loadu_si128(string.as_ptr().add(*idx) as *const __m128i);
+        // Test the data against being backslash and quote.
+        let bs_or_quote = _mm_or_si128(
+            _mm_cmpeq_epi8(data, backslash),
+            _mm_cmpeq_epi8(data, quote)
+        );
+        // Now mask the data with the quote range (0x1F).
+        let in_quote_range = _mm_and_si128(data, lower_quote_range);
+        // then test of the data is unchanged. aka: xor it with the
+        // Any field that was inside the quote range it will be zero
+        // now.
+        let is_unchanged = _mm_xor_si128(data, in_quote_range);
+        let in_range = _mm_cmpeq_epi8(is_unchanged, zero);
+        let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range));
+        if quote_bits != 0 {
+            let quote_dist = quote_bits.trailing_zeros() as usize;
+            stry!(writer.write_all(&string[0..*idx + quote_dist]));
+            let ch = string[*idx + quote_dist];
+            match ESCAPED[ch as usize] {
+                b'u' => stry!(write!(writer, "\\u{:04x}", ch)),
 
-                        escape => stry!(self.write(&[b'\\', escape])),
-                    };
-                    *string = &string[*idx + quote_dist + 1..];
-                    *idx = 0;
-                    *len = string.len();
-                } else {
-                    *idx += 16;
-                }
-            }
-            stry!(self.get_writer().write_all(&string[0..*idx]));
-            *string = &string[*idx..];
-            Ok(())
+                escape => stry!(writer.write_all(&[b'\\', escape])),
+            };
+            *string = &string[*idx + quote_dist + 1..];
+            *idx = 0;
+            *len = string.len();
+        } else {
+            *idx += 16;
         }
     }
+    stry!(writer.write_all(&string[0..*idx]));
+    *string = &string[*idx..];
+    Ok(())
 }
-
-#[macro_export]
-#[cfg(not(target_feature = "sse4.2"))]
-macro_rules! process_16_bytes {
-    () => {
-        #[inline(always)]
-        unsafe fn process_16_bytes(&mut self, _string: &mut &[u8], _len: &mut usize, _idx: &mut usize) -> io::Result<()> {
-            Ok(())
-        }
-    }
-}
\ No newline at end of file
diff --git a/src/value.rs b/src/value.rs
index 06245878..c5d5aa87 100644
--- a/src/value.rs
+++ b/src/value.rs
@@ -11,7 +11,7 @@
 /// we do not require prior knowledge sbout string comtent to to take advantage
 /// of it.
 pub mod borrowed;
-mod generator;
+pub(crate) mod generator;
 pub mod owned;
 
 pub use self::borrowed::{to_value as to_borrowed_value, Value as BorrowedValue};
diff --git a/src/value/generator.rs b/src/value/generator.rs
index b053344f..ebdb0a88 100644
--- a/src/value/generator.rs
+++ b/src/value/generator.rs
@@ -10,21 +10,16 @@ use std::io::Write;
 use std::marker::PhantomData;
 use std::ptr;
 
-#[cfg(target_arch = "x86")]
-use std::arch::x86::*;
-#[cfg(any(target_feature = "sse4.2", target_feature = "avx2"))]
-use std::arch::x86_64::*;
-
 use crate::*;
 
 #[cfg(target_feature = "avx2")]
-pub use crate::avx2::generator::*;
+use crate::avx2::generator::*;
 
 #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))]
-pub use crate::sse42::generator::*;
+use crate::sse42::generator::*;
 
 #[cfg(target_feature = "neon")]
-pub use crate::neon::generator::*;
+use crate::neon::generator::*;
 
 const QU: u8 = b'"';
 const BS: u8 = b'\\';
@@ -37,7 +32,7 @@ const UU: u8 = b'u';
 const __: u8 = 0;
 
 // Look up table for characters that need escaping in a product string
-static ESCAPED: [u8; 256] = [
+pub(crate) static ESCAPED: [u8; 256] = [
     // 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
     UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0
     UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1
@@ -113,8 +108,7 @@ pub trait BaseGenerator {
             // quote characters that gives us a bitmask of 0x1f for that
             // region, only quote (`"`) and backslash (`\`) are not in
             // this range.
-            stry!(self.process_32_bytes(&mut string, &mut len, &mut idx));
-            stry!(self.process_32_bytes(&mut string, &mut len, &mut idx));
+            stry!(write_str_simd(self.get_writer(), &mut string, &mut len, &mut idx));
         }
         // Legacy code to handle the remainder of the code
         for (index, ch) in string.iter().enumerate() {
@@ -127,12 +121,6 @@ pub trait BaseGenerator {
         self.write_char(b'"')
     }
 
-    // 32-byte generation implementation
-    process_32_bytes!();
-
-    // 16-byte generation implementation
-    process_16_bytes!();
-
     #[inline(always)]
     fn write_float(&mut self, num: f64) -> io::Result<()> {
         let mut buffer = ryu::Buffer::new();