simd-lite · sunnygleason · Aug 16, 2019 · Aug 8, 2019 · Aug 9, 2019 · Aug 9, 2019
diff --git a/.drone.yml b/.drone.yml
@@ -58,5 +58,7 @@ steps:
 - name: test
   image: rust:1
   commands:
-  - cargo build --verbose --all
-  - cargo test --verbose --all
+  - rustup default nightly
+  - rustup update
+  - cargo clean && cargo +nightly build --verbose --all
+  - cargo +nightly test --verbose --all
diff --git a/src/avx2/generator.rs b/src/avx2/generator.rs
@@ -0,0 +1,51 @@
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+use crate::value::generator::ESCAPED;
+use std::io;
+
+#[inline(always)]
+pub unsafe fn write_str_simd<W>(writer: &mut W, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> where W: std::io::Write {
+    let zero = _mm256_set1_epi8(0);
+    let lower_quote_range = _mm256_set1_epi8(0x1F as i8);
+    let quote = _mm256_set1_epi8(b'"' as i8);
+    let backslash = _mm256_set1_epi8(b'\\' as i8);
+    while *len - *idx >= 32 {
+        // Load 32 bytes of data;
+        #[allow(clippy::cast_ptr_alignment)]
+            let data: __m256i = _mm256_loadu_si256(string.as_ptr().add(*idx) as *const __m256i);
+        // Test the data against being backslash and quote.
+        let bs_or_quote = _mm256_or_si256(
+            _mm256_cmpeq_epi8(data, backslash),
+            _mm256_cmpeq_epi8(data, quote),
+        );
+        // Now mask the data with the quote range (0x1F).
+        let in_quote_range = _mm256_and_si256(data, lower_quote_range);
+        // then test of the data is unchanged. aka: xor it with the
+        // Any field that was inside the quote range it will be zero
+        // now.
+        let is_unchanged = _mm256_xor_si256(data, in_quote_range);
+        let in_range = _mm256_cmpeq_epi8(is_unchanged, zero);
+        let quote_bits = _mm256_movemask_epi8(_mm256_or_si256(bs_or_quote, in_range));
+        if quote_bits != 0 {
+            let quote_dist = quote_bits.trailing_zeros() as usize;
+            stry!(writer.write_all(&string[0..*idx + quote_dist]));
+            let ch = string[*idx + quote_dist];
+            match ESCAPED[ch as usize] {
+                b'u' => stry!(write!(writer, "\\u{:04x}", ch)),
+
+                escape => stry!(writer.write_all(&[b'\\', escape])),
+            };
+            *string = &string[*idx + quote_dist + 1..];
+            *idx = 0;
+            *len = string.len();
+        } else {
+            *idx += 32;
+        }
+    }
+    stry!(writer.write_all(&string[0..*idx]));
+    *string = &string[*idx..];
+    Ok(())
+}
diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs
@@ -1,3 +1,4 @@
 pub mod deser;
 pub mod stage1;
-pub mod utf8check;
+pub mod utf8check;
+pub mod generator;
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,4 +1,18 @@
 #![deny(warnings)]
+
+#![cfg_attr(target_feature = "neon", feature(
+    asm,
+    stdsimd,
+    repr_simd,
+    custom_inner_attributes,
+    aarch64_target_feature,
+    platform_intrinsics,
+    stmt_expr_attributes,
+    simd_ffi,
+    link_llvm_intrinsics
+    )
+)]
+
 #![cfg_attr(feature = "hints", feature(core_intrinsics))]
 //! simdjson-rs is a rust port of the simejson c++ library. It follows
 //! most of the design closely with a few exceptions to make it better
@@ -89,17 +103,25 @@ pub use crate::avx2::deser::*;
 #[cfg(target_feature = "avx2")]
 use crate::avx2::stage1::SIMDJSON_PADDING;
 
-#[cfg(not(target_feature = "avx2"))]
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))]
 mod sse42;
-#[cfg(not(target_feature = "avx2"))]
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))]
 pub use crate::sse42::deser::*;
-#[cfg(not(target_feature = "avx2"))]
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))]
 use crate::sse42::stage1::SIMDJSON_PADDING;
 
+#[cfg(target_feature = "neon")]
+mod neon;
+#[cfg(target_feature = "neon")]
+pub use crate::neon::deser::*;
+#[cfg(target_feature = "neon")]
+use crate::neon::stage1::SIMDJSON_PADDING;
+
 mod stage2;
 pub mod value;
 
 use crate::numberparse::Number;
+#[cfg(not(target_feature = "neon"))]
 use std::mem;
 use std::str;
 
@@ -163,7 +185,11 @@ impl<'de> Deserializer<'de> {
 
         let counts = Deserializer::validate(input, &structural_indexes)?;
 
-        let strings = Vec::with_capacity(len + SIMDJSON_PADDING);
+        // Set length to allow slice access in ARM code
+        let mut strings = Vec::with_capacity(len + SIMDJSON_PADDING);
+        unsafe {
+            strings.set_len(len + SIMDJSON_PADDING);
+        }
 
         Ok(Deserializer {
             counts,

diff --git a/src/neon/deser.rs b/src/neon/deser.rs
@@ -0,0 +1,199 @@
+
+pub use crate::error::{Error, ErrorType};
+pub use crate::Deserializer;
+pub use crate::Result;
+pub use crate::neon::stage1::*;
+pub use crate::neon::utf8check::*;
+pub use crate::neon::intrinsics::*;
+pub use crate::stringparse::*;
+
+impl<'de> Deserializer<'de> {
+    #[cfg_attr(not(feature = "no-inline"), inline(always))]
+    pub fn parse_str_(&mut self) -> Result<&'de str> {
+        // Add 1 to skip the initial "
+        let idx = self.iidx + 1;
+        let mut padding = [0u8; 32];
+        //let mut read: usize = 0;
+
+        // we include the terminal '"' so we know where to end
+        // This is safe since we check sub's lenght in the range access above and only
+        // create sub sliced form sub to `sub.len()`.
+
+        let src: &[u8] = unsafe { &self.input.get_unchecked(idx..) };
+        let mut src_i: usize = 0;
+        let mut len = src_i;
+        loop {
+            // store to dest unconditionally - we can overwrite the bits we don't like
+            // later
+
+            let (v0, v1) = if src.len() >= src_i + 32 {
+                // This is safe since we ensure src is at least 16 wide
+                #[allow(clippy::cast_ptr_alignment)]
+                unsafe {
+                    (
+                        vld1q_u8(src.get_unchecked(src_i..src_i + 16).as_ptr()),
+                        vld1q_u8(src.get_unchecked(src_i + 16..src_i + 32).as_ptr()),
+                    )
+                }
+            } else {
+                unsafe {
+                    padding
+                        .get_unchecked_mut(..src.len() - src_i)
+                        .clone_from_slice(src.get_unchecked(src_i..));
+                    // This is safe since we ensure src is at least 32 wide
+                    (
+                        vld1q_u8(padding.get_unchecked(0..16).as_ptr()),
+                        vld1q_u8(padding.get_unchecked(16..32).as_ptr()),
+                    )
+                }
+            };
+
+            let ParseStringHelper { bs_bits, quote_bits } = find_bs_bits_and_quote_bits(v0, v1);
+
+            if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
+                // we encountered quotes first. Move dst to point to quotes and exit
+                // find out where the quote is...
+                let quote_dist: u32 = quote_bits.trailing_zeros();
+
+                ///////////////////////
+                // Above, check for overflow in case someone has a crazy string (>=4GB?)
+                // But only add the overflow check when the document itself exceeds 4GB
+                // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+                ////////////////////////
+
+                // we advance the point, accounting for the fact that we have a NULl termination
+
+                len += quote_dist as usize;
+                unsafe {
+                    let v = self.input.get_unchecked(idx..idx + len) as *const [u8] as *const str;
+                    return Ok(&*v);
+                }
+
+                // we compare the pointers since we care if they are 'at the same spot'
+                // not if they are the same value
+            }
+            if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
+                // Move to the 'bad' character
+                let bs_dist: u32 = bs_bits.trailing_zeros();
+                len += bs_dist as usize;
+                src_i += bs_dist as usize;
+                break;
+            } else {
+                // they are the same. Since they can't co-occur, it means we encountered
+                // neither.
+                src_i += 32;
+                len += 32;
+            }
+        }
+
+        let mut dst_i: usize = 0;
+        let dst: &mut [u8] = self.strings.as_mut_slice();
+
+        loop {
+            let (v0, v1) = if src.len() >= src_i + 32 {
+                // This is safe since we ensure src is at least 16 wide
+                #[allow(clippy::cast_ptr_alignment)]
+                    unsafe {
+                    (
+                        vld1q_u8(src.get_unchecked(src_i..src_i + 16).as_ptr()),
+                        vld1q_u8(src.get_unchecked(src_i + 16..src_i + 32).as_ptr()),
+                    )
+                }
+            } else {
+                unsafe {
+                    padding
+                        .get_unchecked_mut(..src.len() - src_i)
+                        .clone_from_slice(src.get_unchecked(src_i..));
+                    // This is safe since we ensure src is at least 32 wide
+                    (
+                        vld1q_u8(padding.get_unchecked(0..16).as_ptr()),
+                        vld1q_u8(padding.get_unchecked(16..32).as_ptr()),
+                    )
+                }
+            };
+
+            unsafe {
+                dst.get_unchecked_mut(dst_i..dst_i + 32).copy_from_slice(src.get_unchecked(src_i..src_i + 32));
+            }
+
+            // store to dest unconditionally - we can overwrite the bits we don't like
+            // later
+            let ParseStringHelper { bs_bits, quote_bits } = find_bs_bits_and_quote_bits(v0, v1);
+
+            if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
+                // we encountered quotes first. Move dst to point to quotes and exit
+                // find out where the quote is...
+                let quote_dist: u32 = quote_bits.trailing_zeros();
+
+                ///////////////////////
+                // Above, check for overflow in case someone has a crazy string (>=4GB?)
+                // But only add the overflow check when the document itself exceeds 4GB
+                // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
+                ////////////////////////
+
+                // we advance the point, accounting for the fact that we have a NULl termination
+
+                dst_i += quote_dist as usize;
+                unsafe {
+                    self.input
+                        .get_unchecked_mut(idx + len..idx + len + dst_i)
+                        .clone_from_slice(&self.strings.get_unchecked(..dst_i));
+                    let v = self.input.get_unchecked(idx..idx + len + dst_i) as *const [u8]
+                        as *const str;
+                    self.str_offset += dst_i as usize;
+                    return Ok(&*v);
+                }
+
+                // we compare the pointers since we care if they are 'at the same spot'
+                // not if they are the same value
+            }
+            if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
+                // find out where the backspace is
+                let bs_dist: u32 = bs_bits.trailing_zeros();
+                let escape_char: u8 = unsafe { *src.get_unchecked(src_i + bs_dist as usize + 1) };
+                // we encountered backslash first. Handle backslash
+                if escape_char == b'u' {
+                    // move src/dst up to the start; they will be further adjusted
+                    // within the unicode codepoint handling code.
+                    src_i += bs_dist as usize;
+                    dst_i += bs_dist as usize;
+                    let (o, s) = if let Ok(r) = handle_unicode_codepoint(
+                            unsafe { src.get_unchecked(src_i..) },
+                            unsafe { dst.get_unchecked_mut(dst_i..) }
+                    )
+                    {
+                        r
+                    } else {
+                        return Err(self.error(ErrorType::InvlaidUnicodeCodepoint));
+                    };
+                    if o == 0 {
+                        return Err(self.error(ErrorType::InvlaidUnicodeCodepoint));
+                    };
+                    // We moved o steps forword at the destiation and 6 on the source
+                    src_i += s;
+                    dst_i += o;
+                } else {
+                    // simple 1:1 conversion. Will eat bs_dist+2 characters in input and
+                    // write bs_dist+1 characters to output
+                    // note this may reach beyond the part of the buffer we've actually
+                    // seen. I think this is ok
+                    let escape_result: u8 =
+                        unsafe { *ESCAPE_MAP.get_unchecked(escape_char as usize) };
+                    if escape_result == 0 {
+                        return Err(self.error(ErrorType::InvalidEscape));
+                    }
+                    unsafe {
+                        *dst.get_unchecked_mut(dst_i + bs_dist as usize) = escape_result;
+                    }
+                    src_i += bs_dist as usize + 2;
+                    dst_i += bs_dist as usize + 1;
+                }
+            } else {
+                // they are the same. Since they can't co-occur, it means we encountered
+                // neither.
+                src_i += 32;
+                dst_i += 32;
+            }
+        }
+    }
+}
diff --git a/src/neon/generator.rs b/src/neon/generator.rs
@@ -0,0 +1,48 @@
+use crate::value::generator::ESCAPED;
+use std::io;
+use crate::neon::intrinsics::*;
+use crate::neon::stage1::neon_movemask;
+
+#[inline(always)]
+pub unsafe fn write_str_simd<W>(writer: &mut W, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> where W: std::io::Write {
+    // The case where we have a 16+ byte block
+    // we repeate the same logic as above but with
+    // only 16 bytes
+    let zero = vdupq_n_u8(0);
+    let lower_quote_range = vdupq_n_u8(0x1F);
+    let quote = vdupq_n_u8(b'"');
+    let backslash = vdupq_n_u8(b'\\');
+    while *len - *idx > 16 {
+        // Load 16 bytes of data;
+        let data: uint8x16_t = vld1q_u8(string.as_ptr().add(*idx));
+        // Test the data against being backslash and quote.
+        let bs_or_quote =
+            vorrq_u8(vceqq_u8(data, backslash), vceqq_u8(data, quote));
+        // Now mask the data with the quote range (0x1F).
+        let in_quote_range = vandq_u8(data, lower_quote_range);
+        // then test of the data is unchanged. aka: xor it with the
+        // Any field that was inside the quote range it will be zero
+        // now.
+        let is_unchanged = vxorrq_u8(data, in_quote_range);
+        let in_range = vceqq_u8(is_unchanged, zero);
+        let quote_bits = neon_movemask(vorrq_u8(bs_or_quote, in_range));
+        if quote_bits != 0 {
+            let quote_dist = quote_bits.trailing_zeros() as usize;
+            stry!(writer.write_all(&string[0..*idx + quote_dist]));
+            let ch = string[*idx + quote_dist];
+            match ESCAPED[ch as usize] {
+                b'u' => stry!(write!(writer, "\\u{:04x}", ch)),
+
+                escape => stry!(writer.write_all(&[b'\\', escape])),
+            };
+            *string = &string[*idx + quote_dist + 1..];
+            *idx = 0;
+            *len = string.len();
+        } else {
+            *idx += 16;
+        }
+    }
+    stry!(writer.write_all(&string[0..*idx]));
+    *string = &string[*idx..];
+    Ok(())
+}