diff --git a/.drone.yml b/.drone.yml index 9b8e8bb2..dbc33acb 100644 --- a/.drone.yml +++ b/.drone.yml @@ -58,5 +58,7 @@ steps: - name: test image: rust:1 commands: - - cargo build --verbose --all - - cargo test --verbose --all + - rustup default nightly + - rustup update + - cargo clean && cargo +nightly build --verbose --all + - cargo +nightly test --verbose --all diff --git a/src/avx2/generator.rs b/src/avx2/generator.rs new file mode 100644 index 00000000..13e72061 --- /dev/null +++ b/src/avx2/generator.rs @@ -0,0 +1,51 @@ +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +use crate::value::generator::ESCAPED; +use std::io; + +#[inline(always)] +pub unsafe fn write_str_simd(writer: &mut W, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> where W: std::io::Write { + let zero = _mm256_set1_epi8(0); + let lower_quote_range = _mm256_set1_epi8(0x1F as i8); + let quote = _mm256_set1_epi8(b'"' as i8); + let backslash = _mm256_set1_epi8(b'\\' as i8); + while *len - *idx >= 32 { + // Load 32 bytes of data; + #[allow(clippy::cast_ptr_alignment)] + let data: __m256i = _mm256_loadu_si256(string.as_ptr().add(*idx) as *const __m256i); + // Test the data against being backslash and quote. + let bs_or_quote = _mm256_or_si256( + _mm256_cmpeq_epi8(data, backslash), + _mm256_cmpeq_epi8(data, quote), + ); + // Now mask the data with the quote range (0x1F). + let in_quote_range = _mm256_and_si256(data, lower_quote_range); + // then test of the data is unchanged. aka: xor it with the + // Any field that was inside the quote range it will be zero + // now. + let is_unchanged = _mm256_xor_si256(data, in_quote_range); + let in_range = _mm256_cmpeq_epi8(is_unchanged, zero); + let quote_bits = _mm256_movemask_epi8(_mm256_or_si256(bs_or_quote, in_range)); + if quote_bits != 0 { + let quote_dist = quote_bits.trailing_zeros() as usize; + stry!(writer.write_all(&string[0..*idx + quote_dist])); + let ch = string[*idx + quote_dist]; + match ESCAPED[ch as usize] { + b'u' => stry!(write!(writer, "\\u{:04x}", ch)), + + escape => stry!(writer.write_all(&[b'\\', escape])), + }; + *string = &string[*idx + quote_dist + 1..]; + *idx = 0; + *len = string.len(); + } else { + *idx += 32; + } + } + stry!(writer.write_all(&string[0..*idx])); + *string = &string[*idx..]; + Ok(()) +} \ No newline at end of file diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs index 30c55c86..ac608ae2 100644 --- a/src/avx2/mod.rs +++ b/src/avx2/mod.rs @@ -1,3 +1,4 @@ pub mod deser; pub mod stage1; -pub mod utf8check; \ No newline at end of file +pub mod utf8check; +pub mod generator; \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 02038553..b68deadb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,18 @@ #![deny(warnings)] + +#![cfg_attr(target_feature = "neon", feature( + asm, + stdsimd, + repr_simd, + custom_inner_attributes, + aarch64_target_feature, + platform_intrinsics, + stmt_expr_attributes, + simd_ffi, + link_llvm_intrinsics + ) +)] + #![cfg_attr(feature = "hints", feature(core_intrinsics))] //! simdjson-rs is a rust port of the simejson c++ library. It follows //! most of the design closely with a few exceptions to make it better @@ -89,17 +103,25 @@ pub use crate::avx2::deser::*; #[cfg(target_feature = "avx2")] use crate::avx2::stage1::SIMDJSON_PADDING; -#[cfg(not(target_feature = "avx2"))] +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))] mod sse42; -#[cfg(not(target_feature = "avx2"))] +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))] pub use crate::sse42::deser::*; -#[cfg(not(target_feature = "avx2"))] +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))] use crate::sse42::stage1::SIMDJSON_PADDING; +#[cfg(target_feature = "neon")] +mod neon; +#[cfg(target_feature = "neon")] +pub use crate::neon::deser::*; +#[cfg(target_feature = "neon")] +use crate::neon::stage1::SIMDJSON_PADDING; + mod stage2; pub mod value; use crate::numberparse::Number; +#[cfg(not(target_feature = "neon"))] use std::mem; use std::str; @@ -163,7 +185,11 @@ impl<'de> Deserializer<'de> { let counts = Deserializer::validate(input, &structural_indexes)?; - let strings = Vec::with_capacity(len + SIMDJSON_PADDING); + // Set length to allow slice access in ARM code + let mut strings = Vec::with_capacity(len + SIMDJSON_PADDING); + unsafe { + strings.set_len(len + SIMDJSON_PADDING); + } Ok(Deserializer { counts, diff --git a/src/neon/deser.rs b/src/neon/deser.rs new file mode 100644 index 00000000..5d70b7af --- /dev/null +++ b/src/neon/deser.rs @@ -0,0 +1,199 @@ + +pub use crate::error::{Error, ErrorType}; +pub use crate::Deserializer; +pub use crate::Result; +pub use crate::neon::stage1::*; +pub use crate::neon::utf8check::*; +pub use crate::neon::intrinsics::*; +pub use crate::stringparse::*; + +impl<'de> Deserializer<'de> { + #[cfg_attr(not(feature = "no-inline"), inline(always))] + pub fn parse_str_(&mut self) -> Result<&'de str> { + // Add 1 to skip the initial " + let idx = self.iidx + 1; + let mut padding = [0u8; 32]; + //let mut read: usize = 0; + + // we include the terminal '"' so we know where to end + // This is safe since we check sub's lenght in the range access above and only + // create sub sliced form sub to `sub.len()`. + + let src: &[u8] = unsafe { &self.input.get_unchecked(idx..) }; + let mut src_i: usize = 0; + let mut len = src_i; + loop { + // store to dest unconditionally - we can overwrite the bits we don't like + // later + + let (v0, v1) = if src.len() >= src_i + 32 { + // This is safe since we ensure src is at least 16 wide + #[allow(clippy::cast_ptr_alignment)] + unsafe { + ( + vld1q_u8(src.get_unchecked(src_i..src_i + 16).as_ptr()), + vld1q_u8(src.get_unchecked(src_i + 16..src_i + 32).as_ptr()), + ) + } + } else { + unsafe { + padding + .get_unchecked_mut(..src.len() - src_i) + .clone_from_slice(src.get_unchecked(src_i..)); + // This is safe since we ensure src is at least 32 wide + ( + vld1q_u8(padding.get_unchecked(0..16).as_ptr()), + vld1q_u8(padding.get_unchecked(16..32).as_ptr()), + ) + } + }; + + let ParseStringHelper { bs_bits, quote_bits } = find_bs_bits_and_quote_bits(v0, v1); + + if (bs_bits.wrapping_sub(1) & quote_bits) != 0 { + // we encountered quotes first. Move dst to point to quotes and exit + // find out where the quote is... + let quote_dist: u32 = quote_bits.trailing_zeros(); + + /////////////////////// + // Above, check for overflow in case someone has a crazy string (>=4GB?) + // But only add the overflow check when the document itself exceeds 4GB + // Currently unneeded because we refuse to parse docs larger or equal to 4GB. + //////////////////////// + + // we advance the point, accounting for the fact that we have a NULl termination + + len += quote_dist as usize; + unsafe { + let v = self.input.get_unchecked(idx..idx + len) as *const [u8] as *const str; + return Ok(&*v); + } + + // we compare the pointers since we care if they are 'at the same spot' + // not if they are the same value + } + if (quote_bits.wrapping_sub(1) & bs_bits) != 0 { + // Move to the 'bad' character + let bs_dist: u32 = bs_bits.trailing_zeros(); + len += bs_dist as usize; + src_i += bs_dist as usize; + break; + } else { + // they are the same. Since they can't co-occur, it means we encountered + // neither. + src_i += 32; + len += 32; + } + } + + let mut dst_i: usize = 0; + let dst: &mut [u8] = self.strings.as_mut_slice(); + + loop { + let (v0, v1) = if src.len() >= src_i + 32 { + // This is safe since we ensure src is at least 16 wide + #[allow(clippy::cast_ptr_alignment)] + unsafe { + ( + vld1q_u8(src.get_unchecked(src_i..src_i + 16).as_ptr()), + vld1q_u8(src.get_unchecked(src_i + 16..src_i + 32).as_ptr()), + ) + } + } else { + unsafe { + padding + .get_unchecked_mut(..src.len() - src_i) + .clone_from_slice(src.get_unchecked(src_i..)); + // This is safe since we ensure src is at least 32 wide + ( + vld1q_u8(padding.get_unchecked(0..16).as_ptr()), + vld1q_u8(padding.get_unchecked(16..32).as_ptr()), + ) + } + }; + + unsafe { + dst.get_unchecked_mut(dst_i..dst_i + 32).copy_from_slice(src.get_unchecked(src_i..src_i + 32)); + } + + // store to dest unconditionally - we can overwrite the bits we don't like + // later + let ParseStringHelper { bs_bits, quote_bits } = find_bs_bits_and_quote_bits(v0, v1); + + if (bs_bits.wrapping_sub(1) & quote_bits) != 0 { + // we encountered quotes first. Move dst to point to quotes and exit + // find out where the quote is... + let quote_dist: u32 = quote_bits.trailing_zeros(); + + /////////////////////// + // Above, check for overflow in case someone has a crazy string (>=4GB?) + // But only add the overflow check when the document itself exceeds 4GB + // Currently unneeded because we refuse to parse docs larger or equal to 4GB. + //////////////////////// + + // we advance the point, accounting for the fact that we have a NULl termination + + dst_i += quote_dist as usize; + unsafe { + self.input + .get_unchecked_mut(idx + len..idx + len + dst_i) + .clone_from_slice(&self.strings.get_unchecked(..dst_i)); + let v = self.input.get_unchecked(idx..idx + len + dst_i) as *const [u8] + as *const str; + self.str_offset += dst_i as usize; + return Ok(&*v); + } + + // we compare the pointers since we care if they are 'at the same spot' + // not if they are the same value + } + if (quote_bits.wrapping_sub(1) & bs_bits) != 0 { + // find out where the backspace is + let bs_dist: u32 = bs_bits.trailing_zeros(); + let escape_char: u8 = unsafe { *src.get_unchecked(src_i + bs_dist as usize + 1) }; + // we encountered backslash first. Handle backslash + if escape_char == b'u' { + // move src/dst up to the start; they will be further adjusted + // within the unicode codepoint handling code. + src_i += bs_dist as usize; + dst_i += bs_dist as usize; + let (o, s) = if let Ok(r) = handle_unicode_codepoint( + unsafe { src.get_unchecked(src_i..) }, + unsafe { dst.get_unchecked_mut(dst_i..) } + ) + { + r + } else { + return Err(self.error(ErrorType::InvlaidUnicodeCodepoint)); + }; + if o == 0 { + return Err(self.error(ErrorType::InvlaidUnicodeCodepoint)); + }; + // We moved o steps forword at the destiation and 6 on the source + src_i += s; + dst_i += o; + } else { + // simple 1:1 conversion. Will eat bs_dist+2 characters in input and + // write bs_dist+1 characters to output + // note this may reach beyond the part of the buffer we've actually + // seen. I think this is ok + let escape_result: u8 = + unsafe { *ESCAPE_MAP.get_unchecked(escape_char as usize) }; + if escape_result == 0 { + return Err(self.error(ErrorType::InvalidEscape)); + } + unsafe { + *dst.get_unchecked_mut(dst_i + bs_dist as usize) = escape_result; + } + src_i += bs_dist as usize + 2; + dst_i += bs_dist as usize + 1; + } + } else { + // they are the same. Since they can't co-occur, it means we encountered + // neither. + src_i += 32; + dst_i += 32; + } + } + } +} \ No newline at end of file diff --git a/src/neon/generator.rs b/src/neon/generator.rs new file mode 100644 index 00000000..6c8cf358 --- /dev/null +++ b/src/neon/generator.rs @@ -0,0 +1,48 @@ +use crate::value::generator::ESCAPED; +use std::io; +use crate::neon::intrinsics::*; +use crate::neon::stage1::neon_movemask; + +#[inline(always)] +pub unsafe fn write_str_simd(writer: &mut W, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> where W: std::io::Write { + // The case where we have a 16+ byte block + // we repeate the same logic as above but with + // only 16 bytes + let zero = vdupq_n_u8(0); + let lower_quote_range = vdupq_n_u8(0x1F); + let quote = vdupq_n_u8(b'"'); + let backslash = vdupq_n_u8(b'\\'); + while *len - *idx > 16 { + // Load 16 bytes of data; + let data: uint8x16_t = vld1q_u8(string.as_ptr().add(*idx)); + // Test the data against being backslash and quote. + let bs_or_quote = + vorrq_u8(vceqq_u8(data, backslash), vceqq_u8(data, quote)); + // Now mask the data with the quote range (0x1F). + let in_quote_range = vandq_u8(data, lower_quote_range); + // then test of the data is unchanged. aka: xor it with the + // Any field that was inside the quote range it will be zero + // now. + let is_unchanged = vxorrq_u8(data, in_quote_range); + let in_range = vceqq_u8(is_unchanged, zero); + let quote_bits = neon_movemask(vorrq_u8(bs_or_quote, in_range)); + if quote_bits != 0 { + let quote_dist = quote_bits.trailing_zeros() as usize; + stry!(writer.write_all(&string[0..*idx + quote_dist])); + let ch = string[*idx + quote_dist]; + match ESCAPED[ch as usize] { + b'u' => stry!(write!(writer, "\\u{:04x}", ch)), + + escape => stry!(writer.write_all(&[b'\\', escape])), + }; + *string = &string[*idx + quote_dist + 1..]; + *idx = 0; + *len = string.len(); + } else { + *idx += 16; + } + } + stry!(writer.write_all(&string[0..*idx])); + *string = &string[*idx..]; + Ok(()) +} diff --git a/src/neon/intrinsics.rs b/src/neon/intrinsics.rs new file mode 100644 index 00000000..c5c98cb6 --- /dev/null +++ b/src/neon/intrinsics.rs @@ -0,0 +1,557 @@ +//use std::arch:: + +use crate::neon::simd_llvm; + +use std::mem; +use core; + +#[allow(unused)] +macro_rules! types { + ($( + $(#[$doc:meta])* + pub struct $name:ident($($fields:tt)*); + )*) => ($( + $(#[$doc])* + #[derive(Copy, Clone, Debug)] + #[allow(non_camel_case_types)] + #[repr(simd)] + #[allow(clippy::missing_inline_in_public_items)] + pub struct $name($($fields)*); + )*) +} + +#[allow(non_camel_case_types)] +pub type poly64_t = i64; + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.aarch64.neon.addp.v16u8"] + fn vpaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; + #[link_name = "llvm.aarch64.neon.pmull64"] + fn vmull_p64_(a: i64, b: i64) -> int8x16_t; + #[link_name = "llvm.aarch64.neon.uqxtn.v2u32"] + fn vqmovn_u64_(a: uint64x2_t) -> uint32x2_t; + #[link_name = "llvm.aarch64.neon.uqsub.v16u8"] + fn vqsubq_u8_(a: uint8x16_t, a: uint8x16_t) -> uint8x16_t; + #[link_name = "llvm.aarch64.neon.uqsub.v16i8"] + fn vqsubq_s8_(a: int8x16_t, a: int8x16_t) -> int8x16_t; +} + +#[inline] +unsafe fn vaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) +} + +#[inline] +unsafe fn vaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t { + simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) +} + +#[inline] +unsafe fn vaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t { + simd_llvm::simd_add(mem::transmute(a), mem::transmute(b)) +} + +#[inline] +pub unsafe fn vnegq_u8(a: uint8x16_t) -> uint8x16_t { + let x: u128 = mem::transmute(a); + let nx = !x; + mem::transmute(nx) +} + +#[inline] +pub unsafe fn vnegq_s8(a: int8x16_t) -> int8x16_t { + let x: u128 = mem::transmute(a); + let nx = !x; + mem::transmute(nx) +} + + +#[inline] +fn rotate_(a: u128, b: u128, n: u128) -> u128 { + let az = a >> (n * 8); + let bz = b << (128 - (n * 8)); + az | bz +} + +#[inline] +pub unsafe fn vextq_u8(a: uint8x16_t, b: uint8x16_t, n: u8) -> uint8x16_t { + mem::transmute(rotate_(mem::transmute(a), mem::transmute(b), n as u128)) +} + +#[inline] +pub unsafe fn vextq_s8(a: int8x16_t, b: int8x16_t, n: u8) -> int8x16_t { + mem::transmute(rotate_(mem::transmute(a), mem::transmute(b), n as u128)) +} + +#[inline] +pub unsafe fn vmull_p64(a: poly64_t, b: poly64_t) -> poly128_t { + mem::transmute(vmull_p64_(mem::transmute(a), mem::transmute(b))) +} + +#[inline] +pub fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + unsafe { vpaddq_u8_(a, b) } +} + +#[inline] +pub unsafe fn vshrq_n_u8(a: uint8x16_t, n: u8) -> uint8x16_t { + uint8x16_t( + a.0 >> n, + a.1 >> n, + a.2 >> n, + a.3 >> n, + a.4 >> n, + a.5 >> n, + a.6 >> n, + a.7 >> n, + a.8 >> n, + a.9 >> n, + a.10 >> n, + a.11 >> n, + a.12 >> n, + a.13 >> n, + a.14 >> n, + a.15 >> n, + ) +} + +types! { + /// ARM-specific 64-bit wide vector of eight packed `i8`. + pub struct int8x8_t(i8, i8, i8, i8, i8, i8, i8, i8); + /// ARM-specific 64-bit wide vector of eight packed `u8`. + pub struct uint8x8_t(u8, u8, u8, u8, u8, u8, u8, u8); + /// ARM-specific 64-bit wide polynomial vector of eight packed `u8`. + pub struct poly8x8_t(u8, u8, u8, u8, u8, u8, u8, u8); + /// ARM-specific 64-bit wide vector of four packed `i16`. + pub struct int16x4_t(i16, i16, i16, i16); + /// ARM-specific 64-bit wide vector of four packed `u16`. + pub struct uint16x4_t(u16, u16, u16, u16); + /// ARM-specific 64-bit wide vector of four packed `u16`. + pub struct poly16x4_t(u16, u16, u16, u16); + /// ARM-specific 64-bit wide vector of two packed `i32`. + pub struct int32x2_t(i32, i32); + /// ARM-specific 64-bit wide vector of two packed `u32`. + pub struct uint32x2_t(u32, u32); + /// ARM-specific 64-bit wide vector of two packed `f32`. + pub struct float32x2_t(f32, f32); + /// ARM-specific 64-bit wide vector of one packed `i64`. + pub struct int64x1_t(i64); + /// ARM-specific 64-bit wide vector of one packed `u64`. + pub struct uint64x1_t(u64); + /// ARM-specific 128-bit wide vector of sixteen packed `i8`. + pub struct int8x16_t( + i8, i8 ,i8, i8, i8, i8 ,i8, i8, + i8, i8 ,i8, i8, i8, i8 ,i8, i8, + ); + /// ARM-specific 128-bit wide vector of sixteen packed `u8`. + pub struct uint8x16_t( + u8, u8 ,u8, u8, u8, u8 ,u8, u8, + u8, u8 ,u8, u8, u8, u8 ,u8, u8, + ); + /// ARM-specific 128-bit wide vector of sixteen packed `u8`. + pub struct poly8x16_t( + u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8 + ); + /// ARM-specific 128-bit wide vector of eight packed `i16`. + pub struct int16x8_t(i16, i16, i16, i16, i16, i16, i16, i16); + /// ARM-specific 128-bit wide vector of eight packed `u16`. + pub struct uint16x8_t(u16, u16, u16, u16, u16, u16, u16, u16); + /// ARM-specific 128-bit wide vector of eight packed `u16`. + pub struct poly16x8_t(u16, u16, u16, u16, u16, u16, u16, u16); + /// ARM-specific 128-bit wide vector of four packed `i32`. + pub struct int32x4_t(i32, i32, i32, i32); + /// ARM-specific 128-bit wide vector of four packed `u32`. + pub struct uint32x4_t(u32, u32, u32, u32); + /// ARM-specific 128-bit wide vector of four packed `f32`. + pub struct float32x4_t(f32, f32, f32, f32); + /// ARM-specific 128-bit wide vector of two packed `i64`. + pub struct int64x2_t(i64, i64); + /// ARM-specific 128-bit wide vector of two packed `u64`. + pub struct uint64x2_t(u64, u64); + /// ARM-specific 128-bit wide vector of one packed `i128`. + pub struct poly128_t(i128); // FIXME: check this! +} + +impl uint8x16_t { + #[inline] + pub fn new(a: u8, b: u8, c: u8, d: u8, e: u8, f: u8, g: u8, h: u8, i: u8, j: u8, k: u8, l: u8, m: u8, n: u8, o: u8, p: u8) -> uint8x16_t { + uint8x16_t(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) + } +} + +impl int8x16_t { + #[inline] + pub fn new(a: i8, b: i8, c: i8, d: i8, e: i8, f: i8, g: i8, h: i8, i: i8, j: i8, k: i8, l: i8, m: i8, n: i8, o: i8, p: i8) -> int8x16_t { + int8x16_t(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) + } +} + +impl int32x4_t { + #[inline] + pub fn new(a: i32, b: i32, c: i32, d: i32) -> int32x4_t { + int32x4_t(a, b, c, d) + } +} + +//#[inline] +//pub fn add_overflow(a: u64, b: u64, out: &mut u64) -> bool { +// let (carry, did_carry) = a.overflowing_add(b); +// *out = carry; +// did_carry +//} + +#[inline] +pub unsafe fn vld1q_s8(addr: *const i8) -> int8x16_t { + *(addr as *const int8x16_t) +} + +#[inline] +pub unsafe fn vld1q_u8(addr: *const u8) -> uint8x16_t { + *(addr as *const uint8x16_t) +} + +#[inline] +pub unsafe fn vst1q_u8(addr: *mut u8, val: uint8x16_t) { + std::ptr::write(addr as *mut uint8x16_t, val); +} + +macro_rules! aarch64_simd_2 { + ($name: ident, $type: ty, $simd_fn: ident, $intrarm: ident, $intraarch: ident) => { + aarch64_simd_2!($name, $type, $type, $simd_fn, $intrarm, $intraarch); + }; + ($name: ident, $type: ty, $res: ty, $simd_fn: ident, $intrarm: ident, $intraarch: ident) => { + #[inline] + pub fn $name(a: $type, b: $type) -> $res { + unsafe { simd_llvm::$simd_fn(a, b) } + } + } +} + +macro_rules! aarch64_simd_ceq { + ($name: ident, $type: ty, $res: ty) => { + /// Compare bitwise Equal (vector) + aarch64_simd_2!($name, $type, $res, simd_eq, cmeq, cmeq); + }; +} + +aarch64_simd_ceq!(vceq_s8, int8x8_t, uint8x8_t); +aarch64_simd_ceq!(vceqq_s8, int8x16_t, uint8x16_t); +aarch64_simd_ceq!(vceq_s16, int16x4_t, uint16x4_t); +aarch64_simd_ceq!(vceqq_s16, int16x8_t, uint16x8_t); +aarch64_simd_ceq!(vceq_s32, int32x2_t, uint32x2_t); +aarch64_simd_ceq!(vceqq_s32, int32x4_t, uint32x4_t); +aarch64_simd_ceq!(vceq_u8, uint8x8_t, uint8x8_t); +aarch64_simd_ceq!(vceqq_u8, uint8x16_t, uint8x16_t); +aarch64_simd_ceq!(vceq_u16, uint16x4_t, uint16x4_t); +aarch64_simd_ceq!(vceqq_u16, uint16x8_t, uint16x8_t); +aarch64_simd_ceq!(vceq_u32, uint32x2_t, uint32x2_t); +aarch64_simd_ceq!(vceqq_u32, uint32x4_t, uint32x4_t); +aarch64_simd_2!(vceq_f32, float32x2_t, uint32x2_t, simd_eq, fcmeq, fcmeq); +aarch64_simd_2!(vceqq_f32, float32x4_t, uint32x4_t, simd_eq, fcmeq, fcmeq); +aarch64_simd_ceq!(vceq_p8, poly8x8_t, poly8x8_t); +aarch64_simd_ceq!(vceqq_p8, poly8x16_t, poly8x16_t); + +macro_rules! aarch64_simd_cgt { + ($name:ident, $type:ty, $res:ty) => { + /// Compare signed Greater than (vector) + aarch64_simd_2!($name, $type, $res, simd_gt, cmgt, cmgt); + }; +} + +//macro_rules! aarch64_simd_cgtu { +// ($name: ident, $type: ty) => { +// /// Compare Greater than (vector) +// aarch64_simd_2!($name, $type, simd_gt, cmhi); +// }; +//} + +aarch64_simd_cgt!(vcgt_s8, int8x8_t, uint8x8_t); +aarch64_simd_cgt!(vcgtq_s8, int8x16_t, uint8x16_t); +aarch64_simd_cgt!(vcgt_s16, int16x4_t, uint16x4_t); +aarch64_simd_cgt!(vcgtq_s16, int16x8_t, uint16x8_t); +aarch64_simd_cgt!(vcgt_s32, int32x2_t, uint32x2_t); +aarch64_simd_cgt!(vcgtq_s32, int32x4_t, uint32x4_t); + +//aarch64_simd_cgtu!(vcgtq_u8, uint8x16_t); +//aarch64_simd_cgt!(vcgt_s64, int64x1_t); +//aarch64_simd_cgt!(vcgtq_s64, int64x2_t); +//aarch64_simd_cgtu!(vcgt_u64, uint64x1_t); +//aarch64_simd_cgtu!(vcgtq_u64, uint64x2_t); + +macro_rules! aarch64_simd_clt { + ($name:ident, $type:ty, $res:ty) => { + /// Compare signed Lesser than (vector) + aarch64_simd_2!($name, $type, $res, simd_lt, cmgt, cmgt); + }; +} + +//macro_rules! aarch64_simd_cltu { +//( $ name: ident, $ type: ty) => { +///// Compare Lesser than (vector) +//aarch64_simd_2 ! ( $ name, $ type, simd_lt, cmhi); +//}; +//} + +aarch64_simd_clt!(vclt_s8, int8x8_t, uint8x8_t); +aarch64_simd_clt!(vcltq_s8, int8x16_t, uint8x16_t); +aarch64_simd_clt!(vclt_s16, int16x4_t, uint16x4_t); +aarch64_simd_clt!(vcltq_s16, int16x8_t, uint16x8_t); +aarch64_simd_clt!(vclt_s32, int32x2_t, uint32x2_t); +aarch64_simd_clt!(vcltq_s32, int32x4_t, uint32x4_t); + +//arm_simd_cltu!(vclt_u8, uint8x8_t); +//arm_simd_cltu!(vcltq_u8, uint8x16_t); +//arm_simd_cltu!(vclt_u16, uint16x4_t); +//arm_simd_cltu!(vcltq_u16, uint16x8_t); +//arm_simd_cltu!(vclt_u32, uint32x2_t); +//arm_simd_cltu!(vcltq_u32, uint32x4_t); + +macro_rules! aarch64_simd_cge { + ($name:ident, $type:ty, $res:ty) => { + /// Compare signed Greater than equals (vector) + aarch64_simd_2!($name, $type, $res, simd_ge, cmge, cmge); + }; +} + +//macro_rules! aarch64_simd_cgeu { +//( $ name: ident, $ type: ty) => { +///// Compare Greater than (vector) +//aarch64_simd_2 ! ( $ name, $ type, simd_ge, cmhs); +//}; +//} + +aarch64_simd_cge!(vcge_s8, int8x8_t, uint8x8_t); +aarch64_simd_cge!(vcgeq_s8, int8x16_t, uint8x16_t); +aarch64_simd_cge!(vcge_s16, int16x4_t, uint16x4_t); +aarch64_simd_cge!(vcgeq_s16, int16x8_t, uint16x8_t); +aarch64_simd_cge!(vcge_s32, int32x2_t, uint32x2_t); +aarch64_simd_cge!(vcgeq_s32, int32x4_t, uint32x4_t); +//arm_simd_cgeu!(vcge_u8, uint8x8_t); +//arm_simd_cgeu!(vcgeq_u8, uint8x16_t); +//arm_simd_cgeu!(vcge_u16, uint16x4_t); +//arm_simd_cgeu!(vcgeq_u16, uint16x8_t); +//arm_simd_cgeu!(vcge_u32, uint32x2_t); +//arm_simd_cgeu!(vcgeq_u32, uint32x4_t); + +macro_rules! aarch64_simd_cle { + ($name:ident, $type:ty, $res:ty) => { + /// Compare signed Lesser than equals (vector) + aarch64_simd_2!($name, $type, $res, simd_le, cmge, cmge); + }; +} + +//macro_rules! aarch64_simd_cleu { +//( $ name: ident, $ type: ty) => { +///// Compare Lesser than (vector) +//aarch64_simd_2 ! ( $ name, $ type, simd_le, cmhs); +//}; +//} + +aarch64_simd_cle!(vcle_s8, int8x8_t, uint8x8_t); +aarch64_simd_cle!(vcleq_s8, int8x16_t, uint8x16_t); +aarch64_simd_cle!(vcle_s16, int16x4_t, uint16x4_t); +aarch64_simd_cle!(vcleq_s16, int16x8_t, uint16x8_t); +aarch64_simd_cle!(vcle_s32, int32x2_t, uint32x2_t); +aarch64_simd_cle!(vcleq_s32, int32x4_t, uint32x4_t); +//arm_simd_cleu!(vcle_u8, uint8x8_t); +aarch64_simd_cle!(vcleq_u8, uint8x16_t, uint8x16_t); +//arm_simd_cleu!(vcle_u16, uint16x4_t); +//arm_simd_cleu!(vcleq_u16, uint16x8_t); +//arm_simd_cleu!(vcle_u32, uint32x2_t); +//arm_simd_cleu!(vcleq_u32, uint32x4_t); + +#[inline] +pub fn vdupq_n_s8(a: i8) -> int8x16_t { + int8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) +} + +#[inline] +pub fn zeroi8x16() -> int8x16_t { + int8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) +} + +#[inline] +pub fn vdupq_n_u8(a: u8) -> uint8x16_t { + uint8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) +} + +#[inline] +pub fn vmovq_n_u8(a: u8) -> uint8x16_t { + uint8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) +} + +#[inline] +pub fn vmovq_n_s8(a: i8) -> int8x16_t { + int8x16_t(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) +} + +#[inline] +pub fn zerou8x16() -> uint8x16_t { + uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00) +} + +#[inline] +pub unsafe fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + mem::transmute(vaddq_u8_(mem::transmute(a), mem::transmute(b))) +} + +#[inline] +pub unsafe fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { + mem::transmute(vaddq_s8_(mem::transmute(a), mem::transmute(b))) +} + +#[inline] +pub unsafe fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t { + mem::transmute(vaddq_s32_(mem::transmute(a), mem::transmute(b))) +} + +#[inline] +pub fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_and(a, b) } } +#[inline] +pub fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_and(a, b) } } +#[inline] +pub fn vandq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t { unsafe { simd_llvm::simd_and(a, b) } } +#[inline] +pub fn vorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_or(a, b) } } +#[inline] +pub fn vorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_or(a, b) } } +#[inline] +pub fn vxorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { unsafe { simd_llvm::simd_xor(a, b) } } +#[inline] +pub fn vxorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { unsafe { simd_llvm::simd_xor(a, b) } } + +macro_rules! arm_reinterpret { + ($name: ident, $from: ty, $to: ty) => { + // Vector reinterpret cast operation + #[inline] + pub fn $name(a: $from) -> $to { + unsafe { mem::transmute(a) } + } + }; +} + +arm_reinterpret!(vreinterpret_u64_u32, uint32x2_t, uint64x1_t); +arm_reinterpret!(vreinterpretq_u64_u32, uint32x4_t, uint64x2_t); +arm_reinterpret!(vreinterpretq_s8_u8, uint8x16_t, int8x16_t); +arm_reinterpret!(vreinterpretq_u16_u8, uint8x16_t, uint16x8_t); +arm_reinterpret!(vreinterpretq_u32_u8, uint8x16_t, uint32x4_t); +arm_reinterpret!(vreinterpretq_u64_u8, uint8x16_t, uint64x2_t); +arm_reinterpret!(vreinterpretq_u64_s8, int8x16_t, uint64x2_t); +arm_reinterpret!(vreinterpretq_u8_s8, int8x16_t, uint8x16_t); + +arm_reinterpret!(vreinterpretq_s16_s8, int8x16_t, int16x8_t); +arm_reinterpret!(vreinterpretq_s32_s8, int8x16_t, int32x4_t); +arm_reinterpret!(vreinterpretq_s64_s8, int8x16_t, int64x2_t); + +macro_rules! arm_vget_lane { + ($name: ident, $to: ty, $from: ty, $lanes: literal) => { + #[inline] + pub unsafe fn $name(v: $from, lane: u32) -> $ to { + simd_llvm::simd_extract(v, lane) + } + }; +} + +arm_vget_lane!(vgetq_lane_u16, u16, uint16x8_t, 7); +arm_vget_lane!(vgetq_lane_u32, u32, uint32x4_t, 3); +arm_vget_lane!(vgetq_lane_u64, u64, uint64x2_t, 1); +arm_vget_lane!(vget_lane_u64, u64, uint64x1_t, 0); + +arm_vget_lane!(vgetq_lane_s16, i16, int16x8_t, 7); +arm_vget_lane!(vgetq_lane_s32, i32, int32x4_t, 3); +arm_vget_lane!(vgetq_lane_s64, i64, int64x2_t, 1); +arm_vget_lane!(vget_lane_s64, i64, int64x1_t, 0); + +#[inline] +pub unsafe fn vqmovn_u64(a: uint64x2_t) -> uint32x2_t { + vqmovn_u64_(a) +} + +#[inline] +pub unsafe fn vqtbl1q_s8(t: int8x16_t, idx: uint8x16_t) -> int8x16_t { + mem::transmute(core::arch::aarch64::vqtbl1q_s8(mem::transmute(t), mem::transmute(idx))) +} + +#[inline] +pub unsafe fn vqtbl1q_u8(t: uint8x16_t, idx: uint8x16_t) -> uint8x16_t { + mem::transmute(core::arch::aarch64::vqtbl1q_s8(mem::transmute(t), mem::transmute(idx))) +} + +#[inline] +pub unsafe fn vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + vqsubq_u8_(a, b) +} + +#[inline] +pub unsafe fn vqsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { + vqsubq_s8_(a, b) +} + +#[inline] +fn test_u8(a: u8, b: u8) -> u8 { + if a & b != 0 { + 0xFF + } else { + 0x00 + } +} + +#[inline] +pub unsafe fn vtstq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t { + uint8x16_t( + test_u8(a.0, b.0), + test_u8(a.1, b.1), + test_u8(a.2, b.2), + test_u8(a.3, b.3), + test_u8(a.4, b.4), + test_u8(a.5, b.5), + test_u8(a.6, b.6), + test_u8(a.7, b.7), + test_u8(a.8, b.8), + test_u8(a.9, b.9), + test_u8(a.10, b.10), + test_u8(a.11, b.11), + test_u8(a.12, b.12), + test_u8(a.13, b.13), + test_u8(a.14, b.14), + test_u8(a.15, b.15), + ) +} + +#[inline] +fn test_s8(a: i8, b: i8) -> i8 { + if a & b != 0 { + -1 + } else { + 0x00 + } +} + +#[inline] +pub unsafe fn vtstq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t { + int8x16_t( + test_s8(a.0, b.0), + test_s8(a.1, b.1), + test_s8(a.2, b.2), + test_s8(a.3, b.3), + test_s8(a.4, b.4), + test_s8(a.5, b.5), + test_s8(a.6, b.6), + test_s8(a.7, b.7), + test_s8(a.8, b.8), + test_s8(a.9, b.9), + test_s8(a.10, b.10), + test_s8(a.11, b.11), + test_s8(a.12, b.12), + test_s8(a.13, b.13), + test_s8(a.14, b.14), + test_s8(a.15, b.15), + ) +} + +#[inline] +pub unsafe fn vst1q_u32(addr: *mut u8, val: uint32x4_t) { + std::ptr::write(addr as *mut uint32x4_t, val) +} diff --git a/src/neon/mod.rs b/src/neon/mod.rs new file mode 100644 index 00000000..f7868249 --- /dev/null +++ b/src/neon/mod.rs @@ -0,0 +1,7 @@ +pub mod deser; +pub mod stage1; +pub mod utf8check; +pub mod generator; +mod simd; +mod simd_llvm; +mod intrinsics; \ No newline at end of file diff --git a/src/neon/simd.rs b/src/neon/simd.rs new file mode 100644 index 00000000..8a5a21fc --- /dev/null +++ b/src/neon/simd.rs @@ -0,0 +1,470 @@ +#![allow(non_camel_case_types)] +#![allow(unused)] + +use crate::neon::simd_llvm; + +macro_rules! simd_ty { + ($id:ident [$ety:ident]: $($elem_ty:ident),* | $($elem_name:ident),*) => { + #[repr(simd)] + #[derive(Copy, Clone, Debug, PartialEq)] + pub(crate) struct $id($(pub $elem_ty),*); + + #[allow(clippy::use_self)] + impl $id { + #[inline] + pub(crate) const fn new($($elem_name: $elem_ty),*) -> Self { + $id($($elem_name),*) + } + // FIXME: Workaround rust@60637 + #[inline(always)] + pub(crate) const fn splat(value: $ety) -> Self { + $id($({ + #[allow(non_camel_case_types, dead_code)] + struct $elem_name; + value + }),*) + } + + // FIXME: Workaround rust@60637 + #[inline(always)] + pub(crate) fn extract(self, index: usize) -> $ety { + unsafe { + simd_llvm::simd_extract(self, index as u32) + } + } + } + } +} + +macro_rules! simd_m_ty { + ($id:ident [$ety:ident]: $($elem_ty:ident),* | $($elem_name:ident),*) => { + #[repr(simd)] + #[derive(Copy, Clone, Debug, PartialEq)] + pub(crate) struct $id($(pub $elem_ty),*); + + #[allow(clippy::use_self)] + impl $id { + #[inline] + const fn bool_to_internal(x: bool) -> $ety { + [0 as $ety, !(0 as $ety)][x as usize] + } + + #[inline] + pub(crate) const fn new($($elem_name: bool),*) -> Self { + $id($(Self::bool_to_internal($elem_name)),*) + } + + // FIXME: Workaround rust@60637 + #[inline(always)] + pub(crate) const fn splat(value: bool) -> Self { + $id($({ + #[allow(non_camel_case_types, dead_code)] + struct $elem_name; + Self::bool_to_internal(value) + }),*) + } + + // FIXME: Workaround rust@60637 + #[inline(always)] + pub(crate) fn extract(self, index: usize) -> bool { + let r: $ety = unsafe { + simd_llvm::simd_extract(self, index as u32) + }; + r != 0 + } + } + } +} + +// 16-bit wide types: + +simd_ty!(u8x2[u8]: u8, u8 | x0, x1); +simd_ty!(i8x2[i8]: i8, i8 | x0, x1); + +// 32-bit wide types: + +simd_ty!(u8x4[u8]: u8, u8, u8, u8 | x0, x1, x2, x3); +simd_ty!(u16x2[u16]: u16, u16 | x0, x1); + +simd_ty!(i8x4[i8]: i8, i8, i8, i8 | x0, x1, x2, x3); +simd_ty!(i16x2[i16]: i16, i16 | x0, x1); + +// 64-bit wide types: + +simd_ty!(u8x8[u8]: + u8, u8, u8, u8, u8, u8, u8, u8 + | x0, x1, x2, x3, x4, x5, x6, x7); +simd_ty!(u16x4[u16]: u16, u16, u16, u16 | x0, x1, x2, x3); +simd_ty!(u32x2[u32]: u32, u32 | x0, x1); +simd_ty!(u64x1[u64]: u64 | x1); + +simd_ty!(i8x8[i8]: + i8, i8, i8, i8, i8, i8, i8, i8 + | x0, x1, x2, x3, x4, x5, x6, x7); +simd_ty!(i16x4[i16]: i16, i16, i16, i16 | x0, x1, x2, x3); +simd_ty!(i32x2[i32]: i32, i32 | x0, x1); +simd_ty!(i64x1[i64]: i64 | x1); + +simd_ty!(f32x2[f32]: f32, f32 | x0, x1); + +// 128-bit wide types: + +simd_ty!(u8x16[u8]: + u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8 + | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 +); +simd_ty!(u16x8[u16]: + u16, u16, u16, u16, u16, u16, u16, u16 + | x0, x1, x2, x3, x4, x5, x6, x7); +simd_ty!(u32x4[u32]: u32, u32, u32, u32 | x0, x1, x2, x3); +simd_ty!(u64x2[u64]: u64, u64 | x0, x1); + +simd_ty!(i8x16[i8]: + i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8 + | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 +); +simd_ty!(i16x8[i16]: + i16, i16, i16, i16, i16, i16, i16, i16 + | x0, x1, x2, x3, x4, x5, x6, x7); +simd_ty!(i32x4[i32]: i32, i32, i32, i32 | x0, x1, x2, x3); +simd_ty!(i64x2[i64]: i64, i64 | x0, x1); + +simd_ty!(f32x4[f32]: f32, f32, f32, f32 | x0, x1, x2, x3); +simd_ty!(f64x2[f64]: f64, f64 | x0, x1); + +simd_m_ty!(m8x16[i8]: + i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8 + | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 +); +simd_m_ty!(m16x8[i16]: + i16, i16, i16, i16, i16, i16, i16, i16 + | x0, x1, x2, x3, x4, x5, x6, x7); +simd_m_ty!(m32x4[i32]: i32, i32, i32, i32 | x0, x1, x2, x3); +simd_m_ty!(m64x2[i64]: i64, i64 | x0, x1); + +// 256-bit wide types: + +simd_ty!(u8x32[u8]: + u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8, + u8, u8, u8, u8, u8, u8, u8, u8 + | x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31 +); +simd_ty!(u16x16[u16]: + u16, u16, u16, u16, u16, u16, u16, u16, + u16, u16, u16, u16, u16, u16, u16, u16 + | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 +); +simd_ty!(u32x8[u32]: + u32, u32, u32, u32, u32, u32, u32, u32 + | x0, x1, x2, x3, x4, x5, x6, x7); +simd_ty!(u64x4[u64]: u64, u64, u64, u64 | x0, x1, x2, x3); + +simd_ty!(i8x32[i8]: + i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8, + i8, i8, i8, i8, i8, i8, i8, i8 + | x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15, + x16, x17, x18, x19, x20, x21, x22, x23, + x24, x25, x26, x27, x28, x29, x30, x31 +); +simd_ty!(i16x16[i16]: + i16, i16, i16, i16, i16, i16, i16, i16, + i16, i16, i16, i16, i16, i16, i16, i16 + | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 +); +simd_ty!(i32x8[i32]: + i32, i32, i32, i32, i32, i32, i32, i32 + | x0, x1, x2, x3, x4, x5, x6, x7); +simd_ty!(i64x4[i64]: i64, i64, i64, i64 | x0, x1, x2, x3); + +simd_ty!(f32x8[f32]: + f32, f32, f32, f32, f32, f32, f32, f32 | + x0, x1, x2, x3, x4, x5, x6, x7); + +// 512-bit wide types: + +simd_ty!(i32x16[i32]: + i32, i32, i32, i32, i32, i32, i32, i32, + i32, i32, i32, i32, i32, i32, i32, i32 + | x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15); + +simd_ty!(i64x8[i64]: + i64, i64, i64, i64, i64, i64, i64, i64 + | x0, x1, x2, x3, x4, x5, x6, x7); + +#[allow(unused)] +#[macro_export] +macro_rules! constify_imm8 { + ($imm8:expr, $expand:ident) => { + #[allow(overflowing_literals)] + match ($imm8) & 0b1111_1111 { + 0 => $expand!(0), + 1 => $expand!(1), + 2 => $expand!(2), + 3 => $expand!(3), + 4 => $expand!(4), + 5 => $expand!(5), + 6 => $expand!(6), + 7 => $expand!(7), + 8 => $expand!(8), + 9 => $expand!(9), + 10 => $expand!(10), + 11 => $expand!(11), + 12 => $expand!(12), + 13 => $expand!(13), + 14 => $expand!(14), + 15 => $expand!(15), + 16 => $expand!(16), + 17 => $expand!(17), + 18 => $expand!(18), + 19 => $expand!(19), + 20 => $expand!(20), + 21 => $expand!(21), + 22 => $expand!(22), + 23 => $expand!(23), + 24 => $expand!(24), + 25 => $expand!(25), + 26 => $expand!(26), + 27 => $expand!(27), + 28 => $expand!(28), + 29 => $expand!(29), + 30 => $expand!(30), + 31 => $expand!(31), + 32 => $expand!(32), + 33 => $expand!(33), + 34 => $expand!(34), + 35 => $expand!(35), + 36 => $expand!(36), + 37 => $expand!(37), + 38 => $expand!(38), + 39 => $expand!(39), + 40 => $expand!(40), + 41 => $expand!(41), + 42 => $expand!(42), + 43 => $expand!(43), + 44 => $expand!(44), + 45 => $expand!(45), + 46 => $expand!(46), + 47 => $expand!(47), + 48 => $expand!(48), + 49 => $expand!(49), + 50 => $expand!(50), + 51 => $expand!(51), + 52 => $expand!(52), + 53 => $expand!(53), + 54 => $expand!(54), + 55 => $expand!(55), + 56 => $expand!(56), + 57 => $expand!(57), + 58 => $expand!(58), + 59 => $expand!(59), + 60 => $expand!(60), + 61 => $expand!(61), + 62 => $expand!(62), + 63 => $expand!(63), + 64 => $expand!(64), + 65 => $expand!(65), + 66 => $expand!(66), + 67 => $expand!(67), + 68 => $expand!(68), + 69 => $expand!(69), + 70 => $expand!(70), + 71 => $expand!(71), + 72 => $expand!(72), + 73 => $expand!(73), + 74 => $expand!(74), + 75 => $expand!(75), + 76 => $expand!(76), + 77 => $expand!(77), + 78 => $expand!(78), + 79 => $expand!(79), + 80 => $expand!(80), + 81 => $expand!(81), + 82 => $expand!(82), + 83 => $expand!(83), + 84 => $expand!(84), + 85 => $expand!(85), + 86 => $expand!(86), + 87 => $expand!(87), + 88 => $expand!(88), + 89 => $expand!(89), + 90 => $expand!(90), + 91 => $expand!(91), + 92 => $expand!(92), + 93 => $expand!(93), + 94 => $expand!(94), + 95 => $expand!(95), + 96 => $expand!(96), + 97 => $expand!(97), + 98 => $expand!(98), + 99 => $expand!(99), + 100 => $expand!(100), + 101 => $expand!(101), + 102 => $expand!(102), + 103 => $expand!(103), + 104 => $expand!(104), + 105 => $expand!(105), + 106 => $expand!(106), + 107 => $expand!(107), + 108 => $expand!(108), + 109 => $expand!(109), + 110 => $expand!(110), + 111 => $expand!(111), + 112 => $expand!(112), + 113 => $expand!(113), + 114 => $expand!(114), + 115 => $expand!(115), + 116 => $expand!(116), + 117 => $expand!(117), + 118 => $expand!(118), + 119 => $expand!(119), + 120 => $expand!(120), + 121 => $expand!(121), + 122 => $expand!(122), + 123 => $expand!(123), + 124 => $expand!(124), + 125 => $expand!(125), + 126 => $expand!(126), + 127 => $expand!(127), + 128 => $expand!(128), + 129 => $expand!(129), + 130 => $expand!(130), + 131 => $expand!(131), + 132 => $expand!(132), + 133 => $expand!(133), + 134 => $expand!(134), + 135 => $expand!(135), + 136 => $expand!(136), + 137 => $expand!(137), + 138 => $expand!(138), + 139 => $expand!(139), + 140 => $expand!(140), + 141 => $expand!(141), + 142 => $expand!(142), + 143 => $expand!(143), + 144 => $expand!(144), + 145 => $expand!(145), + 146 => $expand!(146), + 147 => $expand!(147), + 148 => $expand!(148), + 149 => $expand!(149), + 150 => $expand!(150), + 151 => $expand!(151), + 152 => $expand!(152), + 153 => $expand!(153), + 154 => $expand!(154), + 155 => $expand!(155), + 156 => $expand!(156), + 157 => $expand!(157), + 158 => $expand!(158), + 159 => $expand!(159), + 160 => $expand!(160), + 161 => $expand!(161), + 162 => $expand!(162), + 163 => $expand!(163), + 164 => $expand!(164), + 165 => $expand!(165), + 166 => $expand!(166), + 167 => $expand!(167), + 168 => $expand!(168), + 169 => $expand!(169), + 170 => $expand!(170), + 171 => $expand!(171), + 172 => $expand!(172), + 173 => $expand!(173), + 174 => $expand!(174), + 175 => $expand!(175), + 176 => $expand!(176), + 177 => $expand!(177), + 178 => $expand!(178), + 179 => $expand!(179), + 180 => $expand!(180), + 181 => $expand!(181), + 182 => $expand!(182), + 183 => $expand!(183), + 184 => $expand!(184), + 185 => $expand!(185), + 186 => $expand!(186), + 187 => $expand!(187), + 188 => $expand!(188), + 189 => $expand!(189), + 190 => $expand!(190), + 191 => $expand!(191), + 192 => $expand!(192), + 193 => $expand!(193), + 194 => $expand!(194), + 195 => $expand!(195), + 196 => $expand!(196), + 197 => $expand!(197), + 198 => $expand!(198), + 199 => $expand!(199), + 200 => $expand!(200), + 201 => $expand!(201), + 202 => $expand!(202), + 203 => $expand!(203), + 204 => $expand!(204), + 205 => $expand!(205), + 206 => $expand!(206), + 207 => $expand!(207), + 208 => $expand!(208), + 209 => $expand!(209), + 210 => $expand!(210), + 211 => $expand!(211), + 212 => $expand!(212), + 213 => $expand!(213), + 214 => $expand!(214), + 215 => $expand!(215), + 216 => $expand!(216), + 217 => $expand!(217), + 218 => $expand!(218), + 219 => $expand!(219), + 220 => $expand!(220), + 221 => $expand!(221), + 222 => $expand!(222), + 223 => $expand!(223), + 224 => $expand!(224), + 225 => $expand!(225), + 226 => $expand!(226), + 227 => $expand!(227), + 228 => $expand!(228), + 229 => $expand!(229), + 230 => $expand!(230), + 231 => $expand!(231), + 232 => $expand!(232), + 233 => $expand!(233), + 234 => $expand!(234), + 235 => $expand!(235), + 236 => $expand!(236), + 237 => $expand!(237), + 238 => $expand!(238), + 239 => $expand!(239), + 240 => $expand!(240), + 241 => $expand!(241), + 242 => $expand!(242), + 243 => $expand!(243), + 244 => $expand!(244), + 245 => $expand!(245), + 246 => $expand!(246), + 247 => $expand!(247), + 248 => $expand!(248), + 249 => $expand!(249), + 250 => $expand!(250), + 251 => $expand!(251), + 252 => $expand!(252), + 253 => $expand!(253), + 254 => $expand!(254), + _ => $expand!(255), + } + }; +} diff --git a/src/neon/simd_llvm.rs b/src/neon/simd_llvm.rs new file mode 100644 index 00000000..6e60b63c --- /dev/null +++ b/src/neon/simd_llvm.rs @@ -0,0 +1,54 @@ +extern "platform-intrinsic" { + pub fn simd_eq(x: T, y: T) -> U; +// pub fn simd_ne(x: T, y: T) -> U; + pub fn simd_lt(x: T, y: T) -> U; + pub fn simd_le(x: T, y: T) -> U; + pub fn simd_gt(x: T, y: T) -> U; + pub fn simd_ge(x: T, y: T) -> U; +// +// pub fn simd_shuffle2(x: T, y: T, idx: [u32; 2]) -> U; +// pub fn simd_shuffle4(x: T, y: T, idx: [u32; 4]) -> U; +// pub fn simd_shuffle8(x: T, y: T, idx: [u32; 8]) -> U; +// pub fn simd_shuffle16(x: T, y: T, idx: [u32; 16]) -> U; +// pub fn simd_shuffle32(x: T, y: T, idx: [u32; 32]) -> U; +// pub fn simd_shuffle64(x: T, y: T, idx: [u32; 64]) -> U; +// pub fn simd_shuffle128(x: T, y: T, idx: [u32; 128]) -> U; +// +// pub fn simd_insert(x: T, idx: u32, val: U) -> T; + pub fn simd_extract(x: T, idx: u32) -> U; +// +// pub fn simd_cast(x: T) -> U; +// + pub fn simd_add(x: T, y: T) -> T; +// pub fn simd_sub(x: T, y: T) -> T; +// pub fn simd_mul(x: T, y: T) -> T; +// pub fn simd_div(x: T, y: T) -> T; +// pub fn simd_shl(x: T, y: T) -> T; +// pub fn simd_shr(x: T, y: T) -> T; + pub fn simd_and(x: T, y: T) -> T; + pub fn simd_or(x: T, y: T) -> T; + pub fn simd_xor(x: T, y: T) -> T; +// +// pub fn simd_reduce_add_unordered(x: T) -> U; +// pub fn simd_reduce_mul_unordered(x: T) -> U; +// pub fn simd_reduce_add_ordered(x: T, acc: U) -> U; +// pub fn simd_reduce_mul_ordered(x: T, acc: U) -> U; +// pub fn simd_reduce_min(x: T) -> U; +// pub fn simd_reduce_max(x: T) -> U; +// pub fn simd_reduce_min_nanless(x: T) -> U; +// pub fn simd_reduce_max_nanless(x: T) -> U; +// pub fn simd_reduce_and(x: T) -> U; +// pub fn simd_reduce_or(x: T) -> U; +// pub fn simd_reduce_xor(x: T) -> U; +// pub fn simd_reduce_all(x: T) -> bool; +// pub fn simd_reduce_any(x: T) -> bool; +// +// pub fn simd_select(m: M, a: T, b: T) -> T; +// pub fn simd_select_bitmask(m: M, a: T, b: T) -> T; +// +// pub fn simd_fmin(a: T, b: T) -> T; +// pub fn simd_fmax(a: T, b: T) -> T; +// +// pub fn simd_fsqrt(a: T) -> T; +// pub fn simd_fma(a: T, b: T, c: T) -> T; +} \ No newline at end of file diff --git a/src/neon/stage1.rs b/src/neon/stage1.rs new file mode 100644 index 00000000..45322cee --- /dev/null +++ b/src/neon/stage1.rs @@ -0,0 +1,598 @@ +#![allow(dead_code)] + +use crate::neon::intrinsics::*; +use crate::neon::utf8check::*; +use crate::*; + +use std::mem; + +// NEON-SPECIFIC + +macro_rules! bit_mask { + () => { + uint8x16_t::new( + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + ) + }; +} + +#[cfg_attr(not(feature = "no-inline"), inline(always))] +pub(crate) unsafe fn neon_movemask(input: uint8x16_t) -> u16 { + let minput: uint8x16_t = vandq_u8(input, bit_mask!()); + let tmp: uint8x16_t = vpaddq_u8(minput, minput); + let tmp = vpaddq_u8(tmp, tmp); + let tmp = vpaddq_u8(tmp, tmp); + + vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0) +} + +#[cfg_attr(not(feature = "no-inline"), inline(always))] +pub unsafe fn neon_movemask_bulk(p0: uint8x16_t, p1: uint8x16_t, p2: uint8x16_t, p3: uint8x16_t) -> u64 { + let bit_mask = bit_mask!(); + + let t0 = vandq_u8(p0, bit_mask); + let t1 = vandq_u8(p1, bit_mask); + let t2 = vandq_u8(p2, bit_mask); + let t3 = vandq_u8(p3, bit_mask); + let sum0 = vpaddq_u8(t0, t1); + let sum1 = vpaddq_u8(t2, t3); + let sum0 = vpaddq_u8(sum0, sum1); + let sum0 = vpaddq_u8(sum0, sum0); + + vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0) +} + +// /NEON-SPECIFIC + +pub const SIMDJSON_PADDING: usize = mem::size_of::() * 4; + +unsafe fn compute_quote_mask(quote_bits: u64) -> u64 { + vgetq_lane_u64( + vreinterpretq_u64_u8( + mem::transmute( + vmull_p64( + -1, + quote_bits as i64) + ) + ), + 0 + ) +} + +#[cfg_attr(not(feature = "no-inline"), inline(always))] +unsafe fn check_ascii(si: &SimdInput) -> bool { + let highbit: uint8x16_t = vdupq_n_u8(0x80); + let t0: uint8x16_t = vorrq_u8(si.v0, si.v1); + let t1: uint8x16_t = vorrq_u8(si.v2, si.v3); + let t3: uint8x16_t = vorrq_u8(t0, t1); + let t4: uint8x16_t = vandq_u8(t3, highbit); + + let v64: uint64x2_t = vreinterpretq_u64_u8(t4); + let v32: uint32x2_t = vqmovn_u64(v64); + let result: uint64x1_t = vreinterpret_u64_u32(v32); + + vget_lane_u64(result, 0) == 0 +} + +#[derive(Debug)] +struct SimdInput { + v0: uint8x16_t, + v1: uint8x16_t, + v2: uint8x16_t, + v3: uint8x16_t, +} + +fn fill_input(ptr: &[u8]) -> SimdInput { + unsafe { + #[allow(clippy::cast_ptr_alignment)] + SimdInput { + v0: vld1q_u8(ptr.as_ptr() as *const u8), + v1: vld1q_u8(ptr.as_ptr().add(16) as *const u8), + v2: vld1q_u8(ptr.as_ptr().add(32) as *const u8), + v3: vld1q_u8(ptr.as_ptr().add(48) as *const u8), + } + } +} + +struct Utf8CheckingState { + has_error: int8x16_t, + previous: ProcessedUtfBytes, +} + +impl Default for Utf8CheckingState { + #[cfg_attr(not(feature = "no-inline"), inline)] + fn default() -> Self { + Utf8CheckingState { + has_error: vdupq_n_s8(0), + previous: ProcessedUtfBytes::default(), + } + } +} + +#[inline] +fn is_utf8_status_ok(has_error: int8x16_t) -> bool { + unsafe { + let has_error_128 : i128 = mem::transmute(has_error); + + has_error_128 == 0 + } +} + +#[cfg_attr(not(feature = "no-inline"), inline(always))] +unsafe fn check_utf8( + input: &SimdInput, + state: &mut Utf8CheckingState, +) { + if check_ascii(input) { + // All bytes are ascii. Therefore the byte that was just before must be + // ascii too. We only check the byte that was just before simd_input. Nines + // are arbitrary values. + let verror: int8x16_t = int8x16_t::new( + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, + ); + state.has_error = vreinterpretq_s8_u8(vorrq_u8( + vcgtq_s8( + state.previous.carried_continuations, + verror, + ), + vreinterpretq_u8_s8(state.has_error)), + ); + } else { + // it is not ascii so we have to do heavy work + state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v0), &mut state.previous, &mut state.has_error); + state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v1), &mut state.previous, &mut state.has_error); + state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v2), &mut state.previous, &mut state.has_error); + state.previous = check_utf8_bytes(vreinterpretq_s8_u8(input.v3), &mut state.previous, &mut state.has_error); + } +} + +// a straightforward comparison of a mask against input +#[cfg_attr(not(feature = "no-inline"), inline(always))] +fn cmp_mask_against_input(input: &SimdInput, m: u8) -> u64 { + unsafe { + let mask: uint8x16_t = vmovq_n_u8(m); + let cmp_res_0: uint8x16_t = vceqq_u8(input.v0, mask); + let cmp_res_1: uint8x16_t = vceqq_u8(input.v1, mask); + let cmp_res_2: uint8x16_t = vceqq_u8(input.v2, mask); + let cmp_res_3: uint8x16_t = vceqq_u8(input.v3, mask); + + neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3) + } +} + +// find all values less than or equal than the content of maxval (using unsigned arithmetic) +#[cfg_attr(not(feature = "no-inline"), inline(always))] +fn unsigned_lteq_against_input(input: &SimdInput, maxval: uint8x16_t) -> u64 { + unsafe { + let cmp_res_0: uint8x16_t = vcleq_u8(input.v0, maxval); + let cmp_res_1: uint8x16_t = vcleq_u8(input.v1, maxval); + let cmp_res_2: uint8x16_t = vcleq_u8(input.v2, maxval); + let cmp_res_3: uint8x16_t = vcleq_u8(input.v3, maxval); + neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3) + } +} + +// return a bitvector indicating where we have characters that end an odd-length +// sequence of backslashes (and thus change the behavior of the next character +// to follow). A even-length sequence of backslashes, and, for that matter, the +// largest even-length prefix of our odd-length sequence of backslashes, simply +// modify the behavior of the backslashes themselves. +// We also update the prev_iter_ends_odd_backslash reference parameter to +// indicate whether we end an iteration on an odd-length sequence of +// backslashes, which modifies our subsequent search for odd-length +// sequences of backslashes in an obvious way. +#[cfg_attr(not(feature = "no-inline"), inline(always))] +unsafe fn find_odd_backslash_sequences(input: &SimdInput, prev_iter_ends_odd_backslash: &mut u64) -> u64 { + const EVEN_BITS: u64 = 0x5555_5555_5555_5555; + const ODD_BITS: u64 = !EVEN_BITS; + + let bs_bits: u64 = cmp_mask_against_input(&input, b'\\'); + let start_edges: u64 = bs_bits & !(bs_bits << 1); + // flip lowest if we have an odd-length run at the end of the prior + // iteration + let even_start_mask: u64 = EVEN_BITS ^ *prev_iter_ends_odd_backslash; + let even_starts: u64 = start_edges & even_start_mask; + let odd_starts: u64 = start_edges & !even_start_mask; + let even_carries: u64 = bs_bits.wrapping_add(even_starts); + + // must record the carry-out of our odd-carries out of bit 63; this + // indicates whether the sense of any edge going to the next iteration + // should be flipped + let (mut odd_carries, iter_ends_odd_backslash) = bs_bits.overflowing_add(odd_starts); + + odd_carries |= *prev_iter_ends_odd_backslash; + // push in bit zero as a potential end + // if we had an odd-numbered run at the + // end of the previous iteration + *prev_iter_ends_odd_backslash = if iter_ends_odd_backslash { 0x1 } else { 0x0 }; + let even_carry_ends: u64 = even_carries & !bs_bits; + let odd_carry_ends: u64 = odd_carries & !bs_bits; + let even_start_odd_end: u64 = even_carry_ends & ODD_BITS; + let odd_start_even_end: u64 = odd_carry_ends & EVEN_BITS; + let odd_ends: u64 = even_start_odd_end | odd_start_even_end; + odd_ends +} + +// return both the quote mask (which is a half-open mask that covers the first +// quote in an unescaped quote pair and everything in the quote pair) and the +// quote bits, which are the simple unescaped quoted bits. +// +// We also update the prev_iter_inside_quote value to tell the next iteration +// whether we finished the final iteration inside a quote pair; if so, this +// inverts our behavior of whether we're inside quotes for the next iteration. +// +// Note that we don't do any error checking to see if we have backslash +// sequences outside quotes; these +// backslash sequences (of any length) will be detected elsewhere. +#[cfg_attr(not(feature = "no-inline"), inline(always))] +unsafe fn find_quote_mask_and_bits( + input: &SimdInput, + odd_ends: u64, + prev_iter_inside_quote: &mut u64, + quote_bits: &mut u64, + error_mask: &mut u64, +) -> u64 { + *quote_bits = cmp_mask_against_input(&input, b'"'); + *quote_bits &= !odd_ends; + // remove from the valid quoted region the unescapted characters. + let mut quote_mask: u64 = compute_quote_mask(*quote_bits); + + quote_mask ^= *prev_iter_inside_quote; + // All Unicode characters may be placed within the + // quotation marks, except for the characters that MUST be escaped: + // quotation mark, reverse solidus, and the control characters (U+0000 + //through U+001F). + // https://tools.ietf.org/html/rfc8259 + let unescaped: u64 = unsigned_lteq_against_input(input, vmovq_n_u8(0x1F)); + *error_mask |= quote_mask & unescaped; + // right shift of a signed value expected to be well-defined and standard + // compliant as of C++20, + // John Regher from Utah U. says this is fine code + *prev_iter_inside_quote = static_cast_u64!(static_cast_i64!(quote_mask) >> 63); + quote_mask +} + +#[cfg_attr(not(feature = "no-inline"), inline(always))] +unsafe fn find_whitespace_and_structurals( + input: &SimdInput, + whitespace: &mut u64, + structurals: &mut u64, +) { + // do a 'shufti' to detect structural JSON characters + // they are + // * `{` 0x7b + // * `}` 0x7d + // * `:` 0x3a + // * `[` 0x5b + // * `]` 0x5d + // * `,` 0x2c + // these go into the first 3 buckets of the comparison (1/2/4) + + // we are also interested in the four whitespace characters: + // * space 0x20 + // * linefeed 0x0a + // * horizontal tab 0x09 + // * carriage return 0x0d + // these go into the next 2 buckets of the comparison (8/16) + + // TODO: const? + let low_nibble_mask: uint8x16_t = uint8x16_t::new( + 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0, + ); + // TODO: const? + let high_nibble_mask: uint8x16_t = uint8x16_t::new( + 8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0, + ); + + let structural_shufti_mask: uint8x16_t = vmovq_n_u8(0x7); + let whitespace_shufti_mask: uint8x16_t = vmovq_n_u8(0x18); + let low_nib_and_mask: uint8x16_t = vmovq_n_u8(0xf); + + let nib_0_lo: uint8x16_t = vandq_u8(input.v0, low_nib_and_mask); + let nib_0_hi: uint8x16_t = vshrq_n_u8(input.v0, 4); + let shuf_0_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_0_lo); + let shuf_0_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_0_hi); + let v_0: uint8x16_t = vandq_u8(shuf_0_lo, shuf_0_hi); + + let nib_1_lo: uint8x16_t = vandq_u8(input.v1, low_nib_and_mask); + let nib_1_hi: uint8x16_t = vshrq_n_u8(input.v1, 4); + let shuf_1_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_1_lo); + let shuf_1_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_1_hi); + let v_1: uint8x16_t = vandq_u8(shuf_1_lo, shuf_1_hi); + + let nib_2_lo: uint8x16_t = vandq_u8(input.v2, low_nib_and_mask); + let nib_2_hi: uint8x16_t = vshrq_n_u8(input.v2, 4); + let shuf_2_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_2_lo); + let shuf_2_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_2_hi); + let v_2: uint8x16_t = vandq_u8(shuf_2_lo, shuf_2_hi); + + let nib_3_lo: uint8x16_t = vandq_u8(input.v3, low_nib_and_mask); + let nib_3_hi: uint8x16_t = vshrq_n_u8(input.v3, 4); + let shuf_3_lo: uint8x16_t = vqtbl1q_u8(low_nibble_mask, nib_3_lo); + let shuf_3_hi: uint8x16_t = vqtbl1q_u8(high_nibble_mask, nib_3_hi); + let v_3: uint8x16_t = vandq_u8(shuf_3_lo, shuf_3_hi); + + let tmp_0: uint8x16_t = vtstq_u8(v_0, structural_shufti_mask); + let tmp_1: uint8x16_t = vtstq_u8(v_1, structural_shufti_mask); + let tmp_2: uint8x16_t = vtstq_u8(v_2, structural_shufti_mask); + let tmp_3: uint8x16_t = vtstq_u8(v_3, structural_shufti_mask); + *structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3); + + let tmp_ws_v0: uint8x16_t = vtstq_u8(v_0, whitespace_shufti_mask); + let tmp_ws_v1: uint8x16_t = vtstq_u8(v_1, whitespace_shufti_mask); + let tmp_ws_v2: uint8x16_t = vtstq_u8(v_2, whitespace_shufti_mask); + let tmp_ws_v3: uint8x16_t = vtstq_u8(v_3, whitespace_shufti_mask); + *whitespace = neon_movemask_bulk(tmp_ws_v0, tmp_ws_v1, tmp_ws_v2, tmp_ws_v3); +} + +// flatten out values in 'bits' assuming that they are are to have values of idx +// plus their position in the bitvector, and store these indexes at +// base_ptr[base] incrementing base as we go +// will potentially store extra values beyond end of valid bits, so base_ptr +// needs to be large enough to handle this +//TODO: usize was u32 here does this matter? +#[cfg_attr(not(feature = "no-inline"), inline(always))] +fn flatten_bits(base: &mut Vec, idx: u32, mut bits: u64) { + let cnt: usize = bits.count_ones() as usize; + let mut l = base.len(); + let idx_minus_64 = idx.wrapping_sub(64); + let idx_64_v = unsafe { + int32x4_t::new( + static_cast_i32!(idx_minus_64), + static_cast_i32!(idx_minus_64), + static_cast_i32!(idx_minus_64), + static_cast_i32!(idx_minus_64), + ) + }; + + // We're doing some trickery here. + // We reserve 64 extra entries, because we've at most 64 bit to set + // then we trunctate the base to the next base (that we calcuate above) + // We later indiscriminatory writre over the len we set but that's OK + // since we ensure we reserve the needed space + base.reserve(64); + unsafe { + base.set_len(l + cnt); + } + + while bits != 0 { + unsafe { + let v0 = bits.trailing_zeros() as i32; + bits &= bits.wrapping_sub(1); + let v1 = bits.trailing_zeros() as i32; + bits &= bits.wrapping_sub(1); + let v2 = bits.trailing_zeros() as i32; + bits &= bits.wrapping_sub(1); + let v3 = bits.trailing_zeros() as i32; + bits &= bits.wrapping_sub(1); + + let v: int32x4_t = int32x4_t::new(v0, v1, v2, v3); + let v: int32x4_t = vaddq_s32(idx_64_v, v); + #[allow(clippy::cast_ptr_alignment)] + std::ptr::write(base.as_mut_ptr().add(l) as *mut int32x4_t, v); + } + l += 4; + } +} + +// return a updated structural bit vector with quoted contents cleared out and +// pseudo-structural characters added to the mask +// updates prev_iter_ends_pseudo_pred which tells us whether the previous +// iteration ended on a whitespace or a structural character (which means that +// the next iteration +// will have a pseudo-structural character at its start) +#[cfg_attr(not(feature = "no-inline"), inline(always))] +fn finalize_structurals( + mut structurals: u64, + whitespace: u64, + quote_mask: u64, + quote_bits: u64, + prev_iter_ends_pseudo_pred: &mut u64, +) -> u64 { + // mask off anything inside quotes + structurals &= !quote_mask; + // add the real quote bits back into our bitmask as well, so we can + // quickly traverse the strings we've spent all this trouble gathering + structurals |= quote_bits; + // Now, establish "pseudo-structural characters". These are non-whitespace + // characters that are (a) outside quotes and (b) have a predecessor that's + // either whitespace or a structural character. This means that subsequent + // passes will get a chance to encounter the first character of every string + // of non-whitespace and, if we're parsing an atom like true/false/null or a + // number we can stop at the first whitespace or structural character + // following it. + + // a qualified predecessor is something that can happen 1 position before an + // psuedo-structural character + let pseudo_pred: u64 = structurals | whitespace; + + let shifted_pseudo_pred: u64 = (pseudo_pred << 1) | *prev_iter_ends_pseudo_pred; + *prev_iter_ends_pseudo_pred = pseudo_pred >> 63; + let pseudo_structurals: u64 = shifted_pseudo_pred & (!whitespace) & (!quote_mask); + structurals |= pseudo_structurals; + + // now, we've used our close quotes all we need to. So let's switch them off + // they will be off in the quote mask and on in quote bits. + structurals &= !(quote_bits & !quote_mask); + structurals +} + +pub fn find_bs_bits_and_quote_bits(v0: uint8x16_t, v1: uint8x16_t) -> ParseStringHelper { + let quote_mask = vmovq_n_u8(b'"'); + let bs_mask = vmovq_n_u8(b'\\'); + let bit_mask = bit_mask!(); + + let cmp_bs_0 : uint8x16_t = vceqq_u8(v0, bs_mask); + let cmp_bs_1 : uint8x16_t = vceqq_u8(v1, bs_mask); + let cmp_qt_0 : uint8x16_t = vceqq_u8(v0, quote_mask); + let cmp_qt_1 : uint8x16_t = vceqq_u8(v1, quote_mask); + + let cmp_bs_0 = vandq_u8(cmp_bs_0, bit_mask); + let cmp_bs_1 = vandq_u8(cmp_bs_1, bit_mask); + let cmp_qt_0 = vandq_u8(cmp_qt_0, bit_mask); + let cmp_qt_1 = vandq_u8(cmp_qt_1, bit_mask); + + let sum0 : uint8x16_t = vpaddq_u8(cmp_bs_0, cmp_bs_1); + let sum1 : uint8x16_t = vpaddq_u8(cmp_qt_0, cmp_qt_1); + let sum0 = vpaddq_u8(sum0, sum1); + let sum0 = vpaddq_u8(sum0, sum0); + + ParseStringHelper { + bs_bits: unsafe { vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 0) }, + quote_bits: unsafe { vgetq_lane_u32(vreinterpretq_u32_u8(sum0), 1) }, + } +} + +impl<'de> Deserializer<'de> { + //#[inline(never)] + pub unsafe fn find_structural_bits(input: &[u8]) -> std::result::Result, ErrorType> { + let len = input.len(); + // 6 is a heuristic number to estimate it turns out a rate of 1/6 structural caracters lears + // almost never to relocations. + let mut structural_indexes = Vec::with_capacity(len / 6); + structural_indexes.push(0); // push extra root element + + let mut utf8_state: Utf8CheckingState = Utf8CheckingState::default(); + + // we have padded the input out to 64 byte multiple with the remainder being + // zeros + + // persistent state across loop + // does the last iteration end with an odd-length sequence of backslashes? + // either 0 or 1, but a 64-bit value + let mut prev_iter_ends_odd_backslash: u64 = 0; + // does the previous iteration end inside a double-quote pair? + let mut prev_iter_inside_quote: u64 = 0; + // either all zeros or all ones + // does the previous iteration end on something that is a predecessor of a + // pseudo-structural character - i.e. whitespace or a structural character + // effectively the very first char is considered to follow "whitespace" for + // the + // purposes of pseudo-structural character detection so we initialize to 1 + let mut prev_iter_ends_pseudo_pred: u64 = 1; + + // structurals are persistent state across loop as we flatten them on the + // subsequent iteration into our array pointed to be base_ptr. + // This is harmless on the first iteration as structurals==0 + // and is done for performance reasons; we can hide some of the latency of the + // expensive carryless multiply in the previous step with this work + let mut structurals: u64 = 0; + + let lenminus64: usize = if len < 64 { 0 } else { len as usize - 64 }; + let mut idx: usize = 0; + let mut error_mask: u64 = 0; // for unescaped characters within strings (ASCII code points < 0x20) + + while idx < lenminus64 { + /* + #ifndef _MSC_VER + __builtin_prefetch(buf + idx + 128); + #endif + */ + let input: SimdInput = fill_input(input.get_unchecked(idx as usize..)); + check_utf8(&input, &mut utf8_state); + // detect odd sequences of backslashes + let odd_ends: u64 = + find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash); + + // detect insides of quote pairs ("quote_mask") and also our quote_bits + // themselves + let mut quote_bits: u64 = 0; + let quote_mask: u64 = find_quote_mask_and_bits( + &input, + odd_ends, + &mut prev_iter_inside_quote, + &mut quote_bits, + &mut error_mask, + ); + + // take the previous iterations structural bits, not our current iteration, + // and flatten + flatten_bits(&mut structural_indexes, idx as u32, structurals); + + let mut whitespace: u64 = 0; + find_whitespace_and_structurals(&input, &mut whitespace, &mut structurals); + + // fixup structurals to reflect quotes and add pseudo-structural characters + structurals = finalize_structurals( + structurals, + whitespace, + quote_mask, + quote_bits, + &mut prev_iter_ends_pseudo_pred, + ); + idx += 64; + } + + // we use a giant copy-paste which is ugly. + // but otherwise the string needs to be properly padded or else we + // risk invalidating the UTF-8 checks. + if idx < len { + let mut tmpbuf: [u8; 64] = [0x20; 64]; + tmpbuf + .as_mut_ptr() + .copy_from(input.as_ptr().add(idx), len as usize - idx); + let input: SimdInput = fill_input(&tmpbuf); + + check_utf8(&input, &mut utf8_state); + + // detect odd sequences of backslashes + let odd_ends: u64 = + find_odd_backslash_sequences(&input, &mut prev_iter_ends_odd_backslash); + + // detect insides of quote pairs ("quote_mask") and also our quote_bits + // themselves + let mut quote_bits: u64 = 0; + let quote_mask: u64 = find_quote_mask_and_bits( + &input, + odd_ends, + &mut prev_iter_inside_quote, + &mut quote_bits, + &mut error_mask, + ); + + // take the previous iterations structural bits, not our current iteration, + // and flatten + flatten_bits(&mut structural_indexes, idx as u32, structurals); + + let mut whitespace: u64 = 0; + find_whitespace_and_structurals(&input, &mut whitespace, &mut structurals); + + // fixup structurals to reflect quotes and add pseudo-structural characters + structurals = finalize_structurals( + structurals, + whitespace, + quote_mask, + quote_bits, + &mut prev_iter_ends_pseudo_pred, + ); + idx += 64; + } + // This test isn't in upstream, for some reason the error mask is et for then. + if prev_iter_inside_quote != 0 { + return Err(ErrorType::Syntax); + } + // finally, flatten out the remaining structurals from the last iteration + flatten_bits(&mut structural_indexes, idx as u32, structurals); + + // a valid JSON file cannot have zero structural indexes - we should have + // found something (note that we compare to 1 as we always add the root!) + if structural_indexes.len() == 1 { + return Err(ErrorType::EOF); + } + + if structural_indexes.last() > Some(&(len as u32)) { + return Err(ErrorType::InternalError); + } + + if error_mask != 0 { + return Err(ErrorType::Syntax); + } + + if is_utf8_status_ok(utf8_state.has_error) { + Ok(structural_indexes) + } else { + Err(ErrorType::InvalidUTF8) + } + } +} diff --git a/src/neon/utf8check.rs b/src/neon/utf8check.rs new file mode 100644 index 00000000..082183b1 --- /dev/null +++ b/src/neon/utf8check.rs @@ -0,0 +1,253 @@ +use crate::neon::intrinsics::*; + +/* + * legal utf-8 byte sequence + * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 + * + * Code Points 1st 2s 3s 4s + * U+0000..U+007F 00..7F + * U+0080..U+07FF C2..DF 80..BF + * U+0800..U+0FFF E0 A0..BF 80..BF + * U+1000..U+CFFF E1..EC 80..BF 80..BF + * U+D000..U+D7FF ED 80..9F 80..BF + * U+E000..U+FFFF EE..EF 80..BF 80..BF + * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + * + */ + +/*****************************/ +#[cfg_attr(not(feature = "no-inline"), inline)] +fn push_last_byte_of_a_to_b(a: int8x16_t, b: int8x16_t) -> int8x16_t { + unsafe { + vextq_s8(a, b, 16 - 1) + } +} + +#[cfg_attr(not(feature = "no-inline"), inline)] +fn push_last_2bytes_of_a_to_b(a: int8x16_t, b: int8x16_t) -> int8x16_t { + unsafe { + vextq_s8(a, b, 16 - 2) + } +} + +// all byte values must be no larger than 0xF4 +#[cfg_attr(not(feature = "no-inline"), inline)] +fn check_smaller_than_0xf4(current_bytes: int8x16_t, has_error: &mut int8x16_t) { + // unsigned, saturates to 0 below max + *has_error = unsafe { + vorrq_s8( + *has_error, + vqsubq_s8(current_bytes, vdupq_n_s8(-12 /* 0xF4 */)) + ) + }; +} + +macro_rules! nibbles_tbl { + () => { + int8x16_t::new( + 1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII) + 0, 0, 0, 0, // 10xx (continuation) + 2, 2, // 110x + 3, // 1110 + 4, // 1111, next should be 0 (not checked here) + ) + }; +} + +#[cfg_attr(not(feature = "no-inline"), inline)] +fn continuation_lengths(high_nibbles: int8x16_t) -> int8x16_t { + unsafe { + vqtbl1q_s8( + nibbles_tbl!(), + vreinterpretq_u8_s8(high_nibbles), + ) + } +} + +#[cfg_attr(not(feature = "no-inline"), inline)] +fn carry_continuations(initial_lengths: int8x16_t, previous_carries: int8x16_t) -> int8x16_t { + unsafe { + let right1: int8x16_t = vqsubq_s8( + push_last_byte_of_a_to_b(previous_carries, initial_lengths), + vdupq_n_s8(1), + ); + let sum: int8x16_t = vaddq_s8(initial_lengths, right1); + let right2: int8x16_t = vqsubq_s8( + push_last_2bytes_of_a_to_b(previous_carries, sum), + vdupq_n_s8(2), + ); + vaddq_s8(sum, right2) + } +} + +#[cfg_attr(not(feature = "no-inline"), inline)] +fn check_continuations(initial_lengths: int8x16_t, carries: int8x16_t, has_error: &mut int8x16_t) { + // overlap || underlap + // carry > length && length > 0 || !(carry > length) && !(length > 0) + // (carries > length) == (lengths > 0) + { + let overunder: uint8x16_t = vceqq_u8( + vcgtq_s8(carries, initial_lengths), + vcgtq_s8(initial_lengths, vdupq_n_s8(0)), + ); + + *has_error = vorrq_s8(*has_error, vreinterpretq_s8_u8(overunder)); + } +} + +// when 0xED is found, next byte must be no larger than 0x9F +// when 0xF4 is found, next byte must be no larger than 0x8F +// next byte must be continuation, ie sign bit is set, so signed < is ok +#[cfg_attr(not(feature = "no-inline"), inline)] +fn check_first_continuation_max( + current_bytes: int8x16_t, + off1_current_bytes: int8x16_t, + has_error: &mut int8x16_t, +) { + { + let mask_ed: uint8x16_t = vceqq_s8( + off1_current_bytes, + vdupq_n_s8(-19 /* 0xED */), + ); + let mask_f4: uint8x16_t = vceqq_s8( + off1_current_bytes, + vdupq_n_s8(-12 /* 0xF4 */), + ); + + let badfollow_ed: uint8x16_t = vandq_u8( + vcgtq_s8(current_bytes, vdupq_n_s8(-97 /* 0x9F */)), + mask_ed, + ); + let badfollow_f4: uint8x16_t = vandq_u8( + vcgtq_s8(current_bytes, vdupq_n_s8(-113 /* 0x8F */)), + mask_f4, + ); + + *has_error = vorrq_s8( + *has_error, + vreinterpretq_s8_u8(vorrq_u8(badfollow_ed, badfollow_f4)), + ); + } +} + +macro_rules! initial_mins_tbl { + () => { + int8x16_t::new( + -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, // 10xx => false + -62 /* 0xC2 */, -128, // 110x + -31 /* 0xE1 */, // 1110 + -15 /*0xF1 */, // 1111 + ) + }; +} + +macro_rules! second_mins_tbl { + () => { + int8x16_t::new( + -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, // 10xx => false + 127, 127, // 110x => true + -96 /* 0xA0 */, // 1110 + -112 /* 0x90 */, // 1111 + ) + }; +} + +// map off1_hibits => error condition +// hibits off1 cur +// C => < C2 && true +// E => < E1 && < A0 +// F => < F1 && < 90 +// else false && false +#[cfg_attr(not(feature = "no-inline"), inline)] +fn check_overlong( + current_bytes: int8x16_t, + off1_current_bytes: int8x16_t, + hibits: int8x16_t, + previous_hibits: int8x16_t, + has_error: &mut int8x16_t, +) { + unsafe { + let off1_hibits: int8x16_t = push_last_byte_of_a_to_b(previous_hibits, hibits); + let initial_mins: int8x16_t = vqtbl1q_s8( + initial_mins_tbl!(), + vreinterpretq_u8_s8(off1_hibits) + ); + + let initial_under: uint8x16_t = vcgtq_s8(initial_mins, off1_current_bytes); + + let second_mins: int8x16_t = vqtbl1q_s8( + second_mins_tbl!(), + vreinterpretq_u8_s8(off1_hibits) + ); + let second_under: uint8x16_t = vcgtq_s8(second_mins, current_bytes); + *has_error = vorrq_s8( + *has_error, + vreinterpretq_s8_u8(vandq_u8(initial_under, second_under)) + ); + } +} + +pub struct ProcessedUtfBytes { + rawbytes: int8x16_t, + high_nibbles: int8x16_t, + pub carried_continuations: int8x16_t, +} + +impl Default for ProcessedUtfBytes { + #[cfg_attr(not(feature = "no-inline"), inline)] + fn default() -> Self { + ProcessedUtfBytes { + rawbytes: vdupq_n_s8(0x00), + high_nibbles: vdupq_n_s8(0x00), + carried_continuations: vdupq_n_s8(0x00), + } + } +} + +#[cfg_attr(not(feature = "no-inline"), inline)] +fn count_nibbles(bytes: int8x16_t, answer: &mut ProcessedUtfBytes) { + answer.rawbytes = bytes; + answer.high_nibbles = unsafe { + vandq_s8( + vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(bytes), 4)), + vmovq_n_s8(0x0F) + ) + }; +} + +// check whether the current bytes are valid UTF-8 +// at the end of the function, previous gets updated +#[cfg_attr(not(feature = "no-inline"), inline)] +pub fn check_utf8_bytes( + current_bytes: int8x16_t, + previous: &mut ProcessedUtfBytes, + has_error: &mut int8x16_t, +) -> ProcessedUtfBytes { + let mut pb = ProcessedUtfBytes::default(); + count_nibbles(current_bytes, &mut pb); + + check_smaller_than_0xf4(current_bytes, has_error); + + let initial_lengths: int8x16_t = continuation_lengths(pb.high_nibbles); + + pb.carried_continuations = + carry_continuations(initial_lengths, previous.carried_continuations); + + check_continuations(initial_lengths, pb.carried_continuations, has_error); + + let off1_current_bytes: int8x16_t = push_last_byte_of_a_to_b(previous.rawbytes, pb.rawbytes); + check_first_continuation_max(current_bytes, off1_current_bytes, has_error); + + check_overlong( + current_bytes, + off1_current_bytes, + pb.high_nibbles, + previous.high_nibbles, + has_error, + ); + pb +} diff --git a/src/numberparse.rs b/src/numberparse.rs index 03880d56..01af460e 100644 --- a/src/numberparse.rs +++ b/src/numberparse.rs @@ -1,6 +1,7 @@ use crate::charutils::*; use crate::unlikely; use crate::*; + #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] @@ -133,6 +134,7 @@ pub enum Number { } #[cfg_attr(not(feature = "no-inline"), inline)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 { unsafe { // this actually computes *16* values so we are being wasteful. @@ -143,7 +145,7 @@ fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 { let mul_1_10000: __m128i = _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1); // We know what we're doing right? :P #[allow(clippy::cast_ptr_alignment)] - let input: __m128i = _mm_sub_epi8( + let input: __m128i = _mm_sub_epi8( _mm_loadu_si128(chars.get_unchecked(0..16).as_ptr() as *const __m128i), ascii0, ); @@ -155,6 +157,17 @@ fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 { } } +#[cfg_attr(not(feature = "no-inline"), inline)] +#[cfg(target_feature = "neon")] +fn parse_eight_digits_unrolled(chars: &[u8]) -> u32 { + let val: u64 = unsafe { *(chars.as_ptr() as *const u64) }; + // memcpy(&val, chars, sizeof(u64)); + let val = (val & 0x0F0F0F0F0F0F0F0F).wrapping_mul(2561) >> 8; + let val = (val & 0x00FF00FF00FF00FF).wrapping_mul(6553601) >> 16; + + return ((val & 0x0000FFFF0000FFFF).wrapping_mul(42949672960001) >> 32) as u32; +} + impl<'de> Deserializer<'de> { /// called by parse_number when we know that the output is a float, /// but where there might be some integer overflow. The trick here is to @@ -215,7 +228,7 @@ impl<'de> Deserializer<'de> { digit = unsafe { *p.get_unchecked(digitcount) } - b'0'; digitcount += 1; fraction_weight *= 10.0; - fraction += f64::from(digit) / fraction_weight;; + fraction += f64::from(digit) / fraction_weight; } i += fraction; } diff --git a/src/portability.rs b/src/portability.rs new file mode 100644 index 00000000..69481a85 --- /dev/null +++ b/src/portability.rs @@ -0,0 +1,30 @@ +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[cfg_attr(not(feature = "no-inline"), inline)] +#[cfg(target_arch = "x86_64")] +pub fn add_overflow(value1: u64, value2: u64, result: &mut u64) -> bool { + unsafe { _addcarry_u64(0, value1, value2, result) != 0 } +} + +//TODO: static? + +#[cfg_attr(not(feature = "no-inline"), inline)] +#[cfg(target_arch = "x86_64")] +pub fn hamming(input_num: u64) -> u32 { + unsafe { _popcnt64(input_num as i64) as u32 } +} + +#[cfg_attr(not(feature = "no-inline"), inline)] +#[cfg(target_arch = "x86_64")] +pub fn hamming(input_num: u64) -> u32 { + unsafe { __popcnt(input_num as u32) + __popcnt((input_num >> 32) as u32) as u32 } +} + +#[cfg_attr(not(feature = "no-inline"), inline)] +#[cfg(target_arch = "x86_64")] +pub fn trailingzeroes(input_num: u64) -> u32 { + unsafe { _tzcnt_u64(input_num) as u32 } +} diff --git a/src/sse42/generator.rs b/src/sse42/generator.rs new file mode 100644 index 00000000..e6636585 --- /dev/null +++ b/src/sse42/generator.rs @@ -0,0 +1,51 @@ +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +use crate::value::generator::ESCAPED; +use std::io; + +#[inline(always)] +pub unsafe fn write_str_simd(writer: &mut W, string: &mut &[u8], len: &mut usize, idx: &mut usize) -> io::Result<()> where W: std::io::Write { + let zero = _mm_set1_epi8(0); + let lower_quote_range = _mm_set1_epi8(0x1F as i8); + let quote = _mm_set1_epi8(b'"' as i8); + let backslash = _mm_set1_epi8(b'\\' as i8); + while *len - *idx > 16 { + // Load 16 bytes of data; + #[allow(clippy::cast_ptr_alignment)] + let data: __m128i = _mm_loadu_si128(string.as_ptr().add(*idx) as *const __m128i); + // Test the data against being backslash and quote. + let bs_or_quote = _mm_or_si128( + _mm_cmpeq_epi8(data, backslash), + _mm_cmpeq_epi8(data, quote) + ); + // Now mask the data with the quote range (0x1F). + let in_quote_range = _mm_and_si128(data, lower_quote_range); + // then test of the data is unchanged. aka: xor it with the + // Any field that was inside the quote range it will be zero + // now. + let is_unchanged = _mm_xor_si128(data, in_quote_range); + let in_range = _mm_cmpeq_epi8(is_unchanged, zero); + let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range)); + if quote_bits != 0 { + let quote_dist = quote_bits.trailing_zeros() as usize; + stry!(writer.write_all(&string[0..*idx + quote_dist])); + let ch = string[*idx + quote_dist]; + match ESCAPED[ch as usize] { + b'u' => stry!(write!(writer, "\\u{:04x}", ch)), + + escape => stry!(writer.write_all(&[b'\\', escape])), + }; + *string = &string[*idx + quote_dist + 1..]; + *idx = 0; + *len = string.len(); + } else { + *idx += 16; + } + } + stry!(writer.write_all(&string[0..*idx])); + *string = &string[*idx..]; + Ok(()) +} diff --git a/src/sse42/mod.rs b/src/sse42/mod.rs index 30c55c86..ac608ae2 100644 --- a/src/sse42/mod.rs +++ b/src/sse42/mod.rs @@ -1,3 +1,4 @@ pub mod deser; pub mod stage1; -pub mod utf8check; \ No newline at end of file +pub mod utf8check; +pub mod generator; \ No newline at end of file diff --git a/src/stage2.rs b/src/stage2.rs index aaa51ad8..3bbd71c1 100644 --- a/src/stage2.rs +++ b/src/stage2.rs @@ -1,9 +1,11 @@ #![allow(dead_code)] +use crate::charutils::*; #[cfg(target_feature = "avx2")] use crate::avx2::stage1::SIMDJSON_PADDING; -use crate::charutils::*; -#[cfg(not(target_feature = "avx2"))] +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))] use crate::sse42::stage1::SIMDJSON_PADDING; +#[cfg(target_feature = "neon")] +use crate::neon::stage1::SIMDJSON_PADDING; use crate::{Deserializer, Error, ErrorType, Result}; #[cfg_attr(not(feature = "no-inline"), inline(always))] diff --git a/src/stringparse.rs b/src/stringparse.rs index 0ff8a078..9a7e3b8f 100644 --- a/src/stringparse.rs +++ b/src/stringparse.rs @@ -73,3 +73,9 @@ pub fn handle_unicode_codepoint( let offset: usize = codepoint_to_utf8(code_point, dst_ptr); Ok((offset, src_offset)) } + +// Holds backslashes and quotes locations. +pub struct ParseStringHelper { + pub bs_bits: u32, + pub quote_bits: u32, +} diff --git a/src/value.rs b/src/value.rs index 06245878..c5d5aa87 100644 --- a/src/value.rs +++ b/src/value.rs @@ -11,7 +11,7 @@ /// we do not require prior knowledge sbout string comtent to to take advantage /// of it. pub mod borrowed; -mod generator; +pub(crate) mod generator; pub mod owned; pub use self::borrowed::{to_value as to_borrowed_value, Value as BorrowedValue}; diff --git a/src/value/generator.rs b/src/value/generator.rs index 55824f57..ebdb0a88 100644 --- a/src/value/generator.rs +++ b/src/value/generator.rs @@ -5,19 +5,21 @@ // https://github.com/maciejhirsz/json-rust/blob/master/src/codegen.rs use crate::value::ValueTrait; -#[cfg(target_arch = "x86")] -use std::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; use std::io; use std::io::Write; use std::marker::PhantomData; use std::ptr; +use crate::*; + #[cfg(target_feature = "avx2")] -const AVX2_PRESENT : bool = true; -#[cfg(not(target_feature = "avx2"))] -const AVX2_PRESENT : bool = false; +use crate::avx2::generator::*; + +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(target_feature = "avx2")))] +use crate::sse42::generator::*; + +#[cfg(target_feature = "neon")] +use crate::neon::generator::*; const QU: u8 = b'"'; const BS: u8 = b'\\'; @@ -30,7 +32,7 @@ const UU: u8 = b'u'; const __: u8 = 0; // Look up table for characters that need escaping in a product string -static ESCAPED: [u8; 256] = [ +pub(crate) static ESCAPED: [u8; 256] = [ // 0 1 2 3 4 5 6 7 8 9 A B C D E F UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0 UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1 @@ -106,85 +108,7 @@ pub trait BaseGenerator { // quote characters that gives us a bitmask of 0x1f for that // region, only quote (`"`) and backslash (`\`) are not in // this range. - if AVX2_PRESENT { - let zero = _mm256_set1_epi8(0); - let lower_quote_range = _mm256_set1_epi8(0x1F as i8); - let quote = _mm256_set1_epi8(b'"' as i8); - let backslash = _mm256_set1_epi8(b'\\' as i8); - while len - idx >= 32 { - // Load 32 bytes of data; - #[allow(clippy::cast_ptr_alignment)] - let data: __m256i = _mm256_loadu_si256(string.as_ptr().add(idx) as *const __m256i); - // Test the data against being backslash and quote. - let bs_or_quote = _mm256_or_si256( - _mm256_cmpeq_epi8(data, backslash), - _mm256_cmpeq_epi8(data, quote), - ); - // Now mask the data with the quote range (0x1F). - let in_quote_range = _mm256_and_si256(data, lower_quote_range); - // then test of the data is unchanged. aka: xor it with the - // Any field that was inside the quote range it will be zero - // now. - let is_unchanged = _mm256_xor_si256(data, in_quote_range); - let in_range = _mm256_cmpeq_epi8(is_unchanged, zero); - let quote_bits = _mm256_movemask_epi8(_mm256_or_si256(bs_or_quote, in_range)); - if quote_bits != 0 { - let quote_dist = quote_bits.trailing_zeros() as usize; - stry!(self.get_writer().write_all(&string[0..idx + quote_dist])); - let ch = string[idx + quote_dist]; - match ESCAPED[ch as usize] { - b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)), - - escape => stry!(self.write(&[b'\\', escape])), - }; - string = &string[idx + quote_dist + 1..]; - idx = 0; - len = string.len(); - } else { - idx += 32; - } - } - } - // The case where we have a 16+ byte block - // we repeate the same logic as above but with - // only 16 bytes - let zero = _mm_set1_epi8(0); - let lower_quote_range = _mm_set1_epi8(0x1F as i8); - let quote = _mm_set1_epi8(b'"' as i8); - let backslash = _mm_set1_epi8(b'\\' as i8); - while len - idx > 16 { - // Load 16 bytes of data; - #[allow(clippy::cast_ptr_alignment)] - let data: __m128i = _mm_loadu_si128(string.as_ptr().add(idx) as *const __m128i); - // Test the data against being backslash and quote. - let bs_or_quote = - _mm_or_si128(_mm_cmpeq_epi8(data, backslash), _mm_cmpeq_epi8(data, quote)); - // Now mask the data with the quote range (0x1F). - let in_quote_range = _mm_and_si128(data, lower_quote_range); - // then test of the data is unchanged. aka: xor it with the - // Any field that was inside the quote range it will be zero - // now. - let is_unchanged = _mm_xor_si128(data, in_quote_range); - let in_range = _mm_cmpeq_epi8(is_unchanged, zero); - let quote_bits = _mm_movemask_epi8(_mm_or_si128(bs_or_quote, in_range)); - if quote_bits != 0 { - let quote_dist = quote_bits.trailing_zeros() as usize; - stry!(self.get_writer().write_all(&string[0..idx + quote_dist])); - let ch = string[idx + quote_dist]; - match ESCAPED[ch as usize] { - b'u' => stry!(write!(self.get_writer(), "\\u{:04x}", ch)), - - escape => stry!(self.write(&[b'\\', escape])), - }; - string = &string[idx + quote_dist + 1..]; - idx = 0; - len = string.len(); - } else { - idx += 16; - } - } - stry!(self.get_writer().write_all(&string[0..idx])); - string = &string[idx..]; + stry!(write_str_simd(self.get_writer(), &mut string, &mut len, &mut idx)); } // Legacy code to handle the remainder of the code for (index, ch) in string.iter().enumerate() {