From f759a7862be30f8e334a9b529ecab5169ba56c1a Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Wed, 26 May 2021 08:13:09 +0200 Subject: [PATCH 01/69] refactor: check_remainder() --- src/implementation/algorithm.rs | 34 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index 85d881c4..c68ddd75 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -200,6 +200,21 @@ macro_rules! algorithm_simd { panic!("Unsupported number of chunks"); } } + + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(unconditional_panic)] // does not panic because len is checked + #[allow(const_err)] // the same, but for Rust 1.38.0 + unsafe fn check_remainder(&mut self, input: *const u8, len: usize) { + let mut tmpbuf = TempSimdChunk::new(); + crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64( + input, + tmpbuf.0.as_mut_ptr(), + len, + ); + let simd_input = SimdInput::new(&tmpbuf.0); + self.check_utf8(simd_input); + } } /// Validation implementation for CPUs supporting the SIMD extension (see module). @@ -241,14 +256,7 @@ macro_rules! algorithm_simd { } if idx < len { - let mut tmpbuf = TempSimdChunk::new(); - crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64( - input.as_ptr().add(idx), - tmpbuf.0.as_mut_ptr(), - len - idx, - ); - let simd_input = SimdInput::new(&tmpbuf.0); - algorithm.check_utf8(simd_input); + algorithm.check_remainder(input.as_ptr().add(idx), len - idx); } algorithm.check_incomplete_pending(); if algorithm.has_error() { @@ -331,15 +339,7 @@ macro_rules! algorithm_simd { } } if idx < len { - let mut tmpbuf = TempSimdChunk::new(); - crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64( - input.as_ptr().add(idx), - tmpbuf.0.as_mut_ptr(), - len - idx, - ); - let simd_input = SimdInput::new(&tmpbuf.0); - - algorithm.check_utf8(simd_input); + algorithm.check_remainder(input.as_ptr().add(idx), len - idx) } algorithm.check_incomplete_pending(); if algorithm.has_error() { From e562d487e3d12d406d555161b6562fd723fca7f7 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Wed, 26 May 2021 08:29:34 +0200 Subject: [PATCH 02/69] bit mor idiomatic --- src/implementation/algorithm.rs | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index c68ddd75..3969e349 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -186,18 +186,22 @@ macro_rules! algorithm_simd { unsafe fn check_block(&mut self, input: SimdInput) { // WORKAROUND // necessary because the for loop is not unrolled on ARM64 - if input.vals.len() == 2 { - self.check_bytes(input.vals[0]); - self.check_bytes(input.vals[1]); - self.incomplete = Self::is_incomplete(input.vals[1]); - } else if input.vals.len() == 4 { - self.check_bytes(input.vals[0]); - self.check_bytes(input.vals[1]); - self.check_bytes(input.vals[2]); - self.check_bytes(input.vals[3]); - self.incomplete = Self::is_incomplete(input.vals[3]); - } else { - panic!("Unsupported number of chunks"); + match input.vals.len() { + 2 => { + self.check_bytes(input.vals[0]); + self.check_bytes(input.vals[1]); + self.incomplete = Self::is_incomplete(input.vals[1]); + } + 4 => { + self.check_bytes(input.vals[0]); + self.check_bytes(input.vals[1]); + self.check_bytes(input.vals[2]); + self.check_bytes(input.vals[3]); + self.incomplete = Self::is_incomplete(input.vals[3]); + } + _ => { + panic!("Unsupported number of chunks"); + } } } From 1edb3b27786e765844d64aba0e75caabaa23a214 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Wed, 26 May 2021 12:02:49 +0200 Subject: [PATCH 03/69] WIP: remainder optimization --- src/implementation/aarch64/neon.rs | 12 +++++++ src/implementation/algorithm.rs | 55 ++++++++++++++++++++++-------- src/implementation/helpers.rs | 31 +++++++++++++++++ 3 files changed, 83 insertions(+), 15 deletions(-) diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs index 8fb05057..b932b596 100644 --- a/src/implementation/aarch64/neon.rs +++ b/src/implementation/aarch64/neon.rs @@ -100,6 +100,17 @@ impl SimdU8Value { Self::from(dst.assume_init()) } + #[inline] + unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { + let mut tmpbuf = [0u8; 16]; + crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_16( + ptr, + tmpbuf.as_mut_ptr(), + len, + ); + Self::load_from(tmpbuf.as_ptr()) + } + #[inline] #[allow(clippy::too_many_arguments)] unsafe fn lookup_16( @@ -233,6 +244,7 @@ unsafe fn simd_prefetch(ptr: *const u8) { } const PREFETCH: bool = false; +#[allow(unused_imports)] use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk; simd_input_128_bit!("not_used"); algorithm_simd!("not_used"); diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index 3969e349..f0eab573 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -209,15 +209,36 @@ macro_rules! algorithm_simd { #[inline] #[allow(unconditional_panic)] // does not panic because len is checked #[allow(const_err)] // the same, but for Rust 1.38.0 - unsafe fn check_remainder(&mut self, input: *const u8, len: usize) { - let mut tmpbuf = TempSimdChunk::new(); - crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64( - input, - tmpbuf.0.as_mut_ptr(), - len, - ); - let simd_input = SimdInput::new(&tmpbuf.0); - self.check_utf8(simd_input); + unsafe fn check_remainder(&mut self, mut input: *const u8, len: usize) { + let orig_len = len; + let mut len = len; + const SIMD_SIZE: usize = core::mem::size_of::(); + while len > SIMD_SIZE { + let simd_val = SimdU8Value::load_from(input); + input = input.add(SIMD_SIZE); + if simd_val.is_ascii() { + if orig_len == len { + // first after last block, check if previous block is incomplete + self.check_incomplete_pending(); + } + } else { + self.check_bytes(simd_val); + self.incomplete = Self::is_incomplete(simd_val); + } + len -= SIMD_SIZE; + } + if len > 0 { + let simd_val = SimdU8Value::load_partial(input, len); + if simd_val.is_ascii() { + if orig_len < SIMD_SIZE { + // first after last block, check if previous block is incomplete + self.check_incomplete_pending(); + } + } else { + self.check_bytes(simd_val); + self.incomplete = Self::is_incomplete(simd_val); + } + } } } @@ -241,13 +262,17 @@ macro_rules! algorithm_simd { let mut idx: usize = 0; let iter_lim = len - (len % SIMD_CHUNK_SIZE); - while idx < iter_lim { - let simd_input = SimdInput::new(input.get_unchecked(idx as usize..)); - idx += SIMD_CHUNK_SIZE; - if !simd_input.is_ascii() { - algorithm.check_block(simd_input); - break; + 'outer: loop { + while idx < iter_lim { + let simd_input = SimdInput::new(input.get_unchecked(idx as usize..)); + idx += SIMD_CHUNK_SIZE; + if !simd_input.is_ascii() { + algorithm.check_block(simd_input); + break 'outer; + } } + // TODO: check remainder ASCII + break; } while idx < iter_lim { diff --git a/src/implementation/helpers.rs b/src/implementation/helpers.rs index a6bd693a..0a8bb36d 100644 --- a/src/implementation/helpers.rs +++ b/src/implementation/helpers.rs @@ -37,6 +37,37 @@ pub(crate) fn get_compat_error(input: &[u8], failing_block_pos: usize) -> Utf8Er validate_utf8_at_offset(input, offset).unwrap_err() } +#[allow(dead_code)] +#[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const +pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_16( + mut src: *const u8, + mut dest: *mut u8, + mut len: usize, +) { + #[inline] + unsafe fn memcpy_u32(src: &mut *const u8, dest: &mut *mut u8) { + #[allow(clippy::cast_ptr_alignment)] + dest.cast::() + .write_unaligned(src.cast::().read_unaligned()); + *src = src.offset(4); + *dest = dest.offset(4); + } + if len >= 8 { + memcpy_u32(&mut src, &mut dest); + memcpy_u32(&mut src, &mut dest); + len -= 8; + } + if len >= 4 { + memcpy_u32(&mut src, &mut dest); + len -= 4; + } + while len > 0 { + *dest = *src; + src = src.offset(1); + dest = dest.offset(1); + len -= 1; + } +} #[allow(dead_code)] #[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_64( From a7efda182145d8da039a16f372a7f3755e805f85 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sat, 29 May 2021 21:29:32 +0200 Subject: [PATCH 04/69] aarch64 remainder processing: use lane loads --- src/implementation/aarch64/neon.rs | 69 +++++++++++++++++++++++++++--- 1 file changed, 62 insertions(+), 7 deletions(-) diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs index b932b596..a215012c 100644 --- a/src/implementation/aarch64/neon.rs +++ b/src/implementation/aarch64/neon.rs @@ -102,13 +102,68 @@ impl SimdU8Value { #[inline] unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { - let mut tmpbuf = [0u8; 16]; - crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_16( - ptr, - tmpbuf.as_mut_ptr(), - len, - ); - Self::load_from(tmpbuf.as_ptr()) + let mut res = Self::splat0(); + if len == 0 { + } else if len < 4 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr, res.0, 0); + if len > 1 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(1), res.0, 1); + } + if len > 2 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(2), res.0, 2); + } + } else if len < 8 { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + if len > 4 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(4), res.0, 4); + } + if len > 5 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(5), res.0, 5); + } + if len > 6 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(6), res.0, 6); + } + } else if len < 12 { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + if len > 8 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(8), res.0, 8); + } + if len > 9 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(9), res.0, 9); + } + if len > 10 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(10), res.0, 10); + } + } else { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.add(8).cast(), + core::mem::transmute(res.0), + 2, + )); + if len > 12 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(12), res.0, 12); + } + if len > 13 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(13), res.0, 13); + } + if len > 14 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(14), res.0, 14); + } + } + res } #[inline] From b3c763680b736d2bff0a415ad5e6355dc03ba5a0 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sat, 29 May 2021 21:30:02 +0200 Subject: [PATCH 05/69] fix --- src/implementation/algorithm.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index f0eab573..e601ab56 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -213,7 +213,7 @@ macro_rules! algorithm_simd { let orig_len = len; let mut len = len; const SIMD_SIZE: usize = core::mem::size_of::(); - while len > SIMD_SIZE { + while len >= SIMD_SIZE { let simd_val = SimdU8Value::load_from(input); input = input.add(SIMD_SIZE); if simd_val.is_ascii() { From defd4d28f5bf4c47c3205952b49f4ace0fa409db Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sat, 29 May 2021 21:34:26 +0200 Subject: [PATCH 06/69] SimdInput::new_partial() implementation --- src/implementation/algorithm.rs | 43 +++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index e601ab56..5b5dc516 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -567,6 +567,49 @@ macro_rules! simd_input_128_bit { } } + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(clippy::cast_ptr_alignment)] + unsafe fn new_partial(ptr: *const u8, len: usize) -> Self { + if len < 16 { + Self { + vals: [ + SimdU8Value::load_partial(ptr, len), + SimdU8Value::splat0(), + SimdU8Value::splat0(), + SimdU8Value::splat0(), + ], + } + } else if len < 32 { + Self { + vals: [ + SimdU8Value::load_from(ptr), + SimdU8Value::load_partial(ptr.add(16), len - 16), + SimdU8Value::splat0(), + SimdU8Value::splat0(), + ], + } + } else if len < 48 { + Self { + vals: [ + SimdU8Value::load_from(ptr), + SimdU8Value::load_from(ptr.add(16)), + SimdU8Value::load_partial(ptr.add(32), len - 32), + SimdU8Value::splat0(), + ], + } + } else { + Self { + vals: [ + SimdU8Value::load_from(ptr), + SimdU8Value::load_from(ptr.add(16)), + SimdU8Value::load_from(ptr.add(32)), + SimdU8Value::load_partial(ptr.add(48), len - 48), + ], + } + } + } + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] #[inline] unsafe fn is_ascii(&self) -> bool { From 5df7b381840bf9a0f63da6c57e6e2e31dac3c520 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sat, 29 May 2021 21:35:18 +0200 Subject: [PATCH 07/69] ascii special-casing --- src/implementation/algorithm.rs | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index 5b5dc516..2a2f5814 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -271,8 +271,13 @@ macro_rules! algorithm_simd { break 'outer; } } - // TODO: check remainder ASCII - break; + if idx < len { + let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len - idx); + if !simd_input.is_ascii() { + break; + } + } + return Ok(()); } while idx < iter_lim { @@ -286,6 +291,8 @@ macro_rules! algorithm_simd { if idx < len { algorithm.check_remainder(input.as_ptr().add(idx), len - idx); + // let input = SimdInput::new_partial(input.as_ptr().add(idx), len - idx); + // algorithm.check_utf8(input); } algorithm.check_incomplete_pending(); if algorithm.has_error() { @@ -339,7 +346,13 @@ macro_rules! algorithm_simd { } idx += SIMD_CHUNK_SIZE; } - break; + if idx < len { + let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len - idx); + if !simd_input.is_ascii() { + break; + } + } + return Ok(()); } else { while idx < iter_lim { if PREFETCH { From 7f0cf3b5e3556ac067508039d976ccf17108df52 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sun, 30 May 2021 12:37:28 +0200 Subject: [PATCH 08/69] ascii-optimized remainder checking --- src/implementation/algorithm.rs | 36 ++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index 2a2f5814..28ada5c9 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -240,6 +240,30 @@ macro_rules! algorithm_simd { } } } + + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(unconditional_panic)] // does not panic because len is checked + #[allow(const_err)] // the same, but for Rust 1.38.0 + unsafe fn check_remainder_ascii(&mut self, mut input: *const u8, mut len: usize) { + const SIMD_SIZE: usize = core::mem::size_of::(); + while len >= SIMD_SIZE { + let simd_val = SimdU8Value::load_from(input); + input = input.add(SIMD_SIZE); + if !simd_val.is_ascii() { + self.check_bytes(simd_val); + self.incomplete = Self::is_incomplete(simd_val); + } + len -= SIMD_SIZE; + } + if len > 0 { + let simd_val = SimdU8Value::load_partial(input, len); + if !simd_val.is_ascii() { + self.check_bytes(simd_val); + self.incomplete = Self::is_incomplete(simd_val); + } + } + } } /// Validation implementation for CPUs supporting the SIMD extension (see module). @@ -347,12 +371,14 @@ macro_rules! algorithm_simd { idx += SIMD_CHUNK_SIZE; } if idx < len { - let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len - idx); - if !simd_input.is_ascii() { - break; - } + algorithm.check_remainder_ascii(input.as_ptr().add(idx), len - idx); + algorithm.check_incomplete_pending(); } - return Ok(()); + return if algorithm.has_error() { + Err(idx) + } else { + Ok(()) + }; } else { while idx < iter_lim { if PREFETCH { From 2cbf8a9c9d4b11b7bf1cf6819c4859dbf8f2b616 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sun, 30 May 2021 12:37:53 +0200 Subject: [PATCH 09/69] test-bed for different partial load implementations --- src/implementation/aarch64/neon.rs | 269 ++++++++++++++++++++++++++++- 1 file changed, 268 insertions(+), 1 deletion(-) diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs index a215012c..8e6534ae 100644 --- a/src/implementation/aarch64/neon.rs +++ b/src/implementation/aarch64/neon.rs @@ -101,7 +101,23 @@ impl SimdU8Value { } #[inline] - unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { + pub unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { + Self::load_partial_direct_16(ptr, len) + } + + #[inline] + unsafe fn load_partial_copy(ptr: *const u8, len: usize) -> Self { + let mut tmpbuf = [0u8; 16]; + crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_16( + ptr, + tmpbuf.as_mut_ptr(), + len, + ); + Self::load_from(tmpbuf.as_ptr()) + } + + #[inline] + unsafe fn load_partial_direct(ptr: *const u8, len: usize) -> Self { let mut res = Self::splat0(); if len == 0 { } else if len < 4 { @@ -166,6 +182,257 @@ impl SimdU8Value { res } + #[inline] + unsafe fn load_partial_direct_16(ptr: *const u8, len: usize) -> Self { + let mut res = Self::splat0(); + if len == 0 { + } else if len < 4 { + if len == 1 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr, res.0, 0); + } else { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + if len > 2 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(2), res.0, 2); + } + } + } else if len < 8 { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + if len == 5 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(4), res.0, 4); + } else if len > 5 { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.add(4).cast(), + core::mem::transmute(res.0), + 2, + )); + if len > 6 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(6), res.0, 6); + } + } + } else if len < 12 { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + if len == 9 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(8), res.0, 8); + } else if len > 9 { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.add(8).cast(), + core::mem::transmute(res.0), + 4, + )); + if len > 10 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(10), res.0, 10); + } + } + } else { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.add(8).cast(), + core::mem::transmute(res.0), + 2, + )); + if len == 13 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(12), res.0, 12); + } else if len > 13 { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.add(12).cast(), + core::mem::transmute(res.0), + 6, + )); + if len > 14 { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(14), res.0, 14); + } + } + } + res + } + + #[inline] + unsafe fn load_partial_direct_match(ptr: *const u8, len: usize) -> Self { + let mut res = Self::splat0(); + match len { + 0 => {} + 1 => { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr, res.0, 0); + } + 2 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + } + 3 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(2), res.0, 2); + } + 4 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + } + 5 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(4), res.0, 4); + } + 6 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.add(4).cast(), + core::mem::transmute(res.0), + 2, + )); + } + 7 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.add(4).cast(), + core::mem::transmute(res.0), + 2, + )); + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(6), res.0, 6); + } + 8 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + } + 9 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(8), res.0, 8); + } + 10 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.add(8).cast(), + core::mem::transmute(res.0), + 4, + )); + } + 11 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.add(8).cast(), + core::mem::transmute(res.0), + 4, + )); + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(10), res.0, 10); + } + 12 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.add(8).cast(), + core::mem::transmute(res.0), + 2, + )); + } + 13 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.add(8).cast(), + core::mem::transmute(res.0), + 2, + )); + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(12), res.0, 12); + } + 14 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.add(8).cast(), + core::mem::transmute(res.0), + 2, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.add(12).cast(), + core::mem::transmute(res.0), + 6, + )); + } + 15 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.add(8).cast(), + core::mem::transmute(res.0), + 2, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.add(12).cast(), + core::mem::transmute(res.0), + 6, + )); + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(14), res.0, 14); + } + _ => { + // not allowed + debug_assert!(false); + } + } + res + } + #[inline] #[allow(clippy::too_many_arguments)] unsafe fn lookup_16( From 42f1acde033abab919a9448aed7c00da84d7f7e9 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sun, 30 May 2021 13:44:08 +0200 Subject: [PATCH 10/69] aarch64: select partial load implementation --- src/implementation/aarch64/neon.rs | 162 ----------------------------- 1 file changed, 162 deletions(-) diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs index 8e6534ae..80724ab8 100644 --- a/src/implementation/aarch64/neon.rs +++ b/src/implementation/aarch64/neon.rs @@ -102,168 +102,6 @@ impl SimdU8Value { #[inline] pub unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { - Self::load_partial_direct_16(ptr, len) - } - - #[inline] - unsafe fn load_partial_copy(ptr: *const u8, len: usize) -> Self { - let mut tmpbuf = [0u8; 16]; - crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_16( - ptr, - tmpbuf.as_mut_ptr(), - len, - ); - Self::load_from(tmpbuf.as_ptr()) - } - - #[inline] - unsafe fn load_partial_direct(ptr: *const u8, len: usize) -> Self { - let mut res = Self::splat0(); - if len == 0 { - } else if len < 4 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr, res.0, 0); - if len > 1 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(1), res.0, 1); - } - if len > 2 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(2), res.0, 2); - } - } else if len < 8 { - res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( - ptr.cast(), - core::mem::transmute(res.0), - 0, - )); - if len > 4 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(4), res.0, 4); - } - if len > 5 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(5), res.0, 5); - } - if len > 6 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(6), res.0, 6); - } - } else if len < 12 { - res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( - ptr.cast(), - core::mem::transmute(res.0), - 0, - )); - if len > 8 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(8), res.0, 8); - } - if len > 9 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(9), res.0, 9); - } - if len > 10 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(10), res.0, 10); - } - } else { - res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( - ptr.cast(), - core::mem::transmute(res.0), - 0, - )); - res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( - ptr.add(8).cast(), - core::mem::transmute(res.0), - 2, - )); - if len > 12 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(12), res.0, 12); - } - if len > 13 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(13), res.0, 13); - } - if len > 14 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(14), res.0, 14); - } - } - res - } - - #[inline] - unsafe fn load_partial_direct_16(ptr: *const u8, len: usize) -> Self { - let mut res = Self::splat0(); - if len == 0 { - } else if len < 4 { - if len == 1 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr, res.0, 0); - } else { - res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( - ptr.cast(), - core::mem::transmute(res.0), - 0, - )); - if len > 2 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(2), res.0, 2); - } - } - } else if len < 8 { - res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( - ptr.cast(), - core::mem::transmute(res.0), - 0, - )); - if len == 5 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(4), res.0, 4); - } else if len > 5 { - res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( - ptr.add(4).cast(), - core::mem::transmute(res.0), - 2, - )); - if len > 6 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(6), res.0, 6); - } - } - } else if len < 12 { - res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( - ptr.cast(), - core::mem::transmute(res.0), - 0, - )); - if len == 9 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(8), res.0, 8); - } else if len > 9 { - res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( - ptr.add(8).cast(), - core::mem::transmute(res.0), - 4, - )); - if len > 10 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(10), res.0, 10); - } - } - } else { - res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( - ptr.cast(), - core::mem::transmute(res.0), - 0, - )); - res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( - ptr.add(8).cast(), - core::mem::transmute(res.0), - 2, - )); - if len == 13 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(12), res.0, 12); - } else if len > 13 { - res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( - ptr.add(12).cast(), - core::mem::transmute(res.0), - 6, - )); - if len > 14 { - res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(14), res.0, 14); - } - } - } - res - } - - #[inline] - unsafe fn load_partial_direct_match(ptr: *const u8, len: usize) -> Self { let mut res = Self::splat0(); match len { 0 => {} From 8e2706f516216a9f647b0a0d29f475da2955ff01 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sun, 30 May 2021 15:18:49 +0200 Subject: [PATCH 11/69] cleanup --- src/implementation/algorithm.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index 28ada5c9..e0eb8c63 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -315,8 +315,6 @@ macro_rules! algorithm_simd { if idx < len { algorithm.check_remainder(input.as_ptr().add(idx), len - idx); - // let input = SimdInput::new_partial(input.as_ptr().add(idx), len - idx); - // algorithm.check_utf8(input); } algorithm.check_incomplete_pending(); if algorithm.has_error() { From 695c4f8357db3613de2ca8629c8380cf8c492489 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sun, 30 May 2021 17:54:44 +0200 Subject: [PATCH 12/69] remove unnecessary pub --- src/implementation/aarch64/neon.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs index 80724ab8..17966445 100644 --- a/src/implementation/aarch64/neon.rs +++ b/src/implementation/aarch64/neon.rs @@ -101,7 +101,7 @@ impl SimdU8Value { } #[inline] - pub unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { + unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { let mut res = Self::splat0(); match len { 0 => {} From fa25b4676ef3dc3ae9b303dc3e10f0b8caecf710 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sun, 30 May 2021 17:55:57 +0200 Subject: [PATCH 13/69] basic x86 partial load impl. --- src/implementation/algorithm.rs | 18 ++++++++++++++++++ src/implementation/x86/avx2.rs | 12 ++++++++++++ src/implementation/x86/sse42.rs | 12 ++++++++++++ 3 files changed, 42 insertions(+) diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index e0eb8c63..8cc197cb 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -679,6 +679,24 @@ macro_rules! simd_input_256_bit { } } + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(clippy::cast_ptr_alignment)] + unsafe fn new_partial(ptr: *const u8, len: usize) -> Self { + if len < 32 { + Self { + vals: [SimdU8Value::load_partial(ptr, len), SimdU8Value::splat0()], + } + } else { + Self { + vals: [ + SimdU8Value::load_from(ptr), + SimdU8Value::load_partial(ptr.add(32), len - 32), + ], + } + } + } + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] #[inline] unsafe fn is_ascii(&self) -> bool { diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs index 8232f571..9bdcb0c0 100644 --- a/src/implementation/x86/avx2.rs +++ b/src/implementation/x86/avx2.rs @@ -103,6 +103,18 @@ impl SimdU8Value { Self::from(_mm256_loadu_si256(ptr.cast::<__m256i>())) } + #[target_feature(enable = "avx2")] + #[inline] + unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { + let mut tmpbuf = [0_u8; 32]; + crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_32( + ptr, + tmpbuf.as_mut_ptr(), + len, + ); + Self::load_from(tmpbuf.as_ptr()) + } + #[target_feature(enable = "avx2")] #[inline] unsafe fn lookup_16( diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs index f9140d50..496361fe 100644 --- a/src/implementation/x86/sse42.rs +++ b/src/implementation/x86/sse42.rs @@ -99,6 +99,18 @@ impl SimdU8Value { Self::from(_mm_loadu_si128(ptr.cast::<__m128i>())) } + #[target_feature(enable = "sse4.2")] + #[inline] + unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { + let mut tmpbuf = [0_u8; 16]; + crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_16( + ptr, + tmpbuf.as_mut_ptr(), + len, + ); + Self::load_from(tmpbuf.as_ptr()) + } + #[target_feature(enable = "sse4.2")] #[inline] unsafe fn lookup_16( From 3d718220c22e0564e5fd6dc19f280d2f657bdfbe Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sun, 30 May 2021 18:56:12 +0200 Subject: [PATCH 14/69] clippy --- src/implementation/algorithm.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index 8cc197cb..aa65766b 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -210,9 +210,9 @@ macro_rules! algorithm_simd { #[allow(unconditional_panic)] // does not panic because len is checked #[allow(const_err)] // the same, but for Rust 1.38.0 unsafe fn check_remainder(&mut self, mut input: *const u8, len: usize) { + const SIMD_SIZE: usize = core::mem::size_of::(); let orig_len = len; let mut len = len; - const SIMD_SIZE: usize = core::mem::size_of::(); while len >= SIMD_SIZE { let simd_val = SimdU8Value::load_from(input); input = input.add(SIMD_SIZE); From d4d3ccd97d7dd6191431acd35da07cf93bb42560 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sun, 30 May 2021 18:56:27 +0200 Subject: [PATCH 15/69] add missing helper fns --- src/implementation/helpers.rs | 68 ++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 17 deletions(-) diff --git a/src/implementation/helpers.rs b/src/implementation/helpers.rs index 0a8bb36d..bbef8733 100644 --- a/src/implementation/helpers.rs +++ b/src/implementation/helpers.rs @@ -37,6 +37,24 @@ pub(crate) fn get_compat_error(input: &[u8], failing_block_pos: usize) -> Utf8Er validate_utf8_at_offset(input, offset).unwrap_err() } +#[inline] +unsafe fn memcpy_u32(src: &mut *const u8, dest: &mut *mut u8) { + #[allow(clippy::cast_ptr_alignment)] + dest.cast::() + .write_unaligned(src.cast::().read_unaligned()); + *src = src.offset(4); + *dest = dest.offset(4); +} + +#[inline] +unsafe fn memcpy_u64(src: &mut *const u8, dest: &mut *mut u8) { + #[allow(clippy::cast_ptr_alignment)] + dest.cast::() + .write_unaligned(src.cast::().read_unaligned()); + *src = src.offset(8); + *dest = dest.offset(8); +} + #[allow(dead_code)] #[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_16( @@ -44,14 +62,6 @@ pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_16( mut dest: *mut u8, mut len: usize, ) { - #[inline] - unsafe fn memcpy_u32(src: &mut *const u8, dest: &mut *mut u8) { - #[allow(clippy::cast_ptr_alignment)] - dest.cast::() - .write_unaligned(src.cast::().read_unaligned()); - *src = src.offset(4); - *dest = dest.offset(4); - } if len >= 8 { memcpy_u32(&mut src, &mut dest); memcpy_u32(&mut src, &mut dest); @@ -68,22 +78,42 @@ pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_16( len -= 1; } } + #[allow(dead_code)] #[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const -pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_64( +pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_32( mut src: *const u8, mut dest: *mut u8, mut len: usize, ) { - // This gets properly auto-vectorized on AVX 2 and SSE 4.2 - #[inline] - unsafe fn memcpy_u64(src: &mut *const u8, dest: &mut *mut u8) { - #[allow(clippy::cast_ptr_alignment)] - dest.cast::() - .write_unaligned(src.cast::().read_unaligned()); - *src = src.offset(8); - *dest = dest.offset(8); + if len >= 16 { + memcpy_u64(&mut src, &mut dest); + memcpy_u64(&mut src, &mut dest); + len -= 16; + } + if len >= 8 { + memcpy_u64(&mut src, &mut dest); + len -= 8; + } + if len >= 4 { + memcpy_u32(&mut src, &mut dest); + len -= 4; } + while len > 0 { + *dest = *src; + src = src.offset(1); + dest = dest.offset(1); + len -= 1; + } +} + +#[allow(dead_code)] +#[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const +pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_64( + mut src: *const u8, + mut dest: *mut u8, + mut len: usize, +) { if len >= 32 { memcpy_u64(&mut src, &mut dest); memcpy_u64(&mut src, &mut dest); @@ -100,6 +130,10 @@ pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_64( memcpy_u64(&mut src, &mut dest); len -= 8; } + if len >= 4 { + memcpy_u32(&mut src, &mut dest); + len -= 4; + } while len > 0 { *dest = *src; src = src.offset(1); From d08c88b3870dd438866b7934bbf6160e619f1a4b Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sun, 30 May 2021 19:12:08 +0200 Subject: [PATCH 16/69] clippy --- src/implementation/x86/avx2.rs | 1 + src/implementation/x86/sse42.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs index 9bdcb0c0..e402e145 100644 --- a/src/implementation/x86/avx2.rs +++ b/src/implementation/x86/avx2.rs @@ -268,6 +268,7 @@ unsafe fn simd_prefetch(ptr: *const u8) { } const PREFETCH: bool = true; +#[allow(unused_imports)] use crate::implementation::helpers::TempSimdChunkA32 as TempSimdChunk; simd_input_256_bit!("avx2"); algorithm_simd!("avx2"); diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs index 496361fe..e0d08675 100644 --- a/src/implementation/x86/sse42.rs +++ b/src/implementation/x86/sse42.rs @@ -252,6 +252,7 @@ unsafe fn simd_prefetch(ptr: *const u8) { } const PREFETCH: bool = false; +#[allow(unused_imports)] use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk; simd_input_128_bit!("sse4.2"); algorithm_simd!("sse4.2"); From 0c391490e7f6b99cacc3e6494f05acc1dc4d3f83 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Tue, 1 Jun 2021 13:59:37 +0200 Subject: [PATCH 17/69] add unit test for AVX 2 load_partial() --- src/implementation/x86/avx2.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs index e402e145..bc93f3a4 100644 --- a/src/implementation/x86/avx2.rs +++ b/src/implementation/x86/avx2.rs @@ -267,6 +267,31 @@ unsafe fn simd_prefetch(ptr: *const u8) { _mm_prefetch(ptr.cast::(), _MM_HINT_T0); } +mod test { + #[allow(unused_imports)] + use super::*; + + #[test] + pub fn masked_load() { + let arr = [ + 1_u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + unsafe { + for len in 0..32 { + let loaded_arr: [u8; 32] = + core::mem::transmute(SimdU8Value::load_partial(arr.as_ptr(), len)); + for i in 0..len { + assert_eq!(arr[i], loaded_arr[i]); + } + for x in &loaded_arr[len..arr.len()] { + assert_eq!(*x, 0); + } + } + } + } +} + const PREFETCH: bool = true; #[allow(unused_imports)] use crate::implementation::helpers::TempSimdChunkA32 as TempSimdChunk; From 198e11ceb646ca926681d17926f4c61be5023bfa Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Tue, 1 Jun 2021 14:01:11 +0200 Subject: [PATCH 18/69] Implement AVX 2 simd value Display and LowerHex traits for debugging --- src/implementation/x86/avx2.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs index bc93f3a4..3e536bfd 100644 --- a/src/implementation/x86/avx2.rs +++ b/src/implementation/x86/avx2.rs @@ -248,6 +248,24 @@ impl From<__m256i> for SimdU8Value { } } +impl core::fmt::Display for SimdU8Value { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + unsafe { + let arr: [u8; 32] = core::mem::transmute(self.0); + write!(f, "{:?}", arr) + } + } +} + +impl core::fmt::LowerHex for SimdU8Value { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + unsafe { + let arr: [u8; 32] = core::mem::transmute(self.0); + write!(f, "{:x?}", arr) + } + } +} + impl Utf8CheckAlgorithm { #[target_feature(enable = "avx2")] #[inline] From 57b0790c778855ea3cb695d47090d3cc61135863 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Tue, 1 Jun 2021 14:01:47 +0200 Subject: [PATCH 19/69] add load_partial_direct() method --- src/implementation/x86/avx2.rs | 64 +++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 8 deletions(-) diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs index 3e536bfd..bc107f09 100644 --- a/src/implementation/x86/avx2.rs +++ b/src/implementation/x86/avx2.rs @@ -4,17 +4,21 @@ #[cfg(target_arch = "x86")] use core::arch::x86::{ - __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_cmpgt_epi8, _mm256_loadu_si256, - _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi8, - _mm256_setr_epi8, _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16, - _mm256_subs_epu8, _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0, + __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_blendv_ps, _mm256_castps_si256, + _mm256_castsi256_ps, _mm256_cmpgt_epi8, _mm256_loadu_si256, _mm256_maskload_epi32, + _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi32, + _mm256_set1_epi8, _mm256_set_epi32, _mm256_setr_epi8, _mm256_setzero_si256, + _mm256_shuffle_epi8, _mm256_sllv_epi32, _mm256_srli_epi16, _mm256_subs_epu8, + _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0, }; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::{ - __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_cmpgt_epi8, _mm256_loadu_si256, - _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi8, - _mm256_setr_epi8, _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16, - _mm256_subs_epu8, _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0, + __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_blendv_ps, _mm256_castps_si256, + _mm256_castsi256_ps, _mm256_cmpgt_epi8, _mm256_loadu_si256, _mm256_maskload_epi32, + _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi32, + _mm256_set1_epi8, _mm256_set_epi32, _mm256_setr_epi8, _mm256_setzero_si256, + _mm256_shuffle_epi8, _mm256_sllv_epi32, _mm256_srli_epi16, _mm256_subs_epu8, + _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0, }; use crate::implementation::helpers::Utf8CheckAlgorithm; @@ -103,9 +107,53 @@ impl SimdU8Value { Self::from(_mm256_loadu_si256(ptr.cast::<__m256i>())) } + #[target_feature(enable = "avx2")] + #[inline] + unsafe fn vecmask_from_bitmask(mask: u8) -> Self { + let vshift_count = _mm256_set_epi32(24, 25, 26, 27, 28, 29, 30, 31); + let bcast = _mm256_set1_epi32(i32::from(mask)); + let shifted = _mm256_sllv_epi32(bcast, vshift_count); // high bit of each element = corresponding bit of the mask + Self::from(shifted) + } + #[target_feature(enable = "avx2")] #[inline] unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { + Self::load_partial_direct(ptr, len) + } + + #[target_feature(enable = "avx2")] + #[inline] + unsafe fn load_partial_direct(mut ptr: *const u8, len: usize) -> Self { + if len == 0 { + return Self::splat0(); + } + let sel_mask = 1 << (len / 4); + let mask = (sel_mask - 1) as u8; + let mut res = _mm256_maskload_epi32(ptr.cast(), Self::vecmask_from_bitmask(mask).0); + let remainder = len % 4; + if remainder != 0 { + ptr = ptr.add((len - len % 4) as usize); + let remaining_bytes = match remainder { + 1 => u32::from(*ptr), + 2 => u32::from(*ptr) | u32::from(*ptr.add(1)) << 8, + 3 => u32::from(*ptr) | u32::from(*ptr.add(1)) << 8 | u32::from(*ptr.add(2)) << 16, + _ => 0, + }; + #[allow(clippy::cast_possible_wrap)] + let remaining_vec = _mm256_set1_epi32(remaining_bytes as i32); + res = _mm256_castps_si256(_mm256_blendv_ps( + _mm256_castsi256_ps(res), + _mm256_castsi256_ps(remaining_vec), + _mm256_castsi256_ps(Self::vecmask_from_bitmask(sel_mask).0), + )); + } + Self::from(res) + } + + #[target_feature(enable = "avx2")] + #[inline] + unsafe fn load_partial_copy(ptr: *const u8, len: usize) -> Self { let mut tmpbuf = [0_u8; 32]; crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_32( ptr, From 38b58de365e5be84885635bb2bb2e06ece7bc04e Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Tue, 1 Jun 2021 14:10:43 +0200 Subject: [PATCH 20/69] only run avx2 masked load test if AVX 2 is available --- src/implementation/x86/avx2.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs index bc107f09..93ed101c 100644 --- a/src/implementation/x86/avx2.rs +++ b/src/implementation/x86/avx2.rs @@ -334,11 +334,18 @@ unsafe fn simd_prefetch(ptr: *const u8) { } mod test { + #[cfg(not(features = "std"))] + extern crate std; + #[allow(unused_imports)] use super::*; #[test] pub fn masked_load() { + if std::is_x86_feature_detected!("avx2") { + return; + } + let arr = [ 1_u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, From b6b23882b2f5be93cdc9098e59901a8db3329d01 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Tue, 1 Jun 2021 14:21:31 +0200 Subject: [PATCH 21/69] fix avx2 detection --- src/implementation/x86/avx2.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs index 93ed101c..e0efeaa1 100644 --- a/src/implementation/x86/avx2.rs +++ b/src/implementation/x86/avx2.rs @@ -342,7 +342,7 @@ mod test { #[test] pub fn masked_load() { - if std::is_x86_feature_detected!("avx2") { + if !std::is_x86_feature_detected!("avx2") { return; } From 44f8fd101b3d3e01ff794110d8da2da347a19d5e Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Tue, 1 Jun 2021 18:17:55 +0200 Subject: [PATCH 22/69] prevent remainder loop unrolling in sse 4.2 which caused the methods not to be inlined. --- src/implementation/aarch64/neon.rs | 1 + src/implementation/algorithm.rs | 11 +++++++++++ src/implementation/x86/avx2.rs | 1 + src/implementation/x86/sse42.rs | 1 + 4 files changed, 14 insertions(+) diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs index 17966445..325386a0 100644 --- a/src/implementation/aarch64/neon.rs +++ b/src/implementation/aarch64/neon.rs @@ -404,6 +404,7 @@ unsafe fn simd_prefetch(ptr: *const u8) { } const PREFETCH: bool = false; +const PREVENT_REMAINDER_LOOP_UNROLLING: bool = false; #[allow(unused_imports)] use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk; simd_input_128_bit!("not_used"); diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index aa65766b..7e91d8c7 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -213,6 +213,12 @@ macro_rules! algorithm_simd { const SIMD_SIZE: usize = core::mem::size_of::(); let orig_len = len; let mut len = len; + + // necessary, otherwise the compiler needlessly unrolls the loop, + // the function becomes to big and is no longer inlined for SSE 4.2 + if PREVENT_REMAINDER_LOOP_UNROLLING { + assert!(len < crate::implementation::helpers::SIMD_CHUNK_SIZE); + } while len >= SIMD_SIZE { let simd_val = SimdU8Value::load_from(input); input = input.add(SIMD_SIZE); @@ -247,6 +253,11 @@ macro_rules! algorithm_simd { #[allow(const_err)] // the same, but for Rust 1.38.0 unsafe fn check_remainder_ascii(&mut self, mut input: *const u8, mut len: usize) { const SIMD_SIZE: usize = core::mem::size_of::(); + + // prevent loop unrolling which can cause the function to be too big for inlining + if PREVENT_REMAINDER_LOOP_UNROLLING { + assert!(len < crate::implementation::helpers::SIMD_CHUNK_SIZE); + } while len >= SIMD_SIZE { let simd_val = SimdU8Value::load_from(input); input = input.add(SIMD_SIZE); diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs index 93ed101c..c54db356 100644 --- a/src/implementation/x86/avx2.rs +++ b/src/implementation/x86/avx2.rs @@ -366,6 +366,7 @@ mod test { } const PREFETCH: bool = true; +const PREVENT_REMAINDER_LOOP_UNROLLING: bool = false; #[allow(unused_imports)] use crate::implementation::helpers::TempSimdChunkA32 as TempSimdChunk; simd_input_256_bit!("avx2"); diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs index e0d08675..99f11597 100644 --- a/src/implementation/x86/sse42.rs +++ b/src/implementation/x86/sse42.rs @@ -252,6 +252,7 @@ unsafe fn simd_prefetch(ptr: *const u8) { } const PREFETCH: bool = false; +const PREVENT_REMAINDER_LOOP_UNROLLING: bool = true; #[allow(unused_imports)] use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk; simd_input_128_bit!("sse4.2"); From cb7ed197bd91db6d40a50dbb1a5aa5ef079a2cad Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Tue, 1 Jun 2021 18:18:11 +0200 Subject: [PATCH 23/69] only implement Debug/LowerHex for tests --- src/implementation/x86/avx2.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs index c54db356..f158069e 100644 --- a/src/implementation/x86/avx2.rs +++ b/src/implementation/x86/avx2.rs @@ -296,6 +296,7 @@ impl From<__m256i> for SimdU8Value { } } +#[cfg(test)] impl core::fmt::Display for SimdU8Value { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { unsafe { @@ -305,6 +306,7 @@ impl core::fmt::Display for SimdU8Value { } } +#[cfg(test)] impl core::fmt::LowerHex for SimdU8Value { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { unsafe { @@ -333,6 +335,7 @@ unsafe fn simd_prefetch(ptr: *const u8) { _mm_prefetch(ptr.cast::(), _MM_HINT_T0); } +#[cfg(test)] mod test { #[cfg(not(features = "std"))] extern crate std; From 10df3366190d4a927cfd9c13e7e5a9ffba79549c Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Wed, 2 Jun 2021 07:27:48 +0200 Subject: [PATCH 24/69] expand benchmarks --- bench/src/lib.rs | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/bench/src/lib.rs b/bench/src/lib.rs index 218055b3..9aade97b 100644 --- a/bench/src/lib.rs +++ b/bench/src/lib.rs @@ -2,6 +2,7 @@ use criterion::{measurement::Measurement, BenchmarkGroup, BenchmarkId, Criterion use simdutf8::basic::from_utf8 as basic_from_utf8; use simdutf8::compat::from_utf8 as compat_from_utf8; +use std::collections::HashSet; use std::str::from_utf8 as std_from_utf8; #[cfg(feature = "simdjson")] @@ -113,7 +114,24 @@ fn get_valid_slice_of_len_or_more_aligned( fn bench(c: &mut Criterion, name: &str, bytes: &[u8], bench_fn: BenchFn) { let mut group = c.benchmark_group(name); - for i in [1, 8, 64, 512, 4096, 65536, 131072].iter() { + let mut sizes = HashSet::new(); + for i in 1..129 { + let alignment = Alignment { + boundary: 64, + offset: 8, // 8 is the default alignment on 64-bit, so this is what can be expected worst-case + }; + let (vec, offset) = get_valid_slice_of_len_or_more_aligned(bytes, i, alignment); + let slice = &vec[offset..]; + assert_eq!( + (slice.as_ptr() as usize) % alignment.boundary, + alignment.offset + ); + if !sizes.contains(&slice.len()) { + bench_input(&mut group, slice, true, true, bench_fn); + sizes.insert(slice.len()); + } + } + for i in [512, 4096, 65536, 131072].iter() { let alignment = Alignment { boundary: 64, offset: 8, // 8 is the default alignment on 64-bit, so this is what can be expected worst-case From bab2fbffde7f2c4bc743b4a8be3abbdf19df307e Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Wed, 2 Jun 2021 07:28:27 +0200 Subject: [PATCH 25/69] x86: make delegation to std for small inputs in-code configurable --- src/implementation/x86/mod.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs index 19954956..36108f09 100644 --- a/src/implementation/x86/mod.rs +++ b/src/implementation/x86/mod.rs @@ -9,6 +9,8 @@ use super::helpers::SIMD_CHUNK_SIZE; // validate_utf8_basic() std: implementation auto-selection +const DELEGATE_TO_STD_FOR_SMALL_INPUTS: bool = false; + #[cfg(all(feature = "std", not(target_feature = "avx2")))] #[inline] pub(crate) unsafe fn validate_utf8_basic( @@ -27,7 +29,7 @@ pub(crate) unsafe fn validate_utf8_basic( (fun)(input) } - if input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SIMD_CHUNK_SIZE { return super::validate_utf8_basic_fallback(input); } @@ -53,7 +55,7 @@ fn get_fastest_available_implementation_basic() -> super::ValidateUtf8Fn { pub(crate) unsafe fn validate_utf8_basic( input: &[u8], ) -> core::result::Result<(), crate::basic::Utf8Error> { - if input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE { return super::validate_utf8_basic_fallback(input); } @@ -76,7 +78,7 @@ unsafe fn validate_utf8_basic_avx2( pub(crate) unsafe fn validate_utf8_basic( input: &[u8], ) -> core::result::Result<(), crate::basic::Utf8Error> { - if input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE { return super::validate_utf8_basic_fallback(input); } @@ -123,7 +125,7 @@ pub(crate) unsafe fn validate_utf8_compat( (fun)(input) } - if input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SIMD_CHUNK_SIZE { return super::validate_utf8_compat_fallback(input); } @@ -149,7 +151,7 @@ fn get_fastest_available_implementation_compat() -> super::ValidateUtf8CompatFn pub(crate) unsafe fn validate_utf8_compat( input: &[u8], ) -> core::result::Result<(), crate::compat::Utf8Error> { - if input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE { return super::validate_utf8_compat_fallback(input); } @@ -172,7 +174,7 @@ unsafe fn validate_utf8_compat_avx2( pub(crate) unsafe fn validate_utf8_compat( input: &[u8], ) -> core::result::Result<(), crate::compat::Utf8Error> { - if input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE { return super::validate_utf8_compat_fallback(input); } From 0bb570dc8439e0b571b551ec4283c843e4698096 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Wed, 2 Jun 2021 11:20:07 +0200 Subject: [PATCH 26/69] comment --- src/implementation/x86/avx2.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs index a4c928d7..88e769e6 100644 --- a/src/implementation/x86/avx2.rs +++ b/src/implementation/x86/avx2.rs @@ -140,6 +140,7 @@ impl SimdU8Value { 3 => u32::from(*ptr) | u32::from(*ptr.add(1)) << 8 | u32::from(*ptr.add(2)) << 16, _ => 0, }; + // blend vec with 4-byte chunks and last incomplete chunk together #[allow(clippy::cast_possible_wrap)] let remaining_vec = _mm256_set1_epi32(remaining_bytes as i32); res = _mm256_castps_si256(_mm256_blendv_ps( From cbae8480df7f63eb6e523017a9b0a69bf5567fd1 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Wed, 2 Jun 2021 11:20:29 +0200 Subject: [PATCH 27/69] x86: delegate inputs < 9 bytes to std --- src/implementation/x86/mod.rs | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs index 36108f09..dfa41ddd 100644 --- a/src/implementation/x86/mod.rs +++ b/src/implementation/x86/mod.rs @@ -4,12 +4,10 @@ pub(crate) mod avx2; #[allow(dead_code)] pub(crate) mod sse42; -#[allow(unused_imports)] -use super::helpers::SIMD_CHUNK_SIZE; - // validate_utf8_basic() std: implementation auto-selection -const DELEGATE_TO_STD_FOR_SMALL_INPUTS: bool = false; +const DELEGATE_TO_STD_FOR_SMALL_INPUTS: bool = true; +const SMALL_STRING_LIMIT: usize = 9; #[cfg(all(feature = "std", not(target_feature = "avx2")))] #[inline] @@ -29,7 +27,7 @@ pub(crate) unsafe fn validate_utf8_basic( (fun)(input) } - if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_basic_fallback(input); } @@ -55,7 +53,7 @@ fn get_fastest_available_implementation_basic() -> super::ValidateUtf8Fn { pub(crate) unsafe fn validate_utf8_basic( input: &[u8], ) -> core::result::Result<(), crate::basic::Utf8Error> { - if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_basic_fallback(input); } @@ -78,7 +76,7 @@ unsafe fn validate_utf8_basic_avx2( pub(crate) unsafe fn validate_utf8_basic( input: &[u8], ) -> core::result::Result<(), crate::basic::Utf8Error> { - if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_basic_fallback(input); } @@ -125,7 +123,7 @@ pub(crate) unsafe fn validate_utf8_compat( (fun)(input) } - if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_compat_fallback(input); } @@ -151,7 +149,7 @@ fn get_fastest_available_implementation_compat() -> super::ValidateUtf8CompatFn pub(crate) unsafe fn validate_utf8_compat( input: &[u8], ) -> core::result::Result<(), crate::compat::Utf8Error> { - if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_compat_fallback(input); } @@ -174,7 +172,7 @@ unsafe fn validate_utf8_compat_avx2( pub(crate) unsafe fn validate_utf8_compat( input: &[u8], ) -> core::result::Result<(), crate::compat::Utf8Error> { - if DELEGATE_SMALL_TO_STD && input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_compat_fallback(input); } From 522949283565c0a9537b3bb044a15219e036dd3d Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Wed, 2 Jun 2021 14:50:53 +0200 Subject: [PATCH 28/69] Dsiplay for SimdU8Value --- src/implementation/helpers.rs | 34 ++++++++++++++++++++++++++++++++++ src/implementation/x86/avx2.rs | 20 -------------------- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/implementation/helpers.rs b/src/implementation/helpers.rs index bbef8733..9aedb81a 100644 --- a/src/implementation/helpers.rs +++ b/src/implementation/helpers.rs @@ -180,3 +180,37 @@ impl TempSimdChunkA32 { pub(crate) struct SimdU8Value(pub(crate) T) where T: Copy; + +#[cfg(test)] +impl core::fmt::Display for SimdU8Value { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + unsafe { + if core::mem::size_of::() == 16 { + let arr: [u8; 16] = core::mem::transmute_copy(&self.0); + write!(f, "{:?}", arr) + } else if core::mem::size_of::() == 32 { + let arr: [u8; 32] = core::mem::transmute_copy(&self.0); + write!(f, "{:?}", arr) + } else { + Err(core::fmt::Error) + } + } + } +} + +#[cfg(test)] +impl core::fmt::LowerHex for SimdU8Value { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + unsafe { + if core::mem::size_of::() == 16 { + let arr: [u8; 16] = core::mem::transmute_copy(&self.0); + write!(f, "{:x?}", arr) + } else if core::mem::size_of::() == 32 { + let arr: [u8; 32] = core::mem::transmute_copy(&self.0); + write!(f, "{:x?}", arr) + } else { + Err(core::fmt::Error) + } + } + } +} diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs index 88e769e6..b42e230c 100644 --- a/src/implementation/x86/avx2.rs +++ b/src/implementation/x86/avx2.rs @@ -297,26 +297,6 @@ impl From<__m256i> for SimdU8Value { } } -#[cfg(test)] -impl core::fmt::Display for SimdU8Value { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - unsafe { - let arr: [u8; 32] = core::mem::transmute(self.0); - write!(f, "{:?}", arr) - } - } -} - -#[cfg(test)] -impl core::fmt::LowerHex for SimdU8Value { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - unsafe { - let arr: [u8; 32] = core::mem::transmute(self.0); - write!(f, "{:x?}", arr) - } - } -} - impl Utf8CheckAlgorithm { #[target_feature(enable = "avx2")] #[inline] From 2952db3a4c11ef2325f6b1a5191378d5c6e93f6c Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Wed, 2 Jun 2021 14:51:21 +0200 Subject: [PATCH 29/69] SSE 4.2 load_partial() --- src/implementation/x86/sse42.rs | 187 +++++++++++++++++++++++++++++++- 1 file changed, 181 insertions(+), 6 deletions(-) diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs index 99f11597..e60deb3d 100644 --- a/src/implementation/x86/sse42.rs +++ b/src/implementation/x86/sse42.rs @@ -4,15 +4,17 @@ #[cfg(target_arch = "x86")] use core::arch::x86::{ - __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_loadu_si128, _mm_movemask_epi8, - _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, - _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, + __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_bsrli_si128, _mm_cmpgt_epi8, _mm_insert_epi16, + _mm_insert_epi8, _mm_loadu_si128, _mm_loadu_si64, _mm_movemask_epi8, _mm_or_si128, + _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, + _mm_shuffle_epi8, _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, }; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::{ - __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_loadu_si128, _mm_movemask_epi8, - _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, - _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, + __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_bsrli_si128, _mm_cmpgt_epi8, _mm_insert_epi16, + _mm_insert_epi8, _mm_loadu_si128, _mm_loadu_si64, _mm_movemask_epi8, _mm_or_si128, + _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, + _mm_shuffle_epi8, _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, }; use crate::implementation::helpers::Utf8CheckAlgorithm; @@ -101,7 +103,149 @@ impl SimdU8Value { #[target_feature(enable = "sse4.2")] #[inline] + #[allow(clippy::too_many_lines)] + #[allow(clippy::cast_ptr_alignment)] unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { + Self::from(match len { + 1 => _mm_setr_epi8( + ptr.cast::().read_unaligned(), + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ), + 2 => _mm_setr_epi16(ptr.cast::().read_unaligned(), 0, 0, 0, 0, 0, 0, 0), + 3 => _mm_setr_epi8( + ptr.cast::().read_unaligned(), + ptr.add(1).cast::().read_unaligned(), + ptr.add(2).cast::().read_unaligned(), + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ), + 4 => _mm_setr_epi32(ptr.cast::().read_unaligned(), 0, 0, 0), // assembly??? + 5 => { + let val = _mm_setr_epi32(ptr.cast::().read_unaligned(), 0, 0, 0); + _mm_insert_epi8(val, i32::from(ptr.add(4).cast::().read_unaligned()), 4) + } + 6 => _mm_setr_epi16( + ptr.cast::().read_unaligned(), + ptr.add(2).cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + 0, + 0, + 0, + 0, + 0, + ), + 7 => { + let val = _mm_setr_epi16( + ptr.cast::().read_unaligned(), + ptr.add(2).cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + 0, + 0, + 0, + 0, + 0, + ); + _mm_insert_epi8(val, i32::from(ptr.add(6).cast::().read_unaligned()), 6) + } + 8 => _mm_bsrli_si128(_mm_loadu_si64(ptr), 8), + 9 => { + let val = _mm_bsrli_si128(_mm_loadu_si64(ptr), 8); + _mm_insert_epi8(val, i32::from(ptr.add(8).cast::().read_unaligned()), 8) + } + 10 => { + let val = _mm_bsrli_si128(_mm_loadu_si64(ptr), 8); + _mm_insert_epi16(val, i32::from(ptr.add(8).cast::().read_unaligned()), 4) + } + 11 => { + let mut val = _mm_bsrli_si128(_mm_loadu_si64(ptr), 8); + val = + _mm_insert_epi16(val, i32::from(ptr.add(8).cast::().read_unaligned()), 4); + _mm_insert_epi8( + val, + i32::from(ptr.add(10).cast::().read_unaligned()), + 10, + ) + } + 12 => _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + ptr.add(8).cast::().read_unaligned(), + 0, + ), + 13 => { + let val = _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + ptr.add(8).cast::().read_unaligned(), + 0, + ); + _mm_insert_epi8( + val, + i32::from(ptr.add(12).cast::().read_unaligned()), + 12, + ) + } + 14 => { + let val = _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + ptr.add(8).cast::().read_unaligned(), + 0, + ); + _mm_insert_epi16( + val, + i32::from(ptr.add(12).cast::().read_unaligned()), + 6, + ) + } + 15 => { + let mut val = _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + ptr.add(8).cast::().read_unaligned(), + 0, + ); + val = _mm_insert_epi16( + val, + i32::from(ptr.add(12).cast::().read_unaligned()), + 6, + ); + _mm_insert_epi8( + val, + i32::from(ptr.add(14).cast::().read_unaligned()), + 14, + ) + } + _ => Self::splat0().0, // _ => res = Self::load_partial_copy(ptr, len), + }) + } + + unsafe fn load_partial_copy(ptr: *const u8, len: usize) -> Self { let mut tmpbuf = [0_u8; 16]; crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_16( ptr, @@ -251,6 +395,37 @@ unsafe fn simd_prefetch(ptr: *const u8) { _mm_prefetch(ptr.cast::(), _MM_HINT_T0); } +#[cfg(test)] +mod test { + #[cfg(not(features = "std"))] + extern crate std; + + #[allow(unused_imports)] + use super::*; + + #[test] + pub fn masked_load() { + if !std::is_x86_feature_detected!("sse4.2") { + return; + } + + let arr = [1_u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + unsafe { + for len in 0..16 { + let loaded_arr: [u8; 16] = + core::mem::transmute(SimdU8Value::load_partial(arr.as_ptr(), len)); + println!("{:?}", loaded_arr); + for i in 0..len { + assert_eq!(arr[i], loaded_arr[i]); + } + for x in &loaded_arr[len..arr.len()] { + assert_eq!(*x, 0); + } + } + } + } +} + const PREFETCH: bool = false; const PREVENT_REMAINDER_LOOP_UNROLLING: bool = true; #[allow(unused_imports)] From 2333e0ec19eb78f5ef6c0a38795c7a5c1dbe5d8b Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 3 Jun 2021 09:40:09 +0200 Subject: [PATCH 30/69] cleanup --- src/implementation/x86/sse42.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs index e60deb3d..1ba07b59 100644 --- a/src/implementation/x86/sse42.rs +++ b/src/implementation/x86/sse42.rs @@ -241,7 +241,7 @@ impl SimdU8Value { 14, ) } - _ => Self::splat0().0, // _ => res = Self::load_partial_copy(ptr, len), + _ => Self::splat0().0, }) } From 54e774ebb30281885c1b60fd2c2dce168f4745a5 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 3 Jun 2021 09:40:34 +0200 Subject: [PATCH 31/69] testbed for partial load --- src/implementation/algorithm.rs | 74 +++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index 7e91d8c7..75b73d98 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -619,6 +619,13 @@ macro_rules! simd_input_128_bit { #[inline] #[allow(clippy::cast_ptr_alignment)] unsafe fn new_partial(ptr: *const u8, len: usize) -> Self { + Self::new_partial_ordered(ptr, len) + } + + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(clippy::cast_ptr_alignment)] + unsafe fn new_partial_ordered(ptr: *const u8, len: usize) -> Self { if len < 16 { Self { vals: [ @@ -658,6 +665,50 @@ macro_rules! simd_input_128_bit { } } + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(clippy::cast_ptr_alignment)] + unsafe fn new_partial_small(ptr: *const u8, len: usize) -> Self { + let partial = SimdU8Value::load_partial(ptr.add(len / 16 * 16), len % 16); + if len < 16 { + Self { + vals: [ + partial, + SimdU8Value::splat0(), + SimdU8Value::splat0(), + SimdU8Value::splat0(), + ], + } + } else if len < 32 { + Self { + vals: [ + SimdU8Value::load_from(ptr), + partial, + SimdU8Value::splat0(), + SimdU8Value::splat0(), + ], + } + } else if len < 48 { + Self { + vals: [ + SimdU8Value::load_from(ptr), + SimdU8Value::load_from(ptr.add(16)), + partial, + SimdU8Value::splat0(), + ], + } + } else { + Self { + vals: [ + SimdU8Value::load_from(ptr), + SimdU8Value::load_from(ptr.add(16)), + SimdU8Value::load_from(ptr.add(32)), + partial, + ], + } + } + } + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] #[inline] unsafe fn is_ascii(&self) -> bool { @@ -694,6 +745,13 @@ macro_rules! simd_input_256_bit { #[inline] #[allow(clippy::cast_ptr_alignment)] unsafe fn new_partial(ptr: *const u8, len: usize) -> Self { + Self::new_partial_ordered(ptr, len) + } + + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(clippy::cast_ptr_alignment)] + unsafe fn new_partial_ordered(ptr: *const u8, len: usize) -> Self { if len < 32 { Self { vals: [SimdU8Value::load_partial(ptr, len), SimdU8Value::splat0()], @@ -708,6 +766,22 @@ macro_rules! simd_input_256_bit { } } + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(clippy::cast_ptr_alignment)] + unsafe fn new_partial_small(ptr: *const u8, len: usize) -> Self { + let partial = SimdU8Value::load_partial(ptr.add(len / 32 * 32), len % 32); + if len < 32 { + Self { + vals: [partial, SimdU8Value::splat0()], + } + } else { + Self { + vals: [SimdU8Value::load_from(ptr), partial], + } + } + } + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] #[inline] unsafe fn is_ascii(&self) -> bool { From 1969057d496d78a067c053b124264a96e20b8d6c Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 3 Jun 2021 09:46:48 +0200 Subject: [PATCH 32/69] AVX2 in-code config var --- src/implementation/x86/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs index dfa41ddd..b960f96d 100644 --- a/src/implementation/x86/mod.rs +++ b/src/implementation/x86/mod.rs @@ -6,6 +6,7 @@ pub(crate) mod sse42; // validate_utf8_basic() std: implementation auto-selection +const ENABLE_AVX2: bool = true; const DELEGATE_TO_STD_FOR_SMALL_INPUTS: bool = true; const SMALL_STRING_LIMIT: usize = 9; @@ -38,7 +39,7 @@ pub(crate) unsafe fn validate_utf8_basic( #[cfg(all(feature = "std", not(target_feature = "avx2")))] #[inline] fn get_fastest_available_implementation_basic() -> super::ValidateUtf8Fn { - if std::is_x86_feature_detected!("avx2") { + if ENABLE_AVX2 && std::is_x86_feature_detected!("avx2") { avx2::validate_utf8_basic } else if std::is_x86_feature_detected!("sse4.2") { sse42::validate_utf8_basic @@ -134,7 +135,7 @@ pub(crate) unsafe fn validate_utf8_compat( #[cfg(all(feature = "std", not(target_feature = "avx2")))] #[inline] fn get_fastest_available_implementation_compat() -> super::ValidateUtf8CompatFn { - if std::is_x86_feature_detected!("avx2") { + if ENABLE_AVX2 && std::is_x86_feature_detected!("avx2") { avx2::validate_utf8_compat } else if std::is_x86_feature_detected!("sse4.2") { sse42::validate_utf8_compat From 7e346a8831183dd28d905cf14a8f9ea29e4defc8 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 3 Jun 2021 12:51:50 +0200 Subject: [PATCH 33/69] Faster than std impl on Apple Silicon --- src/implementation/aarch64/mod.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/implementation/aarch64/mod.rs b/src/implementation/aarch64/mod.rs index 6b1bfb2f..a483ecbc 100644 --- a/src/implementation/aarch64/mod.rs +++ b/src/implementation/aarch64/mod.rs @@ -5,10 +5,6 @@ pub(crate) mod neon; #[inline] #[cfg(all(feature = "aarch64_neon", target_feature = "neon"))] pub(crate) unsafe fn validate_utf8_basic(input: &[u8]) -> Result<(), crate::basic::Utf8Error> { - if input.len() < super::helpers::SIMD_CHUNK_SIZE { - return super::validate_utf8_basic_fallback(input); - } - validate_utf8_basic_neon(input) } @@ -24,10 +20,6 @@ pub(crate) use super::validate_utf8_basic_fallback as validate_utf8_basic; #[inline] #[cfg(all(feature = "aarch64_neon", target_feature = "neon"))] pub(crate) unsafe fn validate_utf8_compat(input: &[u8]) -> Result<(), crate::compat::Utf8Error> { - if input.len() < super::helpers::SIMD_CHUNK_SIZE { - return super::validate_utf8_compat_fallback(input); - } - validate_utf8_compat_neon(input) } From fcbb14c13c53a8b5c0882b0a2506414dceee431d Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 3 Jun 2021 12:57:29 +0200 Subject: [PATCH 34/69] comment wording --- src/implementation/algorithm.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index 75b73d98..79ebb393 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -214,7 +214,7 @@ macro_rules! algorithm_simd { let orig_len = len; let mut len = len; - // necessary, otherwise the compiler needlessly unrolls the loop, + // necessary, otherwise the compiler excessively unrolls the loop, // the function becomes to big and is no longer inlined for SSE 4.2 if PREVENT_REMAINDER_LOOP_UNROLLING { assert!(len < crate::implementation::helpers::SIMD_CHUNK_SIZE); @@ -254,7 +254,7 @@ macro_rules! algorithm_simd { unsafe fn check_remainder_ascii(&mut self, mut input: *const u8, mut len: usize) { const SIMD_SIZE: usize = core::mem::size_of::(); - // prevent loop unrolling which can cause the function to be too big for inlining + // prevent excessive loop unrolling which can cause the function to be too big for inlining if PREVENT_REMAINDER_LOOP_UNROLLING { assert!(len < crate::implementation::helpers::SIMD_CHUNK_SIZE); } From 74ecc4e3e685f563ede476e374c57f7ed241c372 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 17 Jun 2021 12:11:37 +0200 Subject: [PATCH 35/69] Update algorithm.rs Help the compiler to assert that partial_len < 16 --- src/implementation/algorithm.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index 75b73d98..b1ddbde4 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -626,10 +626,11 @@ macro_rules! simd_input_128_bit { #[inline] #[allow(clippy::cast_ptr_alignment)] unsafe fn new_partial_ordered(ptr: *const u8, len: usize) -> Self { + let partial_len = len % 16; if len < 16 { Self { vals: [ - SimdU8Value::load_partial(ptr, len), + SimdU8Value::load_partial(ptr, partial_len), SimdU8Value::splat0(), SimdU8Value::splat0(), SimdU8Value::splat0(), @@ -639,7 +640,7 @@ macro_rules! simd_input_128_bit { Self { vals: [ SimdU8Value::load_from(ptr), - SimdU8Value::load_partial(ptr.add(16), len - 16), + SimdU8Value::load_partial(ptr.add(16), partial_len), SimdU8Value::splat0(), SimdU8Value::splat0(), ], @@ -649,7 +650,7 @@ macro_rules! simd_input_128_bit { vals: [ SimdU8Value::load_from(ptr), SimdU8Value::load_from(ptr.add(16)), - SimdU8Value::load_partial(ptr.add(32), len - 32), + SimdU8Value::load_partial(ptr.add(32), partial_len), SimdU8Value::splat0(), ], } @@ -659,7 +660,7 @@ macro_rules! simd_input_128_bit { SimdU8Value::load_from(ptr), SimdU8Value::load_from(ptr.add(16)), SimdU8Value::load_from(ptr.add(32)), - SimdU8Value::load_partial(ptr.add(48), len - 48), + SimdU8Value::load_partial(ptr.add(48), partial_len), ], } } From 7df6fe7f9442d3e49873ab11ef381a2d3191a811 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 17 Jun 2021 13:31:29 +0200 Subject: [PATCH 36/69] Update mod.rs fix x86 build --- src/implementation/x86/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs index b960f96d..c175e542 100644 --- a/src/implementation/x86/mod.rs +++ b/src/implementation/x86/mod.rs @@ -54,7 +54,7 @@ fn get_fastest_available_implementation_basic() -> super::ValidateUtf8Fn { pub(crate) unsafe fn validate_utf8_basic( input: &[u8], ) -> core::result::Result<(), crate::basic::Utf8Error> { - if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT { + if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_basic_fallback(input); } @@ -77,7 +77,7 @@ unsafe fn validate_utf8_basic_avx2( pub(crate) unsafe fn validate_utf8_basic( input: &[u8], ) -> core::result::Result<(), crate::basic::Utf8Error> { - if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT { + if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_basic_fallback(input); } @@ -150,7 +150,7 @@ fn get_fastest_available_implementation_compat() -> super::ValidateUtf8CompatFn pub(crate) unsafe fn validate_utf8_compat( input: &[u8], ) -> core::result::Result<(), crate::compat::Utf8Error> { - if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT { + if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_compat_fallback(input); } @@ -173,7 +173,7 @@ unsafe fn validate_utf8_compat_avx2( pub(crate) unsafe fn validate_utf8_compat( input: &[u8], ) -> core::result::Result<(), crate::compat::Utf8Error> { - if DELEGATE_SMALL_TO_STD && input.len() < SMALL_STRING_LIMIT { + if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_compat_fallback(input); } From 1458d7c999a3e269074acf82789ca1cc35e630ed Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 17 Jun 2021 15:45:07 +0200 Subject: [PATCH 37/69] Update mod.rs clippy --- src/implementation/x86/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs index c175e542..65734d30 100644 --- a/src/implementation/x86/mod.rs +++ b/src/implementation/x86/mod.rs @@ -6,7 +6,9 @@ pub(crate) mod sse42; // validate_utf8_basic() std: implementation auto-selection +#[allow(dead_code)] const ENABLE_AVX2: bool = true; + const DELEGATE_TO_STD_FOR_SMALL_INPUTS: bool = true; const SMALL_STRING_LIMIT: usize = 9; From 439dc90b3ee702e015215aa41c3a4cf108f3d20d Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 17 Jun 2021 15:59:58 +0200 Subject: [PATCH 38/69] Update mod.rs clippy --- src/implementation/x86/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs index 65734d30..09fb53b7 100644 --- a/src/implementation/x86/mod.rs +++ b/src/implementation/x86/mod.rs @@ -9,7 +9,10 @@ pub(crate) mod sse42; #[allow(dead_code)] const ENABLE_AVX2: bool = true; +#[allow(dead_code)] const DELEGATE_TO_STD_FOR_SMALL_INPUTS: bool = true; + +#[allow(dead_code)] const SMALL_STRING_LIMIT: usize = 9; #[cfg(all(feature = "std", not(target_feature = "avx2")))] From d650782270af13aceb054ec4218b4b96e3d429bd Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 18 Jun 2021 06:56:16 +0200 Subject: [PATCH 39/69] Rust impl for _mm_loadu_si64 intrinsic was wrong, see https://github.com/rust-lang/stdarch/issues/1166 has landed in stable --- src/implementation/x86/sse42.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs index 1ba07b59..34774ba2 100644 --- a/src/implementation/x86/sse42.rs +++ b/src/implementation/x86/sse42.rs @@ -23,6 +23,8 @@ use crate::implementation::helpers::Utf8CheckAlgorithm; type SimdU8Value = crate::implementation::helpers::SimdU8Value<__m128i>; +// _mm_loadu_si64 + impl SimdU8Value { #[target_feature(enable = "sse4.2")] #[inline] @@ -172,17 +174,17 @@ impl SimdU8Value { ); _mm_insert_epi8(val, i32::from(ptr.add(6).cast::().read_unaligned()), 6) } - 8 => _mm_bsrli_si128(_mm_loadu_si64(ptr), 8), + 8 => _mm_loadu_si64(ptr), 9 => { - let val = _mm_bsrli_si128(_mm_loadu_si64(ptr), 8); + let val = _mm_loadu_si64(ptr); _mm_insert_epi8(val, i32::from(ptr.add(8).cast::().read_unaligned()), 8) } 10 => { - let val = _mm_bsrli_si128(_mm_loadu_si64(ptr), 8); + let val = _mm_loadu_si64(ptr); _mm_insert_epi16(val, i32::from(ptr.add(8).cast::().read_unaligned()), 4) } 11 => { - let mut val = _mm_bsrli_si128(_mm_loadu_si64(ptr), 8); + let mut val = _mm_loadu_si64(ptr); val = _mm_insert_epi16(val, i32::from(ptr.add(8).cast::().read_unaligned()), 4); _mm_insert_epi8( From 25b4272efa8663bd91a8caf0bb40b66d61707d0c Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 18 Jun 2021 07:02:10 +0200 Subject: [PATCH 40/69] Rust 1.38.0 compat: _mm_loadu_si64() not available -> replace. Asm is the same. --- src/implementation/x86/sse42.rs | 40 ++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs index 34774ba2..c76c67a3 100644 --- a/src/implementation/x86/sse42.rs +++ b/src/implementation/x86/sse42.rs @@ -5,16 +5,16 @@ #[cfg(target_arch = "x86")] use core::arch::x86::{ __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_bsrli_si128, _mm_cmpgt_epi8, _mm_insert_epi16, - _mm_insert_epi8, _mm_loadu_si128, _mm_loadu_si64, _mm_movemask_epi8, _mm_or_si128, - _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, - _mm_shuffle_epi8, _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, + _mm_insert_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8, + _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, + _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, }; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::{ __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_bsrli_si128, _mm_cmpgt_epi8, _mm_insert_epi16, - _mm_insert_epi8, _mm_loadu_si128, _mm_loadu_si64, _mm_movemask_epi8, _mm_or_si128, - _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, - _mm_shuffle_epi8, _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, + _mm_insert_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8, + _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, + _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, }; use crate::implementation::helpers::Utf8CheckAlgorithm; @@ -174,17 +174,37 @@ impl SimdU8Value { ); _mm_insert_epi8(val, i32::from(ptr.add(6).cast::().read_unaligned()), 6) } - 8 => _mm_loadu_si64(ptr), + 8 => _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + 0, + 0, + ), 9 => { - let val = _mm_loadu_si64(ptr); + let val = _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + 0, + 0, + ); _mm_insert_epi8(val, i32::from(ptr.add(8).cast::().read_unaligned()), 8) } 10 => { - let val = _mm_loadu_si64(ptr); + let val = _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + 0, + 0, + ); _mm_insert_epi16(val, i32::from(ptr.add(8).cast::().read_unaligned()), 4) } 11 => { - let mut val = _mm_loadu_si64(ptr); + let mut val = _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + 0, + 0, + ); val = _mm_insert_epi16(val, i32::from(ptr.add(8).cast::().read_unaligned()), 4); _mm_insert_epi8( From 8f5b7584cdfc91c29a6f1f55962cac6871e9f301 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 18 Jun 2021 07:04:50 +0200 Subject: [PATCH 41/69] clippy --- src/implementation/x86/sse42.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs index c76c67a3..f8fa4be8 100644 --- a/src/implementation/x86/sse42.rs +++ b/src/implementation/x86/sse42.rs @@ -4,17 +4,17 @@ #[cfg(target_arch = "x86")] use core::arch::x86::{ - __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_bsrli_si128, _mm_cmpgt_epi8, _mm_insert_epi16, - _mm_insert_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8, - _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, - _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, + __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_insert_epi16, _mm_insert_epi8, + _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16, + _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi16, + _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, }; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::{ - __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_bsrli_si128, _mm_cmpgt_epi8, _mm_insert_epi16, - _mm_insert_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8, - _mm_setr_epi16, _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, - _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, + __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_insert_epi16, _mm_insert_epi8, + _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16, + _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi16, + _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, }; use crate::implementation::helpers::Utf8CheckAlgorithm; From d1753c2fc2d4909f7f6ac1da128aa88af455f513 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 18 Jun 2021 07:11:20 +0200 Subject: [PATCH 42/69] remove extra println!() --- src/implementation/x86/sse42.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs index f8fa4be8..cdc4b1b4 100644 --- a/src/implementation/x86/sse42.rs +++ b/src/implementation/x86/sse42.rs @@ -436,7 +436,6 @@ mod test { for len in 0..16 { let loaded_arr: [u8; 16] = core::mem::transmute(SimdU8Value::load_partial(arr.as_ptr(), len)); - println!("{:?}", loaded_arr); for i in 0..len { assert_eq!(arr[i], loaded_arr[i]); } From 1b2d9aa73800ea79c988f700c2ba5fb6d57837ac Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 18 Jun 2021 07:39:29 +0200 Subject: [PATCH 43/69] remove stray comment --- src/implementation/x86/sse42.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs index cdc4b1b4..7f39ef49 100644 --- a/src/implementation/x86/sse42.rs +++ b/src/implementation/x86/sse42.rs @@ -23,8 +23,6 @@ use crate::implementation::helpers::Utf8CheckAlgorithm; type SimdU8Value = crate::implementation::helpers::SimdU8Value<__m128i>; -// _mm_loadu_si64 - impl SimdU8Value { #[target_feature(enable = "sse4.2")] #[inline] From 150103b5250de0a94cd3cbc176539e3737be7b76 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 18 Jun 2021 08:52:38 +0200 Subject: [PATCH 44/69] loop unrolling prevention no longer needed --- src/implementation/aarch64/neon.rs | 1 - src/implementation/algorithm.rs | 9 --------- src/implementation/x86/avx2.rs | 1 - src/implementation/x86/sse42.rs | 1 - 4 files changed, 12 deletions(-) diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs index 325386a0..17966445 100644 --- a/src/implementation/aarch64/neon.rs +++ b/src/implementation/aarch64/neon.rs @@ -404,7 +404,6 @@ unsafe fn simd_prefetch(ptr: *const u8) { } const PREFETCH: bool = false; -const PREVENT_REMAINDER_LOOP_UNROLLING: bool = false; #[allow(unused_imports)] use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk; simd_input_128_bit!("not_used"); diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index da34e783..44125128 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -214,11 +214,6 @@ macro_rules! algorithm_simd { let orig_len = len; let mut len = len; - // necessary, otherwise the compiler excessively unrolls the loop, - // the function becomes to big and is no longer inlined for SSE 4.2 - if PREVENT_REMAINDER_LOOP_UNROLLING { - assert!(len < crate::implementation::helpers::SIMD_CHUNK_SIZE); - } while len >= SIMD_SIZE { let simd_val = SimdU8Value::load_from(input); input = input.add(SIMD_SIZE); @@ -254,10 +249,6 @@ macro_rules! algorithm_simd { unsafe fn check_remainder_ascii(&mut self, mut input: *const u8, mut len: usize) { const SIMD_SIZE: usize = core::mem::size_of::(); - // prevent excessive loop unrolling which can cause the function to be too big for inlining - if PREVENT_REMAINDER_LOOP_UNROLLING { - assert!(len < crate::implementation::helpers::SIMD_CHUNK_SIZE); - } while len >= SIMD_SIZE { let simd_val = SimdU8Value::load_from(input); input = input.add(SIMD_SIZE); diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs index b42e230c..f20ea798 100644 --- a/src/implementation/x86/avx2.rs +++ b/src/implementation/x86/avx2.rs @@ -350,7 +350,6 @@ mod test { } const PREFETCH: bool = true; -const PREVENT_REMAINDER_LOOP_UNROLLING: bool = false; #[allow(unused_imports)] use crate::implementation::helpers::TempSimdChunkA32 as TempSimdChunk; simd_input_256_bit!("avx2"); diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs index 7f39ef49..af6de1db 100644 --- a/src/implementation/x86/sse42.rs +++ b/src/implementation/x86/sse42.rs @@ -446,7 +446,6 @@ mod test { } const PREFETCH: bool = false; -const PREVENT_REMAINDER_LOOP_UNROLLING: bool = true; #[allow(unused_imports)] use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk; simd_input_128_bit!("sse4.2"); From 44cd43c42f305525127e681718e1e3e6e571c626 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 18 Jun 2021 12:45:01 +0200 Subject: [PATCH 45/69] benchmark for small inputs of a random length in ranges --- bench/benches/small_compat.rs | 3 ++ bench/benches/small_std.rs | 3 ++ bench/src/lib.rs | 92 +++++++++++++++++++++++++++++++++++ bench/src/macros.rs | 28 ++++++++++- 4 files changed, 124 insertions(+), 2 deletions(-) create mode 100644 bench/benches/small_compat.rs create mode 100644 bench/benches/small_std.rs diff --git a/bench/benches/small_compat.rs b/bench/benches/small_compat.rs new file mode 100644 index 00000000..0ae2af45 --- /dev/null +++ b/bench/benches/small_compat.rs @@ -0,0 +1,3 @@ +use simdutf8_bench::define_small_benchmark; + +define_small_benchmark!(BenchFn::Compat); diff --git a/bench/benches/small_std.rs b/bench/benches/small_std.rs new file mode 100644 index 00000000..5a6bacc7 --- /dev/null +++ b/bench/benches/small_std.rs @@ -0,0 +1,3 @@ +use simdutf8_bench::define_throughput_benchmark; + +define_small_benchmark!(BenchFn::Std); diff --git a/bench/src/lib.rs b/bench/src/lib.rs index 9aade97b..2aad990c 100644 --- a/bench/src/lib.rs +++ b/bench/src/lib.rs @@ -63,6 +63,37 @@ pub fn criterion_benchmark(c: &mut Criterion, bench_fn: Bench bench_late_error(c, bench_fn); } +pub fn criterion_benchmark_small(c: &mut Criterion, bench_fn: BenchFn) { + let core_ids = core_affinity::get_core_ids().unwrap(); + core_affinity::set_for_current(*core_ids.get(2).unwrap_or(&core_ids[0])); + + bench_small( + c, + "1-latin", + &scale_to_one_mib(include_bytes!("../data/Latin-Lipsum.txt")), + bench_fn, + ); + + bench_small( + c, + "2-cyrillic", + &scale_to_one_mib(include_bytes!("../data/Russian-Lipsum.txt")), + bench_fn, + ); + bench_small( + c, + "3-chinese", + &scale_to_one_mib(include_bytes!("../data/Chinese-Lipsum.txt")), + bench_fn, + ); + bench_small( + c, + "4-emoji", + &scale_to_one_mib(include_bytes!("../data/Emoji-Lipsum.txt")), + bench_fn, + ); +} + fn bench_empty(c: &mut Criterion, bench_fn: BenchFn) { let mut group = c.benchmark_group("0-empty"); bench_input(&mut group, b"", false, true, bench_fn); @@ -147,6 +178,67 @@ fn bench(c: &mut Criterion, name: &str, bytes: &[u8], bench_f group.finish(); } +fn bench_small(c: &mut Criterion, name: &str, bytes: &[u8], bench_fn: BenchFn) { + let mut group = c.benchmark_group(name); + bench_range(&mut group, bytes, 0, 16, bench_fn); + bench_range(&mut group, bytes, 16, 32, bench_fn); + bench_range(&mut group, bytes, 32, 64, bench_fn); + bench_range(&mut group, bytes, 65, 127, bench_fn); + group.finish(); +} + +fn gen_valid_in_range(bytes: &[u8], lower_limit: usize, upper_limit: usize) -> usize { + use rand::Rng; + let mut rng = rand::thread_rng(); + loop { + let x = rng.gen_range(lower_limit..upper_limit); + if std_from_utf8(&bytes[0..x]).is_ok() { + return x; + } + } +} + +fn bench_range( + group: &mut BenchmarkGroup, + bytes: &[u8], + lower_limit: usize, + upper_limit: usize, + bench_fn: BenchFn, +) { + match bench_fn { + BenchFn::Basic => { + group.bench_function(format!("rand_{}-{}", lower_limit, upper_limit), |b| { + b.iter_batched( + || gen_valid_in_range(bytes, lower_limit, upper_limit), + |x| assert!(basic_from_utf8(&bytes[0..x]).is_ok()), + criterion::BatchSize::SmallInput, + ) + }); + } + BenchFn::Compat => { + group.bench_function(format!("rand_{}-{}", lower_limit, upper_limit), |b| { + b.iter_batched( + || gen_valid_in_range(bytes, lower_limit, upper_limit), + |x| assert!(compat_from_utf8(&bytes[0..x]).is_ok()), + criterion::BatchSize::SmallInput, + ) + }); + } + BenchFn::Std => { + group.bench_function(format!("rand_{}-{}", lower_limit, upper_limit), |b| { + b.iter_batched( + || gen_valid_in_range(bytes, lower_limit, upper_limit), + |x| assert!(std_from_utf8(&bytes[0..x]).is_ok()), + criterion::BatchSize::SmallInput, + ) + }); + } + _ => { + unimplemented!(); + } + } +} + #[inline(never)] fn basic_from_utf8_no_inline(v: &[u8]) -> bool { basic_from_utf8(v).is_ok() diff --git a/bench/src/macros.rs b/bench/src/macros.rs index 94421b41..745432eb 100644 --- a/bench/src/macros.rs +++ b/bench/src/macros.rs @@ -8,14 +8,38 @@ macro_rules! define_throughput_benchmark { use simdutf8_bench::*; - fn benchmark_compat(c: &mut Criterion) { + fn benchmark_throughput(c: &mut Criterion) { criterion_benchmark(c, $bench_fn); } criterion_group!( name = benches; config = Criterion::default().measurement_time(Duration::from_secs(1)).warm_up_time(Duration::from_secs(1)).sample_size(300); - targets = benchmark_compat + targets = benchmark_throughput + ); + + criterion_main!(benches); + }; +} + +#[macro_export] +macro_rules! define_small_benchmark { + ($bench_fn:expr) => { + use std::time::Duration; + + use criterion::measurement::Measurement; + use criterion::{criterion_group, criterion_main, Criterion}; + + use simdutf8_bench::*; + + fn benchmark_small(c: &mut Criterion) { + criterion_benchmark_small(c, $bench_fn); + } + + criterion_group!( + name = benches; + config = Criterion::default().measurement_time(Duration::from_secs(1)).warm_up_time(Duration::from_secs(1)).sample_size(300); + targets = benchmark_small ); criterion_main!(benches); From 97794b37ad48b9268c34656cf623a03bc844ba44 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 18 Jun 2021 13:00:49 +0200 Subject: [PATCH 46/69] aarch64 partial load asm impl. --- src/implementation/aarch64/neon.rs | 250 ++++++++++++++++++++++++++++- src/lib.rs | 4 + 2 files changed, 253 insertions(+), 1 deletion(-) diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs index 325386a0..0d604168 100644 --- a/src/implementation/aarch64/neon.rs +++ b/src/implementation/aarch64/neon.rs @@ -9,6 +9,247 @@ use crate::implementation::helpers::Utf8CheckAlgorithm; // aarch64 SIMD primitives +#[inline(never)] +/// C ABI spec is necessary so that the loaded value is returned in a register +unsafe extern "C" fn load_partial_assembly_opt_call( + mut ptr: *const u8, + len: usize, +) -> core::arch::aarch64::uint8x16_t { + let res: core::arch::aarch64::uint8x16_t; + unsafe { + asm!( + "movi.2d v0, #0000000000000000", + "cmp {len}, 15", + "b.hi 99f", + "adr {scratch}, #12", + "adds {scratch}, {scratch}, {len}, lsl #4", + "br {scratch}", + + // 0 + "ret", + "nop", + "nop", + "nop", + + // 1 + "ld1.b {{ v0 }}[0], [{ptr}]", + "ret", + "nop", + "nop", + + // 2 + "ld1.h {{ v0 }}[0], [{ptr}]", + "ret", + "nop", + "nop", + + // 3 + "ld1.h {{ v0 }}[0], [{ptr}], #2", + "ld1.b {{ v0 }}[2], [{ptr}]", + "ret", + "nop", + + // 4 + "ld1.s {{ v0 }}[0], [{ptr}]", + "ret", + "nop", + "nop", + + // 5 + "ld1.s {{ v0 }}[0], [{ptr}], #4", + "ld1.b {{ v0 }}[4], [{ptr}]", + "ret", + "nop", + + // 6 + "ld1.s {{ v0 }}[0], [{ptr}], #4", + "ld1.h {{ v0 }}[2], [{ptr}]", + "ret", + "nop", + + // 7 + "ld1.s {{ v0 }}[0], [{ptr}], #4", + "ld1.h {{ v0 }}[2], [{ptr}], #2", + "ld1.b {{ v0 }}[6], [{ptr}]", + "ret", + + // 8 + "ld1.d {{ v0 }}[0], [{ptr}]", + "ret", + "nop", + "nop", + + // 9 + "ld1.d {{ v0 }}[0], [{ptr}], #8", + "ld1.b {{ v0 }}[8], [{ptr}]", + "ret", + "nop", + + // 10 + "ld1.d {{ v0 }}[0], [{ptr}], #8", + "ld1.h {{ v0 }}[4], [{ptr}]", + "ret", + "nop", + + // 11 + "ld1.d {{ v0 }}[0], [{ptr}], #8", + "ld1.h {{ v0 }}[4], [{ptr}], #2", + "ld1.b {{ v0 }}[10], [{ptr}]", + "ret", + + // 12 + "ld1.d {{ v0 }}[0], [{ptr}], #8", + "ld1.s {{ v0 }}[2], [{ptr}]", + "ret", + "nop", + + // 13 + "ld1.d {{ v0 }}[0], [{ptr}], #8", + "ld1.s {{ v0 }}[2], [{ptr}], #4", + "ld1.b {{ v0 }}[12], [{ptr}]", + "ret", + + // 14 + "ld1.d {{ v0 }}[0], [{ptr}], #8", + "ld1.s {{ v0 }}[2], [{ptr}], #4", + "ld1.h {{ v0 }}[6], [{ptr}]", + "ret", + + // 15 + "ld1.d {{ v0 }}[0], [{ptr}], #8", + "ld1.s {{ v0 }}[2], [{ptr}], #4", + "ld1.h {{ v0 }}[6], [{ptr}], #2", + "ld1.b {{ v0 }}[14], [{ptr}]", + + "99:", + ptr = inout(reg) ptr, + len = in(reg) len, + scratch = out(reg) _, + lateout("v0") res, + options(pure, readonly, nostack) + ); + }; + res +} + +#[inline(always)] +fn load_partial_assembly(mut ptr: *const u8, len: usize) -> core::arch::aarch64::uint8x16_t { + assert!(len < 16); + let res: core::arch::aarch64::uint8x16_t; + unsafe { + asm!( + "movi.2d {res:v}, #0000000000000000", + "adr {scratch}, #12", + "adds {scratch}, {scratch}, {len}, lsl #4", + "br {scratch}", + + // 0 + "b 99f", + "nop", + "nop", + "nop", + + // 1 + "ld1.b {{ {res:v} }}[0], [{ptr}]", + "b 99f", + "nop", + "nop", + + // 2 + "ld1.h {{ {res:v} }}[0], [{ptr}]", + "b 99f", + "nop", + "nop", + + // 3 + "ld1.h {{ {res:v} }}[0], [{ptr}], #2", + "ld1.b {{ {res:v} }}[2], [{ptr}]", + "b 99f", + "nop", + + // 4 + "ld1.s {{ {res:v} }}[0], [{ptr}]", + "b 99f", + "nop", + "nop", + + // 5 + "ld1.s {{ {res:v} }}[0], [{ptr}], #4", + "ld1.b {{ {res:v} }}[4], [{ptr}]", + "b 99f", + "nop", + + // 6 + "ld1.s {{ {res:v} }}[0], [{ptr}], #4", + "ld1.h {{ {res:v} }}[2], [{ptr}]", + "b 99f", + "nop", + + // 7 + "ld1.s {{ {res:v} }}[0], [{ptr}], #4", + "ld1.h {{ {res:v} }}[2], [{ptr}], #2", + "ld1.b {{ {res:v} }}[6], [{ptr}]", + "b 99f", + + // 8 + "ld1.d {{ {res:v} }}[0], [{ptr}]", + "b 99f", + "nop", + "nop", + + // 9 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.b {{ {res:v} }}[8], [{ptr}]", + "b 99f", + "nop", + + // 10 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.h {{ {res:v} }}[4], [{ptr}]", + "b 99f", + "nop", + + // 11 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.h {{ {res:v} }}[4], [{ptr}], #2", + "ld1.b {{ {res:v} }}[10], [{ptr}]", + "b 99f", + + // 12 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.s {{ {res:v} }}[2], [{ptr}]", + "b 99f", + "nop", + + // 13 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.s {{ {res:v} }}[2], [{ptr}], #4", + "ld1.b {{ {res:v} }}[12], [{ptr}]", + "b 99f", + + // 14 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.s {{ {res:v} }}[2], [{ptr}], #4", + "ld1.h {{ {res:v} }}[6], [{ptr}]", + "b 99f", + + // 15 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.s {{ {res:v} }}[2], [{ptr}], #4", + "ld1.h {{ {res:v} }}[6], [{ptr}], #2", + "ld1.b {{ {res:v} }}[14], [{ptr}]", + + "99:", + ptr = inout(reg) ptr, + len = in(reg) len, + scratch = out(reg) _, + res = lateout(vreg) res, + options(pure, readonly, nostack) + ); + }; + res +} + type SimdU8Value = crate::implementation::helpers::SimdU8Value; impl SimdU8Value { @@ -102,6 +343,13 @@ impl SimdU8Value { #[inline] unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { + SimdU8Value::from(load_partial_assembly(ptr, len)) + // SimdU8Value::from(load_partial_assembly_opt_call(ptr, len)) + // SimdU8Value::from(Self::load_partial_imp(ptr, len)) + } + + #[inline(always)] + unsafe fn load_partial_imp(ptr: *const u8, len: usize) -> uint8x16_t { let mut res = Self::splat0(); match len { 0 => {} @@ -268,7 +516,7 @@ impl SimdU8Value { debug_assert!(false); } } - res + res.0 } #[inline] diff --git a/src/lib.rs b/src/lib.rs index 51057298..076011eb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,6 +17,10 @@ all(feature = "aarch64_neon", target_arch = "aarch64"), feature(stdsimd) )] +#![cfg_attr( + all(feature = "aarch64_neon", target_arch = "aarch64"), + feature(asm) +)] //! Blazingly fast API-compatible UTF-8 validation for Rust using SIMD extensions, based on the implementation from //! [simdjson](https://github.com/simdjson/simdjson). Originally ported to Rust by the developers of [simd-json.rs](https://simd-json.rs), but now heavily improved. From 138ea37740261ec0468a194e1cc7f5147ed6e7fc Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 18 Jun 2021 13:01:29 +0200 Subject: [PATCH 47/69] Update Cargo.toml update benchmark Cargo.toml --- bench/Cargo.toml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 24c7b5c7..6a502aab 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -17,6 +17,7 @@ core_affinity = "0.5" criterion = "0.3" simdutf8 = { version = "*", path = "..", features = ["aarch64_neon"] } simdjson-utf8 = { version = "*", path = "simdjson-utf8", optional = true } +rand = "0.8" [[bench]] name = "throughput_basic" @@ -37,4 +38,12 @@ harness = false [[bench]] name = "throughput_simdjson" harness = false -required-features = ["simdjson"] \ No newline at end of file +required-features = ["simdjson"] + +[[bench]] +name = "small_compat" +harness = false + +[[bench]] +name = "small_std" +harness = false \ No newline at end of file From b6304c2698b71bd5ee61777e384fb0d42ead92e5 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 18 Jun 2021 13:12:05 +0200 Subject: [PATCH 48/69] fix/allow lint warnings --- src/implementation/aarch64/neon.rs | 227 +++++++++++++++-------------- 1 file changed, 114 insertions(+), 113 deletions(-) diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs index a4ab319f..3158b572 100644 --- a/src/implementation/aarch64/neon.rs +++ b/src/implementation/aarch64/neon.rs @@ -10,129 +10,130 @@ use crate::implementation::helpers::Utf8CheckAlgorithm; // aarch64 SIMD primitives #[inline(never)] +#[allow(unused_assignments)] +#[allow(improper_ctypes_definitions)] /// C ABI spec is necessary so that the loaded value is returned in a register unsafe extern "C" fn load_partial_assembly_opt_call( mut ptr: *const u8, len: usize, ) -> core::arch::aarch64::uint8x16_t { let res: core::arch::aarch64::uint8x16_t; - unsafe { - asm!( - "movi.2d v0, #0000000000000000", - "cmp {len}, 15", - "b.hi 99f", - "adr {scratch}, #12", - "adds {scratch}, {scratch}, {len}, lsl #4", - "br {scratch}", - - // 0 - "ret", - "nop", - "nop", - "nop", - - // 1 - "ld1.b {{ v0 }}[0], [{ptr}]", - "ret", - "nop", - "nop", - - // 2 - "ld1.h {{ v0 }}[0], [{ptr}]", - "ret", - "nop", - "nop", - - // 3 - "ld1.h {{ v0 }}[0], [{ptr}], #2", - "ld1.b {{ v0 }}[2], [{ptr}]", - "ret", - "nop", - - // 4 - "ld1.s {{ v0 }}[0], [{ptr}]", - "ret", - "nop", - "nop", - - // 5 - "ld1.s {{ v0 }}[0], [{ptr}], #4", - "ld1.b {{ v0 }}[4], [{ptr}]", - "ret", - "nop", - - // 6 - "ld1.s {{ v0 }}[0], [{ptr}], #4", - "ld1.h {{ v0 }}[2], [{ptr}]", - "ret", - "nop", - - // 7 - "ld1.s {{ v0 }}[0], [{ptr}], #4", - "ld1.h {{ v0 }}[2], [{ptr}], #2", - "ld1.b {{ v0 }}[6], [{ptr}]", - "ret", - - // 8 - "ld1.d {{ v0 }}[0], [{ptr}]", - "ret", - "nop", - "nop", - - // 9 - "ld1.d {{ v0 }}[0], [{ptr}], #8", - "ld1.b {{ v0 }}[8], [{ptr}]", - "ret", - "nop", - - // 10 - "ld1.d {{ v0 }}[0], [{ptr}], #8", - "ld1.h {{ v0 }}[4], [{ptr}]", - "ret", - "nop", - - // 11 - "ld1.d {{ v0 }}[0], [{ptr}], #8", - "ld1.h {{ v0 }}[4], [{ptr}], #2", - "ld1.b {{ v0 }}[10], [{ptr}]", - "ret", - - // 12 - "ld1.d {{ v0 }}[0], [{ptr}], #8", - "ld1.s {{ v0 }}[2], [{ptr}]", - "ret", - "nop", - - // 13 - "ld1.d {{ v0 }}[0], [{ptr}], #8", - "ld1.s {{ v0 }}[2], [{ptr}], #4", - "ld1.b {{ v0 }}[12], [{ptr}]", - "ret", - - // 14 - "ld1.d {{ v0 }}[0], [{ptr}], #8", - "ld1.s {{ v0 }}[2], [{ptr}], #4", - "ld1.h {{ v0 }}[6], [{ptr}]", - "ret", - - // 15 - "ld1.d {{ v0 }}[0], [{ptr}], #8", - "ld1.s {{ v0 }}[2], [{ptr}], #4", - "ld1.h {{ v0 }}[6], [{ptr}], #2", - "ld1.b {{ v0 }}[14], [{ptr}]", - - "99:", - ptr = inout(reg) ptr, - len = in(reg) len, - scratch = out(reg) _, - lateout("v0") res, - options(pure, readonly, nostack) - ); - }; + asm!( + "movi.2d v0, #0000000000000000", + "cmp {len}, 15", + "b.hi 99f", + "adr {scratch}, #12", + "adds {scratch}, {scratch}, {len}, lsl #4", + "br {scratch}", + + // 0 + "ret", + "nop", + "nop", + "nop", + + // 1 + "ld1.b {{ v0 }}[0], [{ptr}]", + "ret", + "nop", + "nop", + + // 2 + "ld1.h {{ v0 }}[0], [{ptr}]", + "ret", + "nop", + "nop", + + // 3 + "ld1.h {{ v0 }}[0], [{ptr}], #2", + "ld1.b {{ v0 }}[2], [{ptr}]", + "ret", + "nop", + + // 4 + "ld1.s {{ v0 }}[0], [{ptr}]", + "ret", + "nop", + "nop", + + // 5 + "ld1.s {{ v0 }}[0], [{ptr}], #4", + "ld1.b {{ v0 }}[4], [{ptr}]", + "ret", + "nop", + + // 6 + "ld1.s {{ v0 }}[0], [{ptr}], #4", + "ld1.h {{ v0 }}[2], [{ptr}]", + "ret", + "nop", + + // 7 + "ld1.s {{ v0 }}[0], [{ptr}], #4", + "ld1.h {{ v0 }}[2], [{ptr}], #2", + "ld1.b {{ v0 }}[6], [{ptr}]", + "ret", + + // 8 + "ld1.d {{ v0 }}[0], [{ptr}]", + "ret", + "nop", + "nop", + + // 9 + "ld1.d {{ v0 }}[0], [{ptr}], #8", + "ld1.b {{ v0 }}[8], [{ptr}]", + "ret", + "nop", + + // 10 + "ld1.d {{ v0 }}[0], [{ptr}], #8", + "ld1.h {{ v0 }}[4], [{ptr}]", + "ret", + "nop", + + // 11 + "ld1.d {{ v0 }}[0], [{ptr}], #8", + "ld1.h {{ v0 }}[4], [{ptr}], #2", + "ld1.b {{ v0 }}[10], [{ptr}]", + "ret", + + // 12 + "ld1.d {{ v0 }}[0], [{ptr}], #8", + "ld1.s {{ v0 }}[2], [{ptr}]", + "ret", + "nop", + + // 13 + "ld1.d {{ v0 }}[0], [{ptr}], #8", + "ld1.s {{ v0 }}[2], [{ptr}], #4", + "ld1.b {{ v0 }}[12], [{ptr}]", + "ret", + + // 14 + "ld1.d {{ v0 }}[0], [{ptr}], #8", + "ld1.s {{ v0 }}[2], [{ptr}], #4", + "ld1.h {{ v0 }}[6], [{ptr}]", + "ret", + + // 15 + "ld1.d {{ v0 }}[0], [{ptr}], #8", + "ld1.s {{ v0 }}[2], [{ptr}], #4", + "ld1.h {{ v0 }}[6], [{ptr}], #2", + "ld1.b {{ v0 }}[14], [{ptr}]", + + "99:", + ptr = inout(reg) ptr, + len = in(reg) len, + scratch = out(reg) _, + lateout("v0") res, + options(pure, readonly, nostack) + ); res } #[inline(always)] +#[allow(unused_assignments)] fn load_partial_assembly(mut ptr: *const u8, len: usize) -> core::arch::aarch64::uint8x16_t { assert!(len < 16); let res: core::arch::aarch64::uint8x16_t; From 7fea54649bcf0913b4f42434ffd7c3179b32b773 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 18 Jun 2021 13:12:28 +0200 Subject: [PATCH 49/69] example is only for x86-64 --- examples/streaming.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/streaming.rs b/examples/streaming.rs index 5db0764b..89725392 100644 --- a/examples/streaming.rs +++ b/examples/streaming.rs @@ -1,11 +1,9 @@ -#[cfg(feature = "public_imp")] -use simdutf8::basic::imp::Utf8Validator; - #[allow(unused_imports)] use std::io::{stdin, Read, Result}; -#[cfg(feature = "public_imp")] +#[cfg(all(feature = "public_imp", target_arch = "x86_64"))] fn main() -> Result<()> { + use simdutf8::basic::imp::Utf8Validator; unsafe { if !std::is_x86_feature_detected!("avx2") { panic!("This example only works with CPUs supporting AVX 2"); @@ -32,5 +30,5 @@ fn main() -> Result<()> { } /// Dummy main. This example requires the crate feature `public_imp`. -#[cfg(not(feature = "public_imp"))] +#[cfg(not(all(feature = "public_imp", target_arch = "x86_64")))] fn main() {} From b995b7f83c745792a1d1691fdd060e54e9cea51e Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 18 Jun 2021 13:28:14 +0200 Subject: [PATCH 50/69] fix small std benchmark --- bench/benches/small_std.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/benches/small_std.rs b/bench/benches/small_std.rs index 5a6bacc7..65c4ba8d 100644 --- a/bench/benches/small_std.rs +++ b/bench/benches/small_std.rs @@ -1,3 +1,3 @@ -use simdutf8_bench::define_throughput_benchmark; +use simdutf8_bench::define_small_benchmark; define_small_benchmark!(BenchFn::Std); From 114163647e623ef704aa65fb87efc845a5df08d8 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 18 Jun 2021 20:01:48 +0200 Subject: [PATCH 51/69] Update lib.rs fmt --- src/lib.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 076011eb..4c936424 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,10 +17,7 @@ all(feature = "aarch64_neon", target_arch = "aarch64"), feature(stdsimd) )] -#![cfg_attr( - all(feature = "aarch64_neon", target_arch = "aarch64"), - feature(asm) -)] +#![cfg_attr(all(feature = "aarch64_neon", target_arch = "aarch64"), feature(asm))] //! Blazingly fast API-compatible UTF-8 validation for Rust using SIMD extensions, based on the implementation from //! [simdjson](https://github.com/simdjson/simdjson). Originally ported to Rust by the developers of [simd-json.rs](https://simd-json.rs), but now heavily improved. From e1fd53cde48acd0e374ce90b2ad2ed3bb26b31fa Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sat, 19 Jun 2021 14:08:27 +0200 Subject: [PATCH 52/69] use classic lengths for throughput benchmark again now that we have an extra benchmark for small inputs. --- bench/src/lib.rs | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/bench/src/lib.rs b/bench/src/lib.rs index 2aad990c..0e6c883a 100644 --- a/bench/src/lib.rs +++ b/bench/src/lib.rs @@ -145,24 +145,7 @@ fn get_valid_slice_of_len_or_more_aligned( fn bench(c: &mut Criterion, name: &str, bytes: &[u8], bench_fn: BenchFn) { let mut group = c.benchmark_group(name); - let mut sizes = HashSet::new(); - for i in 1..129 { - let alignment = Alignment { - boundary: 64, - offset: 8, // 8 is the default alignment on 64-bit, so this is what can be expected worst-case - }; - let (vec, offset) = get_valid_slice_of_len_or_more_aligned(bytes, i, alignment); - let slice = &vec[offset..]; - assert_eq!( - (slice.as_ptr() as usize) % alignment.boundary, - alignment.offset - ); - if !sizes.contains(&slice.len()) { - bench_input(&mut group, slice, true, true, bench_fn); - sizes.insert(slice.len()); - } - } - for i in [512, 4096, 65536, 131072].iter() { + for i in [1, 8, 64, 512, 4096, 65536, 131072].iter() { let alignment = Alignment { boundary: 64, offset: 8, // 8 is the default alignment on 64-bit, so this is what can be expected worst-case From b4ce94f0f89e6598ee3b8ab993e06e1d9c0b7bcb Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sat, 19 Jun 2021 14:30:05 +0200 Subject: [PATCH 53/69] add small_basic benchmark --- bench/Cargo.toml | 4 ++++ bench/benches/small_basic.rs | 3 +++ 2 files changed, 7 insertions(+) create mode 100644 bench/benches/small_basic.rs diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 6a502aab..2c1d14e5 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -40,6 +40,10 @@ name = "throughput_simdjson" harness = false required-features = ["simdjson"] +[[bench]] +name = "small_basic" +harness = false + [[bench]] name = "small_compat" harness = false diff --git a/bench/benches/small_basic.rs b/bench/benches/small_basic.rs new file mode 100644 index 00000000..5295b710 --- /dev/null +++ b/bench/benches/small_basic.rs @@ -0,0 +1,3 @@ +use simdutf8_bench::define_small_benchmark; + +define_small_benchmark!(BenchFn::Basic); From 8babc9f280c7dbab571be30a74ce2d001ddec300 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 24 Jun 2021 14:33:14 +0200 Subject: [PATCH 54/69] ARM64 CI --- .github/workflows/ci.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1f516f75..cf4355e4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,6 +39,24 @@ jobs: env: RUSTFLAGS: ${{ matrix.rustflags }} +jobs: + test: + runs-on: ARM64 + strategy: + matrix: + features: ["", "--features std" "--features aarch64_neon,std", "--features aarch64_neon,std,publi_imp", "--features aarch64_neon,std,public_imp"] + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + toolchain: nightly + profile: minimal + override: true + - name: Run tests + run: cargo test --no-default-features ${{ matrix.features }} --all-targets --verbose + env: + RUSTFLAGS: ${{ matrix.rustflags }} + test-inlining-x86: runs-on: ubuntu-latest strategy: From 4021faafced69e49953998515a2499179f9773ad Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 24 Jun 2021 14:42:05 +0200 Subject: [PATCH 55/69] fix ci --- .github/workflows/ci.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cf4355e4..7c5eff65 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,8 +39,7 @@ jobs: env: RUSTFLAGS: ${{ matrix.rustflags }} -jobs: - test: + test-arm64: runs-on: ARM64 strategy: matrix: From 80b0bc8d74c1de048bf323566ca56d607ed0e049 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 24 Jun 2021 14:46:13 +0200 Subject: [PATCH 56/69] Update ci.yml --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7c5eff65..752145c9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,7 +43,7 @@ jobs: runs-on: ARM64 strategy: matrix: - features: ["", "--features std" "--features aarch64_neon,std", "--features aarch64_neon,std,publi_imp", "--features aarch64_neon,std,public_imp"] + features: ["", "--features std", "--features aarch64_neon,std", "--features aarch64_neon,std,publi_imp", "--features aarch64_neon,std,public_imp"] steps: - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1 From d29695dc76401b399162a096a33d7b14f58a7273 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 24 Jun 2021 14:48:43 +0200 Subject: [PATCH 57/69] fix ci --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 752145c9..644431d1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,7 +43,7 @@ jobs: runs-on: ARM64 strategy: matrix: - features: ["", "--features std", "--features aarch64_neon,std", "--features aarch64_neon,std,publi_imp", "--features aarch64_neon,std,public_imp"] + features: ["", "--features std", "--features aarch64_neon,std", "--features aarch64_neon,std,public_imp", "--features aarch64_neon,std,public_imp"] steps: - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1 From b2c033982180a449c0b57aa18548601170ab0e3b Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 24 Jun 2021 15:54:48 +0200 Subject: [PATCH 58/69] more ARM64 ci --- .github/workflows/ci.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 644431d1..19c111cc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -131,12 +131,12 @@ jobs: env: RUSTDOCFLAGS: --cfg docsrs - cross-build-arm: + cross-build-arm32: runs-on: ubuntu-latest strategy: matrix: toolchain: ["1.38.0", stable, beta, nightly ] - target: [arm-unknown-linux-gnueabi, aarch64-unknown-linux-gnu] + target: [arm-unknown-linux-gnueabi] features: ["--features std", ""] include: - toolchain: nightly @@ -204,7 +204,10 @@ jobs: run: cargo fmt -- --check clippy_check: - runs-on: ubuntu-latest + runs-on: ${{ matrix.runner }} + strategy: + matrix: + os: [ubuntu-latest, ARM64] steps: - uses: actions/checkout@v1 - name: Update rustup From b1416f67c0f7a86f6fbdc10a1d284c25f999fc6b Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 25 Jun 2021 07:00:28 +0200 Subject: [PATCH 59/69] fix ci --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index da380fbf..064bd146 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -192,7 +192,7 @@ jobs: runs-on: ${{ matrix.runner }} strategy: matrix: - os: [ubuntu-latest, ARM64] + runner: [ubuntu-latest, ARM64] steps: - uses: actions/checkout@v1 - uses: actions-rs/toolchain@v1 From 5753cad0ec035d80842c3386a16a688da1046d2d Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 25 Jun 2021 07:06:52 +0200 Subject: [PATCH 60/69] clippy --- src/implementation/aarch64/neon.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs index 3158b572..4190cc75 100644 --- a/src/implementation/aarch64/neon.rs +++ b/src/implementation/aarch64/neon.rs @@ -10,6 +10,7 @@ use crate::implementation::helpers::Utf8CheckAlgorithm; // aarch64 SIMD primitives #[inline(never)] +#[allow(clippy::too_many_lines)] #[allow(unused_assignments)] #[allow(improper_ctypes_definitions)] /// C ABI spec is necessary so that the loaded value is returned in a register @@ -133,6 +134,8 @@ unsafe extern "C" fn load_partial_assembly_opt_call( } #[inline(always)] +#[allow(clippy::too_many_lines)] +#[allow(clippy::inline_always)] #[allow(unused_assignments)] fn load_partial_assembly(mut ptr: *const u8, len: usize) -> core::arch::aarch64::uint8x16_t { assert!(len < 16); @@ -344,12 +347,14 @@ impl SimdU8Value { #[inline] unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { - SimdU8Value::from(load_partial_assembly(ptr, len)) - // SimdU8Value::from(load_partial_assembly_opt_call(ptr, len)) - // SimdU8Value::from(Self::load_partial_imp(ptr, len)) + Self::from(load_partial_assembly(ptr, len)) + // Self::from(load_partial_assembly_opt_call(ptr, len)) + // Self::from(Self::load_partial_imp(ptr, len)) } #[inline(always)] + #[allow(clippy::inline_always)] + #[allow(clippy::too_many_lines)] unsafe fn load_partial_imp(ptr: *const u8, len: usize) -> uint8x16_t { let mut res = Self::splat0(); match len { From b86d00fffffad764e6578628391b26d09a76a7d1 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Wed, 30 Jun 2021 08:21:30 +0200 Subject: [PATCH 61/69] add one more small benchmark --- bench/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/bench/src/lib.rs b/bench/src/lib.rs index 0e6c883a..3c08b325 100644 --- a/bench/src/lib.rs +++ b/bench/src/lib.rs @@ -167,6 +167,7 @@ fn bench_small(c: &mut Criterion, name: &str, bytes: &[u8], b bench_range(&mut group, bytes, 16, 32, bench_fn); bench_range(&mut group, bytes, 32, 64, bench_fn); bench_range(&mut group, bytes, 65, 127, bench_fn); + bench_range(&mut group, bytes, 129, 255, bench_fn); group.finish(); } From 8eb9ebf950ed47e45d2e4b4e57cb300df9846c29 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Wed, 30 Jun 2021 08:54:39 +0200 Subject: [PATCH 62/69] more consistent small basic benchmarks --- bench/src/lib.rs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/bench/src/lib.rs b/bench/src/lib.rs index 3c08b325..61021240 100644 --- a/bench/src/lib.rs +++ b/bench/src/lib.rs @@ -2,7 +2,6 @@ use criterion::{measurement::Measurement, BenchmarkGroup, BenchmarkId, Criterion use simdutf8::basic::from_utf8 as basic_from_utf8; use simdutf8::compat::from_utf8 as compat_from_utf8; -use std::collections::HashSet; use std::str::from_utf8 as std_from_utf8; #[cfg(feature = "simdjson")] @@ -166,8 +165,8 @@ fn bench_small(c: &mut Criterion, name: &str, bytes: &[u8], b bench_range(&mut group, bytes, 0, 16, bench_fn); bench_range(&mut group, bytes, 16, 32, bench_fn); bench_range(&mut group, bytes, 32, 64, bench_fn); - bench_range(&mut group, bytes, 65, 127, bench_fn); - bench_range(&mut group, bytes, 129, 255, bench_fn); + bench_range(&mut group, bytes, 64, 128, bench_fn); + bench_range(&mut group, bytes, 128, 256, bench_fn); group.finish(); } @@ -189,29 +188,31 @@ fn bench_range( upper_limit: usize, bench_fn: BenchFn, ) { + let bench_id = format!("rand_{:03}-{:03}", lower_limit, upper_limit); + let gen_fn = || gen_valid_in_range(bytes, lower_limit, upper_limit); match bench_fn { BenchFn::Basic => { - group.bench_function(format!("rand_{}-{}", lower_limit, upper_limit), |b| { + group.bench_function(bench_id, |b| { b.iter_batched( - || gen_valid_in_range(bytes, lower_limit, upper_limit), + gen_fn, |x| assert!(basic_from_utf8(&bytes[0..x]).is_ok()), criterion::BatchSize::SmallInput, ) }); } BenchFn::Compat => { - group.bench_function(format!("rand_{}-{}", lower_limit, upper_limit), |b| { + group.bench_function(bench_id, |b| { b.iter_batched( - || gen_valid_in_range(bytes, lower_limit, upper_limit), + gen_fn, |x| assert!(compat_from_utf8(&bytes[0..x]).is_ok()), criterion::BatchSize::SmallInput, ) }); } BenchFn::Std => { - group.bench_function(format!("rand_{}-{}", lower_limit, upper_limit), |b| { + group.bench_function(bench_id, |b| { b.iter_batched( - || gen_valid_in_range(bytes, lower_limit, upper_limit), + gen_fn, |x| assert!(std_from_utf8(&bytes[0..x]).is_ok()), criterion::BatchSize::SmallInput, ) From 47588bc265039c03833a1ede47c417e7acbab928 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Wed, 30 Jun 2021 10:51:28 +0200 Subject: [PATCH 63/69] rename fn --- src/implementation/aarch64/neon.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs index 4190cc75..75e714a1 100644 --- a/src/implementation/aarch64/neon.rs +++ b/src/implementation/aarch64/neon.rs @@ -349,13 +349,13 @@ impl SimdU8Value { unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { Self::from(load_partial_assembly(ptr, len)) // Self::from(load_partial_assembly_opt_call(ptr, len)) - // Self::from(Self::load_partial_imp(ptr, len)) + // Self::from(Self::load_partial_intrinsics(ptr, len)) } #[inline(always)] #[allow(clippy::inline_always)] #[allow(clippy::too_many_lines)] - unsafe fn load_partial_imp(ptr: *const u8, len: usize) -> uint8x16_t { + unsafe fn load_partial_intrinsics(ptr: *const u8, len: usize) -> uint8x16_t { let mut res = Self::splat0(); match len { 0 => {} From 119b6225bb687c8e98d268eda9b7e0703670e5cb Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Tue, 6 Jul 2021 08:08:33 +0200 Subject: [PATCH 64/69] make cargo test --all-features work on non-x86 --- src/basic.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/basic.rs b/src/basic.rs index 79aea46d..6b24dfba 100644 --- a/src/basic.rs +++ b/src/basic.rs @@ -77,6 +77,11 @@ pub mod imp { /// use simdutf8::basic::imp::Utf8Validator; /// use std::io::{stdin, Read, Result}; /// + /// # #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + /// # fn main() { + /// # } + /// + /// # #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] /// fn main() -> Result<()> { /// unsafe { /// if !std::is_x86_feature_detected!("avx2") { From bb58d71cb47b68e43f892b6b2ae4778dea1e56f8 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Tue, 6 Jul 2021 08:09:37 +0200 Subject: [PATCH 65/69] remove aarch64 partial load assembly impl. non-inlined fn --- src/implementation/aarch64/neon.rs | 124 ----------------------------- 1 file changed, 124 deletions(-) diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs index 75e714a1..cc277e6a 100644 --- a/src/implementation/aarch64/neon.rs +++ b/src/implementation/aarch64/neon.rs @@ -9,130 +9,6 @@ use crate::implementation::helpers::Utf8CheckAlgorithm; // aarch64 SIMD primitives -#[inline(never)] -#[allow(clippy::too_many_lines)] -#[allow(unused_assignments)] -#[allow(improper_ctypes_definitions)] -/// C ABI spec is necessary so that the loaded value is returned in a register -unsafe extern "C" fn load_partial_assembly_opt_call( - mut ptr: *const u8, - len: usize, -) -> core::arch::aarch64::uint8x16_t { - let res: core::arch::aarch64::uint8x16_t; - asm!( - "movi.2d v0, #0000000000000000", - "cmp {len}, 15", - "b.hi 99f", - "adr {scratch}, #12", - "adds {scratch}, {scratch}, {len}, lsl #4", - "br {scratch}", - - // 0 - "ret", - "nop", - "nop", - "nop", - - // 1 - "ld1.b {{ v0 }}[0], [{ptr}]", - "ret", - "nop", - "nop", - - // 2 - "ld1.h {{ v0 }}[0], [{ptr}]", - "ret", - "nop", - "nop", - - // 3 - "ld1.h {{ v0 }}[0], [{ptr}], #2", - "ld1.b {{ v0 }}[2], [{ptr}]", - "ret", - "nop", - - // 4 - "ld1.s {{ v0 }}[0], [{ptr}]", - "ret", - "nop", - "nop", - - // 5 - "ld1.s {{ v0 }}[0], [{ptr}], #4", - "ld1.b {{ v0 }}[4], [{ptr}]", - "ret", - "nop", - - // 6 - "ld1.s {{ v0 }}[0], [{ptr}], #4", - "ld1.h {{ v0 }}[2], [{ptr}]", - "ret", - "nop", - - // 7 - "ld1.s {{ v0 }}[0], [{ptr}], #4", - "ld1.h {{ v0 }}[2], [{ptr}], #2", - "ld1.b {{ v0 }}[6], [{ptr}]", - "ret", - - // 8 - "ld1.d {{ v0 }}[0], [{ptr}]", - "ret", - "nop", - "nop", - - // 9 - "ld1.d {{ v0 }}[0], [{ptr}], #8", - "ld1.b {{ v0 }}[8], [{ptr}]", - "ret", - "nop", - - // 10 - "ld1.d {{ v0 }}[0], [{ptr}], #8", - "ld1.h {{ v0 }}[4], [{ptr}]", - "ret", - "nop", - - // 11 - "ld1.d {{ v0 }}[0], [{ptr}], #8", - "ld1.h {{ v0 }}[4], [{ptr}], #2", - "ld1.b {{ v0 }}[10], [{ptr}]", - "ret", - - // 12 - "ld1.d {{ v0 }}[0], [{ptr}], #8", - "ld1.s {{ v0 }}[2], [{ptr}]", - "ret", - "nop", - - // 13 - "ld1.d {{ v0 }}[0], [{ptr}], #8", - "ld1.s {{ v0 }}[2], [{ptr}], #4", - "ld1.b {{ v0 }}[12], [{ptr}]", - "ret", - - // 14 - "ld1.d {{ v0 }}[0], [{ptr}], #8", - "ld1.s {{ v0 }}[2], [{ptr}], #4", - "ld1.h {{ v0 }}[6], [{ptr}]", - "ret", - - // 15 - "ld1.d {{ v0 }}[0], [{ptr}], #8", - "ld1.s {{ v0 }}[2], [{ptr}], #4", - "ld1.h {{ v0 }}[6], [{ptr}], #2", - "ld1.b {{ v0 }}[14], [{ptr}]", - - "99:", - ptr = inout(reg) ptr, - len = in(reg) len, - scratch = out(reg) _, - lateout("v0") res, - options(pure, readonly, nostack) - ); - res -} - #[inline(always)] #[allow(clippy::too_many_lines)] #[allow(clippy::inline_always)] From 45befdbcb7aa46ed686e17c7e5167e0aab4d97bb Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Tue, 6 Jul 2021 08:15:28 +0200 Subject: [PATCH 66/69] asm fn -> method --- src/implementation/aarch64/neon.rs | 245 ++++++++++++++--------------- 1 file changed, 122 insertions(+), 123 deletions(-) diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs index cc277e6a..1669d356 100644 --- a/src/implementation/aarch64/neon.rs +++ b/src/implementation/aarch64/neon.rs @@ -9,127 +9,6 @@ use crate::implementation::helpers::Utf8CheckAlgorithm; // aarch64 SIMD primitives -#[inline(always)] -#[allow(clippy::too_many_lines)] -#[allow(clippy::inline_always)] -#[allow(unused_assignments)] -fn load_partial_assembly(mut ptr: *const u8, len: usize) -> core::arch::aarch64::uint8x16_t { - assert!(len < 16); - let res: core::arch::aarch64::uint8x16_t; - unsafe { - asm!( - "movi.2d {res:v}, #0000000000000000", - "adr {scratch}, #12", - "adds {scratch}, {scratch}, {len}, lsl #4", - "br {scratch}", - - // 0 - "b 99f", - "nop", - "nop", - "nop", - - // 1 - "ld1.b {{ {res:v} }}[0], [{ptr}]", - "b 99f", - "nop", - "nop", - - // 2 - "ld1.h {{ {res:v} }}[0], [{ptr}]", - "b 99f", - "nop", - "nop", - - // 3 - "ld1.h {{ {res:v} }}[0], [{ptr}], #2", - "ld1.b {{ {res:v} }}[2], [{ptr}]", - "b 99f", - "nop", - - // 4 - "ld1.s {{ {res:v} }}[0], [{ptr}]", - "b 99f", - "nop", - "nop", - - // 5 - "ld1.s {{ {res:v} }}[0], [{ptr}], #4", - "ld1.b {{ {res:v} }}[4], [{ptr}]", - "b 99f", - "nop", - - // 6 - "ld1.s {{ {res:v} }}[0], [{ptr}], #4", - "ld1.h {{ {res:v} }}[2], [{ptr}]", - "b 99f", - "nop", - - // 7 - "ld1.s {{ {res:v} }}[0], [{ptr}], #4", - "ld1.h {{ {res:v} }}[2], [{ptr}], #2", - "ld1.b {{ {res:v} }}[6], [{ptr}]", - "b 99f", - - // 8 - "ld1.d {{ {res:v} }}[0], [{ptr}]", - "b 99f", - "nop", - "nop", - - // 9 - "ld1.d {{ {res:v} }}[0], [{ptr}], #8", - "ld1.b {{ {res:v} }}[8], [{ptr}]", - "b 99f", - "nop", - - // 10 - "ld1.d {{ {res:v} }}[0], [{ptr}], #8", - "ld1.h {{ {res:v} }}[4], [{ptr}]", - "b 99f", - "nop", - - // 11 - "ld1.d {{ {res:v} }}[0], [{ptr}], #8", - "ld1.h {{ {res:v} }}[4], [{ptr}], #2", - "ld1.b {{ {res:v} }}[10], [{ptr}]", - "b 99f", - - // 12 - "ld1.d {{ {res:v} }}[0], [{ptr}], #8", - "ld1.s {{ {res:v} }}[2], [{ptr}]", - "b 99f", - "nop", - - // 13 - "ld1.d {{ {res:v} }}[0], [{ptr}], #8", - "ld1.s {{ {res:v} }}[2], [{ptr}], #4", - "ld1.b {{ {res:v} }}[12], [{ptr}]", - "b 99f", - - // 14 - "ld1.d {{ {res:v} }}[0], [{ptr}], #8", - "ld1.s {{ {res:v} }}[2], [{ptr}], #4", - "ld1.h {{ {res:v} }}[6], [{ptr}]", - "b 99f", - - // 15 - "ld1.d {{ {res:v} }}[0], [{ptr}], #8", - "ld1.s {{ {res:v} }}[2], [{ptr}], #4", - "ld1.h {{ {res:v} }}[6], [{ptr}], #2", - "ld1.b {{ {res:v} }}[14], [{ptr}]", - - "99:", - ptr = inout(reg) ptr, - len = in(reg) len, - scratch = out(reg) _, - res = lateout(vreg) res, - options(pure, readonly, nostack) - ); - }; - res -} - type SimdU8Value = crate::implementation::helpers::SimdU8Value; impl SimdU8Value { @@ -223,8 +102,7 @@ impl SimdU8Value { #[inline] unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { - Self::from(load_partial_assembly(ptr, len)) - // Self::from(load_partial_assembly_opt_call(ptr, len)) + Self::from(Self::load_partial_assembly(ptr, len)) // Self::from(Self::load_partial_intrinsics(ptr, len)) } @@ -401,6 +279,127 @@ impl SimdU8Value { res.0 } + #[inline(always)] + #[allow(clippy::too_many_lines)] + #[allow(clippy::inline_always)] + #[allow(unused_assignments)] + fn load_partial_assembly(mut ptr: *const u8, len: usize) -> core::arch::aarch64::uint8x16_t { + assert!(len < 16); + let res: core::arch::aarch64::uint8x16_t; + unsafe { + asm!( + "movi.2d {res:v}, #0000000000000000", + "adr {scratch}, #12", + "adds {scratch}, {scratch}, {len}, lsl #4", + "br {scratch}", + + // 0 + "b 99f", + "nop", + "nop", + "nop", + + // 1 + "ld1.b {{ {res:v} }}[0], [{ptr}]", + "b 99f", + "nop", + "nop", + + // 2 + "ld1.h {{ {res:v} }}[0], [{ptr}]", + "b 99f", + "nop", + "nop", + + // 3 + "ld1.h {{ {res:v} }}[0], [{ptr}], #2", + "ld1.b {{ {res:v} }}[2], [{ptr}]", + "b 99f", + "nop", + + // 4 + "ld1.s {{ {res:v} }}[0], [{ptr}]", + "b 99f", + "nop", + "nop", + + // 5 + "ld1.s {{ {res:v} }}[0], [{ptr}], #4", + "ld1.b {{ {res:v} }}[4], [{ptr}]", + "b 99f", + "nop", + + // 6 + "ld1.s {{ {res:v} }}[0], [{ptr}], #4", + "ld1.h {{ {res:v} }}[2], [{ptr}]", + "b 99f", + "nop", + + // 7 + "ld1.s {{ {res:v} }}[0], [{ptr}], #4", + "ld1.h {{ {res:v} }}[2], [{ptr}], #2", + "ld1.b {{ {res:v} }}[6], [{ptr}]", + "b 99f", + + // 8 + "ld1.d {{ {res:v} }}[0], [{ptr}]", + "b 99f", + "nop", + "nop", + + // 9 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.b {{ {res:v} }}[8], [{ptr}]", + "b 99f", + "nop", + + // 10 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.h {{ {res:v} }}[4], [{ptr}]", + "b 99f", + "nop", + + // 11 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.h {{ {res:v} }}[4], [{ptr}], #2", + "ld1.b {{ {res:v} }}[10], [{ptr}]", + "b 99f", + + // 12 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.s {{ {res:v} }}[2], [{ptr}]", + "b 99f", + "nop", + + // 13 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.s {{ {res:v} }}[2], [{ptr}], #4", + "ld1.b {{ {res:v} }}[12], [{ptr}]", + "b 99f", + + // 14 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.s {{ {res:v} }}[2], [{ptr}], #4", + "ld1.h {{ {res:v} }}[6], [{ptr}]", + "b 99f", + + // 15 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.s {{ {res:v} }}[2], [{ptr}], #4", + "ld1.h {{ {res:v} }}[6], [{ptr}], #2", + "ld1.b {{ {res:v} }}[14], [{ptr}]", + + "99:", + ptr = inout(reg) ptr, + len = in(reg) len, + scratch = out(reg) _, + res = lateout(vreg) res, + options(pure, readonly, nostack) + ); + }; + res + } + #[inline] #[allow(clippy::too_many_arguments)] unsafe fn lookup_16( From 37f9198b20f89e870ef0259be35cf48a786b9100 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sun, 11 Jul 2021 09:54:00 +0200 Subject: [PATCH 67/69] simplify --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4c936424..91798806 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,9 +15,9 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![cfg_attr( all(feature = "aarch64_neon", target_arch = "aarch64"), - feature(stdsimd) + feature(stdsimd), + feature(asm) )] -#![cfg_attr(all(feature = "aarch64_neon", target_arch = "aarch64"), feature(asm))] //! Blazingly fast API-compatible UTF-8 validation for Rust using SIMD extensions, based on the implementation from //! [simdjson](https://github.com/simdjson/simdjson). Originally ported to Rust by the developers of [simd-json.rs](https://simd-json.rs), but now heavily improved. From d3399cd9713f95b874fb0cb2a6abc0c3d9e32589 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Sun, 15 Aug 2021 22:39:59 +0200 Subject: [PATCH 68/69] Trigger GitHub actions From 82c5fee295f9753d7cc90f23c8e1e8736bf2bbd7 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 24 Oct 2024 05:16:14 +0200 Subject: [PATCH 69/69] fixes for current nightly --- src/implementation/aarch64/neon.rs | 1 + src/implementation/algorithm.rs | 3 --- src/lib.rs | 7 +++---- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs index 1669d356..c1cf3790 100644 --- a/src/implementation/aarch64/neon.rs +++ b/src/implementation/aarch64/neon.rs @@ -6,6 +6,7 @@ use core::arch::aarch64::{ }; use crate::implementation::helpers::Utf8CheckAlgorithm; +use core::arch::asm; // aarch64 SIMD primitives diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index 44125128..52b50b83 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -182,7 +182,6 @@ macro_rules! algorithm_simd { #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] #[inline] #[allow(unconditional_panic)] // does not panic because len is checked - #[allow(const_err)] // the same, but for Rust 1.38.0 unsafe fn check_block(&mut self, input: SimdInput) { // WORKAROUND // necessary because the for loop is not unrolled on ARM64 @@ -208,7 +207,6 @@ macro_rules! algorithm_simd { #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] #[inline] #[allow(unconditional_panic)] // does not panic because len is checked - #[allow(const_err)] // the same, but for Rust 1.38.0 unsafe fn check_remainder(&mut self, mut input: *const u8, len: usize) { const SIMD_SIZE: usize = core::mem::size_of::(); let orig_len = len; @@ -245,7 +243,6 @@ macro_rules! algorithm_simd { #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] #[inline] #[allow(unconditional_panic)] // does not panic because len is checked - #[allow(const_err)] // the same, but for Rust 1.38.0 unsafe fn check_remainder_ascii(&mut self, mut input: *const u8, mut len: usize) { const SIMD_SIZE: usize = core::mem::size_of::(); diff --git a/src/lib.rs b/src/lib.rs index 91798806..cee27bc8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,6 @@ -#![deny(warnings)] +#![warn(warnings)] #![warn(unused_extern_crates)] -#![deny( +#![warn( clippy::all, clippy::unwrap_used, clippy::unnecessary_unwrap, @@ -15,8 +15,7 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![cfg_attr( all(feature = "aarch64_neon", target_arch = "aarch64"), - feature(stdsimd), - feature(asm) + feature(stdarch_aarch64_prefetch) )] //! Blazingly fast API-compatible UTF-8 validation for Rust using SIMD extensions, based on the implementation from