diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b63455ca..064bd146 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,6 +35,23 @@ jobs: env: RUSTFLAGS: ${{ matrix.rustflags }} + test-arm64: + runs-on: ARM64 + strategy: + matrix: + features: ["", "--features std", "--features aarch64_neon,std", "--features aarch64_neon,std,public_imp", "--features aarch64_neon,std,public_imp"] + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + toolchain: nightly + profile: minimal + override: true + - name: Run tests + run: cargo test --no-default-features ${{ matrix.features }} --all-targets --verbose + env: + RUSTFLAGS: ${{ matrix.rustflags }} + test-inlining-x86: runs-on: ubuntu-latest strategy: @@ -106,12 +123,12 @@ jobs: env: RUSTDOCFLAGS: --cfg docsrs - cross-build-arm: + cross-build-arm32: runs-on: ubuntu-latest strategy: matrix: toolchain: ["1.38.0", stable, beta, nightly ] - target: [arm-unknown-linux-gnueabi, aarch64-unknown-linux-gnu] + target: [arm-unknown-linux-gnueabi] features: ["--features std", ""] include: - toolchain: nightly @@ -172,7 +189,10 @@ jobs: run: cargo fmt -- --check clippy_check: - runs-on: ubuntu-latest + runs-on: ${{ matrix.runner }} + strategy: + matrix: + runner: [ubuntu-latest, ARM64] steps: - uses: actions/checkout@v1 - uses: actions-rs/toolchain@v1 diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 24c7b5c7..2c1d14e5 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -17,6 +17,7 @@ core_affinity = "0.5" criterion = "0.3" simdutf8 = { version = "*", path = "..", features = ["aarch64_neon"] } simdjson-utf8 = { version = "*", path = "simdjson-utf8", optional = true } +rand = "0.8" [[bench]] name = "throughput_basic" @@ -37,4 +38,16 @@ harness = false [[bench]] name = "throughput_simdjson" harness = false -required-features = ["simdjson"] \ No newline at end of file +required-features = ["simdjson"] + +[[bench]] +name = "small_basic" +harness = false + +[[bench]] +name = "small_compat" +harness = false + +[[bench]] +name = "small_std" +harness = false \ No newline at end of file diff --git a/bench/benches/small_basic.rs b/bench/benches/small_basic.rs new file mode 100644 index 00000000..5295b710 --- /dev/null +++ b/bench/benches/small_basic.rs @@ -0,0 +1,3 @@ +use simdutf8_bench::define_small_benchmark; + +define_small_benchmark!(BenchFn::Basic); diff --git a/bench/benches/small_compat.rs b/bench/benches/small_compat.rs new file mode 100644 index 00000000..0ae2af45 --- /dev/null +++ b/bench/benches/small_compat.rs @@ -0,0 +1,3 @@ +use simdutf8_bench::define_small_benchmark; + +define_small_benchmark!(BenchFn::Compat); diff --git a/bench/benches/small_std.rs b/bench/benches/small_std.rs new file mode 100644 index 00000000..65c4ba8d --- /dev/null +++ b/bench/benches/small_std.rs @@ -0,0 +1,3 @@ +use simdutf8_bench::define_small_benchmark; + +define_small_benchmark!(BenchFn::Std); diff --git a/bench/src/lib.rs b/bench/src/lib.rs index 218055b3..61021240 100644 --- a/bench/src/lib.rs +++ b/bench/src/lib.rs @@ -62,6 +62,37 @@ pub fn criterion_benchmark(c: &mut Criterion, bench_fn: Bench bench_late_error(c, bench_fn); } +pub fn criterion_benchmark_small(c: &mut Criterion, bench_fn: BenchFn) { + let core_ids = core_affinity::get_core_ids().unwrap(); + core_affinity::set_for_current(*core_ids.get(2).unwrap_or(&core_ids[0])); + + bench_small( + c, + "1-latin", + &scale_to_one_mib(include_bytes!("../data/Latin-Lipsum.txt")), + bench_fn, + ); + + bench_small( + c, + "2-cyrillic", + &scale_to_one_mib(include_bytes!("../data/Russian-Lipsum.txt")), + bench_fn, + ); + bench_small( + c, + "3-chinese", + &scale_to_one_mib(include_bytes!("../data/Chinese-Lipsum.txt")), + bench_fn, + ); + bench_small( + c, + "4-emoji", + &scale_to_one_mib(include_bytes!("../data/Emoji-Lipsum.txt")), + bench_fn, + ); +} + fn bench_empty(c: &mut Criterion, bench_fn: BenchFn) { let mut group = c.benchmark_group("0-empty"); bench_input(&mut group, b"", false, true, bench_fn); @@ -129,6 +160,70 @@ fn bench(c: &mut Criterion, name: &str, bytes: &[u8], bench_f group.finish(); } +fn bench_small(c: &mut Criterion, name: &str, bytes: &[u8], bench_fn: BenchFn) { + let mut group = c.benchmark_group(name); + bench_range(&mut group, bytes, 0, 16, bench_fn); + bench_range(&mut group, bytes, 16, 32, bench_fn); + bench_range(&mut group, bytes, 32, 64, bench_fn); + bench_range(&mut group, bytes, 64, 128, bench_fn); + bench_range(&mut group, bytes, 128, 256, bench_fn); + group.finish(); +} + +fn gen_valid_in_range(bytes: &[u8], lower_limit: usize, upper_limit: usize) -> usize { + use rand::Rng; + let mut rng = rand::thread_rng(); + loop { + let x = rng.gen_range(lower_limit..upper_limit); + if std_from_utf8(&bytes[0..x]).is_ok() { + return x; + } + } +} + +fn bench_range( + group: &mut BenchmarkGroup, + bytes: &[u8], + lower_limit: usize, + upper_limit: usize, + bench_fn: BenchFn, +) { + let bench_id = format!("rand_{:03}-{:03}", lower_limit, upper_limit); + let gen_fn = || gen_valid_in_range(bytes, lower_limit, upper_limit); + match bench_fn { + BenchFn::Basic => { + group.bench_function(bench_id, |b| { + b.iter_batched( + gen_fn, + |x| assert!(basic_from_utf8(&bytes[0..x]).is_ok()), + criterion::BatchSize::SmallInput, + ) + }); + } + BenchFn::Compat => { + group.bench_function(bench_id, |b| { + b.iter_batched( + gen_fn, + |x| assert!(compat_from_utf8(&bytes[0..x]).is_ok()), + criterion::BatchSize::SmallInput, + ) + }); + } + BenchFn::Std => { + group.bench_function(bench_id, |b| { + b.iter_batched( + gen_fn, + |x| assert!(std_from_utf8(&bytes[0..x]).is_ok()), + criterion::BatchSize::SmallInput, + ) + }); + } + _ => { + unimplemented!(); + } + } +} + #[inline(never)] fn basic_from_utf8_no_inline(v: &[u8]) -> bool { basic_from_utf8(v).is_ok() diff --git a/bench/src/macros.rs b/bench/src/macros.rs index 94421b41..745432eb 100644 --- a/bench/src/macros.rs +++ b/bench/src/macros.rs @@ -8,14 +8,38 @@ macro_rules! define_throughput_benchmark { use simdutf8_bench::*; - fn benchmark_compat(c: &mut Criterion) { + fn benchmark_throughput(c: &mut Criterion) { criterion_benchmark(c, $bench_fn); } criterion_group!( name = benches; config = Criterion::default().measurement_time(Duration::from_secs(1)).warm_up_time(Duration::from_secs(1)).sample_size(300); - targets = benchmark_compat + targets = benchmark_throughput + ); + + criterion_main!(benches); + }; +} + +#[macro_export] +macro_rules! define_small_benchmark { + ($bench_fn:expr) => { + use std::time::Duration; + + use criterion::measurement::Measurement; + use criterion::{criterion_group, criterion_main, Criterion}; + + use simdutf8_bench::*; + + fn benchmark_small(c: &mut Criterion) { + criterion_benchmark_small(c, $bench_fn); + } + + criterion_group!( + name = benches; + config = Criterion::default().measurement_time(Duration::from_secs(1)).warm_up_time(Duration::from_secs(1)).sample_size(300); + targets = benchmark_small ); criterion_main!(benches); diff --git a/examples/streaming.rs b/examples/streaming.rs index 5db0764b..89725392 100644 --- a/examples/streaming.rs +++ b/examples/streaming.rs @@ -1,11 +1,9 @@ -#[cfg(feature = "public_imp")] -use simdutf8::basic::imp::Utf8Validator; - #[allow(unused_imports)] use std::io::{stdin, Read, Result}; -#[cfg(feature = "public_imp")] +#[cfg(all(feature = "public_imp", target_arch = "x86_64"))] fn main() -> Result<()> { + use simdutf8::basic::imp::Utf8Validator; unsafe { if !std::is_x86_feature_detected!("avx2") { panic!("This example only works with CPUs supporting AVX 2"); @@ -32,5 +30,5 @@ fn main() -> Result<()> { } /// Dummy main. This example requires the crate feature `public_imp`. -#[cfg(not(feature = "public_imp"))] +#[cfg(not(all(feature = "public_imp", target_arch = "x86_64")))] fn main() {} diff --git a/src/basic.rs b/src/basic.rs index 79aea46d..6b24dfba 100644 --- a/src/basic.rs +++ b/src/basic.rs @@ -77,6 +77,11 @@ pub mod imp { /// use simdutf8::basic::imp::Utf8Validator; /// use std::io::{stdin, Read, Result}; /// + /// # #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + /// # fn main() { + /// # } + /// + /// # #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] /// fn main() -> Result<()> { /// unsafe { /// if !std::is_x86_feature_detected!("avx2") { diff --git a/src/implementation/aarch64/mod.rs b/src/implementation/aarch64/mod.rs index 6b1bfb2f..a483ecbc 100644 --- a/src/implementation/aarch64/mod.rs +++ b/src/implementation/aarch64/mod.rs @@ -5,10 +5,6 @@ pub(crate) mod neon; #[inline] #[cfg(all(feature = "aarch64_neon", target_feature = "neon"))] pub(crate) unsafe fn validate_utf8_basic(input: &[u8]) -> Result<(), crate::basic::Utf8Error> { - if input.len() < super::helpers::SIMD_CHUNK_SIZE { - return super::validate_utf8_basic_fallback(input); - } - validate_utf8_basic_neon(input) } @@ -24,10 +20,6 @@ pub(crate) use super::validate_utf8_basic_fallback as validate_utf8_basic; #[inline] #[cfg(all(feature = "aarch64_neon", target_feature = "neon"))] pub(crate) unsafe fn validate_utf8_compat(input: &[u8]) -> Result<(), crate::compat::Utf8Error> { - if input.len() < super::helpers::SIMD_CHUNK_SIZE { - return super::validate_utf8_compat_fallback(input); - } - validate_utf8_compat_neon(input) } diff --git a/src/implementation/aarch64/neon.rs b/src/implementation/aarch64/neon.rs index 8fb05057..c1cf3790 100644 --- a/src/implementation/aarch64/neon.rs +++ b/src/implementation/aarch64/neon.rs @@ -6,6 +6,7 @@ use core::arch::aarch64::{ }; use crate::implementation::helpers::Utf8CheckAlgorithm; +use core::arch::asm; // aarch64 SIMD primitives @@ -100,6 +101,306 @@ impl SimdU8Value { Self::from(dst.assume_init()) } + #[inline] + unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { + Self::from(Self::load_partial_assembly(ptr, len)) + // Self::from(Self::load_partial_intrinsics(ptr, len)) + } + + #[inline(always)] + #[allow(clippy::inline_always)] + #[allow(clippy::too_many_lines)] + unsafe fn load_partial_intrinsics(ptr: *const u8, len: usize) -> uint8x16_t { + let mut res = Self::splat0(); + match len { + 0 => {} + 1 => { + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr, res.0, 0); + } + 2 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + } + 3 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(2), res.0, 2); + } + 4 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + } + 5 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(4), res.0, 4); + } + 6 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.add(4).cast(), + core::mem::transmute(res.0), + 2, + )); + } + 7 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.add(4).cast(), + core::mem::transmute(res.0), + 2, + )); + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(6), res.0, 6); + } + 8 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + } + 9 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(8), res.0, 8); + } + 10 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.add(8).cast(), + core::mem::transmute(res.0), + 4, + )); + } + 11 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.add(8).cast(), + core::mem::transmute(res.0), + 4, + )); + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(10), res.0, 10); + } + 12 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.add(8).cast(), + core::mem::transmute(res.0), + 2, + )); + } + 13 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.add(8).cast(), + core::mem::transmute(res.0), + 2, + )); + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(12), res.0, 12); + } + 14 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.add(8).cast(), + core::mem::transmute(res.0), + 2, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.add(12).cast(), + core::mem::transmute(res.0), + 6, + )); + } + 15 => { + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u64( + ptr.cast(), + core::mem::transmute(res.0), + 0, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u32( + ptr.add(8).cast(), + core::mem::transmute(res.0), + 2, + )); + res.0 = core::mem::transmute(core::arch::aarch64::vld1q_lane_u16( + ptr.add(12).cast(), + core::mem::transmute(res.0), + 6, + )); + res.0 = core::arch::aarch64::vld1q_lane_u8(ptr.add(14), res.0, 14); + } + _ => { + // not allowed + debug_assert!(false); + } + } + res.0 + } + + #[inline(always)] + #[allow(clippy::too_many_lines)] + #[allow(clippy::inline_always)] + #[allow(unused_assignments)] + fn load_partial_assembly(mut ptr: *const u8, len: usize) -> core::arch::aarch64::uint8x16_t { + assert!(len < 16); + let res: core::arch::aarch64::uint8x16_t; + unsafe { + asm!( + "movi.2d {res:v}, #0000000000000000", + "adr {scratch}, #12", + "adds {scratch}, {scratch}, {len}, lsl #4", + "br {scratch}", + + // 0 + "b 99f", + "nop", + "nop", + "nop", + + // 1 + "ld1.b {{ {res:v} }}[0], [{ptr}]", + "b 99f", + "nop", + "nop", + + // 2 + "ld1.h {{ {res:v} }}[0], [{ptr}]", + "b 99f", + "nop", + "nop", + + // 3 + "ld1.h {{ {res:v} }}[0], [{ptr}], #2", + "ld1.b {{ {res:v} }}[2], [{ptr}]", + "b 99f", + "nop", + + // 4 + "ld1.s {{ {res:v} }}[0], [{ptr}]", + "b 99f", + "nop", + "nop", + + // 5 + "ld1.s {{ {res:v} }}[0], [{ptr}], #4", + "ld1.b {{ {res:v} }}[4], [{ptr}]", + "b 99f", + "nop", + + // 6 + "ld1.s {{ {res:v} }}[0], [{ptr}], #4", + "ld1.h {{ {res:v} }}[2], [{ptr}]", + "b 99f", + "nop", + + // 7 + "ld1.s {{ {res:v} }}[0], [{ptr}], #4", + "ld1.h {{ {res:v} }}[2], [{ptr}], #2", + "ld1.b {{ {res:v} }}[6], [{ptr}]", + "b 99f", + + // 8 + "ld1.d {{ {res:v} }}[0], [{ptr}]", + "b 99f", + "nop", + "nop", + + // 9 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.b {{ {res:v} }}[8], [{ptr}]", + "b 99f", + "nop", + + // 10 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.h {{ {res:v} }}[4], [{ptr}]", + "b 99f", + "nop", + + // 11 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.h {{ {res:v} }}[4], [{ptr}], #2", + "ld1.b {{ {res:v} }}[10], [{ptr}]", + "b 99f", + + // 12 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.s {{ {res:v} }}[2], [{ptr}]", + "b 99f", + "nop", + + // 13 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.s {{ {res:v} }}[2], [{ptr}], #4", + "ld1.b {{ {res:v} }}[12], [{ptr}]", + "b 99f", + + // 14 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.s {{ {res:v} }}[2], [{ptr}], #4", + "ld1.h {{ {res:v} }}[6], [{ptr}]", + "b 99f", + + // 15 + "ld1.d {{ {res:v} }}[0], [{ptr}], #8", + "ld1.s {{ {res:v} }}[2], [{ptr}], #4", + "ld1.h {{ {res:v} }}[6], [{ptr}], #2", + "ld1.b {{ {res:v} }}[14], [{ptr}]", + + "99:", + ptr = inout(reg) ptr, + len = in(reg) len, + scratch = out(reg) _, + res = lateout(vreg) res, + options(pure, readonly, nostack) + ); + }; + res + } + #[inline] #[allow(clippy::too_many_arguments)] unsafe fn lookup_16( @@ -233,6 +534,7 @@ unsafe fn simd_prefetch(ptr: *const u8) { } const PREFETCH: bool = false; +#[allow(unused_imports)] use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk; simd_input_128_bit!("not_used"); algorithm_simd!("not_used"); diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index 85d881c4..52b50b83 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -182,22 +182,85 @@ macro_rules! algorithm_simd { #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] #[inline] #[allow(unconditional_panic)] // does not panic because len is checked - #[allow(const_err)] // the same, but for Rust 1.38.0 unsafe fn check_block(&mut self, input: SimdInput) { // WORKAROUND // necessary because the for loop is not unrolled on ARM64 - if input.vals.len() == 2 { - self.check_bytes(input.vals[0]); - self.check_bytes(input.vals[1]); - self.incomplete = Self::is_incomplete(input.vals[1]); - } else if input.vals.len() == 4 { - self.check_bytes(input.vals[0]); - self.check_bytes(input.vals[1]); - self.check_bytes(input.vals[2]); - self.check_bytes(input.vals[3]); - self.incomplete = Self::is_incomplete(input.vals[3]); - } else { - panic!("Unsupported number of chunks"); + match input.vals.len() { + 2 => { + self.check_bytes(input.vals[0]); + self.check_bytes(input.vals[1]); + self.incomplete = Self::is_incomplete(input.vals[1]); + } + 4 => { + self.check_bytes(input.vals[0]); + self.check_bytes(input.vals[1]); + self.check_bytes(input.vals[2]); + self.check_bytes(input.vals[3]); + self.incomplete = Self::is_incomplete(input.vals[3]); + } + _ => { + panic!("Unsupported number of chunks"); + } + } + } + + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(unconditional_panic)] // does not panic because len is checked + unsafe fn check_remainder(&mut self, mut input: *const u8, len: usize) { + const SIMD_SIZE: usize = core::mem::size_of::(); + let orig_len = len; + let mut len = len; + + while len >= SIMD_SIZE { + let simd_val = SimdU8Value::load_from(input); + input = input.add(SIMD_SIZE); + if simd_val.is_ascii() { + if orig_len == len { + // first after last block, check if previous block is incomplete + self.check_incomplete_pending(); + } + } else { + self.check_bytes(simd_val); + self.incomplete = Self::is_incomplete(simd_val); + } + len -= SIMD_SIZE; + } + if len > 0 { + let simd_val = SimdU8Value::load_partial(input, len); + if simd_val.is_ascii() { + if orig_len < SIMD_SIZE { + // first after last block, check if previous block is incomplete + self.check_incomplete_pending(); + } + } else { + self.check_bytes(simd_val); + self.incomplete = Self::is_incomplete(simd_val); + } + } + } + + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(unconditional_panic)] // does not panic because len is checked + unsafe fn check_remainder_ascii(&mut self, mut input: *const u8, mut len: usize) { + const SIMD_SIZE: usize = core::mem::size_of::(); + + while len >= SIMD_SIZE { + let simd_val = SimdU8Value::load_from(input); + input = input.add(SIMD_SIZE); + if !simd_val.is_ascii() { + self.check_bytes(simd_val); + self.incomplete = Self::is_incomplete(simd_val); + } + len -= SIMD_SIZE; + } + if len > 0 { + let simd_val = SimdU8Value::load_partial(input, len); + if !simd_val.is_ascii() { + self.check_bytes(simd_val); + self.incomplete = Self::is_incomplete(simd_val); + } } } } @@ -222,13 +285,22 @@ macro_rules! algorithm_simd { let mut idx: usize = 0; let iter_lim = len - (len % SIMD_CHUNK_SIZE); - while idx < iter_lim { - let simd_input = SimdInput::new(input.get_unchecked(idx as usize..)); - idx += SIMD_CHUNK_SIZE; - if !simd_input.is_ascii() { - algorithm.check_block(simd_input); - break; + 'outer: loop { + while idx < iter_lim { + let simd_input = SimdInput::new(input.get_unchecked(idx as usize..)); + idx += SIMD_CHUNK_SIZE; + if !simd_input.is_ascii() { + algorithm.check_block(simd_input); + break 'outer; + } } + if idx < len { + let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len - idx); + if !simd_input.is_ascii() { + break; + } + } + return Ok(()); } while idx < iter_lim { @@ -241,14 +313,7 @@ macro_rules! algorithm_simd { } if idx < len { - let mut tmpbuf = TempSimdChunk::new(); - crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64( - input.as_ptr().add(idx), - tmpbuf.0.as_mut_ptr(), - len - idx, - ); - let simd_input = SimdInput::new(&tmpbuf.0); - algorithm.check_utf8(simd_input); + algorithm.check_remainder(input.as_ptr().add(idx), len - idx); } algorithm.check_incomplete_pending(); if algorithm.has_error() { @@ -302,7 +367,15 @@ macro_rules! algorithm_simd { } idx += SIMD_CHUNK_SIZE; } - break; + if idx < len { + algorithm.check_remainder_ascii(input.as_ptr().add(idx), len - idx); + algorithm.check_incomplete_pending(); + } + return if algorithm.has_error() { + Err(idx) + } else { + Ok(()) + }; } else { while idx < iter_lim { if PREFETCH { @@ -331,15 +404,7 @@ macro_rules! algorithm_simd { } } if idx < len { - let mut tmpbuf = TempSimdChunk::new(); - crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64( - input.as_ptr().add(idx), - tmpbuf.0.as_mut_ptr(), - len - idx, - ); - let simd_input = SimdInput::new(&tmpbuf.0); - - algorithm.check_utf8(simd_input); + algorithm.check_remainder(input.as_ptr().add(idx), len - idx) } algorithm.check_incomplete_pending(); if algorithm.has_error() { @@ -538,6 +603,101 @@ macro_rules! simd_input_128_bit { } } + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(clippy::cast_ptr_alignment)] + unsafe fn new_partial(ptr: *const u8, len: usize) -> Self { + Self::new_partial_ordered(ptr, len) + } + + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(clippy::cast_ptr_alignment)] + unsafe fn new_partial_ordered(ptr: *const u8, len: usize) -> Self { + let partial_len = len % 16; + if len < 16 { + Self { + vals: [ + SimdU8Value::load_partial(ptr, partial_len), + SimdU8Value::splat0(), + SimdU8Value::splat0(), + SimdU8Value::splat0(), + ], + } + } else if len < 32 { + Self { + vals: [ + SimdU8Value::load_from(ptr), + SimdU8Value::load_partial(ptr.add(16), partial_len), + SimdU8Value::splat0(), + SimdU8Value::splat0(), + ], + } + } else if len < 48 { + Self { + vals: [ + SimdU8Value::load_from(ptr), + SimdU8Value::load_from(ptr.add(16)), + SimdU8Value::load_partial(ptr.add(32), partial_len), + SimdU8Value::splat0(), + ], + } + } else { + Self { + vals: [ + SimdU8Value::load_from(ptr), + SimdU8Value::load_from(ptr.add(16)), + SimdU8Value::load_from(ptr.add(32)), + SimdU8Value::load_partial(ptr.add(48), partial_len), + ], + } + } + } + + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(clippy::cast_ptr_alignment)] + unsafe fn new_partial_small(ptr: *const u8, len: usize) -> Self { + let partial = SimdU8Value::load_partial(ptr.add(len / 16 * 16), len % 16); + if len < 16 { + Self { + vals: [ + partial, + SimdU8Value::splat0(), + SimdU8Value::splat0(), + SimdU8Value::splat0(), + ], + } + } else if len < 32 { + Self { + vals: [ + SimdU8Value::load_from(ptr), + partial, + SimdU8Value::splat0(), + SimdU8Value::splat0(), + ], + } + } else if len < 48 { + Self { + vals: [ + SimdU8Value::load_from(ptr), + SimdU8Value::load_from(ptr.add(16)), + partial, + SimdU8Value::splat0(), + ], + } + } else { + Self { + vals: [ + SimdU8Value::load_from(ptr), + SimdU8Value::load_from(ptr.add(16)), + SimdU8Value::load_from(ptr.add(32)), + partial, + ], + } + } + } + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] #[inline] unsafe fn is_ascii(&self) -> bool { @@ -570,6 +730,47 @@ macro_rules! simd_input_256_bit { } } + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(clippy::cast_ptr_alignment)] + unsafe fn new_partial(ptr: *const u8, len: usize) -> Self { + Self::new_partial_ordered(ptr, len) + } + + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(clippy::cast_ptr_alignment)] + unsafe fn new_partial_ordered(ptr: *const u8, len: usize) -> Self { + if len < 32 { + Self { + vals: [SimdU8Value::load_partial(ptr, len), SimdU8Value::splat0()], + } + } else { + Self { + vals: [ + SimdU8Value::load_from(ptr), + SimdU8Value::load_partial(ptr.add(32), len - 32), + ], + } + } + } + + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] + #[inline] + #[allow(clippy::cast_ptr_alignment)] + unsafe fn new_partial_small(ptr: *const u8, len: usize) -> Self { + let partial = SimdU8Value::load_partial(ptr.add(len / 32 * 32), len % 32); + if len < 32 { + Self { + vals: [partial, SimdU8Value::splat0()], + } + } else { + Self { + vals: [SimdU8Value::load_from(ptr), partial], + } + } + } + #[cfg_attr(not(target_arch="aarch64"), target_feature(enable = $feat))] #[inline] unsafe fn is_ascii(&self) -> bool { diff --git a/src/implementation/helpers.rs b/src/implementation/helpers.rs index a6bd693a..9aedb81a 100644 --- a/src/implementation/helpers.rs +++ b/src/implementation/helpers.rs @@ -37,22 +37,83 @@ pub(crate) fn get_compat_error(input: &[u8], failing_block_pos: usize) -> Utf8Er validate_utf8_at_offset(input, offset).unwrap_err() } +#[inline] +unsafe fn memcpy_u32(src: &mut *const u8, dest: &mut *mut u8) { + #[allow(clippy::cast_ptr_alignment)] + dest.cast::() + .write_unaligned(src.cast::().read_unaligned()); + *src = src.offset(4); + *dest = dest.offset(4); +} + +#[inline] +unsafe fn memcpy_u64(src: &mut *const u8, dest: &mut *mut u8) { + #[allow(clippy::cast_ptr_alignment)] + dest.cast::() + .write_unaligned(src.cast::().read_unaligned()); + *src = src.offset(8); + *dest = dest.offset(8); +} + #[allow(dead_code)] #[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const -pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_64( +pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_16( mut src: *const u8, mut dest: *mut u8, mut len: usize, ) { - // This gets properly auto-vectorized on AVX 2 and SSE 4.2 - #[inline] - unsafe fn memcpy_u64(src: &mut *const u8, dest: &mut *mut u8) { - #[allow(clippy::cast_ptr_alignment)] - dest.cast::() - .write_unaligned(src.cast::().read_unaligned()); - *src = src.offset(8); - *dest = dest.offset(8); + if len >= 8 { + memcpy_u32(&mut src, &mut dest); + memcpy_u32(&mut src, &mut dest); + len -= 8; } + if len >= 4 { + memcpy_u32(&mut src, &mut dest); + len -= 4; + } + while len > 0 { + *dest = *src; + src = src.offset(1); + dest = dest.offset(1); + len -= 1; + } +} + +#[allow(dead_code)] +#[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const +pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_32( + mut src: *const u8, + mut dest: *mut u8, + mut len: usize, +) { + if len >= 16 { + memcpy_u64(&mut src, &mut dest); + memcpy_u64(&mut src, &mut dest); + len -= 16; + } + if len >= 8 { + memcpy_u64(&mut src, &mut dest); + len -= 8; + } + if len >= 4 { + memcpy_u32(&mut src, &mut dest); + len -= 4; + } + while len > 0 { + *dest = *src; + src = src.offset(1); + dest = dest.offset(1); + len -= 1; + } +} + +#[allow(dead_code)] +#[allow(clippy::missing_const_for_fn)] // clippy is wrong, it cannot really be const +pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_64( + mut src: *const u8, + mut dest: *mut u8, + mut len: usize, +) { if len >= 32 { memcpy_u64(&mut src, &mut dest); memcpy_u64(&mut src, &mut dest); @@ -69,6 +130,10 @@ pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_64( memcpy_u64(&mut src, &mut dest); len -= 8; } + if len >= 4 { + memcpy_u32(&mut src, &mut dest); + len -= 4; + } while len > 0 { *dest = *src; src = src.offset(1); @@ -115,3 +180,37 @@ impl TempSimdChunkA32 { pub(crate) struct SimdU8Value(pub(crate) T) where T: Copy; + +#[cfg(test)] +impl core::fmt::Display for SimdU8Value { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + unsafe { + if core::mem::size_of::() == 16 { + let arr: [u8; 16] = core::mem::transmute_copy(&self.0); + write!(f, "{:?}", arr) + } else if core::mem::size_of::() == 32 { + let arr: [u8; 32] = core::mem::transmute_copy(&self.0); + write!(f, "{:?}", arr) + } else { + Err(core::fmt::Error) + } + } + } +} + +#[cfg(test)] +impl core::fmt::LowerHex for SimdU8Value { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + unsafe { + if core::mem::size_of::() == 16 { + let arr: [u8; 16] = core::mem::transmute_copy(&self.0); + write!(f, "{:x?}", arr) + } else if core::mem::size_of::() == 32 { + let arr: [u8; 32] = core::mem::transmute_copy(&self.0); + write!(f, "{:x?}", arr) + } else { + Err(core::fmt::Error) + } + } + } +} diff --git a/src/implementation/x86/avx2.rs b/src/implementation/x86/avx2.rs index 8232f571..f20ea798 100644 --- a/src/implementation/x86/avx2.rs +++ b/src/implementation/x86/avx2.rs @@ -4,17 +4,21 @@ #[cfg(target_arch = "x86")] use core::arch::x86::{ - __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_cmpgt_epi8, _mm256_loadu_si256, - _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi8, - _mm256_setr_epi8, _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16, - _mm256_subs_epu8, _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0, + __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_blendv_ps, _mm256_castps_si256, + _mm256_castsi256_ps, _mm256_cmpgt_epi8, _mm256_loadu_si256, _mm256_maskload_epi32, + _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi32, + _mm256_set1_epi8, _mm256_set_epi32, _mm256_setr_epi8, _mm256_setzero_si256, + _mm256_shuffle_epi8, _mm256_sllv_epi32, _mm256_srli_epi16, _mm256_subs_epu8, + _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0, }; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::{ - __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_cmpgt_epi8, _mm256_loadu_si256, - _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi8, - _mm256_setr_epi8, _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16, - _mm256_subs_epu8, _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0, + __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_blendv_ps, _mm256_castps_si256, + _mm256_castsi256_ps, _mm256_cmpgt_epi8, _mm256_loadu_si256, _mm256_maskload_epi32, + _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi32, + _mm256_set1_epi8, _mm256_set_epi32, _mm256_setr_epi8, _mm256_setzero_si256, + _mm256_shuffle_epi8, _mm256_sllv_epi32, _mm256_srli_epi16, _mm256_subs_epu8, + _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0, }; use crate::implementation::helpers::Utf8CheckAlgorithm; @@ -103,6 +107,63 @@ impl SimdU8Value { Self::from(_mm256_loadu_si256(ptr.cast::<__m256i>())) } + #[target_feature(enable = "avx2")] + #[inline] + unsafe fn vecmask_from_bitmask(mask: u8) -> Self { + let vshift_count = _mm256_set_epi32(24, 25, 26, 27, 28, 29, 30, 31); + let bcast = _mm256_set1_epi32(i32::from(mask)); + let shifted = _mm256_sllv_epi32(bcast, vshift_count); // high bit of each element = corresponding bit of the mask + Self::from(shifted) + } + + #[target_feature(enable = "avx2")] + #[inline] + unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { + Self::load_partial_direct(ptr, len) + } + + #[target_feature(enable = "avx2")] + #[inline] + unsafe fn load_partial_direct(mut ptr: *const u8, len: usize) -> Self { + if len == 0 { + return Self::splat0(); + } + let sel_mask = 1 << (len / 4); + let mask = (sel_mask - 1) as u8; + let mut res = _mm256_maskload_epi32(ptr.cast(), Self::vecmask_from_bitmask(mask).0); + let remainder = len % 4; + if remainder != 0 { + ptr = ptr.add((len - len % 4) as usize); + let remaining_bytes = match remainder { + 1 => u32::from(*ptr), + 2 => u32::from(*ptr) | u32::from(*ptr.add(1)) << 8, + 3 => u32::from(*ptr) | u32::from(*ptr.add(1)) << 8 | u32::from(*ptr.add(2)) << 16, + _ => 0, + }; + // blend vec with 4-byte chunks and last incomplete chunk together + #[allow(clippy::cast_possible_wrap)] + let remaining_vec = _mm256_set1_epi32(remaining_bytes as i32); + res = _mm256_castps_si256(_mm256_blendv_ps( + _mm256_castsi256_ps(res), + _mm256_castsi256_ps(remaining_vec), + _mm256_castsi256_ps(Self::vecmask_from_bitmask(sel_mask).0), + )); + } + Self::from(res) + } + + #[target_feature(enable = "avx2")] + #[inline] + unsafe fn load_partial_copy(ptr: *const u8, len: usize) -> Self { + let mut tmpbuf = [0_u8; 32]; + crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_32( + ptr, + tmpbuf.as_mut_ptr(), + len, + ); + Self::load_from(tmpbuf.as_ptr()) + } + #[target_feature(enable = "avx2")] #[inline] unsafe fn lookup_16( @@ -255,7 +316,41 @@ unsafe fn simd_prefetch(ptr: *const u8) { _mm_prefetch(ptr.cast::(), _MM_HINT_T0); } +#[cfg(test)] +mod test { + #[cfg(not(features = "std"))] + extern crate std; + + #[allow(unused_imports)] + use super::*; + + #[test] + pub fn masked_load() { + if !std::is_x86_feature_detected!("avx2") { + return; + } + + let arr = [ + 1_u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + unsafe { + for len in 0..32 { + let loaded_arr: [u8; 32] = + core::mem::transmute(SimdU8Value::load_partial(arr.as_ptr(), len)); + for i in 0..len { + assert_eq!(arr[i], loaded_arr[i]); + } + for x in &loaded_arr[len..arr.len()] { + assert_eq!(*x, 0); + } + } + } + } +} + const PREFETCH: bool = true; +#[allow(unused_imports)] use crate::implementation::helpers::TempSimdChunkA32 as TempSimdChunk; simd_input_256_bit!("avx2"); algorithm_simd!("avx2"); diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs index 19954956..09fb53b7 100644 --- a/src/implementation/x86/mod.rs +++ b/src/implementation/x86/mod.rs @@ -4,11 +4,17 @@ pub(crate) mod avx2; #[allow(dead_code)] pub(crate) mod sse42; -#[allow(unused_imports)] -use super::helpers::SIMD_CHUNK_SIZE; - // validate_utf8_basic() std: implementation auto-selection +#[allow(dead_code)] +const ENABLE_AVX2: bool = true; + +#[allow(dead_code)] +const DELEGATE_TO_STD_FOR_SMALL_INPUTS: bool = true; + +#[allow(dead_code)] +const SMALL_STRING_LIMIT: usize = 9; + #[cfg(all(feature = "std", not(target_feature = "avx2")))] #[inline] pub(crate) unsafe fn validate_utf8_basic( @@ -27,7 +33,7 @@ pub(crate) unsafe fn validate_utf8_basic( (fun)(input) } - if input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_basic_fallback(input); } @@ -38,7 +44,7 @@ pub(crate) unsafe fn validate_utf8_basic( #[cfg(all(feature = "std", not(target_feature = "avx2")))] #[inline] fn get_fastest_available_implementation_basic() -> super::ValidateUtf8Fn { - if std::is_x86_feature_detected!("avx2") { + if ENABLE_AVX2 && std::is_x86_feature_detected!("avx2") { avx2::validate_utf8_basic } else if std::is_x86_feature_detected!("sse4.2") { sse42::validate_utf8_basic @@ -53,7 +59,7 @@ fn get_fastest_available_implementation_basic() -> super::ValidateUtf8Fn { pub(crate) unsafe fn validate_utf8_basic( input: &[u8], ) -> core::result::Result<(), crate::basic::Utf8Error> { - if input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_basic_fallback(input); } @@ -76,7 +82,7 @@ unsafe fn validate_utf8_basic_avx2( pub(crate) unsafe fn validate_utf8_basic( input: &[u8], ) -> core::result::Result<(), crate::basic::Utf8Error> { - if input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_basic_fallback(input); } @@ -123,7 +129,7 @@ pub(crate) unsafe fn validate_utf8_compat( (fun)(input) } - if input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_compat_fallback(input); } @@ -134,7 +140,7 @@ pub(crate) unsafe fn validate_utf8_compat( #[cfg(all(feature = "std", not(target_feature = "avx2")))] #[inline] fn get_fastest_available_implementation_compat() -> super::ValidateUtf8CompatFn { - if std::is_x86_feature_detected!("avx2") { + if ENABLE_AVX2 && std::is_x86_feature_detected!("avx2") { avx2::validate_utf8_compat } else if std::is_x86_feature_detected!("sse4.2") { sse42::validate_utf8_compat @@ -149,7 +155,7 @@ fn get_fastest_available_implementation_compat() -> super::ValidateUtf8CompatFn pub(crate) unsafe fn validate_utf8_compat( input: &[u8], ) -> core::result::Result<(), crate::compat::Utf8Error> { - if input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_compat_fallback(input); } @@ -172,7 +178,7 @@ unsafe fn validate_utf8_compat_avx2( pub(crate) unsafe fn validate_utf8_compat( input: &[u8], ) -> core::result::Result<(), crate::compat::Utf8Error> { - if input.len() < SIMD_CHUNK_SIZE { + if DELEGATE_TO_STD_FOR_SMALL_INPUTS && input.len() < SMALL_STRING_LIMIT { return super::validate_utf8_compat_fallback(input); } diff --git a/src/implementation/x86/sse42.rs b/src/implementation/x86/sse42.rs index f9140d50..af6de1db 100644 --- a/src/implementation/x86/sse42.rs +++ b/src/implementation/x86/sse42.rs @@ -4,15 +4,17 @@ #[cfg(target_arch = "x86")] use core::arch::x86::{ - __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_loadu_si128, _mm_movemask_epi8, - _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, - _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, + __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_insert_epi16, _mm_insert_epi8, + _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16, + _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi16, + _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, }; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::{ - __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_loadu_si128, _mm_movemask_epi8, - _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, - _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, + __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_insert_epi16, _mm_insert_epi8, + _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi16, + _mm_setr_epi32, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi16, + _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0, }; use crate::implementation::helpers::Utf8CheckAlgorithm; @@ -99,6 +101,180 @@ impl SimdU8Value { Self::from(_mm_loadu_si128(ptr.cast::<__m128i>())) } + #[target_feature(enable = "sse4.2")] + #[inline] + #[allow(clippy::too_many_lines)] + #[allow(clippy::cast_ptr_alignment)] + unsafe fn load_partial(ptr: *const u8, len: usize) -> Self { + Self::from(match len { + 1 => _mm_setr_epi8( + ptr.cast::().read_unaligned(), + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ), + 2 => _mm_setr_epi16(ptr.cast::().read_unaligned(), 0, 0, 0, 0, 0, 0, 0), + 3 => _mm_setr_epi8( + ptr.cast::().read_unaligned(), + ptr.add(1).cast::().read_unaligned(), + ptr.add(2).cast::().read_unaligned(), + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ), + 4 => _mm_setr_epi32(ptr.cast::().read_unaligned(), 0, 0, 0), // assembly??? + 5 => { + let val = _mm_setr_epi32(ptr.cast::().read_unaligned(), 0, 0, 0); + _mm_insert_epi8(val, i32::from(ptr.add(4).cast::().read_unaligned()), 4) + } + 6 => _mm_setr_epi16( + ptr.cast::().read_unaligned(), + ptr.add(2).cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + 0, + 0, + 0, + 0, + 0, + ), + 7 => { + let val = _mm_setr_epi16( + ptr.cast::().read_unaligned(), + ptr.add(2).cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + 0, + 0, + 0, + 0, + 0, + ); + _mm_insert_epi8(val, i32::from(ptr.add(6).cast::().read_unaligned()), 6) + } + 8 => _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + 0, + 0, + ), + 9 => { + let val = _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + 0, + 0, + ); + _mm_insert_epi8(val, i32::from(ptr.add(8).cast::().read_unaligned()), 8) + } + 10 => { + let val = _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + 0, + 0, + ); + _mm_insert_epi16(val, i32::from(ptr.add(8).cast::().read_unaligned()), 4) + } + 11 => { + let mut val = _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + 0, + 0, + ); + val = + _mm_insert_epi16(val, i32::from(ptr.add(8).cast::().read_unaligned()), 4); + _mm_insert_epi8( + val, + i32::from(ptr.add(10).cast::().read_unaligned()), + 10, + ) + } + 12 => _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + ptr.add(8).cast::().read_unaligned(), + 0, + ), + 13 => { + let val = _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + ptr.add(8).cast::().read_unaligned(), + 0, + ); + _mm_insert_epi8( + val, + i32::from(ptr.add(12).cast::().read_unaligned()), + 12, + ) + } + 14 => { + let val = _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + ptr.add(8).cast::().read_unaligned(), + 0, + ); + _mm_insert_epi16( + val, + i32::from(ptr.add(12).cast::().read_unaligned()), + 6, + ) + } + 15 => { + let mut val = _mm_setr_epi32( + ptr.cast::().read_unaligned(), + ptr.add(4).cast::().read_unaligned(), + ptr.add(8).cast::().read_unaligned(), + 0, + ); + val = _mm_insert_epi16( + val, + i32::from(ptr.add(12).cast::().read_unaligned()), + 6, + ); + _mm_insert_epi8( + val, + i32::from(ptr.add(14).cast::().read_unaligned()), + 14, + ) + } + _ => Self::splat0().0, + }) + } + + unsafe fn load_partial_copy(ptr: *const u8, len: usize) -> Self { + let mut tmpbuf = [0_u8; 16]; + crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_16( + ptr, + tmpbuf.as_mut_ptr(), + len, + ); + Self::load_from(tmpbuf.as_ptr()) + } + #[target_feature(enable = "sse4.2")] #[inline] unsafe fn lookup_16( @@ -239,7 +415,38 @@ unsafe fn simd_prefetch(ptr: *const u8) { _mm_prefetch(ptr.cast::(), _MM_HINT_T0); } +#[cfg(test)] +mod test { + #[cfg(not(features = "std"))] + extern crate std; + + #[allow(unused_imports)] + use super::*; + + #[test] + pub fn masked_load() { + if !std::is_x86_feature_detected!("sse4.2") { + return; + } + + let arr = [1_u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + unsafe { + for len in 0..16 { + let loaded_arr: [u8; 16] = + core::mem::transmute(SimdU8Value::load_partial(arr.as_ptr(), len)); + for i in 0..len { + assert_eq!(arr[i], loaded_arr[i]); + } + for x in &loaded_arr[len..arr.len()] { + assert_eq!(*x, 0); + } + } + } + } +} + const PREFETCH: bool = false; +#[allow(unused_imports)] use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk; simd_input_128_bit!("sse4.2"); algorithm_simd!("sse4.2"); diff --git a/src/lib.rs b/src/lib.rs index 51057298..cee27bc8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,6 @@ -#![deny(warnings)] +#![warn(warnings)] #![warn(unused_extern_crates)] -#![deny( +#![warn( clippy::all, clippy::unwrap_used, clippy::unnecessary_unwrap, @@ -15,7 +15,7 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![cfg_attr( all(feature = "aarch64_neon", target_arch = "aarch64"), - feature(stdsimd) + feature(stdarch_aarch64_prefetch) )] //! Blazingly fast API-compatible UTF-8 validation for Rust using SIMD extensions, based on the implementation from