Skip to content

Commit 55914e1

Browse files
committed
Merge branch 'master' of github.com:Licenser/simdjson-rs
2 parents 852c275 + 32e5b5b commit 55914e1

22 files changed

+1350
-247
lines changed

.circleci/config.yml

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,10 @@ version: 2.1
33
#orbs:
44
# codecov: codecov/codecov@1.0.4
55

6-
jobs:
7-
test:
8-
docker:
9-
- image: rust:1
10-
steps:
6+
commands:
7+
run_tests:
8+
description: "A very simple command for demonstration purposes"
9+
steps:
1110
- checkout
1211
# - run:
1312
# name: Install CMAKE
@@ -55,8 +54,25 @@ jobs:
5554
# name: Run cycle benchmarks
5655
# command: make perf
5756

57+
jobs:
58+
test:
59+
docker:
60+
- image: rust:1
61+
environment:
62+
RUSTFLAGS: '-C target-cpu=native'
63+
steps:
64+
- run_tests: {}
65+
test-sse:
66+
docker:
67+
- image: rust:1
68+
environment:
69+
RUSTFLAGS: '-C target-cpu=native -C target-feature=-avx2'
70+
steps:
71+
- run_tests: {}
72+
5873
workflows:
5974
version: 2
6075
workflow:
6176
jobs:
6277
- test
78+
- test-sse

data/crash/crash000030.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"Neh":-333333333333333333333333333333.3}

data/crash/crash000031.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[ -9265394459000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0p003]

data/crash/crash000032.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[348253421170671280348253421170679.6171,-]

data/crash/crash000033.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[3,28033333333333333333333333333333333348253421170679.6170-]

data/crash/crash000034.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
[3,2825342119.61701e-0,1,
2+
128253421170679.61701e-0,1,
3+
128253421170679.61701e-0,1,
4+
0.1e1,-1e-1,-128253421170679.61701e3,28253421170679.61701e-0,1,
5+
48253421170679.61701e-0,1,
6+
1266666666661,-128253421170679.61701e3,28253421170679.61701e-0,1,
7+
48253421170679.61701e-0,1,
8+
12666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666661170679.61701e3,28253421170679.61701e-0,1,
9+
48253421170679.61701e-0,16666666666666666666666666666666666666666666666666666666666666666666666679.6111-]

data/crash/crash000035.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
-9223372036854775808

data/crash/crash000036.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[
2+
1066,
3+
1e1,
4+
0.1e1,
5+
1e-1,-1e-0, -88888888888888888888888888888888.1,"rod"]

simd-fuzz-target/Makefile

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
run: build
2-
RUSTFLAGS='-C codegen-units=1' cargo +nightly afl fuzz -i in -o out target/debug/simd-fuzz-target
2+
RUSTFLAGS='-C codegen-units=1 -C target-cpu=native' cargo +nightly afl fuzz -i in -o out target/debug/simd-fuzz-target
33
build:
44
RUSTFLAGS='-C codegen-units=1' cargo +nightly afl build
5+
run-sse: build-sse
6+
RUSTFLAGS='-C codegen-units=1 -C target-cpu=native -C target-feature=-avx2' cargo +nightly afl fuzz -i in -o out target/debug/simd-fuzz-target
7+
build-sse:
8+
RUSTFLAGS='-C codegen-units=1 -C target-cpu=native -C target-feature=-avx2' cargo +nightly afl build
59

610
copy:
711
for from in `ls out/crashes/id*`; do to=`echo $$from | sed -e 's;out/crashes/id:;crash;' -e 's;,.*;.json;'`; cp $$from ../simdjson-rs/data/crash/$$to; done

src/avx2/deser.rs

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
#[cfg(target_arch = "x86")]
2+
use std::arch::x86::*;
3+
#[cfg(target_arch = "x86_64")]
4+
use std::arch::x86_64::*;
5+
6+
use std::mem;
7+
8+
pub use crate::error::{Error, ErrorType};
9+
pub use crate::Deserializer;
10+
pub use crate::Result;
11+
pub use crate::avx2::utf8check::*;
12+
pub use crate::stringparse::*;
13+
14+
use crate::portability::trailingzeroes;
15+
16+
17+
impl<'de> Deserializer<'de> {
18+
#[cfg_attr(not(feature = "no-inline"), inline(always))]
19+
pub fn parse_str_(&mut self) -> Result<&'de str> {
20+
// Add 1 to skip the initial "
21+
let idx = self.iidx + 1;
22+
let mut padding = [0u8; 32];
23+
//let mut read: usize = 0;
24+
25+
// we include the terminal '"' so we know where to end
26+
// This is safe since we check sub's lenght in the range access above and only
27+
// create sub sliced form sub to `sub.len()`.
28+
29+
let src: &[u8] = unsafe { &self.input.get_unchecked(idx..) };
30+
let mut src_i: usize = 0;
31+
let mut len = src_i;
32+
loop {
33+
let v: __m256i = if src.len() >= src_i + 32 {
34+
// This is safe since we ensure src is at least 32 wide
35+
#[allow(clippy::cast_ptr_alignment)]
36+
unsafe {
37+
_mm256_loadu_si256(src.as_ptr().add(src_i) as *const __m256i)
38+
}
39+
} else {
40+
unsafe {
41+
padding
42+
.get_unchecked_mut(..src.len() - src_i)
43+
.clone_from_slice(src.get_unchecked(src_i..));
44+
// This is safe since we ensure src is at least 32 wide
45+
#[allow(clippy::cast_ptr_alignment)]
46+
_mm256_loadu_si256(padding.as_ptr() as *const __m256i)
47+
}
48+
};
49+
50+
// store to dest unconditionally - we can overwrite the bits we don't like
51+
// later
52+
let bs_bits: u32 = unsafe {
53+
static_cast_u32!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(
54+
v,
55+
_mm256_set1_epi8(b'\\' as i8)
56+
)))
57+
};
58+
let quote_mask = unsafe { _mm256_cmpeq_epi8(v, _mm256_set1_epi8(b'"' as i8)) };
59+
let quote_bits = unsafe { static_cast_u32!(_mm256_movemask_epi8(quote_mask)) };
60+
if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
61+
// we encountered quotes first. Move dst to point to quotes and exit
62+
// find out where the quote is...
63+
let quote_dist: u32 = trailingzeroes(u64::from(quote_bits)) as u32;
64+
65+
///////////////////////
66+
// Above, check for overflow in case someone has a crazy string (>=4GB?)
67+
// But only add the overflow check when the document itself exceeds 4GB
68+
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
69+
////////////////////////
70+
71+
// we advance the point, accounting for the fact that we have a NULl termination
72+
73+
len += quote_dist as usize;
74+
unsafe {
75+
let v = self.input.get_unchecked(idx..idx + len) as *const [u8] as *const str;
76+
return Ok(&*v);
77+
}
78+
79+
// we compare the pointers since we care if they are 'at the same spot'
80+
// not if they are the same value
81+
}
82+
if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
83+
// Move to the 'bad' character
84+
let bs_dist: u32 = trailingzeroes(u64::from(bs_bits));
85+
len += bs_dist as usize;
86+
src_i += bs_dist as usize;
87+
break;
88+
} else {
89+
// they are the same. Since they can't co-occur, it means we encountered
90+
// neither.
91+
src_i += 32;
92+
len += 32;
93+
}
94+
}
95+
96+
let mut dst_i: usize = 0;
97+
let dst: &mut [u8] = &mut self.strings;
98+
99+
loop {
100+
let v: __m256i = if src.len() >= src_i + 32 {
101+
// This is safe since we ensure src is at least 32 wide
102+
#[allow(clippy::cast_ptr_alignment)]
103+
unsafe {
104+
_mm256_loadu_si256(src.as_ptr().add(src_i) as *const __m256i)
105+
}
106+
} else {
107+
unsafe {
108+
padding
109+
.get_unchecked_mut(..src.len() - src_i)
110+
.clone_from_slice(src.get_unchecked(src_i..));
111+
// This is safe since we ensure src is at least 32 wide
112+
#[allow(clippy::cast_ptr_alignment)]
113+
_mm256_loadu_si256(padding.as_ptr() as *const __m256i)
114+
}
115+
};
116+
117+
#[allow(clippy::cast_ptr_alignment)]
118+
unsafe {
119+
_mm256_storeu_si256(dst.as_mut_ptr().add(dst_i) as *mut __m256i, v)
120+
};
121+
122+
// store to dest unconditionally - we can overwrite the bits we don't like
123+
// later
124+
let bs_bits: u32 = unsafe {
125+
static_cast_u32!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(
126+
v,
127+
_mm256_set1_epi8(b'\\' as i8)
128+
)))
129+
};
130+
let quote_mask = unsafe { _mm256_cmpeq_epi8(v, _mm256_set1_epi8(b'"' as i8)) };
131+
let quote_bits = unsafe { static_cast_u32!(_mm256_movemask_epi8(quote_mask)) };
132+
if (bs_bits.wrapping_sub(1) & quote_bits) != 0 {
133+
// we encountered quotes first. Move dst to point to quotes and exit
134+
// find out where the quote is...
135+
let quote_dist: u32 = trailingzeroes(u64::from(quote_bits)) as u32;
136+
137+
///////////////////////
138+
// Above, check for overflow in case someone has a crazy string (>=4GB?)
139+
// But only add the overflow check when the document itself exceeds 4GB
140+
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
141+
////////////////////////
142+
143+
// we advance the point, accounting for the fact that we have a NULl termination
144+
145+
dst_i += quote_dist as usize;
146+
unsafe {
147+
self.input
148+
.get_unchecked_mut(idx + len..idx + len + dst_i)
149+
.clone_from_slice(&self.strings.get_unchecked(..dst_i));
150+
let v = self.input.get_unchecked(idx..idx + len + dst_i) as *const [u8]
151+
as *const str;
152+
self.str_offset += dst_i as usize;
153+
return Ok(&*v);
154+
}
155+
156+
// we compare the pointers since we care if they are 'at the same spot'
157+
// not if they are the same value
158+
}
159+
if (quote_bits.wrapping_sub(1) & bs_bits) != 0 {
160+
// find out where the backspace is
161+
let bs_dist: u32 = trailingzeroes(u64::from(bs_bits));
162+
let escape_char: u8 = unsafe { *src.get_unchecked(src_i + bs_dist as usize + 1) };
163+
// we encountered backslash first. Handle backslash
164+
if escape_char == b'u' {
165+
// move src/dst up to the start; they will be further adjusted
166+
// within the unicode codepoint handling code.
167+
src_i += bs_dist as usize;
168+
dst_i += bs_dist as usize;
169+
let (o, s) = if let Ok(r) =
170+
handle_unicode_codepoint(unsafe { src.get_unchecked(src_i..) }, unsafe {
171+
dst.get_unchecked_mut(dst_i..)
172+
}) {
173+
r
174+
} else {
175+
return Err(self.error(ErrorType::InvlaidUnicodeCodepoint));
176+
};
177+
if o == 0 {
178+
return Err(self.error(ErrorType::InvlaidUnicodeCodepoint));
179+
};
180+
// We moved o steps forword at the destiation and 6 on the source
181+
src_i += s;
182+
dst_i += o;
183+
} else {
184+
// simple 1:1 conversion. Will eat bs_dist+2 characters in input and
185+
// write bs_dist+1 characters to output
186+
// note this may reach beyond the part of the buffer we've actually
187+
// seen. I think this is ok
188+
let escape_result: u8 =
189+
unsafe { *ESCAPE_MAP.get_unchecked(escape_char as usize) };
190+
if escape_result == 0 {
191+
return Err(self.error(ErrorType::InvalidEscape));
192+
}
193+
unsafe {
194+
*dst.get_unchecked_mut(dst_i + bs_dist as usize) = escape_result;
195+
}
196+
src_i += bs_dist as usize + 2;
197+
dst_i += bs_dist as usize + 1;
198+
}
199+
} else {
200+
// they are the same. Since they can't co-occur, it means we encountered
201+
// neither.
202+
src_i += 32;
203+
dst_i += 32;
204+
}
205+
}
206+
}
207+
}

0 commit comments

Comments
 (0)