Skip to content

Commit 6709654

Browse files
committed
Fix bug that allowed invalid UTF8 to be paresed
Signed-off-by: Heinz N. Gies <heinz@licenser.net>
1 parent ad70738 commit 6709654

File tree

2 files changed

+17
-1
lines changed

2 files changed

+17
-1
lines changed

src/lib.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,6 +1308,14 @@ mod tests_serde {
13081308
//assert_eq!(v_serde, "\u{e}");
13091309
//assert_eq!(v_simd, v_serde);
13101310
}
1311+
#[test]
1312+
fn utf8_invalid_surrogates() {
1313+
// This is invalid UTF-8, the first character is a high surrogate
1314+
let mut d = String::from(r#""\uDE71""#);
1315+
let mut d = unsafe { d.as_bytes_mut() };
1316+
let v_simd: Result<serde_json::Value, _> = from_slice(&mut d);
1317+
assert!(v_simd.is_err());
1318+
}
13111319

13121320
#[test]
13131321
fn unicode() {

src/stringparse.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
use std::ops::Range;
2+
13
use crate::charutils::{codepoint_to_utf8, hex_to_u32_nocheck};
24
use crate::error::ErrorType;
35

@@ -19,6 +21,9 @@ pub(crate) const ESCAPE_MAP: [u8; 256] = [
1921
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2022
];
2123

24+
const HIGH_SURROGATES: Range<u32> = 0xd800..0xdc00;
25+
const LOW_SURROGATES: Range<u32> = 0xdc00..0xe000;
26+
2227
/// handle a unicode codepoint
2328
/// write appropriate values into dest
2429
/// src will advance 6 bytes or 12 bytes
@@ -38,7 +43,7 @@ pub(crate) fn handle_unicode_codepoint(
3843
let mut src_offset = 6;
3944
// check for low surrogate for characters outside the Basic
4045
// Multilingual Plane.
41-
if (0xd800..0xdc00).contains(&code_point) {
46+
if HIGH_SURROGATES.contains(&code_point) {
4247
if (unsafe { *src_ptr.get_unchecked(0) } != b'\\')
4348
|| unsafe { *src_ptr.get_unchecked(1) } != b'u'
4449
{
@@ -67,6 +72,9 @@ pub(crate) fn handle_unicode_codepoint(
6772
};
6873
code_point = ((c1 << 10) | c2) + 0x10000;
6974
src_offset += 6;
75+
} else if LOW_SURROGATES.contains(&code_point) {
76+
// This is a low surrogate on it's own, which is invalid.
77+
return Err(ErrorType::InvalidUtf8);
7078
}
7179
let offset: usize = codepoint_to_utf8(code_point, dst_ptr);
7280
Ok((offset, src_offset))

0 commit comments

Comments
 (0)