Skip to content

Commit 18d367d

Browse files
committed
Fixed the panic at single unicode character (two bytes)
1 parent c075cee commit 18d367d

File tree

2 files changed

+40
-79
lines changed

2 files changed

+40
-79
lines changed

src/parser.rs

Lines changed: 31 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
// This makes for some ugly code, but it is faster. Hopefully in the future
1818
// with MIR support the compiler will get smarter about this.
1919

20-
use std::{ str, slice, char };
20+
use std::{ str, slice };
2121
use object::Object;
2222
use number::Number;
2323
use { JsonValue, Error, Result };
@@ -31,16 +31,6 @@ const MAX_PRECISION: u64 = 576460752303423500;
3131
const DEPTH_LIMIT: usize = 512;
3232

3333

34-
// Position is only used when we stumble upon an unexpected character. We don't
35-
// track lines during parsing, as that would mean doing unnecessary work.
36-
// Instead, if an error occurs, we figure out the line and column from the
37-
// current index position of the parser.
38-
struct Position {
39-
pub line: usize,
40-
pub column: usize,
41-
}
42-
43-
4434
// The `Parser` struct keeps track of indexing over our buffer. All niceness
4535
// has been abandonned in favor of raw pointer magic. Does that make you feel
4636
// dirty? _Good._
@@ -91,7 +81,7 @@ macro_rules! expect_sequence {
9181
$(
9282
match expect_byte!($parser) {
9383
$ch => {},
94-
ch => return $parser.unexpected_character(ch),
84+
_ => return $parser.unexpected_character(),
9585
}
9686
)*
9787
}
@@ -131,9 +121,9 @@ macro_rules! expect_eof {
131121
while !$parser.is_eof() {
132122
match $parser.read_byte() {
133123
9 ... 13 | 32 => $parser.bump(),
134-
ch => {
124+
_ => {
135125
$parser.bump();
136-
return $parser.unexpected_character(ch);
126+
return $parser.unexpected_character();
137127
}
138128
}
139129
}
@@ -147,7 +137,7 @@ macro_rules! expect {
147137
let ch = expect_byte_ignore_whitespace!($parser);
148138

149139
if ch != $byte {
150-
return $parser.unexpected_character(ch)
140+
return $parser.unexpected_character()
151141
}
152142
});
153143

@@ -158,7 +148,7 @@ macro_rules! expect {
158148
$(
159149
$byte => $then,
160150
)*
161-
_ => return $parser.unexpected_character(ch)
151+
_ => return $parser.unexpected_character()
162152
}
163153

164154
})
@@ -222,7 +212,7 @@ macro_rules! expect_string {
222212
break;
223213
}
224214

225-
return $parser.unexpected_character(ch);
215+
return $parser.unexpected_character();
226216
}
227217

228218
result
@@ -324,7 +314,7 @@ macro_rules! expect_fraction {
324314
}
325315
}
326316
},
327-
_ => return $parser.unexpected_character(ch)
317+
_ => return $parser.unexpected_character()
328318
}
329319

330320
loop {
@@ -403,66 +393,28 @@ impl<'a> Parser<'a> {
403393
self.index = self.index.wrapping_add(1);
404394
}
405395

406-
// Figure out the `Position` in the source. This doesn't look like it's
407-
// very fast - it probably isn't, and it doesn't really have to be.
408-
// This method is only called when an unexpected character error occurs.
409-
fn source_position_from_index(&self, index: usize) -> Position {
410-
let (bytes, _) = self.source.split_at(index-1);
411-
412-
Position {
413-
line: bytes.lines().count(),
414-
column: bytes.lines().last().map_or(1, |line| {
415-
line.chars().count() + 1
416-
})
417-
}
418-
}
419-
420396
// So we got an unexpected character, now what? Well, figure out where
421397
// it is, and throw an error!
422-
fn unexpected_character<T: Sized>(&mut self, byte: u8) -> Result<T> {
423-
let pos = self.source_position_from_index(self.index);
424-
425-
// If the first byte is non ASCII (> 127), attempt to read the
426-
// codepoint from the following UTF-8 sequence. This can lead
427-
// to a fun scenario where an unexpected character error can
428-
// produce an end of json or UTF-8 failure error first :).
429-
let ch = if byte & 0x80 != 0 {
430-
let mut buf = [byte,0,0,0];
431-
let mut len = 0usize;
432-
433-
if byte & 0xE0 == 0xCE {
434-
// 2 bytes, 11 bits
435-
len = 2;
436-
buf[1] = expect_byte!(self);
437-
} else if byte & 0xF0 == 0xE0 {
438-
// 3 bytes, 16 bits
439-
len = 3;
440-
buf[1] = expect_byte!(self);
441-
buf[2] = expect_byte!(self);
442-
} else if byte & 0xF8 == 0xF0 {
443-
// 4 bytes, 21 bits
444-
len = 4;
445-
buf[1] = expect_byte!(self);
446-
buf[2] = expect_byte!(self);
447-
buf[3] = expect_byte!(self);
448-
}
398+
fn unexpected_character<T: Sized>(&mut self) -> Result<T> {
399+
let at = self.index - 1;
449400

450-
let slice = try!(
451-
str::from_utf8(&buf[0..len])
452-
.map_err(|_| Error::FailedUtf8Parsing)
453-
);
401+
let ch = self.source[at..]
402+
.chars()
403+
.next()
404+
.expect("Must have a character");
454405

455-
slice.chars().next().unwrap()
456-
} else {
406+
let (lineno, col) = self.source[..at]
407+
.lines()
408+
.enumerate()
409+
.last()
410+
.unwrap_or((0, ""));
457411

458-
// codepoints < 128 are safe ASCII compatibles
459-
unsafe { char::from_u32_unchecked(byte as u32) }
460-
};
412+
let colno = col.chars().count();
461413

462414
Err(Error::UnexpectedCharacter {
463415
ch: ch,
464-
line: pos.line,
465-
column: pos.column,
416+
line: lineno + 1,
417+
column: colno + 1,
466418
})
467419
}
468420

@@ -473,7 +425,7 @@ impl<'a> Parser<'a> {
473425
b'0' ... b'9' => (ch - b'0'),
474426
b'a' ... b'f' => (ch + 10 - b'a'),
475427
b'A' ... b'F' => (ch + 10 - b'A'),
476-
ch => return self.unexpected_character(ch),
428+
_ => return self.unexpected_character(),
477429
} as u32)
478430
}
479431

@@ -575,11 +527,11 @@ impl<'a> Parser<'a> {
575527
b't' => b'\t',
576528
b'r' => b'\r',
577529
b'n' => b'\n',
578-
_ => return self.unexpected_character(escaped)
530+
_ => return self.unexpected_character()
579531
};
580532
self.buffer.push(escaped);
581533
},
582-
_ => return self.unexpected_character(ch)
534+
_ => return self.unexpected_character()
583535
}
584536
ch = expect_byte!(self);
585537
}
@@ -656,7 +608,7 @@ impl<'a> Parser<'a> {
656608

657609
let mut e = match ch {
658610
b'0' ... b'9' => (ch - b'0') as i16,
659-
_ => return self.unexpected_character(ch),
611+
_ => return self.unexpected_character(),
660612
};
661613

662614
loop {
@@ -708,7 +660,7 @@ impl<'a> Parser<'a> {
708660
let mut object = Object::with_capacity(3);
709661

710662
if ch != b'"' {
711-
return self.unexpected_character(ch)
663+
return self.unexpected_character()
712664
}
713665

714666
object.insert(expect_string!(self), JsonValue::Null);
@@ -733,7 +685,7 @@ impl<'a> Parser<'a> {
733685
JsonValue::Number(- match ch {
734686
b'0' => allow_number_extensions!(self),
735687
b'1' ... b'9' => expect_number!(self, ch),
736-
_ => return self.unexpected_character(ch)
688+
_ => return self.unexpected_character()
737689
})
738690
}
739691
b't' => {
@@ -748,7 +700,7 @@ impl<'a> Parser<'a> {
748700
expect_sequence!(self, b'u', b'l', b'l');
749701
JsonValue::Null
750702
},
751-
_ => return self.unexpected_character(ch)
703+
_ => return self.unexpected_character()
752704
};
753705

754706
'popping: loop {
@@ -776,7 +728,7 @@ impl<'a> Parser<'a> {
776728
value = JsonValue::Array(array);
777729
continue 'popping;
778730
},
779-
_ => return self.unexpected_character(ch)
731+
_ => return self.unexpected_character()
780732
}
781733
},
782734

@@ -802,7 +754,7 @@ impl<'a> Parser<'a> {
802754

803755
continue 'popping;
804756
},
805-
_ => return self.unexpected_character(ch)
757+
_ => return self.unexpected_character()
806758
}
807759
},
808760
}

tests/parse.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,3 +318,12 @@ fn parse_error_after_depth_limit() {
318318
assert_eq!(parse(&text), Err(json::Error::ExceededDepthLimit));
319319
}
320320

321+
#[test]
322+
fn does_not_panic_on_single_unicode_char() {
323+
let bytes = vec![0xC3, 0xA5];
324+
325+
let string = String::from_utf8(bytes).unwrap();
326+
327+
assert!(parse(&string).is_err());
328+
}
329+

0 commit comments

Comments
 (0)