Skip to content

Commit bdc1428

Browse files
cmyrstepancheg
authored andcommitted
[text-format] Fix parsing of string literals
This renames `next_byte_value` to `next_str_lit_bytes` and may return between 1..=4 bytes per call, representing the variable-length nature of the UTF-8 encoding.
1 parent 82e76bc commit bdc1428

File tree

3 files changed

+53
-26
lines changed

3 files changed

+53
-26
lines changed

protobuf-support/src/lexer/lexer_impl.rs

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,34 @@ impl From<ProtobufFloatParseError> for LexerError {
6767
}
6868
}
6969

70+
/// The raw bytes for a single char or escape sequence in a string literal
71+
///
72+
/// The raw bytes are available via an `into_iter` implementation.
73+
pub(crate) struct DecodedBytes {
74+
// a single char can be up to 4-bytes when encoded in utf-8
75+
buf: [u8; 4],
76+
len: usize,
77+
}
78+
79+
impl DecodedBytes {
80+
fn byte(b: u8) -> DecodedBytes {
81+
DecodedBytes {
82+
buf: [b, 0, 0, 0],
83+
len: 1,
84+
}
85+
}
86+
87+
fn char(value: char) -> Self {
88+
let mut buf = [0; 4];
89+
let len = value.encode_utf8(&mut buf).len();
90+
DecodedBytes { buf, len }
91+
}
92+
93+
pub(crate) fn bytes(&self) -> &[u8] {
94+
&self.buf[..self.len]
95+
}
96+
}
97+
7098
#[derive(Copy, Clone)]
7199
pub struct Lexer<'a> {
72100
language: ParserLanguage,
@@ -440,24 +468,24 @@ impl<'a> Lexer<'a> {
440468
// octEscape = '\' octalDigit octalDigit octalDigit
441469
// charEscape = '\' ( "a" | "b" | "f" | "n" | "r" | "t" | "v" | '\' | "'" | '"' )
442470
// quote = "'" | '"'
443-
pub fn next_byte_value(&mut self) -> LexerResult<u8> {
471+
pub(crate) fn next_str_lit_bytes(&mut self) -> LexerResult<DecodedBytes> {
444472
match self.next_char()? {
445473
'\\' => {
446474
match self.next_char()? {
447-
'\'' => Ok(b'\''),
448-
'"' => Ok(b'"'),
449-
'\\' => Ok(b'\\'),
450-
'a' => Ok(b'\x07'),
451-
'b' => Ok(b'\x08'),
452-
'f' => Ok(b'\x0c'),
453-
'n' => Ok(b'\n'),
454-
'r' => Ok(b'\r'),
455-
't' => Ok(b'\t'),
456-
'v' => Ok(b'\x0b'),
475+
'\'' => Ok(DecodedBytes::byte(b'\'')),
476+
'"' => Ok(DecodedBytes::byte(b'"')),
477+
'\\' => Ok(DecodedBytes::byte(b'\\')),
478+
'a' => Ok(DecodedBytes::byte(b'\x07')),
479+
'b' => Ok(DecodedBytes::byte(b'\x08')),
480+
'f' => Ok(DecodedBytes::byte(b'\x0c')),
481+
'n' => Ok(DecodedBytes::byte(b'\n')),
482+
'r' => Ok(DecodedBytes::byte(b'\r')),
483+
't' => Ok(DecodedBytes::byte(b'\t')),
484+
'v' => Ok(DecodedBytes::byte(b'\x0b')),
457485
'x' => {
458486
let d1 = self.next_hex_digit()? as u8;
459487
let d2 = self.next_hex_digit()? as u8;
460-
Ok(((d1 << 4) | d2) as u8)
488+
Ok(DecodedBytes::byte((d1 << 4) | d2))
461489
}
462490
d if d >= '0' && d <= '7' => {
463491
let mut r = d as u8 - b'0';
@@ -467,16 +495,14 @@ impl<'a> Lexer<'a> {
467495
Ok(d) => r = (r << 3) + d as u8,
468496
}
469497
}
470-
Ok(r)
498+
Ok(DecodedBytes::byte(r))
471499
}
472500
// https://github.com/google/protobuf/issues/4562
473-
// TODO: overflow
474-
c => Ok(c as u8),
501+
c => Ok(DecodedBytes::char(c)),
475502
}
476503
}
477504
'\n' | '\0' => Err(LexerError::IncorrectInput),
478-
// TODO: check overflow
479-
c => Ok(c as u8),
505+
c => Ok(DecodedBytes::char(c)),
480506
}
481507
}
482508

@@ -530,7 +556,7 @@ impl<'a> Lexer<'a> {
530556
};
531557
first = false;
532558
while self.lookahead_char() != Some(q) {
533-
self.next_byte_value()?;
559+
self.next_str_lit_bytes()?;
534560
}
535561
self.next_char_expect_eq(q)?;
536562

protobuf-support/src/lexer/str_lit.rs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,11 @@ impl StrLit {
3232
let mut lexer = Lexer::new(&self.escaped, ParserLanguage::Json);
3333
let mut r = Vec::new();
3434
while !lexer.eof() {
35-
r.push(
35+
r.extend(
3636
lexer
37-
.next_byte_value()
38-
.map_err(|_| StrLitDecodeError::OtherError)?,
37+
.next_str_lit_bytes()
38+
.map_err(|_| StrLitDecodeError::OtherError)?
39+
.bytes(),
3940
);
4041
}
4142
Ok(String::from_utf8(r)?)
@@ -45,10 +46,11 @@ impl StrLit {
4546
let mut lexer = Lexer::new(&self.escaped, ParserLanguage::Json);
4647
let mut r = Vec::new();
4748
while !lexer.eof() {
48-
r.push(
49+
r.extend(
4950
lexer
50-
.next_byte_value()
51-
.map_err(|_| StrLitDecodeError::OtherError)?,
51+
.next_str_lit_bytes()
52+
.map_err(|_| StrLitDecodeError::OtherError)?
53+
.bytes(),
5254
);
5355
}
5456
Ok(r)

test-crates/protobuf-codegen-protoc-test/src/common/v2/test_fmt_text_format.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,7 @@ fn test_string_bytes() {
117117
#[test]
118118
fn non_ascii_strings() {
119119
test_text_format_str_descriptor("string_singular: \"À\"", &TestTypes::descriptor());
120-
// TODO: fix this.
121-
// test_text_format_str_descriptor("string_singular: \"日月\"", &TestTypes::descriptor());
120+
test_text_format_str_descriptor("string_singular: \"日月\"", &TestTypes::descriptor());
122121
}
123122

124123
#[test]

0 commit comments

Comments
 (0)