Skip to content

Commit ce5c7e7

Browse files
Simplify StringTable decoding after switching to 5-byte string ref encoding.
1 parent ea03582 commit ce5c7e7

File tree

1 file changed

+16
-71
lines changed

1 file changed

+16
-71
lines changed

analyzeme/src/stringtable.rs

Lines changed: 16 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use measureme::{
1010
stringtable::STRING_REF_TAG,
1111
};
1212
use measureme::{Addr, StringId};
13-
use memchr::memchr;
13+
use memchr::{memchr, memchr2};
1414
use rustc_hash::FxHashMap;
1515
use std::borrow::Cow;
1616
use std::convert::TryInto;
@@ -34,6 +34,10 @@ pub struct StringRef<'st> {
3434
// be resolved.
3535
const UNKNOWN_STRING: &str = "<unknown>";
3636

37+
// This is the text we emit when we encounter string data that does not have a
38+
// proper terminator.
39+
const INVALID_STRING: &str = "<invalid>";
40+
3741
impl<'st> StringRef<'st> {
3842
/// Expands the StringRef into an actual string. This method will
3943
/// avoid allocating a `String` if it can instead return a `&str` pointing
@@ -110,9 +114,18 @@ impl<'st> StringRef<'st> {
110114

111115
pos += STRING_REF_ENCODED_SIZE;
112116
} else {
113-
while let Some((c, len)) = decode_utf8_char(&self.table.string_data[pos..]) {
114-
output.push(c);
117+
// This is a literal UTF-8 string value. Find its end by looking
118+
// for either of the two possible terminator bytes.
119+
let remaining_data = &self.table.string_data[pos..];
120+
if let Some(len) = memchr2(0xFF, 0xFE, remaining_data) {
121+
let value = String::from_utf8_lossy(&remaining_data[..len]);
122+
output.push_str(&value);
115123
pos += len;
124+
} else {
125+
// The grammar does not allow unterminated raw strings. We
126+
// have to stop decoding.
127+
output.push_str(INVALID_STRING);
128+
return;
116129
}
117130
}
118131
}
@@ -141,58 +154,6 @@ fn decode_string_ref_from_data(bytes: &[u8]) -> StringId {
141154
StringId::new(id)
142155
}
143156

144-
// Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
145-
// Returns the decoded `char` and its size in bytes if it succeeds.
146-
// Returns `None` if `bytes` does not start with a valid UTF-8 codepoint.
147-
// See https://en.wikipedia.org/wiki/UTF-8 for in-depth information on the
148-
// encoding.
149-
fn decode_utf8_char(bytes: &[u8]) -> Option<(char, usize)> {
150-
use std::convert::TryFrom;
151-
let first_byte = bytes[0] as u32;
152-
let (codepoint, len) = if (first_byte & 0b1000_0000) == 0 {
153-
// The highest bit is zero, so this is a single-byte char
154-
(first_byte, 1)
155-
} else if (first_byte & 0b1110_0000) == 0b1100_0000 {
156-
// This is a two byte character
157-
let bits0 = first_byte & 0b0001_1111;
158-
let bits1 = (bytes[1] & 0b0011_1111) as u32;
159-
160-
(bits0 << 6 | bits1, 2)
161-
} else if (first_byte & 0b1111_0000) == 0b1110_0000 {
162-
// This is a three byte character
163-
let bits0 = first_byte & 0b0000_1111;
164-
let bits1 = (bytes[1] & 0b0011_1111) as u32;
165-
let bits2 = (bytes[2] & 0b0011_1111) as u32;
166-
167-
((bits0 << 12) | (bits1 << 6) | bits2, 3)
168-
} else if (first_byte & 0b1111_1000) == 0b1111_0000 {
169-
// This is a four byte character
170-
let bits0 = first_byte & 0b0000_0111;
171-
let bits1 = (bytes[1] & 0b0011_1111) as u32;
172-
let bits2 = (bytes[2] & 0b0011_1111) as u32;
173-
let bits3 = (bytes[3] & 0b0011_1111) as u32;
174-
175-
((bits0 << 18) | (bits1 << 12) | (bits2 << 6) | bits3, 4)
176-
} else {
177-
return None;
178-
};
179-
180-
match char::try_from(codepoint) {
181-
Ok(c) => {
182-
debug_assert!({
183-
let test_bytes = &mut [0u8; 8];
184-
c.encode_utf8(test_bytes);
185-
&test_bytes[..len] == &bytes[..len]
186-
});
187-
188-
Some((c, len))
189-
}
190-
Err(e) => {
191-
panic!("StringTable: Encountered invalid UTF8 char: {:?}", e);
192-
}
193-
}
194-
}
195-
196157
/// Read-only version of the string table
197158
#[derive(Debug)]
198159
pub struct StringTable {
@@ -343,20 +304,4 @@ mod tests {
343304
assert_eq!(str_ref.to_string(), write_to);
344305
}
345306
}
346-
347-
#[test]
348-
fn utf8_char_decoding() {
349-
use std::convert::TryFrom;
350-
351-
// Let's just test all possible codepoints because there are not that
352-
// many actually.
353-
for codepoint in 0..=0x10FFFFu32 {
354-
if let Ok(expected_char) = char::try_from(codepoint) {
355-
let buffer = &mut [0; 4];
356-
let expected_len = expected_char.encode_utf8(buffer).len();
357-
let expected = Some((expected_char, expected_len));
358-
assert_eq!(expected, decode_utf8_char(&buffer[..]));
359-
}
360-
}
361-
}
362307
}

0 commit comments

Comments
 (0)