Skip to content

Commit 6792001

Browse files
authored
Merge pull request #137 from michaelwoerister/more-address-space2
Allow StringTable to use the full 32-bit address space
2 parents 186be86 + f80765f commit 6792001

File tree

3 files changed

+66
-124
lines changed

3 files changed

+66
-124
lines changed

analyzeme/src/stringtable.rs

Lines changed: 38 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
//! See module-level documentation `measureme::stringtable`.
22
3-
use measureme::file_header::{
4-
strip_file_header, verify_file_header, FILE_MAGIC_STRINGTABLE_DATA,
5-
FILE_MAGIC_STRINGTABLE_INDEX,
3+
use measureme::stringtable::{METADATA_STRING_ID, TERMINATOR};
4+
use measureme::{
5+
file_header::{
6+
strip_file_header, verify_file_header, FILE_MAGIC_STRINGTABLE_DATA,
7+
FILE_MAGIC_STRINGTABLE_INDEX,
8+
},
9+
stringtable::STRING_REF_ENCODED_SIZE,
10+
stringtable::STRING_REF_TAG,
611
};
7-
use measureme::stringtable::{METADATA_STRING_ID, STRING_ID_MASK, TERMINATOR};
812
use measureme::{Addr, StringId};
9-
use memchr::memchr;
13+
use memchr::{memchr, memchr2};
1014
use rustc_hash::FxHashMap;
1115
use std::borrow::Cow;
1216
use std::convert::TryInto;
@@ -30,6 +34,10 @@ pub struct StringRef<'st> {
3034
// be resolved.
3135
const UNKNOWN_STRING: &str = "<unknown>";
3236

37+
// This is the text we emit when we encounter string data that does not have a
38+
// proper terminator.
39+
const INVALID_STRING: &str = "<invalid>";
40+
3341
impl<'st> StringRef<'st> {
3442
/// Expands the StringRef into an actual string. This method will
3543
/// avoid allocating a `String` if it can instead return a `&str` pointing
@@ -55,9 +63,8 @@ impl<'st> StringRef<'st> {
5563

5664
// Check if this is a string containing a single StringId component
5765
let first_byte = self.table.string_data[pos];
58-
const STRING_ID_SIZE: usize = std::mem::size_of::<StringId>();
59-
if terminator_pos == pos + STRING_ID_SIZE && is_utf8_continuation_byte(first_byte) {
60-
let id = decode_string_id_from_data(&self.table.string_data[pos..pos + STRING_ID_SIZE]);
66+
if first_byte == STRING_REF_TAG && terminator_pos == pos + STRING_REF_ENCODED_SIZE {
67+
let id = decode_string_ref_from_data(&self.table.string_data[pos..]);
6168
return StringRef {
6269
id,
6370
table: self.table,
@@ -97,19 +104,28 @@ impl<'st> StringRef<'st> {
97104

98105
if byte == TERMINATOR {
99106
return;
100-
} else if is_utf8_continuation_byte(byte) {
107+
} else if byte == STRING_REF_TAG {
101108
let string_ref = StringRef {
102-
id: decode_string_id_from_data(&self.table.string_data[pos..pos + 4]),
109+
id: decode_string_ref_from_data(&self.table.string_data[pos..]),
103110
table: self.table,
104111
};
105112

106113
string_ref.write_to_string(output);
107114

108-
pos += 4;
115+
pos += STRING_REF_ENCODED_SIZE;
109116
} else {
110-
while let Some((c, len)) = decode_utf8_char(&self.table.string_data[pos..]) {
111-
output.push(c);
117+
// This is a literal UTF-8 string value. Find its end by looking
118+
// for either of the two possible terminator bytes.
119+
let remaining_data = &self.table.string_data[pos..];
120+
if let Some(len) = memchr2(0xFF, 0xFE, remaining_data) {
121+
let value = String::from_utf8_lossy(&remaining_data[..len]);
122+
output.push_str(&value);
112123
pos += len;
124+
} else {
125+
// The grammar does not allow unterminated raw strings. We
126+
// have to stop decoding.
127+
output.push_str(INVALID_STRING);
128+
return;
113129
}
114130
}
115131
}
@@ -129,71 +145,17 @@ impl<'st> StringRef<'st> {
129145
}
130146
}
131147

132-
fn is_utf8_continuation_byte(byte: u8) -> bool {
133-
// See module-level documentation for more information on the encoding.
134-
const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
135-
const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;
136-
(byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE
137-
}
138-
139148
// String IDs in the table data are encoded in big endian format, while string
140149
// IDs in the index are encoded in little endian format. Don't mix the two up.
141-
fn decode_string_id_from_data(bytes: &[u8]) -> StringId {
142-
let id = u32::from_be_bytes(bytes[0..4].try_into().unwrap());
143-
// Mask off the `0b10` prefix
144-
StringId::new(id & STRING_ID_MASK)
145-
}
146-
147-
// Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
148-
// Returns the decoded `char` and its size in bytes if it succeeds.
149-
// Returns `None` if `bytes` does not start with a valid UTF-8 codepoint.
150-
// See https://en.wikipedia.org/wiki/UTF-8 for in-depth information on the
151-
// encoding.
152-
fn decode_utf8_char(bytes: &[u8]) -> Option<(char, usize)> {
153-
use std::convert::TryFrom;
154-
let first_byte = bytes[0] as u32;
155-
let (codepoint, len) = if (first_byte & 0b1000_0000) == 0 {
156-
// The highest bit is zero, so this is a single-byte char
157-
(first_byte, 1)
158-
} else if (first_byte & 0b1110_0000) == 0b1100_0000 {
159-
// This is a two byte character
160-
let bits0 = first_byte & 0b0001_1111;
161-
let bits1 = (bytes[1] & 0b0011_1111) as u32;
162-
163-
(bits0 << 6 | bits1, 2)
164-
} else if (first_byte & 0b1111_0000) == 0b1110_0000 {
165-
// This is a three byte character
166-
let bits0 = first_byte & 0b0000_1111;
167-
let bits1 = (bytes[1] & 0b0011_1111) as u32;
168-
let bits2 = (bytes[2] & 0b0011_1111) as u32;
169-
170-
((bits0 << 12) | (bits1 << 6) | bits2, 3)
171-
} else if (first_byte & 0b1111_1000) == 0b1111_0000 {
172-
// This is a four byte character
173-
let bits0 = first_byte & 0b0000_0111;
174-
let bits1 = (bytes[1] & 0b0011_1111) as u32;
175-
let bits2 = (bytes[2] & 0b0011_1111) as u32;
176-
let bits3 = (bytes[3] & 0b0011_1111) as u32;
177-
178-
((bits0 << 18) | (bits1 << 12) | (bits2 << 6) | bits3, 4)
179-
} else {
180-
return None;
181-
};
182-
183-
match char::try_from(codepoint) {
184-
Ok(c) => {
185-
debug_assert!({
186-
let test_bytes = &mut [0u8; 8];
187-
c.encode_utf8(test_bytes);
188-
&test_bytes[..len] == &bytes[..len]
189-
});
190-
191-
Some((c, len))
192-
}
193-
Err(e) => {
194-
panic!("StringTable: Encountered invalid UTF8 char: {:?}", e);
195-
}
196-
}
150+
fn decode_string_ref_from_data(bytes: &[u8]) -> StringId {
151+
// The code below assumes we use a 5-byte encoding for string
152+
// refs, where the first byte is STRING_REF_TAG and the
153+
// following 4 bytes are a little-endian u32 string ID value.
154+
assert!(bytes[0] == STRING_REF_TAG);
155+
assert!(STRING_REF_ENCODED_SIZE == 5);
156+
157+
let id = u32::from_le_bytes(bytes[1..5].try_into().unwrap());
158+
StringId::new(id)
197159
}
198160

199161
/// Read-only version of the string table
@@ -346,20 +308,4 @@ mod tests {
346308
assert_eq!(str_ref.to_string(), write_to);
347309
}
348310
}
349-
350-
#[test]
351-
fn utf8_char_decoding() {
352-
use std::convert::TryFrom;
353-
354-
// Let's just test all possible codepoints because there are not that
355-
// many actually.
356-
for codepoint in 0..=0x10FFFFu32 {
357-
if let Ok(expected_char) = char::try_from(codepoint) {
358-
let buffer = &mut [0; 4];
359-
let expected_len = expected_char.encode_utf8(buffer).len();
360-
let expected = Some((expected_char, expected_len));
361-
assert_eq!(expected, decode_utf8_char(&buffer[..]));
362-
}
363-
}
364-
}
365311
}

measureme/src/file_header.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use std::convert::TryInto;
55
use std::error::Error;
66
use std::path::Path;
77

8-
pub const CURRENT_FILE_FORMAT_VERSION: u32 = 6;
8+
pub const CURRENT_FILE_FORMAT_VERSION: u32 = 7;
99

1010
pub const FILE_MAGIC_TOP_LEVEL: &[u8; 4] = b"MMPD";
1111
pub const FILE_MAGIC_EVENT_STREAM: &[u8; 4] = b"MMES";

measureme/src/stringtable.rs

Lines changed: 27 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -15,27 +15,24 @@
1515
//! The byte-level encoding of component lists uses the structure of UTF-8 in
1616
//! order to save space:
1717
//!
18-
//! - A valid UTF-8 codepoint never starts with the bits `10` as this bit
19-
//! prefix is reserved for bytes in the middle of a UTF-8 codepoint byte
20-
//! sequence. We make use of this fact by letting all string ID components
21-
//! start with this `10` prefix. Thus when we parse the contents of a value
22-
//! we know to stop if the start byte of the next codepoint has this prefix.
18+
//! - A valid UTF-8 codepoint never starts with the byte `0xFE`. We make use
19+
//! of this fact by letting all string ID components start with this `0xFE`
20+
//! prefix. Thus when we parse the contents of a value we know to stop if
21+
//! we encounter this byte.
2322
//!
24-
//! - A valid UTF-8 string cannot contain the `0xFF` byte and since string IDs
25-
//! start with `10` as described above, they also cannot start with a `0xFF`
26-
//! byte. Thus we can safely use `0xFF` as our component list terminator.
23+
//! - A valid UTF-8 string cannot contain the `0xFF` byte. Thus we can safely
24+
//! use `0xFF` as our component list terminator.
2725
//!
2826
//! The sample composite string ["abc", ID(42), "def", TERMINATOR] would thus be
2927
//! encoded as:
3028
//!
3129
//! ```ignore
32-
//! ['a', 'b' , 'c', 128, 0, 0, 42, 'd', 'e', 'f', 255]
33-
//! ^^^^^^^^^^^^^ ^^^
34-
//! string ID 42 with 0b10 prefix terminator (0xFF)
30+
//! ['a', 'b' , 'c', 254, 42, 0, 0, 0, 'd', 'e', 'f', 255]
31+
//! ^^^^^^^^^^^^^^^^ ^^^
32+
//! string ID with 0xFE prefix terminator (0xFF)
3533
//! ```
3634
//!
37-
//! As you can see string IDs are encoded in big endian format so that highest
38-
//! order bits show up in the first byte we encounter.
35+
//! As you can see string IDs are encoded in little endian format.
3936
//!
4037
//! ----------------------------------------------------------------------------
4138
//!
@@ -58,10 +55,10 @@
5855
//! > [0 .. MAX_VIRTUAL_STRING_ID, METADATA_STRING_ID, .. ]
5956
//!
6057
//! From `0` to `MAX_VIRTUAL_STRING_ID` are the allowed values for virtual strings.
61-
//! After `MAX_VIRTUAL_STRING_ID`, there is one string id (`METADATA_STRING_ID`) which is used
62-
//! internally by `measureme` to record additional metadata about the profiling session.
63-
//! After `METADATA_STRING_ID` are all other `StringId` values.
64-
//!
58+
//! After `MAX_VIRTUAL_STRING_ID`, there is one string id (`METADATA_STRING_ID`)
59+
//! which is used internally by `measureme` to record additional metadata about
60+
//! the profiling session. After `METADATA_STRING_ID` are all other `StringId`
61+
//! values.
6562
6663
use crate::file_header::{
6764
write_file_header, FILE_MAGIC_STRINGTABLE_DATA, FILE_MAGIC_STRINGTABLE_INDEX,
@@ -84,7 +81,6 @@ impl StringId {
8481

8582
#[inline]
8683
pub fn new(id: u32) -> StringId {
87-
assert!(id <= MAX_STRING_ID);
8884
StringId(id)
8985
}
9086

@@ -106,23 +102,20 @@ impl StringId {
106102

107103
#[inline]
108104
pub fn from_addr(addr: Addr) -> StringId {
109-
let id = addr.0 + FIRST_REGULAR_STRING_ID;
105+
let id = addr.0.checked_add(FIRST_REGULAR_STRING_ID).unwrap();
110106
StringId::new(id)
111107
}
112108

113109
#[inline]
114110
pub fn to_addr(self) -> Addr {
115-
assert!(self.0 >= FIRST_REGULAR_STRING_ID);
116-
Addr(self.0 - FIRST_REGULAR_STRING_ID)
111+
Addr(self.0.checked_sub(FIRST_REGULAR_STRING_ID).unwrap())
117112
}
118113
}
119114

120115
// See module-level documentation for more information on the encoding.
121116
pub const TERMINATOR: u8 = 0xFF;
122-
123-
// All 1s except for the two highest bits.
124-
pub const MAX_STRING_ID: u32 = 0x3FFF_FFFF;
125-
pub const STRING_ID_MASK: u32 = 0x3FFF_FFFF;
117+
pub const STRING_REF_TAG: u8 = 0xFE;
118+
pub const STRING_REF_ENCODED_SIZE: usize = 5;
126119

127120
/// The maximum id value a virtual string may be.
128121
const MAX_USER_VIRTUAL_STRING_ID: u32 = 100_000_000;
@@ -175,7 +168,7 @@ impl<'s> StringComponent<'s> {
175168
fn serialized_size(&self) -> usize {
176169
match *self {
177170
StringComponent::Value(s) => s.len(),
178-
StringComponent::Ref(_) => 4,
171+
StringComponent::Ref(_) => STRING_REF_ENCODED_SIZE,
179172
}
180173
}
181174

@@ -187,11 +180,14 @@ impl<'s> StringComponent<'s> {
187180
&mut bytes[s.len()..]
188181
}
189182
StringComponent::Ref(string_id) => {
190-
assert!(string_id.0 == string_id.0 & STRING_ID_MASK);
191-
let tagged = string_id.0 | (1u32 << 31);
192-
193-
&mut bytes[0..4].copy_from_slice(&tagged.to_be_bytes());
194-
&mut bytes[4..]
183+
// The code below assumes we use a 5-byte encoding for string
184+
// refs, where the first byte is STRING_REF_TAG and the
185+
// following 4 bytes are a little-endian u32 string ID value.
186+
assert!(STRING_REF_ENCODED_SIZE == 5);
187+
188+
bytes[0] = STRING_REF_TAG;
189+
&mut bytes[1..5].copy_from_slice(&string_id.0.to_le_bytes());
190+
&mut bytes[5..]
195191
}
196192
}
197193
}

0 commit comments

Comments
 (0)