Skip to content

Commit ea03582

Browse files
Use 5-byte encoding for string-refs so we can use the full 32 bit address space.
1 parent ad0b58d commit ea03582

File tree

2 files changed

+41
-52
lines changed

2 files changed

+41
-52
lines changed

analyzeme/src/stringtable.rs

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
//! See module-level documentation `measureme::stringtable`.
22
3-
use measureme::file_header::{
4-
strip_file_header, verify_file_header, FILE_MAGIC_STRINGTABLE_DATA,
5-
FILE_MAGIC_STRINGTABLE_INDEX,
3+
use measureme::stringtable::{METADATA_STRING_ID, TERMINATOR};
4+
use measureme::{
5+
file_header::{
6+
strip_file_header, verify_file_header, FILE_MAGIC_STRINGTABLE_DATA,
7+
FILE_MAGIC_STRINGTABLE_INDEX,
8+
},
9+
stringtable::STRING_REF_ENCODED_SIZE,
10+
stringtable::STRING_REF_TAG,
611
};
7-
use measureme::stringtable::{METADATA_STRING_ID, STRING_ID_MASK, TERMINATOR};
812
use measureme::{Addr, StringId};
913
use memchr::memchr;
1014
use rustc_hash::FxHashMap;
@@ -55,9 +59,8 @@ impl<'st> StringRef<'st> {
5559

5660
// Check if this is a string containing a single StringId component
5761
let first_byte = self.table.string_data[pos];
58-
const STRING_ID_SIZE: usize = std::mem::size_of::<StringId>();
59-
if terminator_pos == pos + STRING_ID_SIZE && is_utf8_continuation_byte(first_byte) {
60-
let id = decode_string_id_from_data(&self.table.string_data[pos..pos + STRING_ID_SIZE]);
62+
if first_byte == STRING_REF_TAG && terminator_pos == pos + STRING_REF_ENCODED_SIZE {
63+
let id = decode_string_ref_from_data(&self.table.string_data[pos..]);
6164
return StringRef {
6265
id,
6366
table: self.table,
@@ -97,15 +100,15 @@ impl<'st> StringRef<'st> {
97100

98101
if byte == TERMINATOR {
99102
return;
100-
} else if is_utf8_continuation_byte(byte) {
103+
} else if byte == STRING_REF_TAG {
101104
let string_ref = StringRef {
102-
id: decode_string_id_from_data(&self.table.string_data[pos..pos + 4]),
105+
id: decode_string_ref_from_data(&self.table.string_data[pos..]),
103106
table: self.table,
104107
};
105108

106109
string_ref.write_to_string(output);
107110

108-
pos += 4;
111+
pos += STRING_REF_ENCODED_SIZE;
109112
} else {
110113
while let Some((c, len)) = decode_utf8_char(&self.table.string_data[pos..]) {
111114
output.push(c);
@@ -129,19 +132,13 @@ impl<'st> StringRef<'st> {
129132
}
130133
}
131134

132-
fn is_utf8_continuation_byte(byte: u8) -> bool {
133-
// See module-level documentation for more information on the encoding.
134-
const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
135-
const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;
136-
(byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE
137-
}
138-
139135
// String IDs in the table data are encoded in big endian format, while string
140136
// IDs in the index are encoded in little endian format. Don't mix the two up.
141-
fn decode_string_id_from_data(bytes: &[u8]) -> StringId {
142-
let id = u32::from_be_bytes(bytes[0..4].try_into().unwrap());
143-
// Mask off the `0b10` prefix
144-
StringId::new(id & STRING_ID_MASK)
137+
fn decode_string_ref_from_data(bytes: &[u8]) -> StringId {
138+
assert!(bytes[0] == STRING_REF_TAG);
139+
assert!(STRING_REF_ENCODED_SIZE == 5);
140+
let id = u32::from_le_bytes(bytes[1..5].try_into().unwrap());
141+
StringId::new(id)
145142
}
146143

147144
// Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.

measureme/src/stringtable.rs

Lines changed: 23 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -15,27 +15,24 @@
1515
//! The byte-level encoding of component lists uses the structure of UTF-8 in
1616
//! order to save space:
1717
//!
18-
//! - A valid UTF-8 codepoint never starts with the bits `10` as this bit
19-
//! prefix is reserved for bytes in the middle of a UTF-8 codepoint byte
20-
//! sequence. We make use of this fact by letting all string ID components
21-
//! start with this `10` prefix. Thus when we parse the contents of a value
22-
//! we know to stop if the start byte of the next codepoint has this prefix.
18+
//! - A valid UTF-8 codepoint never starts with the byte `0xFE`. We make use
19+
//! of this fact by letting all string ID components start with this `0xFE`
20+
//! prefix. Thus when we parse the contents of a value we know to stop if
21+
//! we encounter this byte.
2322
//!
24-
//! - A valid UTF-8 string cannot contain the `0xFF` byte and since string IDs
25-
//! start with `10` as described above, they also cannot start with a `0xFF`
26-
//! byte. Thus we can safely use `0xFF` as our component list terminator.
23+
//! - A valid UTF-8 string cannot contain the `0xFF` byte. Thus we can safely
24+
//! use `0xFF` as our component list terminator.
2725
//!
2826
//! The sample composite string ["abc", ID(42), "def", TERMINATOR] would thus be
2927
//! encoded as:
3028
//!
3129
//! ```ignore
32-
//! ['a', 'b' , 'c', 128, 0, 0, 42, 'd', 'e', 'f', 255]
33-
//! ^^^^^^^^^^^^^ ^^^
34-
//! string ID 42 with 0b10 prefix terminator (0xFF)
30+
//! ['a', 'b' , 'c', 254, 42, 0, 0, 0, 'd', 'e', 'f', 255]
31+
//! ^^^^^^^^^^^^^^^^ ^^^
32+
//! string ID with 0xFE prefix terminator (0xFF)
3533
//! ```
3634
//!
37-
//! As you can see string IDs are encoded in big endian format so that highest
38-
//! order bits show up in the first byte we encounter.
35+
//! As you can see string IDs are encoded in little endian format.
3936
//!
4037
//! ----------------------------------------------------------------------------
4138
//!
@@ -58,10 +55,10 @@
5855
//! > [0 .. MAX_VIRTUAL_STRING_ID, METADATA_STRING_ID, .. ]
5956
//!
6057
//! From `0` to `MAX_VIRTUAL_STRING_ID` are the allowed values for virtual strings.
61-
//! After `MAX_VIRTUAL_STRING_ID`, there is one string id (`METADATA_STRING_ID`) which is used
62-
//! internally by `measureme` to record additional metadata about the profiling session.
63-
//! After `METADATA_STRING_ID` are all other `StringId` values.
64-
//!
58+
//! After `MAX_VIRTUAL_STRING_ID`, there is one string id (`METADATA_STRING_ID`)
59+
//! which is used internally by `measureme` to record additional metadata about
60+
//! the profiling session. After `METADATA_STRING_ID` are all other `StringId`
61+
//! values.
6562
6663
use crate::file_header::{
6764
write_file_header, FILE_MAGIC_STRINGTABLE_DATA, FILE_MAGIC_STRINGTABLE_INDEX,
@@ -84,7 +81,6 @@ impl StringId {
8481

8582
#[inline]
8683
pub fn new(id: u32) -> StringId {
87-
assert!(id <= MAX_STRING_ID);
8884
StringId(id)
8985
}
9086

@@ -106,23 +102,20 @@ impl StringId {
106102

107103
#[inline]
108104
pub fn from_addr(addr: Addr) -> StringId {
109-
let id = addr.0 + FIRST_REGULAR_STRING_ID;
105+
let id = addr.0.checked_add(FIRST_REGULAR_STRING_ID).unwrap();
110106
StringId::new(id)
111107
}
112108

113109
#[inline]
114110
pub fn to_addr(self) -> Addr {
115-
assert!(self.0 >= FIRST_REGULAR_STRING_ID);
116-
Addr(self.0 - FIRST_REGULAR_STRING_ID)
111+
Addr(self.0.checked_sub(FIRST_REGULAR_STRING_ID).unwrap())
117112
}
118113
}
119114

120115
// See module-level documentation for more information on the encoding.
121116
pub const TERMINATOR: u8 = 0xFF;
122-
123-
// All 1s except for the two highest bits.
124-
pub const MAX_STRING_ID: u32 = 0x3FFF_FFFF;
125-
pub const STRING_ID_MASK: u32 = 0x3FFF_FFFF;
117+
pub const STRING_REF_TAG: u8 = 0xFE;
118+
pub const STRING_REF_ENCODED_SIZE: usize = 5;
126119

127120
/// The maximum id value a virtual string may be.
128121
const MAX_USER_VIRTUAL_STRING_ID: u32 = 100_000_000;
@@ -175,7 +168,7 @@ impl<'s> StringComponent<'s> {
175168
fn serialized_size(&self) -> usize {
176169
match *self {
177170
StringComponent::Value(s) => s.len(),
178-
StringComponent::Ref(_) => 4,
171+
StringComponent::Ref(_) => STRING_REF_ENCODED_SIZE,
179172
}
180173
}
181174

@@ -187,11 +180,10 @@ impl<'s> StringComponent<'s> {
187180
&mut bytes[s.len()..]
188181
}
189182
StringComponent::Ref(string_id) => {
190-
assert!(string_id.0 == string_id.0 & STRING_ID_MASK);
191-
let tagged = string_id.0 | (1u32 << 31);
192-
193-
&mut bytes[0..4].copy_from_slice(&tagged.to_be_bytes());
194-
&mut bytes[4..]
183+
assert!(STRING_REF_ENCODED_SIZE == 5);
184+
bytes[0] = STRING_REF_TAG;
185+
&mut bytes[1..5].copy_from_slice(&string_id.0.to_le_bytes());
186+
&mut bytes[5..]
195187
}
196188
}
197189
}

0 commit comments

Comments
 (0)