Skip to content

Commit 47ca752

Browse files
StringTable: Replace the concept of reserved StringIds with the concept of virtual StringIds.
With this commit only "virtual" StringIds get an entry in the index table and regular StringIds store an actual address instead of index table key. That makes the index data a lot smaller and removes the need to do a table lookup for regular StringIds.
1 parent e1127aa commit 47ca752

File tree

6 files changed

+218
-97
lines changed

6 files changed

+218
-97
lines changed

analyzeme/src/stringtable.rs

Lines changed: 74 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,9 @@ use std::borrow::Cow;
1212
use std::error::Error;
1313
use memchr::memchr;
1414

15-
// See module-level documentation for more information on the encoding.
16-
const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
17-
const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;
18-
1915
fn deserialize_index_entry(bytes: &[u8]) -> (StringId, Addr) {
2016
(
21-
StringId::reserved(LittleEndian::read_u32(&bytes[0..4])),
17+
StringId::new(LittleEndian::read_u32(&bytes[0..4])),
2218
Addr(LittleEndian::read_u32(&bytes[4..8])),
2319
)
2420
}
@@ -29,12 +25,29 @@ pub struct StringRef<'st> {
2925
table: &'st StringTable,
3026
}
3127

28+
// This is the text we emit when encountering a virtual string ID that cannot
29+
// be resolved.
30+
const UNKNOWN_STRING: &str = "<unknown>";
31+
3232
impl<'st> StringRef<'st> {
33+
34+
/// Expands the StringRef into an actual string. This method will
35+
/// avoid allocating a `String` if it can instead return a `&str` pointing
36+
/// into the raw string table data.
3337
pub fn to_string(&self) -> Cow<'st, str> {
3438

35-
// Try to avoid the allocation, which we can do if this is a
36-
// [value, 0xFF] entry.
37-
let addr = self.table.index[&self.id];
39+
let addr = match self.get_addr() {
40+
Ok(addr) => addr,
41+
Err(_) => {
42+
return Cow::from(UNKNOWN_STRING)
43+
}
44+
};
45+
46+
// Try to avoid the allocation, which we can do if this is
47+
//
48+
// - a string with a single value component (`[value, 0xFF]`) or
49+
// - a string with a single reference component (`[string_id, 0xFF]`)
50+
3851
let pos = addr.as_usize();
3952
let slice_to_search = &self.table.string_data[pos..];
4053

@@ -43,36 +56,53 @@ impl<'st> StringRef<'st> {
4356
// is super fast.
4457
let terminator_pos = memchr(TERMINATOR, slice_to_search).unwrap();
4558

59+
// Check if this is a string containing a single StringId component
60+
let first_byte = self.table.string_data[pos];
61+
const STRING_ID_SIZE: usize = std::mem::size_of::<StringId>();
62+
if terminator_pos == pos + STRING_ID_SIZE && is_utf8_continuation_byte(first_byte) {
63+
let id = decode_string_id_from_data(&self.table.string_data[pos..pos+STRING_ID_SIZE]);
64+
return StringRef {
65+
id,
66+
table: self.table,
67+
}.to_string();
68+
}
69+
4670
// Decode the bytes until the terminator. If there is a string id in
4771
// between somewhere this will fail, and we fall back to the allocating
4872
// path.
4973
if let Ok(s) = std::str::from_utf8(&slice_to_search[..terminator_pos]) {
5074
Cow::from(s)
5175
} else {
76+
// This is the slow path where we actually allocate a `String` on
77+
// the heap and expand into that. If you suspect that there is a
78+
// bug in the fast path above, you can easily check if always taking
79+
// the slow path fixes the issue.
5280
let mut output = String::new();
5381
self.write_to_string(&mut output);
5482
Cow::from(output)
5583
}
5684
}
5785

5886
pub fn write_to_string(&self, output: &mut String) {
59-
let addr = self.table.index[&self.id];
87+
88+
let addr = match self.get_addr() {
89+
Ok(addr) => addr,
90+
Err(_) => {
91+
output.push_str(UNKNOWN_STRING);
92+
return
93+
}
94+
};
95+
6096
let mut pos = addr.as_usize();
6197

6298
loop {
6399
let byte = self.table.string_data[pos];
64100

65101
if byte == TERMINATOR {
66102
return;
67-
} else if (byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE {
68-
// This is a string-id
69-
let id = BigEndian::read_u32(&self.table.string_data[pos..pos + 4]);
70-
71-
// Mask off the `0b10` prefix
72-
let id = id & STRING_ID_MASK;
73-
103+
} else if is_utf8_continuation_byte(byte) {
74104
let string_ref = StringRef {
75-
id: StringId::reserved(id),
105+
id: decode_string_id_from_data(&self.table.string_data[pos..pos + 4]),
76106
table: self.table,
77107
};
78108

@@ -87,6 +117,32 @@ impl<'st> StringRef<'st> {
87117
}
88118
}
89119
}
120+
121+
fn get_addr(&self) -> Result<Addr, ()> {
122+
if self.id.is_virtual() {
123+
match self.table.index.get(&self.id) {
124+
Some(&addr) => Ok(addr),
125+
None => Err(()),
126+
}
127+
} else {
128+
Ok(self.id.to_addr())
129+
}
130+
}
131+
}
132+
133+
fn is_utf8_continuation_byte(byte: u8) -> bool {
134+
// See module-level documentation for more information on the encoding.
135+
const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
136+
const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;
137+
(byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE
138+
}
139+
140+
// String IDs in the table data are encoded in big endian format, while string
141+
// IDs in the index are encoded in little endian format. Don't mix the two up.
142+
fn decode_string_id_from_data(bytes: &[u8]) -> StringId {
143+
let id = BigEndian::read_u32(&bytes[0..4]);
144+
// Mask off the `0b10` prefix
145+
StringId::new(id & STRING_ID_MASK)
90146
}
91147

92148
// Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
@@ -181,7 +237,7 @@ impl StringTable {
181237
}
182238

183239
pub fn get_metadata<'a>(&'a self) -> StringRef<'a> {
184-
let id = StringId::reserved(METADATA_STRING_ID);
240+
let id = StringId::new(METADATA_STRING_ID);
185241
self.get(id)
186242
}
187243
}

analyzeme/src/testing_common.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@ fn generate_profiling_data<S: SerializationSink>(
2626
) -> Vec<Event<'static>> {
2727
let profiler = Arc::new(Profiler::<S>::new(Path::new(filestem)).unwrap());
2828

29-
let event_id_reserved = StringId::reserved(42);
29+
let event_id_virtual = StringId::new_virtual(42);
3030

3131
let event_ids = vec![
3232
(
3333
profiler.alloc_string("Generic"),
3434
profiler.alloc_string("SomeGenericActivity"),
3535
),
36-
(profiler.alloc_string("Query"), event_id_reserved),
36+
(profiler.alloc_string("Query"), event_id_virtual),
3737
];
3838

3939
// This and event_ids have to match!
@@ -73,7 +73,10 @@ fn generate_profiling_data<S: SerializationSink>(
7373

7474
// An example of allocating the string contents of an event id that has
7575
// already been used
76-
profiler.alloc_string_with_reserved_id(event_id_reserved, "SomeQuery");
76+
profiler.map_virtual_to_concrete_string(
77+
event_id_virtual,
78+
profiler.alloc_string("SomeQuery")
79+
);
7780

7881
expected_events
7982
}

measureme/src/file_header.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use crate::serialization::SerializationSink;
66
use byteorder::{ByteOrder, LittleEndian};
77
use std::error::Error;
88

9-
pub const CURRENT_FILE_FORMAT_VERSION: u32 = 3;
9+
pub const CURRENT_FILE_FORMAT_VERSION: u32 = 4;
1010
pub const FILE_MAGIC_EVENT_STREAM: &[u8; 4] = b"MMES";
1111
pub const FILE_MAGIC_STRINGTABLE_DATA: &[u8; 4] = b"MMSD";
1212
pub const FILE_MAGIC_STRINGTABLE_INDEX: &[u8; 4] = b"MMSI";

measureme/src/profiler.rs

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,21 @@ impl<S: SerializationSink> Profiler<S> {
6868
}
6969

7070
#[inline(always)]
71-
pub fn alloc_string_with_reserved_id<STR: SerializableString + ?Sized>(
71+
pub fn map_virtual_to_concrete_string(&self, virtual_id: StringId, concrete_id: StringId) {
72+
self.string_table
73+
.map_virtual_to_concrete_string(virtual_id, concrete_id);
74+
}
75+
76+
#[inline(always)]
77+
pub fn bulk_map_virtual_to_single_concrete_string<I>(
7278
&self,
73-
id: StringId,
74-
s: &STR,
75-
) -> StringId {
76-
self.string_table.alloc_with_reserved_id(id, s)
79+
virtual_ids: I,
80+
concrete_id: StringId,
81+
) where
82+
I: Iterator<Item = StringId> + ExactSizeIterator,
83+
{
84+
self.string_table
85+
.bulk_map_virtual_to_single_concrete_string(virtual_ids, concrete_id);
7786
}
7887

7988
#[inline(always)]
@@ -92,6 +101,7 @@ impl<S: SerializationSink> Profiler<S> {
92101

93102
/// Creates a "start" event and returns a `TimingGuard` that will create
94103
/// the corresponding "end" event when it is dropped.
104+
#[inline]
95105
pub fn start_recording_interval_event<'a>(
96106
&'a self,
97107
event_kind: StringId,

0 commit comments

Comments
 (0)