Skip to content

Commit 8d2d4fd

Browse files
authored
Merge pull request #98 from michaelwoerister/virtual-instead-of-reserved-string-ids
Virtual instead of reserved string ids
2 parents e1127aa + 47ca752 commit 8d2d4fd

File tree

6 files changed

+218
-97
lines changed

6 files changed

+218
-97
lines changed

analyzeme/src/stringtable.rs

Lines changed: 74 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,9 @@ use std::borrow::Cow;
1212
use std::error::Error;
1313
use memchr::memchr;
1414

15-
// See module-level documentation for more information on the encoding.
16-
const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
17-
const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;
18-
1915
fn deserialize_index_entry(bytes: &[u8]) -> (StringId, Addr) {
2016
(
21-
StringId::reserved(LittleEndian::read_u32(&bytes[0..4])),
17+
StringId::new(LittleEndian::read_u32(&bytes[0..4])),
2218
Addr(LittleEndian::read_u32(&bytes[4..8])),
2319
)
2420
}
@@ -29,12 +25,29 @@ pub struct StringRef<'st> {
2925
table: &'st StringTable,
3026
}
3127

28+
// This is the text we emit when encountering a virtual string ID that cannot
29+
// be resolved.
30+
const UNKNOWN_STRING: &str = "<unknown>";
31+
3232
impl<'st> StringRef<'st> {
33+
34+
/// Expands the StringRef into an actual string. This method will
35+
/// avoid allocating a `String` if it can instead return a `&str` pointing
36+
/// into the raw string table data.
3337
pub fn to_string(&self) -> Cow<'st, str> {
3438

35-
// Try to avoid the allocation, which we can do if this is a
36-
// [value, 0xFF] entry.
37-
let addr = self.table.index[&self.id];
39+
let addr = match self.get_addr() {
40+
Ok(addr) => addr,
41+
Err(_) => {
42+
return Cow::from(UNKNOWN_STRING)
43+
}
44+
};
45+
46+
// Try to avoid the allocation, which we can do if this is
47+
//
48+
// - a string with a single value component (`[value, 0xFF]`) or
49+
// - a string with a single reference component (`[string_id, 0xFF]`)
50+
3851
let pos = addr.as_usize();
3952
let slice_to_search = &self.table.string_data[pos..];
4053

@@ -43,36 +56,53 @@ impl<'st> StringRef<'st> {
4356
// is super fast.
4457
let terminator_pos = memchr(TERMINATOR, slice_to_search).unwrap();
4558

59+
// Check if this is a string containing a single StringId component
60+
let first_byte = self.table.string_data[pos];
61+
const STRING_ID_SIZE: usize = std::mem::size_of::<StringId>();
62+
if terminator_pos == pos + STRING_ID_SIZE && is_utf8_continuation_byte(first_byte) {
63+
let id = decode_string_id_from_data(&self.table.string_data[pos..pos+STRING_ID_SIZE]);
64+
return StringRef {
65+
id,
66+
table: self.table,
67+
}.to_string();
68+
}
69+
4670
// Decode the bytes until the terminator. If there is a string id in
4771
// between somewhere this will fail, and we fall back to the allocating
4872
// path.
4973
if let Ok(s) = std::str::from_utf8(&slice_to_search[..terminator_pos]) {
5074
Cow::from(s)
5175
} else {
76+
// This is the slow path where we actually allocate a `String` on
77+
// the heap and expand into that. If you suspect that there is a
78+
// bug in the fast path above, you can easily check if always taking
79+
// the slow path fixes the issue.
5280
let mut output = String::new();
5381
self.write_to_string(&mut output);
5482
Cow::from(output)
5583
}
5684
}
5785

5886
pub fn write_to_string(&self, output: &mut String) {
59-
let addr = self.table.index[&self.id];
87+
88+
let addr = match self.get_addr() {
89+
Ok(addr) => addr,
90+
Err(_) => {
91+
output.push_str(UNKNOWN_STRING);
92+
return
93+
}
94+
};
95+
6096
let mut pos = addr.as_usize();
6197

6298
loop {
6399
let byte = self.table.string_data[pos];
64100

65101
if byte == TERMINATOR {
66102
return;
67-
} else if (byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE {
68-
// This is a string-id
69-
let id = BigEndian::read_u32(&self.table.string_data[pos..pos + 4]);
70-
71-
// Mask off the `0b10` prefix
72-
let id = id & STRING_ID_MASK;
73-
103+
} else if is_utf8_continuation_byte(byte) {
74104
let string_ref = StringRef {
75-
id: StringId::reserved(id),
105+
id: decode_string_id_from_data(&self.table.string_data[pos..pos + 4]),
76106
table: self.table,
77107
};
78108

@@ -87,6 +117,32 @@ impl<'st> StringRef<'st> {
87117
}
88118
}
89119
}
120+
121+
fn get_addr(&self) -> Result<Addr, ()> {
122+
if self.id.is_virtual() {
123+
match self.table.index.get(&self.id) {
124+
Some(&addr) => Ok(addr),
125+
None => Err(()),
126+
}
127+
} else {
128+
Ok(self.id.to_addr())
129+
}
130+
}
131+
}
132+
133+
fn is_utf8_continuation_byte(byte: u8) -> bool {
134+
// See module-level documentation for more information on the encoding.
135+
const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
136+
const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;
137+
(byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE
138+
}
139+
140+
// String IDs in the table data are encoded in big endian format, while string
141+
// IDs in the index are encoded in little endian format. Don't mix the two up.
142+
fn decode_string_id_from_data(bytes: &[u8]) -> StringId {
143+
let id = BigEndian::read_u32(&bytes[0..4]);
144+
// Mask off the `0b10` prefix
145+
StringId::new(id & STRING_ID_MASK)
90146
}
91147

92148
// Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
@@ -181,7 +237,7 @@ impl StringTable {
181237
}
182238

183239
pub fn get_metadata<'a>(&'a self) -> StringRef<'a> {
184-
let id = StringId::reserved(METADATA_STRING_ID);
240+
let id = StringId::new(METADATA_STRING_ID);
185241
self.get(id)
186242
}
187243
}

analyzeme/src/testing_common.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@ fn generate_profiling_data<S: SerializationSink>(
2626
) -> Vec<Event<'static>> {
2727
let profiler = Arc::new(Profiler::<S>::new(Path::new(filestem)).unwrap());
2828

29-
let event_id_reserved = StringId::reserved(42);
29+
let event_id_virtual = StringId::new_virtual(42);
3030

3131
let event_ids = vec![
3232
(
3333
profiler.alloc_string("Generic"),
3434
profiler.alloc_string("SomeGenericActivity"),
3535
),
36-
(profiler.alloc_string("Query"), event_id_reserved),
36+
(profiler.alloc_string("Query"), event_id_virtual),
3737
];
3838

3939
// This and event_ids have to match!
@@ -73,7 +73,10 @@ fn generate_profiling_data<S: SerializationSink>(
7373

7474
// An example of allocating the string contents of an event id that has
7575
// already been used
76-
profiler.alloc_string_with_reserved_id(event_id_reserved, "SomeQuery");
76+
profiler.map_virtual_to_concrete_string(
77+
event_id_virtual,
78+
profiler.alloc_string("SomeQuery")
79+
);
7780

7881
expected_events
7982
}

measureme/src/file_header.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use crate::serialization::SerializationSink;
66
use byteorder::{ByteOrder, LittleEndian};
77
use std::error::Error;
88

9-
pub const CURRENT_FILE_FORMAT_VERSION: u32 = 3;
9+
pub const CURRENT_FILE_FORMAT_VERSION: u32 = 4;
1010
pub const FILE_MAGIC_EVENT_STREAM: &[u8; 4] = b"MMES";
1111
pub const FILE_MAGIC_STRINGTABLE_DATA: &[u8; 4] = b"MMSD";
1212
pub const FILE_MAGIC_STRINGTABLE_INDEX: &[u8; 4] = b"MMSI";

measureme/src/profiler.rs

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,21 @@ impl<S: SerializationSink> Profiler<S> {
6868
}
6969

7070
#[inline(always)]
71-
pub fn alloc_string_with_reserved_id<STR: SerializableString + ?Sized>(
71+
pub fn map_virtual_to_concrete_string(&self, virtual_id: StringId, concrete_id: StringId) {
72+
self.string_table
73+
.map_virtual_to_concrete_string(virtual_id, concrete_id);
74+
}
75+
76+
#[inline(always)]
77+
pub fn bulk_map_virtual_to_single_concrete_string<I>(
7278
&self,
73-
id: StringId,
74-
s: &STR,
75-
) -> StringId {
76-
self.string_table.alloc_with_reserved_id(id, s)
79+
virtual_ids: I,
80+
concrete_id: StringId,
81+
) where
82+
I: Iterator<Item = StringId> + ExactSizeIterator,
83+
{
84+
self.string_table
85+
.bulk_map_virtual_to_single_concrete_string(virtual_ids, concrete_id);
7786
}
7887

7988
#[inline(always)]
@@ -92,6 +101,7 @@ impl<S: SerializationSink> Profiler<S> {
92101

93102
/// Creates a "start" event and returns a `TimingGuard` that will create
94103
/// the corresponding "end" event when it is dropped.
104+
#[inline]
95105
pub fn start_recording_interval_event<'a>(
96106
&'a self,
97107
event_kind: StringId,

0 commit comments

Comments
 (0)