Skip to content

Commit 771697a

Browse files
authored
Merge pull request #90 from michaelwoerister/utf8-based-stringtable-encoding
Use more compact encoding for the string table
2 parents 665b384 + 3bc5f30 commit 771697a

File tree

7 files changed

+448
-222
lines changed

7 files changed

+448
-222
lines changed

analyzeme/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ edition = "2018"
66
license = "MIT OR Apache-2.0"
77

88
[dependencies]
9+
byteorder = "1.2.7"
10+
memchr = "2"
911
measureme = { path = "../measureme" }
1012
rustc-hash = "1.0.1"
1113
serde = { version = "1.0", features = [ "derive" ] }

analyzeme/src/lib.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@ mod event;
1717
mod lightweight_event;
1818
mod profiling_data;
1919
mod stack_collapse;
20-
mod timestamp;
20+
mod stringtable;
2121
pub mod testing_common;
22+
mod timestamp;
2223

2324
pub use crate::event::Event;
2425
pub use crate::lightweight_event::LightweightEvent;
2526
pub use crate::profiling_data::{ProfilingData, ProfilingDataBuilder};
2627
pub use crate::stack_collapse::collapse_stacks;
28+
pub use crate::stringtable::{StringRef, StringTable};
2729
pub use crate::timestamp::Timestamp;

analyzeme/src/profiling_data.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
use crate::event::Event;
22
use crate::lightweight_event::LightweightEvent;
3+
use crate::StringTable;
34
use crate::timestamp::Timestamp;
45
use measureme::file_header::{
56
read_file_header, write_file_header, CURRENT_FILE_FORMAT_VERSION, FILE_HEADER_SIZE,
67
FILE_MAGIC_EVENT_STREAM,
78
};
89
use measureme::ByteVecSink;
9-
use measureme::{ProfilerFiles, RawEvent, SerializationSink, StringTable, StringTableBuilder};
10+
use measureme::{ProfilerFiles, RawEvent, SerializationSink, StringTableBuilder};
1011
use serde::{Deserialize, Deserializer};
1112
use std::error::Error;
1213
use std::fs;

analyzeme/src/stringtable.rs

Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
//! See module-level documentation `measureme::stringtable`.
2+
3+
use byteorder::{BigEndian, ByteOrder, LittleEndian};
4+
use measureme::file_header::{
5+
read_file_header, strip_file_header, CURRENT_FILE_FORMAT_VERSION, FILE_MAGIC_STRINGTABLE_DATA,
6+
FILE_MAGIC_STRINGTABLE_INDEX,
7+
};
8+
use measureme::stringtable::{METADATA_STRING_ID, STRING_ID_MASK, TERMINATOR};
9+
use measureme::{Addr, StringId};
10+
use rustc_hash::FxHashMap;
11+
use std::borrow::Cow;
12+
use std::error::Error;
13+
use memchr::memchr;
14+
15+
// See module-level documentation for more information on the encoding.
16+
const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
17+
const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;
18+
19+
fn deserialize_index_entry(bytes: &[u8]) -> (StringId, Addr) {
20+
(
21+
StringId::reserved(LittleEndian::read_u32(&bytes[0..4])),
22+
Addr(LittleEndian::read_u32(&bytes[4..8])),
23+
)
24+
}
25+
26+
#[derive(Copy, Clone)]
27+
pub struct StringRef<'st> {
28+
id: StringId,
29+
table: &'st StringTable,
30+
}
31+
32+
impl<'st> StringRef<'st> {
33+
pub fn to_string(&self) -> Cow<'st, str> {
34+
35+
// Try to avoid the allocation, which we can do if this is a
36+
// [value, 0xFF] entry.
37+
let addr = self.table.index[&self.id];
38+
let pos = addr.as_usize();
39+
let slice_to_search = &self.table.string_data[pos..];
40+
41+
// Find the first 0xFF byte which which is either the sequence
42+
// terminator or a byte in the middle of string id. Use `memchr` which
43+
// is super fast.
44+
let terminator_pos = memchr(TERMINATOR, slice_to_search).unwrap();
45+
46+
// Decode the bytes until the terminator. If there is a string id in
47+
// between somewhere this will fail, and we fall back to the allocating
48+
// path.
49+
if let Ok(s) = std::str::from_utf8(&slice_to_search[..terminator_pos]) {
50+
Cow::from(s)
51+
} else {
52+
let mut output = String::new();
53+
self.write_to_string(&mut output);
54+
Cow::from(output)
55+
}
56+
}
57+
58+
pub fn write_to_string(&self, output: &mut String) {
59+
let addr = self.table.index[&self.id];
60+
let mut pos = addr.as_usize();
61+
62+
loop {
63+
let byte = self.table.string_data[pos];
64+
65+
if byte == TERMINATOR {
66+
return;
67+
} else if (byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE {
68+
// This is a string-id
69+
let id = BigEndian::read_u32(&self.table.string_data[pos..pos + 4]);
70+
71+
// Mask off the `0b10` prefix
72+
let id = id & STRING_ID_MASK;
73+
74+
let string_ref = StringRef {
75+
id: StringId::reserved(id),
76+
table: self.table,
77+
};
78+
79+
string_ref.write_to_string(output);
80+
81+
pos += 4;
82+
} else {
83+
while let Some((c, len)) = decode_utf8_char(&self.table.string_data[pos..]) {
84+
output.push(c);
85+
pos += len;
86+
}
87+
}
88+
}
89+
}
90+
}
91+
92+
// Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
93+
// Returns the decoded `char` and its size in bytes if it succeeds.
94+
// Returns `None` if `bytes` does not start with a valid UTF-8 codepoint.
95+
// See https://en.wikipedia.org/wiki/UTF-8 for in-depth information on the
96+
// encoding.
97+
fn decode_utf8_char(bytes: &[u8]) -> Option<(char, usize)> {
98+
use std::convert::TryFrom;
99+
let first_byte = bytes[0] as u32;
100+
let (codepoint, len) = if (first_byte & 0b1000_0000) == 0 {
101+
// The highest bit is zero, so this is a single-byte char
102+
(first_byte, 1)
103+
} else if (first_byte & 0b1110_0000) == 0b1100_0000 {
104+
// This is a two byte character
105+
let bits0 = first_byte & 0b0001_1111;
106+
let bits1 = (bytes[1] & 0b0011_1111) as u32;
107+
108+
(bits0 << 6 | bits1, 2)
109+
} else if (first_byte & 0b1111_0000) == 0b1110_0000 {
110+
// This is a three byte character
111+
let bits0 = first_byte & 0b0000_1111;
112+
let bits1 = (bytes[1] & 0b0011_1111) as u32;
113+
let bits2 = (bytes[2] & 0b0011_1111) as u32;
114+
115+
((bits0 << 12) | (bits1 << 6) | bits2, 3)
116+
} else if (first_byte & 0b1111_1000) == 0b1111_0000 {
117+
// This is a four byte character
118+
let bits0 = first_byte & 0b0000_0111;
119+
let bits1 = (bytes[1] & 0b0011_1111) as u32;
120+
let bits2 = (bytes[2] & 0b0011_1111) as u32;
121+
let bits3 = (bytes[3] & 0b0011_1111) as u32;
122+
123+
((bits0 << 18) | (bits1 << 12) | (bits2 << 6) | bits3, 4)
124+
} else {
125+
return None;
126+
};
127+
128+
match char::try_from(codepoint) {
129+
Ok(c) => {
130+
debug_assert!({
131+
let test_bytes = &mut [0u8; 8];
132+
c.encode_utf8(test_bytes);
133+
&test_bytes[..len] == &bytes[..len]
134+
});
135+
136+
Some((c, len))
137+
}
138+
Err(e) => {
139+
panic!("StringTable: Encountered invalid UTF8 char: {:?}", e);
140+
}
141+
}
142+
}
143+
144+
/// Read-only version of the string table
145+
#[derive(Debug)]
146+
pub struct StringTable {
147+
// TODO: Replace with something lazy
148+
string_data: Vec<u8>,
149+
index: FxHashMap<StringId, Addr>,
150+
}
151+
152+
impl StringTable {
153+
pub fn new(string_data: Vec<u8>, index_data: Vec<u8>) -> Result<StringTable, Box<dyn Error>> {
154+
let string_data_format = read_file_header(&string_data, FILE_MAGIC_STRINGTABLE_DATA)?;
155+
let index_data_format = read_file_header(&index_data, FILE_MAGIC_STRINGTABLE_INDEX)?;
156+
157+
if string_data_format != index_data_format {
158+
Err("Mismatch between StringTable DATA and INDEX format version")?;
159+
}
160+
161+
if string_data_format != CURRENT_FILE_FORMAT_VERSION {
162+
Err(format!(
163+
"StringTable file format version '{}' is not supported
164+
by this version of `measureme`.",
165+
string_data_format
166+
))?;
167+
}
168+
169+
assert!(index_data.len() % 8 == 0);
170+
let index: FxHashMap<_, _> = strip_file_header(&index_data)
171+
.chunks(8)
172+
.map(deserialize_index_entry)
173+
.collect();
174+
175+
Ok(StringTable { string_data, index })
176+
}
177+
178+
#[inline]
179+
pub fn get<'a>(&'a self, id: StringId) -> StringRef<'a> {
180+
StringRef { id, table: self }
181+
}
182+
183+
pub fn get_metadata<'a>(&'a self) -> StringRef<'a> {
184+
let id = StringId::reserved(METADATA_STRING_ID);
185+
self.get(id)
186+
}
187+
}
188+
189+
#[cfg(test)]
190+
mod tests {
191+
use super::*;
192+
use measureme::{ByteVecSink, StringComponent, StringTableBuilder};
193+
use std::sync::Arc;
194+
195+
#[test]
196+
fn simple_strings() {
197+
let data_sink = Arc::new(ByteVecSink::new());
198+
let index_sink = Arc::new(ByteVecSink::new());
199+
200+
let expected_strings = &[
201+
"abc",
202+
"",
203+
"xyz",
204+
"g2h9284hgjv282y32983849&(*^&YIJ#R)(F83 f 23 2g4 35g5y",
205+
"",
206+
"",
207+
"g2h9284hgjv282y32983849&35g5y",
208+
];
209+
210+
let mut string_ids = vec![];
211+
212+
{
213+
let builder = StringTableBuilder::new(data_sink.clone(), index_sink.clone());
214+
215+
for &s in expected_strings {
216+
string_ids.push(builder.alloc(s));
217+
}
218+
}
219+
220+
let data_bytes = Arc::try_unwrap(data_sink).unwrap().into_bytes();
221+
let index_bytes = Arc::try_unwrap(index_sink).unwrap().into_bytes();
222+
223+
let string_table = StringTable::new(data_bytes, index_bytes).unwrap();
224+
225+
for (&id, &expected_string) in string_ids.iter().zip(expected_strings.iter()) {
226+
let str_ref = string_table.get(id);
227+
228+
assert_eq!(str_ref.to_string(), expected_string);
229+
230+
let mut write_to = String::new();
231+
str_ref.write_to_string(&mut write_to);
232+
assert_eq!(str_ref.to_string(), write_to);
233+
}
234+
}
235+
236+
#[test]
237+
fn composite_string() {
238+
let data_sink = Arc::new(ByteVecSink::new());
239+
let index_sink = Arc::new(ByteVecSink::new());
240+
241+
let expected_strings = &[
242+
"abc", // 0
243+
"abcabc", // 1
244+
"abcabcabc", // 2
245+
"abcabcabc", // 3
246+
"abcabcabc", // 4
247+
"abcabcabcabc", // 5
248+
"xxabcabcuuuabcabcqqq", // 6
249+
"xxxxxx", // 7
250+
];
251+
252+
let mut string_ids = vec![];
253+
254+
{
255+
let builder = StringTableBuilder::new(data_sink.clone(), index_sink.clone());
256+
257+
let r = |id| StringComponent::Ref(id);
258+
let v = |s| StringComponent::Value(s);
259+
260+
string_ids.push(builder.alloc("abc")); // 0
261+
string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[0])])); // 1
262+
string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[0]), r(string_ids[0])])); // 2
263+
string_ids.push(builder.alloc(&[r(string_ids[1]), r(string_ids[0])])); // 3
264+
string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[1])])); // 4
265+
string_ids.push(builder.alloc(&[r(string_ids[1]), r(string_ids[1])])); // 5
266+
string_ids.push(builder.alloc(&[
267+
v("xx"),
268+
r(string_ids[1]),
269+
v("uuu"),
270+
r(string_ids[1]),
271+
v("qqq"),
272+
])); // 6
273+
}
274+
275+
let data_bytes = Arc::try_unwrap(data_sink).unwrap().into_bytes();
276+
let index_bytes = Arc::try_unwrap(index_sink).unwrap().into_bytes();
277+
278+
let string_table = StringTable::new(data_bytes, index_bytes).unwrap();
279+
280+
for (&id, &expected_string) in string_ids.iter().zip(expected_strings.iter()) {
281+
let str_ref = string_table.get(id);
282+
283+
assert_eq!(str_ref.to_string(), expected_string);
284+
285+
let mut write_to = String::new();
286+
str_ref.write_to_string(&mut write_to);
287+
assert_eq!(str_ref.to_string(), write_to);
288+
}
289+
}
290+
291+
#[test]
292+
fn utf8_char_decoding() {
293+
use std::convert::TryFrom;
294+
295+
// Let's just test all possible codepoints because there are not that
296+
// many actually.
297+
for codepoint in 0..=0x10FFFFu32 {
298+
if let Ok(expected_char) = char::try_from(codepoint) {
299+
let buffer = &mut [0; 4];
300+
let expected_len = expected_char.encode_utf8(buffer).len();
301+
let expected = Some((expected_char, expected_len));
302+
assert_eq!(expected, decode_utf8_char(&buffer[..]));
303+
}
304+
}
305+
}
306+
}

measureme/src/file_header.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use crate::serialization::SerializationSink;
66
use byteorder::{ByteOrder, LittleEndian};
77
use std::error::Error;
88

9-
pub const CURRENT_FILE_FORMAT_VERSION: u32 = 2;
9+
pub const CURRENT_FILE_FORMAT_VERSION: u32 = 3;
1010
pub const FILE_MAGIC_EVENT_STREAM: &[u8; 4] = b"MMES";
1111
pub const FILE_MAGIC_STRINGTABLE_DATA: &[u8; 4] = b"MMSD";
1212
pub const FILE_MAGIC_STRINGTABLE_INDEX: &[u8; 4] = b"MMSI";

measureme/src/lib.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ mod mmap_serialization_sink;
4646
mod profiler;
4747
mod raw_event;
4848
mod serialization;
49-
mod stringtable;
49+
pub mod stringtable;
5050

5151
pub mod rustc;
5252

@@ -57,6 +57,4 @@ pub use crate::mmap_serialization_sink::MmapSerializationSink;
5757
pub use crate::profiler::{Profiler, ProfilerFiles, TimingGuard};
5858
pub use crate::raw_event::{RawEvent, MAX_INSTANT_TIMESTAMP, MAX_INTERVAL_TIMESTAMP};
5959
pub use crate::serialization::{Addr, ByteVecSink, SerializationSink};
60-
pub use crate::stringtable::{
61-
SerializableString, StringId, StringRef, StringTable, StringTableBuilder,
62-
};
60+
pub use crate::stringtable::{SerializableString, StringComponent, StringId, StringTableBuilder};

0 commit comments

Comments
 (0)