Skip to content

Commit 76cf3ea

Browse files
Move reading part of StringTable to analyzeme crate.
1 parent 06419b3 commit 76cf3ea

File tree

6 files changed

+294
-283
lines changed

6 files changed

+294
-283
lines changed

analyzeme/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ edition = "2018"
66
license = "MIT OR Apache-2.0"
77

88
[dependencies]
9+
byteorder = "1.2.7"
910
measureme = { path = "../measureme" }
1011
rustc-hash = "1.0.1"
1112
serde = { version = "1.0", features = [ "derive" ] }

analyzeme/src/lib.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@ mod event;
1717
mod lightweight_event;
1818
mod profiling_data;
1919
mod stack_collapse;
20-
mod timestamp;
20+
mod stringtable;
2121
pub mod testing_common;
22+
mod timestamp;
2223

2324
pub use crate::event::Event;
2425
pub use crate::lightweight_event::LightweightEvent;
2526
pub use crate::profiling_data::{ProfilingData, ProfilingDataBuilder};
2627
pub use crate::stack_collapse::collapse_stacks;
28+
pub use crate::stringtable::{StringRef, StringTable};
2729
pub use crate::timestamp::Timestamp;

analyzeme/src/profiling_data.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
use crate::event::Event;
22
use crate::lightweight_event::LightweightEvent;
3+
use crate::StringTable;
34
use crate::timestamp::Timestamp;
45
use measureme::file_header::{
56
read_file_header, write_file_header, CURRENT_FILE_FORMAT_VERSION, FILE_HEADER_SIZE,
67
FILE_MAGIC_EVENT_STREAM,
78
};
89
use measureme::ByteVecSink;
9-
use measureme::{ProfilerFiles, RawEvent, SerializationSink, StringTable, StringTableBuilder};
10+
use measureme::{ProfilerFiles, RawEvent, SerializationSink, StringTableBuilder};
1011
use serde::{Deserialize, Deserializer};
1112
use std::error::Error;
1213
use std::fs;

analyzeme/src/stringtable.rs

Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
//! See module-level documentation `measureme::stringtable`.
2+
3+
use byteorder::{BigEndian, ByteOrder, LittleEndian};
4+
use measureme::file_header::{
5+
read_file_header, strip_file_header, CURRENT_FILE_FORMAT_VERSION, FILE_MAGIC_STRINGTABLE_DATA,
6+
FILE_MAGIC_STRINGTABLE_INDEX,
7+
};
8+
use measureme::stringtable::{METADATA_STRING_ID, STRING_ID_MASK, TERMINATOR};
9+
use measureme::{Addr, StringId};
10+
use rustc_hash::FxHashMap;
11+
use std::borrow::Cow;
12+
use std::error::Error;
13+
14+
// See module-level documentation for more information on the encoding.
15+
const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
16+
const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;
17+
18+
fn deserialize_index_entry(bytes: &[u8]) -> (StringId, Addr) {
19+
(
20+
StringId::reserved(LittleEndian::read_u32(&bytes[0..4])),
21+
Addr(LittleEndian::read_u32(&bytes[4..8])),
22+
)
23+
}
24+
25+
#[derive(Copy, Clone)]
26+
pub struct StringRef<'st> {
27+
id: StringId,
28+
table: &'st StringTable,
29+
}
30+
31+
impl<'st> StringRef<'st> {
32+
pub fn to_string(&self) -> Cow<'st, str> {
33+
let mut output = String::new();
34+
self.write_to_string(&mut output);
35+
Cow::from(output)
36+
}
37+
38+
pub fn write_to_string(&self, output: &mut String) {
39+
let addr = self.table.index[&self.id];
40+
let mut pos = addr.as_usize();
41+
42+
loop {
43+
let byte = self.table.string_data[pos];
44+
45+
if byte == TERMINATOR {
46+
return;
47+
} else if (byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE {
48+
// This is a string-id
49+
let id = BigEndian::read_u32(&self.table.string_data[pos..pos + 4]);
50+
51+
// Mask off the `0b10` prefix
52+
let id = id & STRING_ID_MASK;
53+
54+
let string_ref = StringRef {
55+
id: StringId::reserved(id),
56+
table: self.table,
57+
};
58+
59+
string_ref.write_to_string(output);
60+
61+
pos += 4;
62+
} else {
63+
while let Some((c, len)) = decode_utf8_char(&self.table.string_data[pos..]) {
64+
output.push(c);
65+
pos += len;
66+
}
67+
}
68+
}
69+
}
70+
}
71+
72+
// Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
73+
// Returns the decoded `char` and its size in bytes if it succeeds.
74+
// Returns `None` if `bytes` does not start with a valid UTF-8 codepoint.
75+
// See https://en.wikipedia.org/wiki/UTF-8 for in-depth information on the
76+
// encoding.
77+
fn decode_utf8_char(bytes: &[u8]) -> Option<(char, usize)> {
78+
use std::convert::TryFrom;
79+
let first_byte = bytes[0] as u32;
80+
let (codepoint, len) = if (first_byte & 0b1000_0000) == 0 {
81+
// The highest bit is zero, so this is a single-byte char
82+
(first_byte, 1)
83+
} else if (first_byte & 0b1110_0000) == 0b1100_0000 {
84+
// This is a two byte character
85+
let bits0 = first_byte & 0b0001_1111;
86+
let bits1 = (bytes[1] & 0b0011_1111) as u32;
87+
88+
(bits0 << 6 | bits1, 2)
89+
} else if (first_byte & 0b1111_0000) == 0b1110_0000 {
90+
// This is a three byte character
91+
let bits0 = first_byte & 0b0000_1111;
92+
let bits1 = (bytes[1] & 0b0011_1111) as u32;
93+
let bits2 = (bytes[2] & 0b0011_1111) as u32;
94+
95+
((bits0 << 12) | (bits1 << 6) | bits2, 3)
96+
} else if (first_byte & 0b1111_1000) == 0b1111_0000 {
97+
// This is a three byte character
98+
let bits0 = first_byte & 0b0000_0111;
99+
let bits1 = (bytes[1] & 0b0011_1111) as u32;
100+
let bits2 = (bytes[2] & 0b0011_1111) as u32;
101+
let bits3 = (bytes[3] & 0b0011_1111) as u32;
102+
103+
((bits0 << 18) | (bits1 << 12) | (bits2 << 6) | bits3, 4)
104+
} else {
105+
return None;
106+
};
107+
108+
match char::try_from(codepoint) {
109+
Ok(c) => {
110+
debug_assert!({
111+
let test_bytes = &mut [0u8; 8];
112+
c.encode_utf8(test_bytes);
113+
&test_bytes[..len] == &bytes[..len]
114+
});
115+
116+
Some((c, len))
117+
}
118+
Err(e) => {
119+
panic!("StringTable: Encountered invalid UTF8 char: {:?}", e);
120+
}
121+
}
122+
}
123+
124+
/// Read-only version of the string table
125+
#[derive(Debug)]
126+
pub struct StringTable {
127+
// TODO: Replace with something lazy
128+
string_data: Vec<u8>,
129+
index: FxHashMap<StringId, Addr>,
130+
}
131+
132+
impl StringTable {
133+
pub fn new(string_data: Vec<u8>, index_data: Vec<u8>) -> Result<StringTable, Box<dyn Error>> {
134+
let string_data_format = read_file_header(&string_data, FILE_MAGIC_STRINGTABLE_DATA)?;
135+
let index_data_format = read_file_header(&index_data, FILE_MAGIC_STRINGTABLE_INDEX)?;
136+
137+
if string_data_format != index_data_format {
138+
Err("Mismatch between StringTable DATA and INDEX format version")?;
139+
}
140+
141+
if string_data_format != CURRENT_FILE_FORMAT_VERSION {
142+
Err(format!(
143+
"StringTable file format version '{}' is not supported
144+
by this version of `measureme`.",
145+
string_data_format
146+
))?;
147+
}
148+
149+
assert!(index_data.len() % 8 == 0);
150+
let index: FxHashMap<_, _> = strip_file_header(&index_data)
151+
.chunks(8)
152+
.map(deserialize_index_entry)
153+
.collect();
154+
155+
Ok(StringTable { string_data, index })
156+
}
157+
158+
#[inline]
159+
pub fn get<'a>(&'a self, id: StringId) -> StringRef<'a> {
160+
StringRef { id, table: self }
161+
}
162+
163+
pub fn get_metadata<'a>(&'a self) -> StringRef<'a> {
164+
let id = StringId::reserved(METADATA_STRING_ID);
165+
self.get(id)
166+
}
167+
}
168+
169+
#[cfg(test)]
170+
mod tests {
171+
use super::*;
172+
use measureme::{ByteVecSink, StringComponent, StringTableBuilder};
173+
use std::sync::Arc;
174+
175+
#[test]
176+
fn simple_strings() {
177+
let data_sink = Arc::new(ByteVecSink::new());
178+
let index_sink = Arc::new(ByteVecSink::new());
179+
180+
let expected_strings = &[
181+
"abc",
182+
"",
183+
"xyz",
184+
"g2h9284hgjv282y32983849&(*^&YIJ#R)(F83 f 23 2g4 35g5y",
185+
"",
186+
"",
187+
"g2h9284hgjv282y32983849&35g5y",
188+
];
189+
190+
let mut string_ids = vec![];
191+
192+
{
193+
let builder = StringTableBuilder::new(data_sink.clone(), index_sink.clone());
194+
195+
for &s in expected_strings {
196+
string_ids.push(builder.alloc(s));
197+
}
198+
}
199+
200+
let data_bytes = Arc::try_unwrap(data_sink).unwrap().into_bytes();
201+
let index_bytes = Arc::try_unwrap(index_sink).unwrap().into_bytes();
202+
203+
let string_table = StringTable::new(data_bytes, index_bytes).unwrap();
204+
205+
for (&id, &expected_string) in string_ids.iter().zip(expected_strings.iter()) {
206+
let str_ref = string_table.get(id);
207+
208+
assert_eq!(str_ref.to_string(), expected_string);
209+
210+
let mut write_to = String::new();
211+
str_ref.write_to_string(&mut write_to);
212+
assert_eq!(str_ref.to_string(), write_to);
213+
}
214+
}
215+
216+
#[test]
217+
fn composite_string() {
218+
let data_sink = Arc::new(ByteVecSink::new());
219+
let index_sink = Arc::new(ByteVecSink::new());
220+
221+
let expected_strings = &[
222+
"abc", // 0
223+
"abcabc", // 1
224+
"abcabcabc", // 2
225+
"abcabcabc", // 3
226+
"abcabcabc", // 4
227+
"abcabcabcabc", // 5
228+
"xxabcabcuuuabcabcqqq", // 6
229+
"xxxxxx", // 7
230+
];
231+
232+
let mut string_ids = vec![];
233+
234+
{
235+
let builder = StringTableBuilder::new(data_sink.clone(), index_sink.clone());
236+
237+
let r = |id| StringComponent::Ref(id);
238+
let v = |s| StringComponent::Value(s);
239+
240+
string_ids.push(builder.alloc("abc")); // 0
241+
string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[0])])); // 1
242+
string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[0]), r(string_ids[0])])); // 2
243+
string_ids.push(builder.alloc(&[r(string_ids[1]), r(string_ids[0])])); // 3
244+
string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[1])])); // 4
245+
string_ids.push(builder.alloc(&[r(string_ids[1]), r(string_ids[1])])); // 5
246+
string_ids.push(builder.alloc(&[
247+
v("xx"),
248+
r(string_ids[1]),
249+
v("uuu"),
250+
r(string_ids[1]),
251+
v("qqq"),
252+
])); // 6
253+
}
254+
255+
let data_bytes = Arc::try_unwrap(data_sink).unwrap().into_bytes();
256+
let index_bytes = Arc::try_unwrap(index_sink).unwrap().into_bytes();
257+
258+
let string_table = StringTable::new(data_bytes, index_bytes).unwrap();
259+
260+
for (&id, &expected_string) in string_ids.iter().zip(expected_strings.iter()) {
261+
let str_ref = string_table.get(id);
262+
263+
assert_eq!(str_ref.to_string(), expected_string);
264+
265+
let mut write_to = String::new();
266+
str_ref.write_to_string(&mut write_to);
267+
assert_eq!(str_ref.to_string(), write_to);
268+
}
269+
}
270+
271+
#[test]
272+
fn utf8_char_decoding() {
273+
let chars = vec![('\0', 1), ('a', 1), ('Ω', 2), ('Ꜵ', 3), ('𝔉', 4)];
274+
275+
for (c, len) in chars {
276+
let buffer = &mut [0; 4];
277+
c.encode_utf8(buffer);
278+
assert_eq!(Some((c, len)), decode_utf8_char(&buffer[..]));
279+
}
280+
}
281+
}

measureme/src/lib.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ mod mmap_serialization_sink;
4646
mod profiler;
4747
mod raw_event;
4848
mod serialization;
49-
mod stringtable;
49+
pub mod stringtable;
5050

5151
pub mod rustc;
5252

@@ -57,6 +57,4 @@ pub use crate::mmap_serialization_sink::MmapSerializationSink;
5757
pub use crate::profiler::{Profiler, ProfilerFiles, TimingGuard};
5858
pub use crate::raw_event::{RawEvent, MAX_INSTANT_TIMESTAMP, MAX_INTERVAL_TIMESTAMP};
5959
pub use crate::serialization::{Addr, ByteVecSink, SerializationSink};
60-
pub use crate::stringtable::{
61-
SerializableString, StringId, StringRef, StringTable, StringTableBuilder,
62-
};
60+
pub use crate::stringtable::{SerializableString, StringComponent, StringId, StringTableBuilder};

0 commit comments

Comments
 (0)