Merge pull request #90 from michaelwoerister/utf8-based-stringtable-encoding

wesleywiser · web-flow · commit 771697a77c64 · 2019-12-02T06:30:33.000-05:00
Use more compact encoding for the string table
diff --git a/analyzeme/Cargo.toml b/analyzeme/Cargo.toml
@@ -6,6 +6,8 @@ edition = "2018"
 license = "MIT OR Apache-2.0"
 
 [dependencies]
+byteorder = "1.2.7"
+memchr = "2"
 measureme = { path = "../measureme" }
 rustc-hash = "1.0.1"
 serde = { version = "1.0", features = [ "derive" ] }
diff --git a/analyzeme/src/lib.rs b/analyzeme/src/lib.rs
@@ -17,11 +17,13 @@ mod event;
 mod lightweight_event;
 mod profiling_data;
 mod stack_collapse;
-mod timestamp;
+mod stringtable;
 pub mod testing_common;
+mod timestamp;
 
 pub use crate::event::Event;
 pub use crate::lightweight_event::LightweightEvent;
 pub use crate::profiling_data::{ProfilingData, ProfilingDataBuilder};
 pub use crate::stack_collapse::collapse_stacks;
+pub use crate::stringtable::{StringRef, StringTable};
 pub use crate::timestamp::Timestamp;
diff --git a/analyzeme/src/profiling_data.rs b/analyzeme/src/profiling_data.rs
@@ -1,12 +1,13 @@
 use crate::event::Event;
 use crate::lightweight_event::LightweightEvent;
+use crate::StringTable;
 use crate::timestamp::Timestamp;
 use measureme::file_header::{
     read_file_header, write_file_header, CURRENT_FILE_FORMAT_VERSION, FILE_HEADER_SIZE,
     FILE_MAGIC_EVENT_STREAM,
 };
 use measureme::ByteVecSink;
-use measureme::{ProfilerFiles, RawEvent, SerializationSink, StringTable, StringTableBuilder};
+use measureme::{ProfilerFiles, RawEvent, SerializationSink, StringTableBuilder};
 use serde::{Deserialize, Deserializer};
 use std::error::Error;
 use std::fs;
diff --git a/analyzeme/src/stringtable.rs b/analyzeme/src/stringtable.rs
@@ -0,0 +1,306 @@
+//! See module-level documentation `measureme::stringtable`.
+
+use byteorder::{BigEndian, ByteOrder, LittleEndian};
+use measureme::file_header::{
+    read_file_header, strip_file_header, CURRENT_FILE_FORMAT_VERSION, FILE_MAGIC_STRINGTABLE_DATA,
+    FILE_MAGIC_STRINGTABLE_INDEX,
+};
+use measureme::stringtable::{METADATA_STRING_ID, STRING_ID_MASK, TERMINATOR};
+use measureme::{Addr, StringId};
+use rustc_hash::FxHashMap;
+use std::borrow::Cow;
+use std::error::Error;
+use memchr::memchr;
+
+// See module-level documentation for more information on the encoding.
+const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
+const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;
+
+fn deserialize_index_entry(bytes: &[u8]) -> (StringId, Addr) {
+    (
+        StringId::reserved(LittleEndian::read_u32(&bytes[0..4])),
+        Addr(LittleEndian::read_u32(&bytes[4..8])),
+    )
+}
+
+#[derive(Copy, Clone)]
+pub struct StringRef<'st> {
+    id: StringId,
+    table: &'st StringTable,
+}
+
+impl<'st> StringRef<'st> {
+    pub fn to_string(&self) -> Cow<'st, str> {
+
+        // Try to avoid the allocation, which we can do if this is a
+        // [value, 0xFF] entry.
+        let addr = self.table.index[&self.id];
+        let pos = addr.as_usize();
+        let slice_to_search = &self.table.string_data[pos..];
+
+        // Find the first 0xFF byte which which is either the sequence
+        // terminator or a byte in the middle of string id. Use `memchr` which
+        // is super fast.
+        let terminator_pos = memchr(TERMINATOR, slice_to_search).unwrap();
+
+        // Decode the bytes until the terminator. If there is a string id in
+        // between somewhere this will fail, and we fall back to the allocating
+        // path.
+        if let Ok(s) = std::str::from_utf8(&slice_to_search[..terminator_pos]) {
+            Cow::from(s)
+        } else {
+            let mut output = String::new();
+            self.write_to_string(&mut output);
+            Cow::from(output)
+        }
+    }
+
+    pub fn write_to_string(&self, output: &mut String) {
+        let addr = self.table.index[&self.id];
+        let mut pos = addr.as_usize();
+
+        loop {
+            let byte = self.table.string_data[pos];
+
+            if byte == TERMINATOR {
+                return;
+            } else if (byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE {
+                // This is a string-id
+                let id = BigEndian::read_u32(&self.table.string_data[pos..pos + 4]);
+
+                // Mask off the `0b10` prefix
+                let id = id & STRING_ID_MASK;
+
+                let string_ref = StringRef {
+                    id: StringId::reserved(id),
+                    table: self.table,
+                };
+
+                string_ref.write_to_string(output);
+
+                pos += 4;
+            } else {
+                while let Some((c, len)) = decode_utf8_char(&self.table.string_data[pos..]) {
+                    output.push(c);
+                    pos += len;
+                }
+            }
+        }
+    }
+}
+
+// Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
+// Returns the decoded `char` and its size in bytes if it succeeds.
+// Returns `None` if `bytes` does not start with a valid UTF-8 codepoint.
+// See https://en.wikipedia.org/wiki/UTF-8 for in-depth information on the
+// encoding.
+fn decode_utf8_char(bytes: &[u8]) -> Option<(char, usize)> {
+    use std::convert::TryFrom;
+    let first_byte = bytes[0] as u32;
+    let (codepoint, len) = if (first_byte & 0b1000_0000) == 0 {
+        // The highest bit is zero, so this is a single-byte char
+        (first_byte, 1)
+    } else if (first_byte & 0b1110_0000) == 0b1100_0000 {
+        // This is a two byte character
+        let bits0 = first_byte & 0b0001_1111;
+        let bits1 = (bytes[1] & 0b0011_1111) as u32;
+
+        (bits0 << 6 | bits1, 2)
+    } else if (first_byte & 0b1111_0000) == 0b1110_0000 {
+        // This is a three byte character
+        let bits0 = first_byte & 0b0000_1111;
+        let bits1 = (bytes[1] & 0b0011_1111) as u32;
+        let bits2 = (bytes[2] & 0b0011_1111) as u32;
+
+        ((bits0 << 12) | (bits1 << 6) | bits2, 3)
+    } else if (first_byte & 0b1111_1000) == 0b1111_0000 {
+        // This is a four byte character
+        let bits0 = first_byte & 0b0000_0111;
+        let bits1 = (bytes[1] & 0b0011_1111) as u32;
+        let bits2 = (bytes[2] & 0b0011_1111) as u32;
+        let bits3 = (bytes[3] & 0b0011_1111) as u32;
+
+        ((bits0 << 18) | (bits1 << 12) | (bits2 << 6) | bits3, 4)
+    } else {
+        return None;
+    };
+
+    match char::try_from(codepoint) {
+        Ok(c) => {
+            debug_assert!({
+                let test_bytes = &mut [0u8; 8];
+                c.encode_utf8(test_bytes);
+                &test_bytes[..len] == &bytes[..len]
+            });
+
+            Some((c, len))
+        }
+        Err(e) => {
+            panic!("StringTable: Encountered invalid UTF8 char: {:?}", e);
+        }
+    }
+}
+
+/// Read-only version of the string table
+#[derive(Debug)]
+pub struct StringTable {
+    // TODO: Replace with something lazy
+    string_data: Vec<u8>,
+    index: FxHashMap<StringId, Addr>,
+}
+
+impl StringTable {
+    pub fn new(string_data: Vec<u8>, index_data: Vec<u8>) -> Result<StringTable, Box<dyn Error>> {
+        let string_data_format = read_file_header(&string_data, FILE_MAGIC_STRINGTABLE_DATA)?;
+        let index_data_format = read_file_header(&index_data, FILE_MAGIC_STRINGTABLE_INDEX)?;
+
+        if string_data_format != index_data_format {
+            Err("Mismatch between StringTable DATA and INDEX format version")?;
+        }
+
+        if string_data_format != CURRENT_FILE_FORMAT_VERSION {
+            Err(format!(
+                "StringTable file format version '{}' is not supported
+                         by this version of `measureme`.",
+                string_data_format
+            ))?;
+        }
+
+        assert!(index_data.len() % 8 == 0);
+        let index: FxHashMap<_, _> = strip_file_header(&index_data)
+            .chunks(8)
+            .map(deserialize_index_entry)
+            .collect();
+
+        Ok(StringTable { string_data, index })
+    }
+
+    #[inline]
+    pub fn get<'a>(&'a self, id: StringId) -> StringRef<'a> {
+        StringRef { id, table: self }
+    }
+
+    pub fn get_metadata<'a>(&'a self) -> StringRef<'a> {
+        let id = StringId::reserved(METADATA_STRING_ID);
+        self.get(id)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use measureme::{ByteVecSink, StringComponent, StringTableBuilder};
+    use std::sync::Arc;
+
+    #[test]
+    fn simple_strings() {
+        let data_sink = Arc::new(ByteVecSink::new());
+        let index_sink = Arc::new(ByteVecSink::new());
+
+        let expected_strings = &[
+            "abc",
+            "",
+            "xyz",
+            "g2h9284hgjv282y32983849&(*^&YIJ#R)(F83 f 23 2g4 35g5y",
+            "",
+            "",
+            "g2h9284hgjv282y32983849&35g5y",
+        ];
+
+        let mut string_ids = vec![];
+
+        {
+            let builder = StringTableBuilder::new(data_sink.clone(), index_sink.clone());
+
+            for &s in expected_strings {
+                string_ids.push(builder.alloc(s));
+            }
+        }
+
+        let data_bytes = Arc::try_unwrap(data_sink).unwrap().into_bytes();
+        let index_bytes = Arc::try_unwrap(index_sink).unwrap().into_bytes();
+
+        let string_table = StringTable::new(data_bytes, index_bytes).unwrap();
+
+        for (&id, &expected_string) in string_ids.iter().zip(expected_strings.iter()) {
+            let str_ref = string_table.get(id);
+
+            assert_eq!(str_ref.to_string(), expected_string);
+
+            let mut write_to = String::new();
+            str_ref.write_to_string(&mut write_to);
+            assert_eq!(str_ref.to_string(), write_to);
+        }
+    }
+
+    #[test]
+    fn composite_string() {
+        let data_sink = Arc::new(ByteVecSink::new());
+        let index_sink = Arc::new(ByteVecSink::new());
+
+        let expected_strings = &[
+            "abc",                  // 0
+            "abcabc",               // 1
+            "abcabcabc",            // 2
+            "abcabcabc",            // 3
+            "abcabcabc",            // 4
+            "abcabcabcabc",         // 5
+            "xxabcabcuuuabcabcqqq", // 6
+            "xxxxxx",               // 7
+        ];
+
+        let mut string_ids = vec![];
+
+        {
+            let builder = StringTableBuilder::new(data_sink.clone(), index_sink.clone());
+
+            let r = |id| StringComponent::Ref(id);
+            let v = |s| StringComponent::Value(s);
+
+            string_ids.push(builder.alloc("abc")); // 0
+            string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[0])])); // 1
+            string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[0]), r(string_ids[0])])); // 2
+            string_ids.push(builder.alloc(&[r(string_ids[1]), r(string_ids[0])])); // 3
+            string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[1])])); // 4
+            string_ids.push(builder.alloc(&[r(string_ids[1]), r(string_ids[1])])); // 5
+            string_ids.push(builder.alloc(&[
+                v("xx"),
+                r(string_ids[1]),
+                v("uuu"),
+                r(string_ids[1]),
+                v("qqq"),
+            ])); // 6
+        }
+
+        let data_bytes = Arc::try_unwrap(data_sink).unwrap().into_bytes();
+        let index_bytes = Arc::try_unwrap(index_sink).unwrap().into_bytes();
+
+        let string_table = StringTable::new(data_bytes, index_bytes).unwrap();
+
+        for (&id, &expected_string) in string_ids.iter().zip(expected_strings.iter()) {
+            let str_ref = string_table.get(id);
+
+            assert_eq!(str_ref.to_string(), expected_string);
+
+            let mut write_to = String::new();
+            str_ref.write_to_string(&mut write_to);
+            assert_eq!(str_ref.to_string(), write_to);
+        }
+    }
+
+    #[test]
+    fn utf8_char_decoding() {
+        use std::convert::TryFrom;
+
+        // Let's just test all possible codepoints because there are not that
+        // many actually.
+        for codepoint in 0..=0x10FFFFu32 {
+            if let Ok(expected_char) = char::try_from(codepoint) {
+                let buffer = &mut [0; 4];
+                let expected_len = expected_char.encode_utf8(buffer).len();
+                let expected = Some((expected_char, expected_len));
+                assert_eq!(expected, decode_utf8_char(&buffer[..]));
+            }
+        }
+    }
+}
diff --git a/measureme/src/file_header.rs b/measureme/src/file_header.rs
@@ -6,7 +6,7 @@ use crate::serialization::SerializationSink;
 use byteorder::{ByteOrder, LittleEndian};
 use std::error::Error;
 
-pub const CURRENT_FILE_FORMAT_VERSION: u32 = 2;
+pub const CURRENT_FILE_FORMAT_VERSION: u32 = 3;
 pub const FILE_MAGIC_EVENT_STREAM: &[u8; 4] = b"MMES";
 pub const FILE_MAGIC_STRINGTABLE_DATA: &[u8; 4] = b"MMSD";
 pub const FILE_MAGIC_STRINGTABLE_INDEX: &[u8; 4] = b"MMSI";
diff --git a/measureme/src/lib.rs b/measureme/src/lib.rs
@@ -46,7 +46,7 @@ mod mmap_serialization_sink;
 mod profiler;
 mod raw_event;
 mod serialization;
-mod stringtable;
+pub mod stringtable;
 
 pub mod rustc;
 
@@ -57,6 +57,4 @@ pub use crate::mmap_serialization_sink::MmapSerializationSink;
 pub use crate::profiler::{Profiler, ProfilerFiles, TimingGuard};
 pub use crate::raw_event::{RawEvent, MAX_INSTANT_TIMESTAMP, MAX_INTERVAL_TIMESTAMP};
 pub use crate::serialization::{Addr, ByteVecSink, SerializationSink};
-pub use crate::stringtable::{
-    SerializableString, StringId, StringRef, StringTable, StringTableBuilder,
-};
+pub use crate::stringtable::{SerializableString, StringComponent, StringId, StringTableBuilder};
diff --git a/measureme/src/stringtable.rs b/measureme/src/stringtable.rs