Use 5-byte encoding for string-refs so we can use the full 32 bit address space.

michaelwoerister · michaelwoerister · commit ea03582cc329 · 2020-10-01T15:00:13.000+02:00
diff --git a/analyzeme/src/stringtable.rs b/analyzeme/src/stringtable.rs
@@ -1,10 +1,14 @@
 //! See module-level documentation `measureme::stringtable`.
 
-use measureme::file_header::{
-    strip_file_header, verify_file_header, FILE_MAGIC_STRINGTABLE_DATA,
-    FILE_MAGIC_STRINGTABLE_INDEX,
+use measureme::stringtable::{METADATA_STRING_ID, TERMINATOR};
+use measureme::{
+    file_header::{
+        strip_file_header, verify_file_header, FILE_MAGIC_STRINGTABLE_DATA,
+        FILE_MAGIC_STRINGTABLE_INDEX,
+    },
+    stringtable::STRING_REF_ENCODED_SIZE,
+    stringtable::STRING_REF_TAG,
 };
-use measureme::stringtable::{METADATA_STRING_ID, STRING_ID_MASK, TERMINATOR};
 use measureme::{Addr, StringId};
 use memchr::memchr;
 use rustc_hash::FxHashMap;
@@ -55,9 +59,8 @@ impl<'st> StringRef<'st> {
 
         // Check if this is a string containing a single StringId component
         let first_byte = self.table.string_data[pos];
-        const STRING_ID_SIZE: usize = std::mem::size_of::<StringId>();
-        if terminator_pos == pos + STRING_ID_SIZE && is_utf8_continuation_byte(first_byte) {
-            let id = decode_string_id_from_data(&self.table.string_data[pos..pos + STRING_ID_SIZE]);
+        if first_byte == STRING_REF_TAG && terminator_pos == pos + STRING_REF_ENCODED_SIZE {
+            let id = decode_string_ref_from_data(&self.table.string_data[pos..]);
             return StringRef {
                 id,
                 table: self.table,
@@ -97,15 +100,15 @@ impl<'st> StringRef<'st> {
 
             if byte == TERMINATOR {
                 return;
-            } else if is_utf8_continuation_byte(byte) {
+            } else if byte == STRING_REF_TAG {
                 let string_ref = StringRef {
-                    id: decode_string_id_from_data(&self.table.string_data[pos..pos + 4]),
+                    id: decode_string_ref_from_data(&self.table.string_data[pos..]),
                     table: self.table,
                 };
 
                 string_ref.write_to_string(output);
 
-                pos += 4;
+                pos += STRING_REF_ENCODED_SIZE;
             } else {
                 while let Some((c, len)) = decode_utf8_char(&self.table.string_data[pos..]) {
                     output.push(c);
@@ -129,19 +132,13 @@ impl<'st> StringRef<'st> {
     }
 }
 
-fn is_utf8_continuation_byte(byte: u8) -> bool {
-    // See module-level documentation for more information on the encoding.
-    const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
-    const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;
-    (byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE
-}
-
 // String IDs in the table data are encoded in big endian format, while string
 // IDs in the index are encoded in little endian format. Don't mix the two up.
-fn decode_string_id_from_data(bytes: &[u8]) -> StringId {
-    let id = u32::from_be_bytes(bytes[0..4].try_into().unwrap());
-    // Mask off the `0b10` prefix
-    StringId::new(id & STRING_ID_MASK)
+fn decode_string_ref_from_data(bytes: &[u8]) -> StringId {
+    assert!(bytes[0] == STRING_REF_TAG);
+    assert!(STRING_REF_ENCODED_SIZE == 5);
+    let id = u32::from_le_bytes(bytes[1..5].try_into().unwrap());
+    StringId::new(id)
 }
 
 // Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
diff --git a/measureme/src/stringtable.rs b/measureme/src/stringtable.rs
@@ -15,27 +15,24 @@
 //! The byte-level encoding of component lists uses the structure of UTF-8 in
 //! order to save space:
 //!
-//! - A valid UTF-8 codepoint never starts with the bits `10` as this bit
-//!   prefix is reserved for bytes in the middle of a UTF-8 codepoint byte
-//!   sequence. We make use of this fact by letting all string ID components
-//!   start with this `10` prefix. Thus when we parse the contents of a value
-//!   we know to stop if the start byte of the next codepoint has this prefix.
+//! - A valid UTF-8 codepoint never starts with the byte `0xFE`. We make use
+//!   of this fact by letting all string ID components start with this `0xFE`
+//!   prefix. Thus when we parse the contents of a value we know to stop if
+//!   we encounter this byte.
 //!
-//! - A valid UTF-8 string cannot contain the `0xFF` byte and since string IDs
-//!   start with `10` as described above, they also cannot start with a `0xFF`
-//!   byte. Thus we can safely use `0xFF` as our component list terminator.
+//! - A valid UTF-8 string cannot contain the `0xFF` byte. Thus we can safely
+//!   use `0xFF` as our component list terminator.
 //!
 //! The sample composite string ["abc", ID(42), "def", TERMINATOR] would thus be
 //! encoded as:
 //!
 //! ```ignore
-//!     ['a', 'b' , 'c', 128, 0, 0, 42, 'd', 'e', 'f', 255]
-//!                      ^^^^^^^^^^^^^                 ^^^
-//!              string ID 42 with 0b10 prefix        terminator (0xFF)
+//!     ['a', 'b' , 'c', 254, 42, 0, 0, 0, 'd', 'e', 'f', 255]
+//!                      ^^^^^^^^^^^^^^^^                 ^^^
+//!                 string ID with 0xFE prefix      terminator (0xFF)
 //! ```
 //!
-//! As you can see string IDs are encoded in big endian format so that highest
-//! order bits show up in the first byte we encounter.
+//! As you can see string IDs are encoded in little endian format.
 //!
 //! ----------------------------------------------------------------------------
 //!
@@ -58,10 +55,10 @@
 //! > [0 .. MAX_VIRTUAL_STRING_ID, METADATA_STRING_ID, .. ]
 //!
 //! From `0` to `MAX_VIRTUAL_STRING_ID` are the allowed values for virtual strings.
-//! After `MAX_VIRTUAL_STRING_ID`, there is one string id (`METADATA_STRING_ID`) which is used
-//! internally by `measureme` to record additional metadata about the profiling session.
-//! After `METADATA_STRING_ID` are all other `StringId` values.
-//!
+//! After `MAX_VIRTUAL_STRING_ID`, there is one string id (`METADATA_STRING_ID`)
+//! which is used internally by `measureme` to record additional metadata about
+//! the profiling session. After `METADATA_STRING_ID` are all other `StringId`
+//! values.
 
 use crate::file_header::{
     write_file_header, FILE_MAGIC_STRINGTABLE_DATA, FILE_MAGIC_STRINGTABLE_INDEX,
@@ -84,7 +81,6 @@ impl StringId {
 
     #[inline]
     pub fn new(id: u32) -> StringId {
-        assert!(id <= MAX_STRING_ID);
         StringId(id)
     }
 
@@ -106,23 +102,20 @@ impl StringId {
 
     #[inline]
     pub fn from_addr(addr: Addr) -> StringId {
-        let id = addr.0 + FIRST_REGULAR_STRING_ID;
+        let id = addr.0.checked_add(FIRST_REGULAR_STRING_ID).unwrap();
         StringId::new(id)
     }
 
     #[inline]
     pub fn to_addr(self) -> Addr {
-        assert!(self.0 >= FIRST_REGULAR_STRING_ID);
-        Addr(self.0 - FIRST_REGULAR_STRING_ID)
+        Addr(self.0.checked_sub(FIRST_REGULAR_STRING_ID).unwrap())
     }
 }
 
 // See module-level documentation for more information on the encoding.
 pub const TERMINATOR: u8 = 0xFF;
-
-// All 1s except for the two highest bits.
-pub const MAX_STRING_ID: u32 = 0x3FFF_FFFF;
-pub const STRING_ID_MASK: u32 = 0x3FFF_FFFF;
+pub const STRING_REF_TAG: u8 = 0xFE;
+pub const STRING_REF_ENCODED_SIZE: usize = 5;
 
 /// The maximum id value a virtual string may be.
 const MAX_USER_VIRTUAL_STRING_ID: u32 = 100_000_000;
@@ -175,7 +168,7 @@ impl<'s> StringComponent<'s> {
     fn serialized_size(&self) -> usize {
         match *self {
             StringComponent::Value(s) => s.len(),
-            StringComponent::Ref(_) => 4,
+            StringComponent::Ref(_) => STRING_REF_ENCODED_SIZE,
         }
     }
 
@@ -187,11 +180,10 @@ impl<'s> StringComponent<'s> {
                 &mut bytes[s.len()..]
             }
             StringComponent::Ref(string_id) => {
-                assert!(string_id.0 == string_id.0 & STRING_ID_MASK);
-                let tagged = string_id.0 | (1u32 << 31);
-
-                &mut bytes[0..4].copy_from_slice(&tagged.to_be_bytes());
-                &mut bytes[4..]
+                assert!(STRING_REF_ENCODED_SIZE == 5);
+                bytes[0] = STRING_REF_TAG;
+                &mut bytes[1..5].copy_from_slice(&string_id.0.to_le_bytes());
+                &mut bytes[5..]
             }
         }
     }