@@ -12,13 +12,9 @@ use std::borrow::Cow;
12
12
use std:: error:: Error ;
13
13
use memchr:: memchr;
14
14
15
- // See module-level documentation for more information on the encoding.
16
- const UTF8_CONTINUATION_MASK : u8 = 0b1100_0000 ;
17
- const UTF8_CONTINUATION_BYTE : u8 = 0b1000_0000 ;
18
-
19
15
fn deserialize_index_entry ( bytes : & [ u8 ] ) -> ( StringId , Addr ) {
20
16
(
21
- StringId :: reserved ( LittleEndian :: read_u32 ( & bytes[ 0 ..4 ] ) ) ,
17
+ StringId :: new ( LittleEndian :: read_u32 ( & bytes[ 0 ..4 ] ) ) ,
22
18
Addr ( LittleEndian :: read_u32 ( & bytes[ 4 ..8 ] ) ) ,
23
19
)
24
20
}
@@ -29,12 +25,29 @@ pub struct StringRef<'st> {
29
25
table : & ' st StringTable ,
30
26
}
31
27
28
+ // This is the text we emit when encountering a virtual string ID that cannot
29
+ // be resolved.
30
+ const UNKNOWN_STRING : & str = "<unknown>" ;
31
+
32
32
impl < ' st > StringRef < ' st > {
33
+
34
+ /// Expands the StringRef into an actual string. This method will
35
+ /// avoid allocating a `String` if it can instead return a `&str` pointing
36
+ /// into the raw string table data.
33
37
pub fn to_string ( & self ) -> Cow < ' st , str > {
34
38
35
- // Try to avoid the allocation, which we can do if this is a
36
- // [value, 0xFF] entry.
37
- let addr = self . table . index [ & self . id ] ;
39
+ let addr = match self . get_addr ( ) {
40
+ Ok ( addr) => addr,
41
+ Err ( _) => {
42
+ return Cow :: from ( UNKNOWN_STRING )
43
+ }
44
+ } ;
45
+
46
+ // Try to avoid the allocation, which we can do if this is
47
+ //
48
+ // - a string with a single value component (`[value, 0xFF]`) or
49
+ // - a string with a single reference component (`[string_id, 0xFF]`)
50
+
38
51
let pos = addr. as_usize ( ) ;
39
52
let slice_to_search = & self . table . string_data [ pos..] ;
40
53
@@ -43,36 +56,53 @@ impl<'st> StringRef<'st> {
43
56
// is super fast.
44
57
let terminator_pos = memchr ( TERMINATOR , slice_to_search) . unwrap ( ) ;
45
58
59
+ // Check if this is a string containing a single StringId component
60
+ let first_byte = self . table . string_data [ pos] ;
61
+ const STRING_ID_SIZE : usize = std:: mem:: size_of :: < StringId > ( ) ;
62
+ if terminator_pos == pos + STRING_ID_SIZE && is_utf8_continuation_byte ( first_byte) {
63
+ let id = decode_string_id_from_data ( & self . table . string_data [ pos..pos+STRING_ID_SIZE ] ) ;
64
+ return StringRef {
65
+ id,
66
+ table : self . table ,
67
+ } . to_string ( ) ;
68
+ }
69
+
46
70
// Decode the bytes until the terminator. If there is a string id in
47
71
// between somewhere this will fail, and we fall back to the allocating
48
72
// path.
49
73
if let Ok ( s) = std:: str:: from_utf8 ( & slice_to_search[ ..terminator_pos] ) {
50
74
Cow :: from ( s)
51
75
} else {
76
+ // This is the slow path where we actually allocate a `String` on
77
+ // the heap and expand into that. If you suspect that there is a
78
+ // bug in the fast path above, you can easily check if always taking
79
+ // the slow path fixes the issue.
52
80
let mut output = String :: new ( ) ;
53
81
self . write_to_string ( & mut output) ;
54
82
Cow :: from ( output)
55
83
}
56
84
}
57
85
58
86
pub fn write_to_string ( & self , output : & mut String ) {
59
- let addr = self . table . index [ & self . id ] ;
87
+
88
+ let addr = match self . get_addr ( ) {
89
+ Ok ( addr) => addr,
90
+ Err ( _) => {
91
+ output. push_str ( UNKNOWN_STRING ) ;
92
+ return
93
+ }
94
+ } ;
95
+
60
96
let mut pos = addr. as_usize ( ) ;
61
97
62
98
loop {
63
99
let byte = self . table . string_data [ pos] ;
64
100
65
101
if byte == TERMINATOR {
66
102
return ;
67
- } else if ( byte & UTF8_CONTINUATION_MASK ) == UTF8_CONTINUATION_BYTE {
68
- // This is a string-id
69
- let id = BigEndian :: read_u32 ( & self . table . string_data [ pos..pos + 4 ] ) ;
70
-
71
- // Mask off the `0b10` prefix
72
- let id = id & STRING_ID_MASK ;
73
-
103
+ } else if is_utf8_continuation_byte ( byte) {
74
104
let string_ref = StringRef {
75
- id : StringId :: reserved ( id ) ,
105
+ id : decode_string_id_from_data ( & self . table . string_data [ pos..pos + 4 ] ) ,
76
106
table : self . table ,
77
107
} ;
78
108
@@ -87,6 +117,32 @@ impl<'st> StringRef<'st> {
87
117
}
88
118
}
89
119
}
120
+
121
+ fn get_addr ( & self ) -> Result < Addr , ( ) > {
122
+ if self . id . is_virtual ( ) {
123
+ match self . table . index . get ( & self . id ) {
124
+ Some ( & addr) => Ok ( addr) ,
125
+ None => Err ( ( ) ) ,
126
+ }
127
+ } else {
128
+ Ok ( self . id . to_addr ( ) )
129
+ }
130
+ }
131
+ }
132
+
133
+ fn is_utf8_continuation_byte ( byte : u8 ) -> bool {
134
+ // See module-level documentation for more information on the encoding.
135
+ const UTF8_CONTINUATION_MASK : u8 = 0b1100_0000 ;
136
+ const UTF8_CONTINUATION_BYTE : u8 = 0b1000_0000 ;
137
+ ( byte & UTF8_CONTINUATION_MASK ) == UTF8_CONTINUATION_BYTE
138
+ }
139
+
140
+ // String IDs in the table data are encoded in big endian format, while string
141
+ // IDs in the index are encoded in little endian format. Don't mix the two up.
142
+ fn decode_string_id_from_data ( bytes : & [ u8 ] ) -> StringId {
143
+ let id = BigEndian :: read_u32 ( & bytes[ 0 ..4 ] ) ;
144
+ // Mask off the `0b10` prefix
145
+ StringId :: new ( id & STRING_ID_MASK )
90
146
}
91
147
92
148
// Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
@@ -181,7 +237,7 @@ impl StringTable {
181
237
}
182
238
183
239
pub fn get_metadata < ' a > ( & ' a self ) -> StringRef < ' a > {
184
- let id = StringId :: reserved ( METADATA_STRING_ID ) ;
240
+ let id = StringId :: new ( METADATA_STRING_ID ) ;
185
241
self . get ( id)
186
242
}
187
243
}
0 commit comments