1
1
//! See module-level documentation `measureme::stringtable`.
2
2
3
- use measureme:: file_header:: {
4
- strip_file_header, verify_file_header, FILE_MAGIC_STRINGTABLE_DATA ,
5
- FILE_MAGIC_STRINGTABLE_INDEX ,
3
+ use measureme:: stringtable:: { METADATA_STRING_ID , TERMINATOR } ;
4
+ use measureme:: {
5
+ file_header:: {
6
+ strip_file_header, verify_file_header, FILE_MAGIC_STRINGTABLE_DATA ,
7
+ FILE_MAGIC_STRINGTABLE_INDEX ,
8
+ } ,
9
+ stringtable:: STRING_REF_ENCODED_SIZE ,
10
+ stringtable:: STRING_REF_TAG ,
6
11
} ;
7
- use measureme:: stringtable:: { METADATA_STRING_ID , STRING_ID_MASK , TERMINATOR } ;
8
12
use measureme:: { Addr , StringId } ;
9
- use memchr:: memchr;
13
+ use memchr:: { memchr, memchr2 } ;
10
14
use rustc_hash:: FxHashMap ;
11
15
use std:: borrow:: Cow ;
12
16
use std:: convert:: TryInto ;
@@ -30,6 +34,10 @@ pub struct StringRef<'st> {
30
34
// be resolved.
31
35
const UNKNOWN_STRING : & str = "<unknown>" ;
32
36
37
+ // This is the text we emit when we encounter string data that does not have a
38
+ // proper terminator.
39
+ const INVALID_STRING : & str = "<invalid>" ;
40
+
33
41
impl < ' st > StringRef < ' st > {
34
42
/// Expands the StringRef into an actual string. This method will
35
43
/// avoid allocating a `String` if it can instead return a `&str` pointing
@@ -55,9 +63,8 @@ impl<'st> StringRef<'st> {
55
63
56
64
// Check if this is a string containing a single StringId component
57
65
let first_byte = self . table . string_data [ pos] ;
58
- const STRING_ID_SIZE : usize = std:: mem:: size_of :: < StringId > ( ) ;
59
- if terminator_pos == pos + STRING_ID_SIZE && is_utf8_continuation_byte ( first_byte) {
60
- let id = decode_string_id_from_data ( & self . table . string_data [ pos..pos + STRING_ID_SIZE ] ) ;
66
+ if first_byte == STRING_REF_TAG && terminator_pos == pos + STRING_REF_ENCODED_SIZE {
67
+ let id = decode_string_ref_from_data ( & self . table . string_data [ pos..] ) ;
61
68
return StringRef {
62
69
id,
63
70
table : self . table ,
@@ -97,19 +104,28 @@ impl<'st> StringRef<'st> {
97
104
98
105
if byte == TERMINATOR {
99
106
return ;
100
- } else if is_utf8_continuation_byte ( byte) {
107
+ } else if byte == STRING_REF_TAG {
101
108
let string_ref = StringRef {
102
- id : decode_string_id_from_data ( & self . table . string_data [ pos..pos + 4 ] ) ,
109
+ id : decode_string_ref_from_data ( & self . table . string_data [ pos..] ) ,
103
110
table : self . table ,
104
111
} ;
105
112
106
113
string_ref. write_to_string ( output) ;
107
114
108
- pos += 4 ;
115
+ pos += STRING_REF_ENCODED_SIZE ;
109
116
} else {
110
- while let Some ( ( c, len) ) = decode_utf8_char ( & self . table . string_data [ pos..] ) {
111
- output. push ( c) ;
117
+ // This is a literal UTF-8 string value. Find its end by looking
118
+ // for either of the two possible terminator bytes.
119
+ let remaining_data = & self . table . string_data [ pos..] ;
120
+ if let Some ( len) = memchr2 ( 0xFF , 0xFE , remaining_data) {
121
+ let value = String :: from_utf8_lossy ( & remaining_data[ ..len] ) ;
122
+ output. push_str ( & value) ;
112
123
pos += len;
124
+ } else {
125
+ // The grammar does not allow unterminated raw strings. We
126
+ // have to stop decoding.
127
+ output. push_str ( INVALID_STRING ) ;
128
+ return ;
113
129
}
114
130
}
115
131
}
@@ -129,71 +145,17 @@ impl<'st> StringRef<'st> {
129
145
}
130
146
}
131
147
132
- fn is_utf8_continuation_byte ( byte : u8 ) -> bool {
133
- // See module-level documentation for more information on the encoding.
134
- const UTF8_CONTINUATION_MASK : u8 = 0b1100_0000 ;
135
- const UTF8_CONTINUATION_BYTE : u8 = 0b1000_0000 ;
136
- ( byte & UTF8_CONTINUATION_MASK ) == UTF8_CONTINUATION_BYTE
137
- }
138
-
139
148
// String IDs in the table data are encoded in big endian format, while string
140
149
// IDs in the index are encoded in little endian format. Don't mix the two up.
141
- fn decode_string_id_from_data ( bytes : & [ u8 ] ) -> StringId {
142
- let id = u32:: from_be_bytes ( bytes[ 0 ..4 ] . try_into ( ) . unwrap ( ) ) ;
143
- // Mask off the `0b10` prefix
144
- StringId :: new ( id & STRING_ID_MASK )
145
- }
146
-
147
- // Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
148
- // Returns the decoded `char` and its size in bytes if it succeeds.
149
- // Returns `None` if `bytes` does not start with a valid UTF-8 codepoint.
150
- // See https://en.wikipedia.org/wiki/UTF-8 for in-depth information on the
151
- // encoding.
152
- fn decode_utf8_char ( bytes : & [ u8 ] ) -> Option < ( char , usize ) > {
153
- use std:: convert:: TryFrom ;
154
- let first_byte = bytes[ 0 ] as u32 ;
155
- let ( codepoint, len) = if ( first_byte & 0b1000_0000 ) == 0 {
156
- // The highest bit is zero, so this is a single-byte char
157
- ( first_byte, 1 )
158
- } else if ( first_byte & 0b1110_0000 ) == 0b1100_0000 {
159
- // This is a two byte character
160
- let bits0 = first_byte & 0b0001_1111 ;
161
- let bits1 = ( bytes[ 1 ] & 0b0011_1111 ) as u32 ;
162
-
163
- ( bits0 << 6 | bits1, 2 )
164
- } else if ( first_byte & 0b1111_0000 ) == 0b1110_0000 {
165
- // This is a three byte character
166
- let bits0 = first_byte & 0b0000_1111 ;
167
- let bits1 = ( bytes[ 1 ] & 0b0011_1111 ) as u32 ;
168
- let bits2 = ( bytes[ 2 ] & 0b0011_1111 ) as u32 ;
169
-
170
- ( ( bits0 << 12 ) | ( bits1 << 6 ) | bits2, 3 )
171
- } else if ( first_byte & 0b1111_1000 ) == 0b1111_0000 {
172
- // This is a four byte character
173
- let bits0 = first_byte & 0b0000_0111 ;
174
- let bits1 = ( bytes[ 1 ] & 0b0011_1111 ) as u32 ;
175
- let bits2 = ( bytes[ 2 ] & 0b0011_1111 ) as u32 ;
176
- let bits3 = ( bytes[ 3 ] & 0b0011_1111 ) as u32 ;
177
-
178
- ( ( bits0 << 18 ) | ( bits1 << 12 ) | ( bits2 << 6 ) | bits3, 4 )
179
- } else {
180
- return None ;
181
- } ;
182
-
183
- match char:: try_from ( codepoint) {
184
- Ok ( c) => {
185
- debug_assert ! ( {
186
- let test_bytes = & mut [ 0u8 ; 8 ] ;
187
- c. encode_utf8( test_bytes) ;
188
- & test_bytes[ ..len] == & bytes[ ..len]
189
- } ) ;
190
-
191
- Some ( ( c, len) )
192
- }
193
- Err ( e) => {
194
- panic ! ( "StringTable: Encountered invalid UTF8 char: {:?}" , e) ;
195
- }
196
- }
150
+ fn decode_string_ref_from_data ( bytes : & [ u8 ] ) -> StringId {
151
+ // The code below assumes we use a 5-byte encoding for string
152
+ // refs, where the first byte is STRING_REF_TAG and the
153
+ // following 4 bytes are a little-endian u32 string ID value.
154
+ assert ! ( bytes[ 0 ] == STRING_REF_TAG ) ;
155
+ assert ! ( STRING_REF_ENCODED_SIZE == 5 ) ;
156
+
157
+ let id = u32:: from_le_bytes ( bytes[ 1 ..5 ] . try_into ( ) . unwrap ( ) ) ;
158
+ StringId :: new ( id)
197
159
}
198
160
199
161
/// Read-only version of the string table
@@ -346,20 +308,4 @@ mod tests {
346
308
assert_eq ! ( str_ref. to_string( ) , write_to) ;
347
309
}
348
310
}
349
-
350
- #[ test]
351
- fn utf8_char_decoding ( ) {
352
- use std:: convert:: TryFrom ;
353
-
354
- // Let's just test all possible codepoints because there are not that
355
- // many actually.
356
- for codepoint in 0 ..=0x10FFFFu32 {
357
- if let Ok ( expected_char) = char:: try_from ( codepoint) {
358
- let buffer = & mut [ 0 ; 4 ] ;
359
- let expected_len = expected_char. encode_utf8 ( buffer) . len ( ) ;
360
- let expected = Some ( ( expected_char, expected_len) ) ;
361
- assert_eq ! ( expected, decode_utf8_char( & buffer[ ..] ) ) ;
362
- }
363
- }
364
- }
365
311
}
0 commit comments