@@ -10,7 +10,7 @@ use measureme::{
10
10
stringtable:: STRING_REF_TAG ,
11
11
} ;
12
12
use measureme:: { Addr , StringId } ;
13
- use memchr:: memchr;
13
+ use memchr:: { memchr, memchr2 } ;
14
14
use rustc_hash:: FxHashMap ;
15
15
use std:: borrow:: Cow ;
16
16
use std:: convert:: TryInto ;
@@ -34,6 +34,10 @@ pub struct StringRef<'st> {
34
34
// be resolved.
35
35
const UNKNOWN_STRING : & str = "<unknown>" ;
36
36
37
+ // This is the text we emit when we encounter string data that does not have a
38
+ // proper terminator.
39
+ const INVALID_STRING : & str = "<invalid>" ;
40
+
37
41
impl < ' st > StringRef < ' st > {
38
42
/// Expands the StringRef into an actual string. This method will
39
43
/// avoid allocating a `String` if it can instead return a `&str` pointing
@@ -110,9 +114,18 @@ impl<'st> StringRef<'st> {
110
114
111
115
pos += STRING_REF_ENCODED_SIZE ;
112
116
} else {
113
- while let Some ( ( c, len) ) = decode_utf8_char ( & self . table . string_data [ pos..] ) {
114
- output. push ( c) ;
117
+ // This is a literal UTF-8 string value. Find its end by looking
118
+ // for either of the two possible terminator bytes.
119
+ let remaining_data = & self . table . string_data [ pos..] ;
120
+ if let Some ( len) = memchr2 ( 0xFF , 0xFE , remaining_data) {
121
+ let value = String :: from_utf8_lossy ( & remaining_data[ ..len] ) ;
122
+ output. push_str ( & value) ;
115
123
pos += len;
124
+ } else {
125
+ // The grammar does not allow unterminated raw strings. We
126
+ // have to stop decoding.
127
+ output. push_str ( INVALID_STRING ) ;
128
+ return ;
116
129
}
117
130
}
118
131
}
@@ -141,58 +154,6 @@ fn decode_string_ref_from_data(bytes: &[u8]) -> StringId {
141
154
StringId :: new ( id)
142
155
}
143
156
144
- // Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
145
- // Returns the decoded `char` and its size in bytes if it succeeds.
146
- // Returns `None` if `bytes` does not start with a valid UTF-8 codepoint.
147
- // See https://en.wikipedia.org/wiki/UTF-8 for in-depth information on the
148
- // encoding.
149
- fn decode_utf8_char ( bytes : & [ u8 ] ) -> Option < ( char , usize ) > {
150
- use std:: convert:: TryFrom ;
151
- let first_byte = bytes[ 0 ] as u32 ;
152
- let ( codepoint, len) = if ( first_byte & 0b1000_0000 ) == 0 {
153
- // The highest bit is zero, so this is a single-byte char
154
- ( first_byte, 1 )
155
- } else if ( first_byte & 0b1110_0000 ) == 0b1100_0000 {
156
- // This is a two byte character
157
- let bits0 = first_byte & 0b0001_1111 ;
158
- let bits1 = ( bytes[ 1 ] & 0b0011_1111 ) as u32 ;
159
-
160
- ( bits0 << 6 | bits1, 2 )
161
- } else if ( first_byte & 0b1111_0000 ) == 0b1110_0000 {
162
- // This is a three byte character
163
- let bits0 = first_byte & 0b0000_1111 ;
164
- let bits1 = ( bytes[ 1 ] & 0b0011_1111 ) as u32 ;
165
- let bits2 = ( bytes[ 2 ] & 0b0011_1111 ) as u32 ;
166
-
167
- ( ( bits0 << 12 ) | ( bits1 << 6 ) | bits2, 3 )
168
- } else if ( first_byte & 0b1111_1000 ) == 0b1111_0000 {
169
- // This is a four byte character
170
- let bits0 = first_byte & 0b0000_0111 ;
171
- let bits1 = ( bytes[ 1 ] & 0b0011_1111 ) as u32 ;
172
- let bits2 = ( bytes[ 2 ] & 0b0011_1111 ) as u32 ;
173
- let bits3 = ( bytes[ 3 ] & 0b0011_1111 ) as u32 ;
174
-
175
- ( ( bits0 << 18 ) | ( bits1 << 12 ) | ( bits2 << 6 ) | bits3, 4 )
176
- } else {
177
- return None ;
178
- } ;
179
-
180
- match char:: try_from ( codepoint) {
181
- Ok ( c) => {
182
- debug_assert ! ( {
183
- let test_bytes = & mut [ 0u8 ; 8 ] ;
184
- c. encode_utf8( test_bytes) ;
185
- & test_bytes[ ..len] == & bytes[ ..len]
186
- } ) ;
187
-
188
- Some ( ( c, len) )
189
- }
190
- Err ( e) => {
191
- panic ! ( "StringTable: Encountered invalid UTF8 char: {:?}" , e) ;
192
- }
193
- }
194
- }
195
-
196
157
/// Read-only version of the string table
197
158
#[ derive( Debug ) ]
198
159
pub struct StringTable {
@@ -343,20 +304,4 @@ mod tests {
343
304
assert_eq ! ( str_ref. to_string( ) , write_to) ;
344
305
}
345
306
}
346
-
347
- #[ test]
348
- fn utf8_char_decoding ( ) {
349
- use std:: convert:: TryFrom ;
350
-
351
- // Let's just test all possible codepoints because there are not that
352
- // many actually.
353
- for codepoint in 0 ..=0x10FFFFu32 {
354
- if let Ok ( expected_char) = char:: try_from ( codepoint) {
355
- let buffer = & mut [ 0 ; 4 ] ;
356
- let expected_len = expected_char. encode_utf8 ( buffer) . len ( ) ;
357
- let expected = Some ( ( expected_char, expected_len) ) ;
358
- assert_eq ! ( expected, decode_utf8_char( & buffer[ ..] ) ) ;
359
- }
360
- }
361
- }
362
307
}
0 commit comments