10
10
//
11
11
// Original authors: alexchrichton, bluss
12
12
13
+ use std:: ptr;
14
+
13
15
// UTF-8 ranges and tags for encoding characters
14
16
const TAG_CONT : u8 = 0b1000_0000 ;
15
17
const TAG_TWO_B : u8 = 0b1100_0000 ;
@@ -22,33 +24,75 @@ const MAX_THREE_B: u32 = 0x10000;
22
24
/// Placeholder
23
25
pub struct EncodeUtf8Error ;
24
26
27
+ #[ inline]
28
+ unsafe fn write ( ptr : * mut u8 , index : usize , byte : u8 ) {
29
+ ptr:: write ( ptr. add ( index) , byte)
30
+ }
31
+
25
32
/// Encode a char into buf using UTF-8.
26
33
///
27
34
/// On success, return the byte length of the encoding (1, 2, 3 or 4).<br>
28
35
/// On error, return `EncodeUtf8Error` if the buffer was too short for the char.
36
+ ///
37
+ /// Safety: `ptr` must be writable for `len` bytes.
29
38
#[ inline]
30
- pub fn encode_utf8 ( ch : char , buf : & mut [ u8 ] ) -> Result < usize , EncodeUtf8Error >
39
+ pub unsafe fn encode_utf8 ( ch : char , ptr : * mut u8 , len : usize ) -> Result < usize , EncodeUtf8Error >
31
40
{
32
41
let code = ch as u32 ;
33
- if code < MAX_ONE_B && buf . len ( ) >= 1 {
34
- buf [ 0 ] = code as u8 ;
42
+ if code < MAX_ONE_B && len >= 1 {
43
+ write ( ptr , 0 , code as u8 ) ;
35
44
return Ok ( 1 ) ;
36
- } else if code < MAX_TWO_B && buf . len ( ) >= 2 {
37
- buf [ 0 ] = ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ;
38
- buf [ 1 ] = ( code & 0x3F ) as u8 | TAG_CONT ;
45
+ } else if code < MAX_TWO_B && len >= 2 {
46
+ write ( ptr , 0 , ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ) ;
47
+ write ( ptr , 1 , ( code & 0x3F ) as u8 | TAG_CONT ) ;
39
48
return Ok ( 2 ) ;
40
- } else if code < MAX_THREE_B && buf . len ( ) >= 3 {
41
- buf [ 0 ] = ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ;
42
- buf [ 1 ] = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
43
- buf [ 2 ] = ( code & 0x3F ) as u8 | TAG_CONT ;
49
+ } else if code < MAX_THREE_B && len >= 3 {
50
+ write ( ptr , 0 , ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ) ;
51
+ write ( ptr , 1 , ( code >> 6 & 0x3F ) as u8 | TAG_CONT ) ;
52
+ write ( ptr , 2 , ( code & 0x3F ) as u8 | TAG_CONT ) ;
44
53
return Ok ( 3 ) ;
45
- } else if buf . len ( ) >= 4 {
46
- buf [ 0 ] = ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ;
47
- buf [ 1 ] = ( code >> 12 & 0x3F ) as u8 | TAG_CONT ;
48
- buf [ 2 ] = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
49
- buf [ 3 ] = ( code & 0x3F ) as u8 | TAG_CONT ;
54
+ } else if len >= 4 {
55
+ write ( ptr , 0 , ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ) ;
56
+ write ( ptr , 1 , ( code >> 12 & 0x3F ) as u8 | TAG_CONT ) ;
57
+ write ( ptr , 2 , ( code >> 6 & 0x3F ) as u8 | TAG_CONT ) ;
58
+ write ( ptr , 3 , ( code & 0x3F ) as u8 | TAG_CONT ) ;
50
59
return Ok ( 4 ) ;
51
60
} ;
52
61
Err ( EncodeUtf8Error )
53
62
}
54
63
64
+
65
+ #[ test]
66
+ fn test_encode_utf8 ( ) {
67
+ // Test that all codepoints are encoded correctly
68
+ let mut data = [ 0u8 ; 16 ] ;
69
+ for codepoint in 0 ..=( std:: char:: MAX as u32 ) {
70
+ if let Some ( ch) = std:: char:: from_u32 ( codepoint) {
71
+ for elt in & mut data { * elt = 0 ; }
72
+ let ptr = data. as_mut_ptr ( ) ;
73
+ let len = data. len ( ) ;
74
+ unsafe {
75
+ let res = encode_utf8 ( ch, ptr, len) . ok ( ) . unwrap ( ) ;
76
+ assert_eq ! ( res, ch. len_utf8( ) ) ;
77
+ }
78
+ let string = std:: str:: from_utf8 ( & data) . unwrap ( ) ;
79
+ assert_eq ! ( string. chars( ) . next( ) , Some ( ch) ) ;
80
+ }
81
+ }
82
+ }
83
+
84
+ #[ test]
85
+ fn test_encode_utf8_oob ( ) {
86
+ // test that we report oob if the buffer is too short
87
+ let mut data = [ 0u8 ; 16 ] ;
88
+ let chars = [ 'a' , 'α' , '�' , '𐍈' ] ;
89
+ for ( len, & ch) in ( 1 ..=4 ) . zip ( & chars) {
90
+ assert_eq ! ( len, ch. len_utf8( ) , "Len of ch={}" , ch) ;
91
+ let ptr = data. as_mut_ptr ( ) ;
92
+ unsafe {
93
+ assert ! ( matches:: matches!( encode_utf8( ch, ptr, len - 1 ) , Err ( _) ) ) ;
94
+ assert ! ( matches:: matches!( encode_utf8( ch, ptr, len) , Ok ( _) ) ) ;
95
+ }
96
+ }
97
+ }
98
+
0 commit comments