Skip to content

Commit 618d027

Browse files
authored
Merge pull request #138 from bluss/array-string
Fix ArrayString encode_utf8 to use raw pointers
2 parents b7b8cf8 + 090a5c5 commit 618d027

File tree

3 files changed

+71
-21
lines changed

3 files changed

+71
-21
lines changed

Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@ std = []
4141
array-sizes-33-128 = []
4242
array-sizes-129-255 = []
4343

44+
[profile.bench]
45+
debug = true
46+
[profile.release]
47+
debug = true
48+
4449
[package.metadata.docs.rs]
4550
features = ["serde"]
4651

src/array_string.rs

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ impl<A> ArrayString<A>
6767
}
6868
}
6969

70+
/// Return the length of the string.
71+
#[inline]
72+
pub fn len(&self) -> usize { self.len.to_usize() }
73+
7074
/// Create a new `ArrayString` from a `str`.
7175
///
7276
/// Capacity is inferred from the type parameter.
@@ -167,7 +171,9 @@ impl<A> ArrayString<A>
167171
pub fn try_push(&mut self, c: char) -> Result<(), CapacityError<char>> {
168172
let len = self.len();
169173
unsafe {
170-
match encode_utf8(c, &mut self.raw_mut_bytes()[len..]) {
174+
let ptr = self.xs.ptr_mut().add(len);
175+
let remaining_cap = self.capacity() - len;
176+
match encode_utf8(c, ptr, remaining_cap) {
171177
Ok(n) => {
172178
self.set_len(len + n);
173179
Ok(())
@@ -342,11 +348,6 @@ impl<A> ArrayString<A>
342348
pub fn as_str(&self) -> &str {
343349
self
344350
}
345-
346-
/// Return a mutable slice of the whole string’s buffer
347-
unsafe fn raw_mut_bytes(&mut self) -> &mut [u8] {
348-
slice::from_raw_parts_mut(self.xs.ptr_mut(), self.capacity())
349-
}
350351
}
351352

352353
impl<A> Deref for ArrayString<A>

src/char.rs

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
//
1111
// Original authors: alexchrichton, bluss
1212

13+
use std::ptr;
14+
1315
// UTF-8 ranges and tags for encoding characters
1416
const TAG_CONT: u8 = 0b1000_0000;
1517
const TAG_TWO_B: u8 = 0b1100_0000;
@@ -22,33 +24,75 @@ const MAX_THREE_B: u32 = 0x10000;
2224
/// Placeholder
2325
pub struct EncodeUtf8Error;
2426

27+
#[inline]
28+
unsafe fn write(ptr: *mut u8, index: usize, byte: u8) {
29+
ptr::write(ptr.add(index), byte)
30+
}
31+
2532
/// Encode a char into buf using UTF-8.
2633
///
2734
/// On success, return the byte length of the encoding (1, 2, 3 or 4).<br>
2835
/// On error, return `EncodeUtf8Error` if the buffer was too short for the char.
36+
///
37+
/// Safety: `ptr` must be writable for `len` bytes.
2938
#[inline]
30-
pub fn encode_utf8(ch: char, buf: &mut [u8]) -> Result<usize, EncodeUtf8Error>
39+
pub unsafe fn encode_utf8(ch: char, ptr: *mut u8, len: usize) -> Result<usize, EncodeUtf8Error>
3140
{
3241
let code = ch as u32;
33-
if code < MAX_ONE_B && buf.len() >= 1 {
34-
buf[0] = code as u8;
42+
if code < MAX_ONE_B && len >= 1 {
43+
write(ptr, 0, code as u8);
3544
return Ok(1);
36-
} else if code < MAX_TWO_B && buf.len() >= 2 {
37-
buf[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
38-
buf[1] = (code & 0x3F) as u8 | TAG_CONT;
45+
} else if code < MAX_TWO_B && len >= 2 {
46+
write(ptr, 0, (code >> 6 & 0x1F) as u8 | TAG_TWO_B);
47+
write(ptr, 1, (code & 0x3F) as u8 | TAG_CONT);
3948
return Ok(2);
40-
} else if code < MAX_THREE_B && buf.len() >= 3 {
41-
buf[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
42-
buf[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
43-
buf[2] = (code & 0x3F) as u8 | TAG_CONT;
49+
} else if code < MAX_THREE_B && len >= 3 {
50+
write(ptr, 0, (code >> 12 & 0x0F) as u8 | TAG_THREE_B);
51+
write(ptr, 1, (code >> 6 & 0x3F) as u8 | TAG_CONT);
52+
write(ptr, 2, (code & 0x3F) as u8 | TAG_CONT);
4453
return Ok(3);
45-
} else if buf.len() >= 4 {
46-
buf[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
47-
buf[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
48-
buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
49-
buf[3] = (code & 0x3F) as u8 | TAG_CONT;
54+
} else if len >= 4 {
55+
write(ptr, 0, (code >> 18 & 0x07) as u8 | TAG_FOUR_B);
56+
write(ptr, 1, (code >> 12 & 0x3F) as u8 | TAG_CONT);
57+
write(ptr, 2, (code >> 6 & 0x3F) as u8 | TAG_CONT);
58+
write(ptr, 3, (code & 0x3F) as u8 | TAG_CONT);
5059
return Ok(4);
5160
};
5261
Err(EncodeUtf8Error)
5362
}
5463

64+
65+
#[test]
66+
fn test_encode_utf8() {
67+
// Test that all codepoints are encoded correctly
68+
let mut data = [0u8; 16];
69+
for codepoint in 0..=(std::char::MAX as u32) {
70+
if let Some(ch) = std::char::from_u32(codepoint) {
71+
for elt in &mut data { *elt = 0; }
72+
let ptr = data.as_mut_ptr();
73+
let len = data.len();
74+
unsafe {
75+
let res = encode_utf8(ch, ptr, len).ok().unwrap();
76+
assert_eq!(res, ch.len_utf8());
77+
}
78+
let string = std::str::from_utf8(&data).unwrap();
79+
assert_eq!(string.chars().next(), Some(ch));
80+
}
81+
}
82+
}
83+
84+
#[test]
85+
fn test_encode_utf8_oob() {
86+
// test that we report oob if the buffer is too short
87+
let mut data = [0u8; 16];
88+
let chars = ['a', 'α', '�', '𐍈'];
89+
for (len, &ch) in (1..=4).zip(&chars) {
90+
assert_eq!(len, ch.len_utf8(), "Len of ch={}", ch);
91+
let ptr = data.as_mut_ptr();
92+
unsafe {
93+
assert!(matches::matches!(encode_utf8(ch, ptr, len - 1), Err(_)));
94+
assert!(matches::matches!(encode_utf8(ch, ptr, len), Ok(_)));
95+
}
96+
}
97+
}
98+

0 commit comments

Comments
 (0)