Skip to content

Commit c22125e

Browse files
committed
Use NonZero in MixedUnit for C strings
1 parent 8f9eee9 commit c22125e

File tree

2 files changed

+100
-72
lines changed

2 files changed

+100
-72
lines changed

benches/benches.rs

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
extern crate test;
44

55
use rustc_literal_escaper::*;
6+
7+
use std::num::NonZero;
68
use std::ops::Range;
79
use std::{array, iter};
810

@@ -58,7 +60,7 @@ macro_rules! fn_bench_check_raw {
5860

5961
fn_bench_check_raw!(bench_check_raw_str, char, check_raw_str);
6062
fn_bench_check_raw!(bench_check_raw_byte_str, u8, check_raw_byte_str);
61-
fn_bench_check_raw!(bench_check_raw_c_str, char, check_raw_c_str);
63+
fn_bench_check_raw!(bench_check_raw_c_str, NonZero<char>, check_raw_c_str);
6264

6365
// raw str
6466

@@ -98,25 +100,28 @@ fn bench_check_raw_byte_str_ascii(b: &mut test::Bencher) {
98100

99101
#[bench]
100102
fn bench_check_raw_c_str_ascii(b: &mut test::Bencher) {
101-
bench_check_raw_c_str(b, "a", &['a'; LEN]);
103+
bench_check_raw_c_str(b, "a", &[NonZero::new('a').unwrap(); LEN]);
102104
}
103105

104106
#[bench]
105107
fn bench_check_raw_c_str_non_ascii(b: &mut test::Bencher) {
106-
bench_check_raw_c_str(b, "🦀", &['🦀'; LEN]);
108+
bench_check_raw_c_str(b, "🦀", &[NonZero::new('🦀').unwrap(); LEN]);
107109
}
108110

109111
#[bench]
110112
fn bench_check_raw_c_str_unicode(b: &mut test::Bencher) {
111113
bench_check_raw_c_str(
112114
b,
113115
"a🦀🚀z",
114-
&array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 {
115-
0 => 'a',
116-
1 => '🦀',
117-
2 => '🚀',
118-
3 => 'z',
119-
_ => unreachable!(),
116+
&array::from_fn::<_, { 4 * LEN }, _>(|i| {
117+
NonZero::new(match i % 4 {
118+
0 => 'a',
119+
1 => '🦀',
120+
2 => '🚀',
121+
3 => 'z',
122+
_ => unreachable!(),
123+
})
124+
.unwrap()
120125
}),
121126
);
122127
}
@@ -318,7 +323,7 @@ fn bench_unescape_c_str_ascii(b: &mut test::Bencher) {
318323
bench_unescape_c_str(
319324
b,
320325
r"a",
321-
&array::from_fn::<_, { LEN }, _>(|i| (i..i + 1, Ok(MixedUnit::Char('a')))),
326+
&array::from_fn::<_, { LEN }, _>(|i| (i..i + 1, 'a'.try_into())),
322327
);
323328
}
324329

@@ -327,7 +332,7 @@ fn bench_unescape_c_str_non_ascii(b: &mut test::Bencher) {
327332
bench_unescape_c_str(
328333
b,
329334
r"🦀",
330-
&array::from_fn::<_, LEN, _>(|i| (4 * i..4 * (i + 1), Ok(MixedUnit::Char('🦀')))),
335+
&array::from_fn::<_, LEN, _>(|i| (4 * i..4 * (i + 1), '🦀'.try_into())),
331336
);
332337
}
333338

@@ -339,10 +344,10 @@ fn bench_unescape_c_str_unicode(b: &mut test::Bencher) {
339344
b,
340345
input,
341346
&array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 {
342-
0 => (i / 4 * l..i / 4 * l + 1, Ok(MixedUnit::Char('a'))),
343-
1 => (i / 4 * l + 1..i / 4 * l + 5, Ok(MixedUnit::Char('🦀'))),
344-
2 => (i / 4 * l + 5..i / 4 * l + 9, Ok(MixedUnit::Char('🚀'))),
345-
3 => (i / 4 * l + 9..i / 4 * l + 10, Ok(MixedUnit::Char('z'))),
347+
0 => (i / 4 * l..i / 4 * l + 1, 'a'.try_into()),
348+
1 => (i / 4 * l + 1..i / 4 * l + 5, '🦀'.try_into()),
349+
2 => (i / 4 * l + 5..i / 4 * l + 9, '🚀'.try_into()),
350+
3 => (i / 4 * l + 9..i / 4 * l + 10, 'z'.try_into()),
346351
_ => unreachable!(),
347352
}),
348353
);
@@ -353,7 +358,7 @@ fn bench_unescape_c_str_ascii_escape(b: &mut test::Bencher) {
353358
bench_unescape_c_str(
354359
b,
355360
r"\n",
356-
&array::from_fn::<_, { LEN }, _>(|i| (2 * i..2 * (i + 1), Ok(MixedUnit::Char('\n')))),
361+
&array::from_fn::<_, { LEN }, _>(|i| (2 * i..2 * (i + 1), '\n'.try_into())),
357362
);
358363
}
359364

@@ -362,7 +367,7 @@ fn bench_unescape_c_str_hex_escape_ascii(b: &mut test::Bencher) {
362367
bench_unescape_c_str(
363368
b,
364369
r"\x22",
365-
&array::from_fn::<_, { LEN }, _>(|i| (4 * i..4 * (i + 1), Ok(MixedUnit::Char('"')))),
370+
&array::from_fn::<_, { LEN }, _>(|i| (4 * i..4 * (i + 1), '"'.try_into())),
366371
);
367372
}
368373

@@ -371,9 +376,7 @@ fn bench_unescape_c_str_hex_escape_byte(b: &mut test::Bencher) {
371376
bench_unescape_c_str(
372377
b,
373378
r"\xff",
374-
&array::from_fn::<_, { LEN }, _>(|i| {
375-
(4 * i..4 * (i + 1), Ok(MixedUnit::HighByte(b'\xff')))
376-
}),
379+
&array::from_fn::<_, { LEN }, _>(|i| (4 * i..4 * (i + 1), b'\xff'.try_into())),
377380
);
378381
}
379382

@@ -382,7 +385,7 @@ fn bench_unescape_c_str_unicode_escape(b: &mut test::Bencher) {
382385
bench_unescape_c_str(
383386
b,
384387
r"\u{1f980}",
385-
&array::from_fn::<_, { LEN }, _>(|i| (9 * i..9 * (i + 1), Ok(MixedUnit::Char('🦀')))),
388+
&array::from_fn::<_, { LEN }, _>(|i| (9 * i..9 * (i + 1), '🦀'.try_into())),
386389
);
387390
}
388391

@@ -399,14 +402,11 @@ fn bench_unescape_c_str_mixed_escape(b: &mut test::Bencher) {
399402
let mut i = 0;
400403
move || {
401404
let res = Some(match i % n {
402-
0 => (i / n * l..i / n * l + 2, Ok(MixedUnit::Char('\n'))),
403-
1 => (i / n * l + 2..i / n * l + 6, Ok(MixedUnit::Char('"'))),
404-
2 => (i / n * l + 6..i / n * l + 15, Ok(MixedUnit::Char('🦀'))),
405-
3 => (i / n * l + 15..i / n * l + 24, Ok(MixedUnit::Char('🚀'))),
406-
4 => (
407-
i / n * l + 24..i / n * l + 28,
408-
Ok(MixedUnit::HighByte(b'\xff')),
409-
),
405+
0 => (i / n * l..i / n * l + 2, '\n'.try_into()),
406+
1 => (i / n * l + 2..i / n * l + 6, '"'.try_into()),
407+
2 => (i / n * l + 6..i / n * l + 15, '🦀'.try_into()),
408+
3 => (i / n * l + 15..i / n * l + 24, '🚀'.try_into()),
409+
4 => (i / n * l + 24..i / n * l + 28, b'\xff'.try_into()),
410410
r if r >= n => unreachable!(),
411411
_ => unimplemented!(),
412412
});

src/lib.rs

Lines changed: 71 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
//! turning escape sequences into the values they represent.
33
44
use std::ffi::CStr;
5+
use std::num::NonZero;
56
use std::ops::Range;
67
use std::str::Chars;
78

@@ -105,7 +106,10 @@ pub fn check_raw_byte_str(src: &str, callback: impl FnMut(Range<usize>, Result<u
105106
/// and produces a sequence of characters or errors,
106107
/// which are returned by invoking `callback`.
107108
/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
108-
pub fn check_raw_c_str(src: &str, callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
109+
pub fn check_raw_c_str(
110+
src: &str,
111+
callback: impl FnMut(Range<usize>, Result<NonZero<char>, EscapeError>),
112+
) {
109113
CStr::check_raw(src, callback);
110114
}
111115

@@ -178,14 +182,10 @@ fn char2byte(c: char) -> Result<u8, EscapeError> {
178182
}
179183

180184
impl CheckRaw for CStr {
181-
type RawUnit = char;
185+
type RawUnit = NonZero<char>;
182186

183187
fn char2raw_unit(c: char) -> Result<Self::RawUnit, EscapeError> {
184-
if c == '\0' {
185-
Err(EscapeError::NulInCStr)
186-
} else {
187-
Ok(c)
188-
}
188+
NonZero::new(c).ok_or(EscapeError::NulInCStr)
189189
}
190190
}
191191

@@ -247,40 +247,63 @@ pub enum MixedUnit {
247247
/// For example, if '¥' appears in a string it is represented here as
248248
/// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte
249249
/// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]`
250-
Char(char),
250+
Char(NonZero<char>),
251251

252252
/// Used for high bytes (`\x80`..`\xff`).
253253
///
254254
/// For example, if `\xa5` appears in a string it is represented here as
255255
/// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant
256256
/// byte string as the single byte `0xa5`.
257-
HighByte(u8),
257+
HighByte(NonZero<u8>),
258258
}
259259

260-
impl From<char> for MixedUnit {
261-
fn from(c: char) -> Self {
260+
impl From<NonZero<char>> for MixedUnit {
261+
fn from(c: NonZero<char>) -> Self {
262262
MixedUnit::Char(c)
263263
}
264264
}
265265

266-
impl From<u8> for MixedUnit {
267-
fn from(n: u8) -> Self {
268-
if n.is_ascii() {
269-
MixedUnit::Char(n as char)
266+
impl From<NonZero<u8>> for MixedUnit {
267+
fn from(byte: NonZero<u8>) -> Self {
268+
if byte.get().is_ascii() {
269+
MixedUnit::Char(NonZero::new(byte.get() as char).unwrap())
270270
} else {
271-
MixedUnit::HighByte(n)
271+
MixedUnit::HighByte(byte)
272272
}
273273
}
274274
}
275275

276+
impl TryFrom<char> for MixedUnit {
277+
type Error = EscapeError;
278+
279+
fn try_from(c: char) -> Result<Self, EscapeError> {
280+
NonZero::new(c)
281+
.map(MixedUnit::Char)
282+
.ok_or(EscapeError::NulInCStr)
283+
}
284+
}
285+
286+
impl TryFrom<u8> for MixedUnit {
287+
type Error = EscapeError;
288+
289+
fn try_from(byte: u8) -> Result<Self, EscapeError> {
290+
NonZero::new(byte)
291+
.map(From::from)
292+
.ok_or(EscapeError::NulInCStr)
293+
}
294+
}
295+
276296
/// Trait for unescaping escape sequences in strings
277297
trait Unescape {
278298
/// Unit type of the implementing string type (`char` for string, `u8` for byte string)
279-
type Unit: From<u8>;
299+
type Unit;
280300

281301
/// Result of unescaping the zero char ('\0')
282302
const ZERO_RESULT: Result<Self::Unit, EscapeError>;
283303

304+
/// Converts non-zero bytes to the unit type
305+
fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit;
306+
284307
/// Converts chars to the unit type
285308
fn char2unit(c: char) -> Result<Self::Unit, EscapeError>;
286309

@@ -311,18 +334,20 @@ trait Unescape {
311334
if c == '0' {
312335
Self::ZERO_RESULT
313336
} else {
314-
simple_escape(c).map(|b| b.into()).or_else(|c| match c {
315-
'x' => Self::hex2unit(hex_escape(chars)?),
316-
'u' => Self::unicode2unit({
317-
let value = unicode_escape(chars)?;
318-
if value > char::MAX as u32 {
319-
Err(EscapeError::OutOfRangeUnicodeEscape)
320-
} else {
321-
char::from_u32(value).ok_or(EscapeError::LoneSurrogateUnicodeEscape)
322-
}
323-
}),
324-
_ => Err(EscapeError::InvalidEscape),
325-
})
337+
simple_escape(c)
338+
.map(|b| Self::nonzero_byte2unit(b))
339+
.or_else(|c| match c {
340+
'x' => Self::hex2unit(hex_escape(chars)?),
341+
'u' => Self::unicode2unit({
342+
let value = unicode_escape(chars)?;
343+
if value > char::MAX as u32 {
344+
Err(EscapeError::OutOfRangeUnicodeEscape)
345+
} else {
346+
char::from_u32(value).ok_or(EscapeError::LoneSurrogateUnicodeEscape)
347+
}
348+
}),
349+
_ => Err(EscapeError::InvalidEscape),
350+
})
326351
}
327352
}
328353

@@ -364,9 +389,9 @@ trait Unescape {
364389
/// Interpret a non-nul ASCII escape
365390
///
366391
/// Parses the character of an ASCII escape (except nul) without the leading backslash.
367-
fn simple_escape(c: char) -> Result<u8, char> {
392+
fn simple_escape(c: char) -> Result<NonZero<u8>, char> {
368393
// Previous character was '\\', unescape what follows.
369-
Ok(match c {
394+
Ok(NonZero::new(match c {
370395
'"' => b'"',
371396
'n' => b'\n',
372397
'r' => b'\r',
@@ -375,6 +400,7 @@ fn simple_escape(c: char) -> Result<u8, char> {
375400
'\'' => b'\'',
376401
_ => Err(c)?,
377402
})
403+
.unwrap())
378404
}
379405

380406
/// Interpret a hexadecimal escape
@@ -476,6 +502,10 @@ impl Unescape for str {
476502

477503
const ZERO_RESULT: Result<Self::Unit, EscapeError> = Ok('\0');
478504

505+
fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit {
506+
b.get().into()
507+
}
508+
479509
fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
480510
Ok(c)
481511
}
@@ -499,6 +529,10 @@ impl Unescape for [u8] {
499529

500530
const ZERO_RESULT: Result<Self::Unit, EscapeError> = Ok(b'\0');
501531

532+
fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit {
533+
b.get()
534+
}
535+
502536
fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
503537
char2byte(c)
504538
}
@@ -518,22 +552,16 @@ impl Unescape for CStr {
518552

519553
const ZERO_RESULT: Result<Self::Unit, EscapeError> = Err(EscapeError::NulInCStr);
520554

555+
fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit {
556+
b.into()
557+
}
558+
521559
fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
522-
if c == '\0' {
523-
Err(EscapeError::NulInCStr)
524-
} else {
525-
Ok(MixedUnit::Char(c))
526-
}
560+
c.try_into()
527561
}
528562

529563
fn hex2unit(byte: u8) -> Result<Self::Unit, EscapeError> {
530-
if byte == b'\0' {
531-
Err(EscapeError::NulInCStr)
532-
} else if byte.is_ascii() {
533-
Ok(MixedUnit::Char(byte as char))
534-
} else {
535-
Ok(MixedUnit::HighByte(byte))
536-
}
564+
byte.try_into()
537565
}
538566

539567
/// Converts the result of a unicode escape to the unit type

0 commit comments

Comments
 (0)