Skip to content

Commit 425ca35

Browse files
hkBstUrgau
authored andcommitted
Use NonZero in MixedUnit for C strings
1 parent 5841236 commit 425ca35

File tree

3 files changed

+106
-72
lines changed

3 files changed

+106
-72
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ edition = "2021"
55
description = "Provides code to unescape string literals"
66
license = "Apache-2.0 OR MIT"
77
repository = "https://github.com/rust-lang/literal-escaper"
8+
rust-version = "1.89" # for NonZero<char>
89

910
[dependencies]
1011
std = { version = '1.0.0', optional = true, package = 'rustc-std-workspace-std' }

benches/benches.rs

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
extern crate test;
44

55
use rustc_literal_escaper::*;
6+
7+
use std::num::NonZero;
68
use std::ops::Range;
79
use std::{array, iter};
810

@@ -58,7 +60,7 @@ macro_rules! fn_bench_check_raw {
5860

5961
fn_bench_check_raw!(bench_check_raw_str, char, check_raw_str);
6062
fn_bench_check_raw!(bench_check_raw_byte_str, u8, check_raw_byte_str);
61-
fn_bench_check_raw!(bench_check_raw_c_str, char, check_raw_c_str);
63+
fn_bench_check_raw!(bench_check_raw_c_str, NonZero<char>, check_raw_c_str);
6264

6365
// raw str
6466

@@ -98,25 +100,28 @@ fn bench_check_raw_byte_str_ascii(b: &mut test::Bencher) {
98100

99101
#[bench]
100102
fn bench_check_raw_c_str_ascii(b: &mut test::Bencher) {
101-
bench_check_raw_c_str(b, "a", &['a'; LEN]);
103+
bench_check_raw_c_str(b, "a", &[NonZero::new('a').unwrap(); LEN]);
102104
}
103105

104106
#[bench]
105107
fn bench_check_raw_c_str_non_ascii(b: &mut test::Bencher) {
106-
bench_check_raw_c_str(b, "🦀", &['🦀'; LEN]);
108+
bench_check_raw_c_str(b, "🦀", &[NonZero::new('🦀').unwrap(); LEN]);
107109
}
108110

109111
#[bench]
110112
fn bench_check_raw_c_str_unicode(b: &mut test::Bencher) {
111113
bench_check_raw_c_str(
112114
b,
113115
"a🦀🚀z",
114-
&array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 {
115-
0 => 'a',
116-
1 => '🦀',
117-
2 => '🚀',
118-
3 => 'z',
119-
_ => unreachable!(),
116+
&array::from_fn::<_, { 4 * LEN }, _>(|i| {
117+
NonZero::new(match i % 4 {
118+
0 => 'a',
119+
1 => '🦀',
120+
2 => '🚀',
121+
3 => 'z',
122+
_ => unreachable!(),
123+
})
124+
.unwrap()
120125
}),
121126
);
122127
}
@@ -318,7 +323,7 @@ fn bench_unescape_c_str_ascii(b: &mut test::Bencher) {
318323
bench_unescape_c_str(
319324
b,
320325
r"a",
321-
&array::from_fn::<_, { LEN }, _>(|i| (i..i + 1, Ok(MixedUnit::Char('a')))),
326+
&array::from_fn::<_, { LEN }, _>(|i| (i..i + 1, 'a'.try_into())),
322327
);
323328
}
324329

@@ -327,7 +332,7 @@ fn bench_unescape_c_str_non_ascii(b: &mut test::Bencher) {
327332
bench_unescape_c_str(
328333
b,
329334
r"🦀",
330-
&array::from_fn::<_, LEN, _>(|i| (4 * i..4 * (i + 1), Ok(MixedUnit::Char('🦀')))),
335+
&array::from_fn::<_, LEN, _>(|i| (4 * i..4 * (i + 1), '🦀'.try_into())),
331336
);
332337
}
333338

@@ -339,10 +344,10 @@ fn bench_unescape_c_str_unicode(b: &mut test::Bencher) {
339344
b,
340345
input,
341346
&array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 {
342-
0 => (i / 4 * l..i / 4 * l + 1, Ok(MixedUnit::Char('a'))),
343-
1 => (i / 4 * l + 1..i / 4 * l + 5, Ok(MixedUnit::Char('🦀'))),
344-
2 => (i / 4 * l + 5..i / 4 * l + 9, Ok(MixedUnit::Char('🚀'))),
345-
3 => (i / 4 * l + 9..i / 4 * l + 10, Ok(MixedUnit::Char('z'))),
347+
0 => (i / 4 * l..i / 4 * l + 1, 'a'.try_into()),
348+
1 => (i / 4 * l + 1..i / 4 * l + 5, '🦀'.try_into()),
349+
2 => (i / 4 * l + 5..i / 4 * l + 9, '🚀'.try_into()),
350+
3 => (i / 4 * l + 9..i / 4 * l + 10, 'z'.try_into()),
346351
_ => unreachable!(),
347352
}),
348353
);
@@ -353,7 +358,7 @@ fn bench_unescape_c_str_ascii_escape(b: &mut test::Bencher) {
353358
bench_unescape_c_str(
354359
b,
355360
r"\n",
356-
&array::from_fn::<_, { LEN }, _>(|i| (2 * i..2 * (i + 1), Ok(MixedUnit::Char('\n')))),
361+
&array::from_fn::<_, { LEN }, _>(|i| (2 * i..2 * (i + 1), '\n'.try_into())),
357362
);
358363
}
359364

@@ -362,7 +367,7 @@ fn bench_unescape_c_str_hex_escape_ascii(b: &mut test::Bencher) {
362367
bench_unescape_c_str(
363368
b,
364369
r"\x22",
365-
&array::from_fn::<_, { LEN }, _>(|i| (4 * i..4 * (i + 1), Ok(MixedUnit::Char('"')))),
370+
&array::from_fn::<_, { LEN }, _>(|i| (4 * i..4 * (i + 1), '"'.try_into())),
366371
);
367372
}
368373

@@ -371,9 +376,7 @@ fn bench_unescape_c_str_hex_escape_byte(b: &mut test::Bencher) {
371376
bench_unescape_c_str(
372377
b,
373378
r"\xff",
374-
&array::from_fn::<_, { LEN }, _>(|i| {
375-
(4 * i..4 * (i + 1), Ok(MixedUnit::HighByte(b'\xff')))
376-
}),
379+
&array::from_fn::<_, { LEN }, _>(|i| (4 * i..4 * (i + 1), b'\xff'.try_into())),
377380
);
378381
}
379382

@@ -382,7 +385,7 @@ fn bench_unescape_c_str_unicode_escape(b: &mut test::Bencher) {
382385
bench_unescape_c_str(
383386
b,
384387
r"\u{1f980}",
385-
&array::from_fn::<_, { LEN }, _>(|i| (9 * i..9 * (i + 1), Ok(MixedUnit::Char('🦀')))),
388+
&array::from_fn::<_, { LEN }, _>(|i| (9 * i..9 * (i + 1), '🦀'.try_into())),
386389
);
387390
}
388391

@@ -399,14 +402,11 @@ fn bench_unescape_c_str_mixed_escape(b: &mut test::Bencher) {
399402
let mut i = 0;
400403
move || {
401404
let res = Some(match i % n {
402-
0 => (i / n * l..i / n * l + 2, Ok(MixedUnit::Char('\n'))),
403-
1 => (i / n * l + 2..i / n * l + 6, Ok(MixedUnit::Char('"'))),
404-
2 => (i / n * l + 6..i / n * l + 15, Ok(MixedUnit::Char('🦀'))),
405-
3 => (i / n * l + 15..i / n * l + 24, Ok(MixedUnit::Char('🚀'))),
406-
4 => (
407-
i / n * l + 24..i / n * l + 28,
408-
Ok(MixedUnit::HighByte(b'\xff')),
409-
),
405+
0 => (i / n * l..i / n * l + 2, '\n'.try_into()),
406+
1 => (i / n * l + 2..i / n * l + 6, '"'.try_into()),
407+
2 => (i / n * l + 6..i / n * l + 15, '🦀'.try_into()),
408+
3 => (i / n * l + 15..i / n * l + 24, '🚀'.try_into()),
409+
4 => (i / n * l + 24..i / n * l + 28, b'\xff'.try_into()),
410410
r if r >= n => unreachable!(),
411411
_ => unimplemented!(),
412412
});

src/lib.rs

Lines changed: 76 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
//! turning escape sequences into the values they represent.
33
44
use std::ffi::CStr;
5+
use std::num::NonZero;
56
use std::ops::Range;
67
use std::str::Chars;
78

@@ -105,7 +106,10 @@ pub fn check_raw_byte_str(src: &str, callback: impl FnMut(Range<usize>, Result<u
105106
/// and produces a sequence of characters or errors,
106107
/// which are returned by invoking `callback`.
107108
/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
108-
pub fn check_raw_c_str(src: &str, callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
109+
pub fn check_raw_c_str(
110+
src: &str,
111+
callback: impl FnMut(Range<usize>, Result<NonZero<char>, EscapeError>),
112+
) {
109113
CStr::check_raw(src, callback);
110114
}
111115

@@ -181,15 +185,11 @@ fn char2byte(c: char) -> Result<u8, EscapeError> {
181185
}
182186

183187
impl CheckRaw for CStr {
184-
type RawUnit = char;
188+
type RawUnit = NonZero<char>;
185189

186190
#[inline]
187191
fn char2raw_unit(c: char) -> Result<Self::RawUnit, EscapeError> {
188-
if c == '\0' {
189-
Err(EscapeError::NulInCStr)
190-
} else {
191-
Ok(c)
192-
}
192+
NonZero::new(c).ok_or(EscapeError::NulInCStr)
193193
}
194194
}
195195

@@ -253,42 +253,67 @@ pub enum MixedUnit {
253253
/// For example, if '¥' appears in a string it is represented here as
254254
/// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte
255255
/// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]`
256-
Char(char),
256+
Char(NonZero<char>),
257257

258258
/// Used for high bytes (`\x80`..`\xff`).
259259
///
260260
/// For example, if `\xa5` appears in a string it is represented here as
261261
/// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant
262262
/// byte string as the single byte `0xa5`.
263-
HighByte(u8),
263+
HighByte(NonZero<u8>),
264264
}
265265

266-
impl From<char> for MixedUnit {
266+
impl From<NonZero<char>> for MixedUnit {
267267
#[inline]
268-
fn from(c: char) -> Self {
268+
fn from(c: NonZero<char>) -> Self {
269269
MixedUnit::Char(c)
270270
}
271271
}
272272

273-
impl From<u8> for MixedUnit {
273+
impl From<NonZero<u8>> for MixedUnit {
274274
#[inline]
275-
fn from(n: u8) -> Self {
276-
if n.is_ascii() {
277-
MixedUnit::Char(n as char)
275+
fn from(byte: NonZero<u8>) -> Self {
276+
if byte.get().is_ascii() {
277+
MixedUnit::Char(NonZero::new(byte.get() as char).unwrap())
278278
} else {
279-
MixedUnit::HighByte(n)
279+
MixedUnit::HighByte(byte)
280280
}
281281
}
282282
}
283283

284+
impl TryFrom<char> for MixedUnit {
285+
type Error = EscapeError;
286+
287+
#[inline]
288+
fn try_from(c: char) -> Result<Self, EscapeError> {
289+
NonZero::new(c)
290+
.map(MixedUnit::Char)
291+
.ok_or(EscapeError::NulInCStr)
292+
}
293+
}
294+
295+
impl TryFrom<u8> for MixedUnit {
296+
type Error = EscapeError;
297+
298+
#[inline]
299+
fn try_from(byte: u8) -> Result<Self, EscapeError> {
300+
NonZero::new(byte)
301+
.map(From::from)
302+
.ok_or(EscapeError::NulInCStr)
303+
}
304+
}
305+
284306
/// Trait for unescaping escape sequences in strings
285307
trait Unescape {
286308
/// Unit type of the implementing string type (`char` for string, `u8` for byte string)
287-
type Unit: From<u8>;
309+
type Unit;
288310

289311
/// Result of unescaping the zero char ('\0')
290312
const ZERO_RESULT: Result<Self::Unit, EscapeError>;
291313

314+
/// Converts non-zero bytes to the unit type
315+
fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit;
316+
292317
/// Converts chars to the unit type
293318
fn char2unit(c: char) -> Result<Self::Unit, EscapeError>;
294319

@@ -319,18 +344,20 @@ trait Unescape {
319344
if c == '0' {
320345
Self::ZERO_RESULT
321346
} else {
322-
simple_escape(c).map(|b| b.into()).or_else(|c| match c {
323-
'x' => Self::hex2unit(hex_escape(chars)?),
324-
'u' => Self::unicode2unit({
325-
let value = unicode_escape(chars)?;
326-
if value > char::MAX as u32 {
327-
Err(EscapeError::OutOfRangeUnicodeEscape)
328-
} else {
329-
char::from_u32(value).ok_or(EscapeError::LoneSurrogateUnicodeEscape)
330-
}
331-
}),
332-
_ => Err(EscapeError::InvalidEscape),
333-
})
347+
simple_escape(c)
348+
.map(|b| Self::nonzero_byte2unit(b))
349+
.or_else(|c| match c {
350+
'x' => Self::hex2unit(hex_escape(chars)?),
351+
'u' => Self::unicode2unit({
352+
let value = unicode_escape(chars)?;
353+
if value > char::MAX as u32 {
354+
Err(EscapeError::OutOfRangeUnicodeEscape)
355+
} else {
356+
char::from_u32(value).ok_or(EscapeError::LoneSurrogateUnicodeEscape)
357+
}
358+
}),
359+
_ => Err(EscapeError::InvalidEscape),
360+
})
334361
}
335362
}
336363

@@ -373,9 +400,9 @@ trait Unescape {
373400
///
374401
/// Parses the character of an ASCII escape (except nul) without the leading backslash.
375402
#[inline] // single use in Unescape::unescape_1
376-
fn simple_escape(c: char) -> Result<u8, char> {
403+
fn simple_escape(c: char) -> Result<NonZero<u8>, char> {
377404
// Previous character was '\\', unescape what follows.
378-
Ok(match c {
405+
Ok(NonZero::new(match c {
379406
'"' => b'"',
380407
'n' => b'\n',
381408
'r' => b'\r',
@@ -384,6 +411,7 @@ fn simple_escape(c: char) -> Result<u8, char> {
384411
'\'' => b'\'',
385412
_ => Err(c)?,
386413
})
414+
.unwrap())
387415
}
388416

389417
/// Interpret a hexadecimal escape
@@ -489,6 +517,11 @@ impl Unescape for str {
489517

490518
const ZERO_RESULT: Result<Self::Unit, EscapeError> = Ok('\0');
491519

520+
#[inline]
521+
fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit {
522+
b.get().into()
523+
}
524+
492525
#[inline]
493526
fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
494527
Ok(c)
@@ -514,6 +547,11 @@ impl Unescape for [u8] {
514547

515548
const ZERO_RESULT: Result<Self::Unit, EscapeError> = Ok(b'\0');
516549

550+
#[inline]
551+
fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit {
552+
b.get()
553+
}
554+
517555
#[inline]
518556
fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
519557
char2byte(c)
@@ -535,24 +573,19 @@ impl Unescape for CStr {
535573

536574
const ZERO_RESULT: Result<Self::Unit, EscapeError> = Err(EscapeError::NulInCStr);
537575

576+
#[inline]
577+
fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit {
578+
b.into()
579+
}
580+
538581
#[inline]
539582
fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
540-
if c == '\0' {
541-
Err(EscapeError::NulInCStr)
542-
} else {
543-
Ok(MixedUnit::Char(c))
544-
}
583+
c.try_into()
545584
}
546585

547586
#[inline]
548587
fn hex2unit(byte: u8) -> Result<Self::Unit, EscapeError> {
549-
if byte == b'\0' {
550-
Err(EscapeError::NulInCStr)
551-
} else if byte.is_ascii() {
552-
Ok(MixedUnit::Char(byte as char))
553-
} else {
554-
Ok(MixedUnit::HighByte(byte))
555-
}
588+
byte.try_into()
556589
}
557590

558591
#[inline]

0 commit comments

Comments
 (0)