Use NonZero in MixedUnit for C strings

hkBst · hkBst · commit c22125e6ce68 · 2025-06-22T06:23:31.000Z
diff --git a/benches/benches.rs b/benches/benches.rs
@@ -3,6 +3,8 @@
 extern crate test;
 
 use rustc_literal_escaper::*;
+
+use std::num::NonZero;
 use std::ops::Range;
 use std::{array, iter};
 
@@ -58,7 +60,7 @@ macro_rules! fn_bench_check_raw {
 
 fn_bench_check_raw!(bench_check_raw_str, char, check_raw_str);
 fn_bench_check_raw!(bench_check_raw_byte_str, u8, check_raw_byte_str);
-fn_bench_check_raw!(bench_check_raw_c_str, char, check_raw_c_str);
+fn_bench_check_raw!(bench_check_raw_c_str, NonZero<char>, check_raw_c_str);
 
 // raw str
 
@@ -98,25 +100,28 @@ fn bench_check_raw_byte_str_ascii(b: &mut test::Bencher) {
 
 #[bench]
 fn bench_check_raw_c_str_ascii(b: &mut test::Bencher) {
-    bench_check_raw_c_str(b, "a", &['a'; LEN]);
+    bench_check_raw_c_str(b, "a", &[NonZero::new('a').unwrap(); LEN]);
 }
 
 #[bench]
 fn bench_check_raw_c_str_non_ascii(b: &mut test::Bencher) {
-    bench_check_raw_c_str(b, "🦀", &['🦀'; LEN]);
+    bench_check_raw_c_str(b, "🦀", &[NonZero::new('🦀').unwrap(); LEN]);
 }
 
 #[bench]
 fn bench_check_raw_c_str_unicode(b: &mut test::Bencher) {
     bench_check_raw_c_str(
         b,
         "a🦀🚀z",
-        &array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 {
-            0 => 'a',
-            1 => '🦀',
-            2 => '🚀',
-            3 => 'z',
-            _ => unreachable!(),
+        &array::from_fn::<_, { 4 * LEN }, _>(|i| {
+            NonZero::new(match i % 4 {
+                0 => 'a',
+                1 => '🦀',
+                2 => '🚀',
+                3 => 'z',
+                _ => unreachable!(),
+            })
+            .unwrap()
         }),
     );
 }
@@ -318,7 +323,7 @@ fn bench_unescape_c_str_ascii(b: &mut test::Bencher) {
     bench_unescape_c_str(
         b,
         r"a",
-        &array::from_fn::<_, { LEN }, _>(|i| (i..i + 1, Ok(MixedUnit::Char('a')))),
+        &array::from_fn::<_, { LEN }, _>(|i| (i..i + 1, 'a'.try_into())),
     );
 }
 
@@ -327,7 +332,7 @@ fn bench_unescape_c_str_non_ascii(b: &mut test::Bencher) {
     bench_unescape_c_str(
         b,
         r"🦀",
-        &array::from_fn::<_, LEN, _>(|i| (4 * i..4 * (i + 1), Ok(MixedUnit::Char('🦀')))),
+        &array::from_fn::<_, LEN, _>(|i| (4 * i..4 * (i + 1), '🦀'.try_into())),
     );
 }
 
@@ -339,10 +344,10 @@ fn bench_unescape_c_str_unicode(b: &mut test::Bencher) {
         b,
         input,
         &array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 {
-            0 => (i / 4 * l..i / 4 * l + 1, Ok(MixedUnit::Char('a'))),
-            1 => (i / 4 * l + 1..i / 4 * l + 5, Ok(MixedUnit::Char('🦀'))),
-            2 => (i / 4 * l + 5..i / 4 * l + 9, Ok(MixedUnit::Char('🚀'))),
-            3 => (i / 4 * l + 9..i / 4 * l + 10, Ok(MixedUnit::Char('z'))),
+            0 => (i / 4 * l..i / 4 * l + 1, 'a'.try_into()),
+            1 => (i / 4 * l + 1..i / 4 * l + 5, '🦀'.try_into()),
+            2 => (i / 4 * l + 5..i / 4 * l + 9, '🚀'.try_into()),
+            3 => (i / 4 * l + 9..i / 4 * l + 10, 'z'.try_into()),
             _ => unreachable!(),
         }),
     );
@@ -353,7 +358,7 @@ fn bench_unescape_c_str_ascii_escape(b: &mut test::Bencher) {
     bench_unescape_c_str(
         b,
         r"\n",
-        &array::from_fn::<_, { LEN }, _>(|i| (2 * i..2 * (i + 1), Ok(MixedUnit::Char('\n')))),
+        &array::from_fn::<_, { LEN }, _>(|i| (2 * i..2 * (i + 1), '\n'.try_into())),
     );
 }
 
@@ -362,7 +367,7 @@ fn bench_unescape_c_str_hex_escape_ascii(b: &mut test::Bencher) {
     bench_unescape_c_str(
         b,
         r"\x22",
-        &array::from_fn::<_, { LEN }, _>(|i| (4 * i..4 * (i + 1), Ok(MixedUnit::Char('"')))),
+        &array::from_fn::<_, { LEN }, _>(|i| (4 * i..4 * (i + 1), '"'.try_into())),
     );
 }
 
@@ -371,9 +376,7 @@ fn bench_unescape_c_str_hex_escape_byte(b: &mut test::Bencher) {
     bench_unescape_c_str(
         b,
         r"\xff",
-        &array::from_fn::<_, { LEN }, _>(|i| {
-            (4 * i..4 * (i + 1), Ok(MixedUnit::HighByte(b'\xff')))
-        }),
+        &array::from_fn::<_, { LEN }, _>(|i| (4 * i..4 * (i + 1), b'\xff'.try_into())),
     );
 }
 
@@ -382,7 +385,7 @@ fn bench_unescape_c_str_unicode_escape(b: &mut test::Bencher) {
     bench_unescape_c_str(
         b,
         r"\u{1f980}",
-        &array::from_fn::<_, { LEN }, _>(|i| (9 * i..9 * (i + 1), Ok(MixedUnit::Char('🦀')))),
+        &array::from_fn::<_, { LEN }, _>(|i| (9 * i..9 * (i + 1), '🦀'.try_into())),
     );
 }
 
@@ -399,14 +402,11 @@ fn bench_unescape_c_str_mixed_escape(b: &mut test::Bencher) {
             let mut i = 0;
             move || {
                 let res = Some(match i % n {
-                    0 => (i / n * l..i / n * l + 2, Ok(MixedUnit::Char('\n'))),
-                    1 => (i / n * l + 2..i / n * l + 6, Ok(MixedUnit::Char('"'))),
-                    2 => (i / n * l + 6..i / n * l + 15, Ok(MixedUnit::Char('🦀'))),
-                    3 => (i / n * l + 15..i / n * l + 24, Ok(MixedUnit::Char('🚀'))),
-                    4 => (
-                        i / n * l + 24..i / n * l + 28,
-                        Ok(MixedUnit::HighByte(b'\xff')),
-                    ),
+                    0 => (i / n * l..i / n * l + 2, '\n'.try_into()),
+                    1 => (i / n * l + 2..i / n * l + 6, '"'.try_into()),
+                    2 => (i / n * l + 6..i / n * l + 15, '🦀'.try_into()),
+                    3 => (i / n * l + 15..i / n * l + 24, '🚀'.try_into()),
+                    4 => (i / n * l + 24..i / n * l + 28, b'\xff'.try_into()),
                     r if r >= n => unreachable!(),
                     _ => unimplemented!(),
                 });
diff --git a/src/lib.rs b/src/lib.rs
@@ -2,6 +2,7 @@
 //! turning escape sequences into the values they represent.
 
 use std::ffi::CStr;
+use std::num::NonZero;
 use std::ops::Range;
 use std::str::Chars;
 
@@ -105,7 +106,10 @@ pub fn check_raw_byte_str(src: &str, callback: impl FnMut(Range<usize>, Result<u
 /// and produces a sequence of characters or errors,
 /// which are returned by invoking `callback`.
 /// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
-pub fn check_raw_c_str(src: &str, callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
+pub fn check_raw_c_str(
+    src: &str,
+    callback: impl FnMut(Range<usize>, Result<NonZero<char>, EscapeError>),
+) {
     CStr::check_raw(src, callback);
 }
 
@@ -178,14 +182,10 @@ fn char2byte(c: char) -> Result<u8, EscapeError> {
 }
 
 impl CheckRaw for CStr {
-    type RawUnit = char;
+    type RawUnit = NonZero<char>;
 
     fn char2raw_unit(c: char) -> Result<Self::RawUnit, EscapeError> {
-        if c == '\0' {
-            Err(EscapeError::NulInCStr)
-        } else {
-            Ok(c)
-        }
+        NonZero::new(c).ok_or(EscapeError::NulInCStr)
     }
 }
 
@@ -247,40 +247,63 @@ pub enum MixedUnit {
     /// For example, if '¥' appears in a string it is represented here as
     /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte
     /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]`
-    Char(char),
+    Char(NonZero<char>),
 
     /// Used for high bytes (`\x80`..`\xff`).
     ///
     /// For example, if `\xa5` appears in a string it is represented here as
     /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant
     /// byte string as the single byte `0xa5`.
-    HighByte(u8),
+    HighByte(NonZero<u8>),
 }
 
-impl From<char> for MixedUnit {
-    fn from(c: char) -> Self {
+impl From<NonZero<char>> for MixedUnit {
+    fn from(c: NonZero<char>) -> Self {
         MixedUnit::Char(c)
     }
 }
 
-impl From<u8> for MixedUnit {
-    fn from(n: u8) -> Self {
-        if n.is_ascii() {
-            MixedUnit::Char(n as char)
+impl From<NonZero<u8>> for MixedUnit {
+    fn from(byte: NonZero<u8>) -> Self {
+        if byte.get().is_ascii() {
+            MixedUnit::Char(NonZero::new(byte.get() as char).unwrap())
         } else {
-            MixedUnit::HighByte(n)
+            MixedUnit::HighByte(byte)
         }
     }
 }
 
+impl TryFrom<char> for MixedUnit {
+    type Error = EscapeError;
+
+    fn try_from(c: char) -> Result<Self, EscapeError> {
+        NonZero::new(c)
+            .map(MixedUnit::Char)
+            .ok_or(EscapeError::NulInCStr)
+    }
+}
+
+impl TryFrom<u8> for MixedUnit {
+    type Error = EscapeError;
+
+    fn try_from(byte: u8) -> Result<Self, EscapeError> {
+        NonZero::new(byte)
+            .map(From::from)
+            .ok_or(EscapeError::NulInCStr)
+    }
+}
+
 /// Trait for unescaping escape sequences in strings
 trait Unescape {
     /// Unit type of the implementing string type (`char` for string, `u8` for byte string)
-    type Unit: From<u8>;
+    type Unit;
 
     /// Result of unescaping the zero char ('\0')
     const ZERO_RESULT: Result<Self::Unit, EscapeError>;
 
+    /// Converts non-zero bytes to the unit type
+    fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit;
+
     /// Converts chars to the unit type
     fn char2unit(c: char) -> Result<Self::Unit, EscapeError>;
 
@@ -311,18 +334,20 @@ trait Unescape {
         if c == '0' {
             Self::ZERO_RESULT
         } else {
-            simple_escape(c).map(|b| b.into()).or_else(|c| match c {
-                'x' => Self::hex2unit(hex_escape(chars)?),
-                'u' => Self::unicode2unit({
-                    let value = unicode_escape(chars)?;
-                    if value > char::MAX as u32 {
-                        Err(EscapeError::OutOfRangeUnicodeEscape)
-                    } else {
-                        char::from_u32(value).ok_or(EscapeError::LoneSurrogateUnicodeEscape)
-                    }
-                }),
-                _ => Err(EscapeError::InvalidEscape),
-            })
+            simple_escape(c)
+                .map(|b| Self::nonzero_byte2unit(b))
+                .or_else(|c| match c {
+                    'x' => Self::hex2unit(hex_escape(chars)?),
+                    'u' => Self::unicode2unit({
+                        let value = unicode_escape(chars)?;
+                        if value > char::MAX as u32 {
+                            Err(EscapeError::OutOfRangeUnicodeEscape)
+                        } else {
+                            char::from_u32(value).ok_or(EscapeError::LoneSurrogateUnicodeEscape)
+                        }
+                    }),
+                    _ => Err(EscapeError::InvalidEscape),
+                })
         }
     }
 
@@ -364,9 +389,9 @@ trait Unescape {
 /// Interpret a non-nul ASCII escape
 ///
 /// Parses the character of an ASCII escape (except nul) without the leading backslash.
-fn simple_escape(c: char) -> Result<u8, char> {
+fn simple_escape(c: char) -> Result<NonZero<u8>, char> {
     // Previous character was '\\', unescape what follows.
-    Ok(match c {
+    Ok(NonZero::new(match c {
         '"' => b'"',
         'n' => b'\n',
         'r' => b'\r',
@@ -375,6 +400,7 @@ fn simple_escape(c: char) -> Result<u8, char> {
         '\'' => b'\'',
         _ => Err(c)?,
     })
+    .unwrap())
 }
 
 /// Interpret a hexadecimal escape
@@ -476,6 +502,10 @@ impl Unescape for str {
 
     const ZERO_RESULT: Result<Self::Unit, EscapeError> = Ok('\0');
 
+    fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit {
+        b.get().into()
+    }
+
     fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
         Ok(c)
     }
@@ -499,6 +529,10 @@ impl Unescape for [u8] {
 
     const ZERO_RESULT: Result<Self::Unit, EscapeError> = Ok(b'\0');
 
+    fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit {
+        b.get()
+    }
+
     fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
         char2byte(c)
     }
@@ -518,22 +552,16 @@ impl Unescape for CStr {
 
     const ZERO_RESULT: Result<Self::Unit, EscapeError> = Err(EscapeError::NulInCStr);
 
+    fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit {
+        b.into()
+    }
+
     fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
-        if c == '\0' {
-            Err(EscapeError::NulInCStr)
-        } else {
-            Ok(MixedUnit::Char(c))
-        }
+        c.try_into()
     }
 
     fn hex2unit(byte: u8) -> Result<Self::Unit, EscapeError> {
-        if byte == b'\0' {
-            Err(EscapeError::NulInCStr)
-        } else if byte.is_ascii() {
-            Ok(MixedUnit::Char(byte as char))
-        } else {
-            Ok(MixedUnit::HighByte(byte))
-        }
+        byte.try_into()
     }
 
     /// Converts the result of a unicode escape to the unit type