From a49b8d8bc9462d00e4878b6f4bcfcf729610b93f Mon Sep 17 00:00:00 2001
From: Marijn Schouten <mhkbst@gmail.com>
Date: Tue, 13 May 2025 13:37:27 +0000
Subject: [PATCH 1/7] New API which does not expose `unreachable`

The old API exposes `unreachable` in both unescape_unicode and unescape_mixed.
These are conceptually one function, but because their return types are incompatible,
they could not be unified.

The new API takes this insight further to separate unescape_unicode into separate functions,
such that byte functions can return bytes instead of chars.
---
 CHANGELOG.md       |   7 +
 Cargo.lock         |   2 +-
 Cargo.toml         |   2 +-
 benches/benches.rs | 372 +++++++++++++++++++++++++++++++++++++--------
 src/lib.rs         | 100 +++++++++++-
 src/tests.rs       |  32 ++--
 6 files changed, 428 insertions(+), 87 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5778083..d5bbc69 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,10 @@
+# 0.0.4
+
+- Add `check_raw_str`, `check_raw_byte_str`, `check_raw_c_str`,
+- Add `unescape_str`, `unescape_byte_str`, `unescape_c_str`,
+- Add `unescape_for_errors`,
+- Remove: `unescape_unicode` and `unescape_mixed`
+
 # 0.0.3
 
 - Extend `rustc-dep-of-std` feature to include `libcore`
diff --git a/Cargo.lock b/Cargo.lock
index 628ead5..f8473de 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,7 +4,7 @@ version = 4
 
 [[package]]
 name = "rustc-literal-escaper"
-version = "0.0.3"
+version = "0.0.4"
 dependencies = [
  "rustc-std-workspace-core",
  "rustc-std-workspace-std",
diff --git a/Cargo.toml b/Cargo.toml
index facc7a4..6201905 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "rustc-literal-escaper"
-version = "0.0.3"
+version = "0.0.4"
 edition = "2021"
 description = "Provides code to unescape string literals"
 license = "Apache-2.0 OR MIT"
diff --git a/benches/benches.rs b/benches/benches.rs
index a028dfd..ecaef3e 100644
--- a/benches/benches.rs
+++ b/benches/benches.rs
@@ -3,7 +3,8 @@
 extern crate test;
 
 use rustc_literal_escaper::*;
-use std::iter::repeat_n;
+use std::ops::Range;
+use std::{array, iter};
 
 const LEN: usize = 10_000;
 
@@ -23,9 +24,7 @@ fn bench_skip_ascii_whitespace(b: &mut test::Bencher) {
         // skip_ascii_whitespace(&mut input.chars(), 0, &mut |range, res| {
         //     output.push((range, res))
         // });
-        unescape_unicode(&input, Mode::Str, &mut |range, res| {
-            output.push((range, res))
-        });
+        unescape_str(&input, |range, res| output.push((range, res)));
         assert_eq!(
             output,
             [((0..LEN + 2), Err(EscapeError::MultipleSkippedLinesWarning))]
@@ -37,138 +36,385 @@ fn bench_skip_ascii_whitespace(b: &mut test::Bencher) {
 // Check raw
 //
 
-fn bench_check_raw(b: &mut test::Bencher, c: char, mode: Mode) {
-    let input: String = test::black_box(repeat_n(c, LEN).collect());
-    assert_eq!(input.len(), LEN * c.len_utf8());
-    b.iter(|| {
-        let mut output = vec![];
-        unescape_unicode(&input, mode, &mut |range, res| output.push((range, res)));
-        assert_eq!(output.len(), LEN);
-        assert_eq!(output[0], ((0..c.len_utf8()), Ok(c)));
-    });
+macro_rules! fn_bench_check_raw {
+    ($name:ident, $unit:ty, $check_raw:ident) => {
+        fn $name(b: &mut test::Bencher, s: &str, expected: &[$unit]) {
+            let input: String = test::black_box([s; LEN].join(""));
+            assert_eq!(input.len(), LEN * s.len());
+            b.iter(|| {
+                let mut output = Vec::with_capacity(expected.len());
+
+                $check_raw(&input, |range, res| output.push((range, res)));
+                assert_eq!(output.len(), LEN * s.chars().count());
+
+                // check that the output is what is expected and comes from the right input bytes
+                for ((i, &e), (p, c)) in expected.iter().enumerate().zip(s.char_indices()) {
+                    assert_eq!(output[i], ((p..p + c.len_utf8()), Ok(e)));
+                }
+            });
+        }
+    };
 }
 
+fn_bench_check_raw!(bench_check_raw_str, char, check_raw_str);
+fn_bench_check_raw!(bench_check_raw_byte_str, u8, check_raw_byte_str);
+fn_bench_check_raw!(bench_check_raw_c_str, char, check_raw_c_str);
+
 // raw str
 
 #[bench]
 fn bench_check_raw_str_ascii(b: &mut test::Bencher) {
-    bench_check_raw(b, 'a', Mode::RawStr);
+    bench_check_raw_str(b, "a", &['a'; LEN]);
+}
+
+#[bench]
+fn bench_check_raw_str_non_ascii(b: &mut test::Bencher) {
+    bench_check_raw_str(b, "🦀", &['🦀'; LEN]);
 }
 
 #[bench]
 fn bench_check_raw_str_unicode(b: &mut test::Bencher) {
-    bench_check_raw(b, '🦀', Mode::RawStr);
+    bench_check_raw_str(
+        b,
+        "a🦀🚀z",
+        &array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 {
+            0 => 'a',
+            1 => '🦀',
+            2 => '🚀',
+            3 => 'z',
+            _ => unreachable!(),
+        }),
+    );
 }
 
 // raw byte str
 
 #[bench]
-fn bench_check_raw_byte_str(b: &mut test::Bencher) {
-    bench_check_raw(b, 'a', Mode::RawByteStr);
+fn bench_check_raw_byte_str_ascii(b: &mut test::Bencher) {
+    bench_check_raw_byte_str(b, "a", &[b'a'; LEN]);
 }
 
 // raw C str
 
 #[bench]
 fn bench_check_raw_c_str_ascii(b: &mut test::Bencher) {
-    bench_check_raw(b, 'a', Mode::RawCStr);
+    bench_check_raw_c_str(b, "a", &['a'; LEN]);
+}
+
+#[bench]
+fn bench_check_raw_c_str_non_ascii(b: &mut test::Bencher) {
+    bench_check_raw_c_str(b, "🦀", &['🦀'; LEN]);
 }
 
 #[bench]
 fn bench_check_raw_c_str_unicode(b: &mut test::Bencher) {
-    bench_check_raw(b, '🦀', Mode::RawCStr);
+    bench_check_raw_c_str(
+        b,
+        "a🦀🚀z",
+        &array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 {
+            0 => 'a',
+            1 => '🦀',
+            2 => '🚀',
+            3 => 'z',
+            _ => unreachable!(),
+        }),
+    );
 }
 
 //
 // Unescape
 //
 
-fn bench_unescape(b: &mut test::Bencher, s: &str, mode: Mode, expected: char) {
-    let input: String = test::black_box(repeat_n(s, LEN).collect());
-    assert_eq!(input.len(), LEN * s.len());
-    b.iter(|| {
-        let mut output = vec![];
-        unescape_unicode(&input, mode, &mut |range, res| output.push((range, res)));
-        assert_eq!(output.len(), LEN);
-        assert_eq!(output[0], ((0..s.len()), Ok(expected)));
-    });
+macro_rules! fn_bench_unescape {
+    ($name:ident, $unit:ty, $unescape:ident) => {
+        fn $name(
+            b: &mut test::Bencher,
+            s: &str,
+            expected: &[(Range<usize>, Result<$unit, EscapeError>)],
+        ) {
+            let input: String = test::black_box([s; LEN].join(""));
+            b.iter(|| {
+                let mut output = Vec::with_capacity(expected.len());
+
+                $unescape(&input, |range, res| output.push((range, res)));
+                //assert_eq!(output.len(), LEN * s.chars().count());
+
+                // check that the output is what is expected and comes from the right input bytes
+                for (i, e) in expected.iter().enumerate() {
+                    assert_eq!(output[i], *e);
+                }
+            });
+        }
+    };
 }
 
+fn_bench_unescape!(bench_unescape_str, char, unescape_str);
+fn_bench_unescape!(bench_unescape_byte_str, u8, unescape_byte_str);
+fn_bench_unescape!(bench_unescape_c_str, MixedUnit, unescape_c_str);
+
 // str
 
 #[bench]
-fn bench_unescape_str_trivial(b: &mut test::Bencher) {
-    bench_unescape(b, r"a", Mode::Str, 'a');
+fn bench_unescape_str_ascii(b: &mut test::Bencher) {
+    bench_unescape_str(
+        b,
+        r"a",
+        &array::from_fn::<_, LEN, _>(|i| (i..i + 1, Ok('a'))),
+    );
 }
 
 #[bench]
-fn bench_unescape_str_ascii(b: &mut test::Bencher) {
-    bench_unescape(b, r"\n", Mode::Str, '\n');
+fn bench_unescape_str_non_ascii(b: &mut test::Bencher) {
+    bench_unescape_str(
+        b,
+        r"🦀",
+        &array::from_fn::<_, LEN, _>(|i| (4 * i..4 * (i + 1), Ok('🦀'))),
+    );
 }
 
 #[bench]
-fn bench_unescape_str_hex(b: &mut test::Bencher) {
-    bench_unescape(b, r"\x22", Mode::Str, '"');
+fn bench_unescape_str_unicode(b: &mut test::Bencher) {
+    let input = "a🦀🚀z";
+    let l = input.len();
+    bench_unescape_str(
+        b,
+        input,
+        &array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 {
+            0 => (i / 4 * l..i / 4 * l + 1, Ok('a')),
+            1 => (i / 4 * l + 1..i / 4 * l + 5, Ok('🦀')),
+            2 => (i / 4 * l + 5..i / 4 * l + 9, Ok('🚀')),
+            3 => (i / 4 * l + 9..i / 4 * l + 10, Ok('z')),
+            _ => unreachable!(),
+        }),
+    );
 }
 
 #[bench]
-fn bench_unescape_str_unicode(b: &mut test::Bencher) {
-    bench_unescape(b, r"\u{1f980}", Mode::Str, '🦀');
+fn bench_unescape_str_ascii_escape(b: &mut test::Bencher) {
+    bench_unescape_str(
+        b,
+        r"\n",
+        &array::from_fn::<_, LEN, _>(|i| (2 * i..2 * (i + 1), Ok('\n'))),
+    );
 }
 
-// byte str
+#[bench]
+fn bench_unescape_str_hex_escape(b: &mut test::Bencher) {
+    bench_unescape_str(
+        b,
+        r"\x22",
+        &array::from_fn::<_, LEN, _>(|i| (4 * i..4 * (i + 1), Ok('"'))),
+    );
+}
+
+#[bench]
+fn bench_unescape_str_unicode_escape(b: &mut test::Bencher) {
+    let input = r"\u{1f980}\u{1f680}";
+    let l = input.len();
+    bench_unescape_str(
+        b,
+        input,
+        &array::from_fn::<_, LEN, _>(|i| {
+            if i % 2 == 0 {
+                (i / 2 * l..i / 2 * l + 9, Ok('🦀'))
+            } else {
+                (i / 2 * l + 9..i / 2 * l + 18, Ok('🚀'))
+            }
+        }),
+    );
+}
 
 #[bench]
-fn bench_unescape_byte_str_trivial(b: &mut test::Bencher) {
-    bench_unescape(b, r"a", Mode::ByteStr, 'a');
+fn bench_unescape_str_mixed_escape(b: &mut test::Bencher) {
+    let inputs = [r"\n", r"\x22", r"\u{1f980}", r"\u{1f680}"];
+    let n = inputs.len();
+    let input = inputs.join("");
+    let l = input.len();
+    bench_unescape_str(
+        b,
+        &input,
+        &iter::from_fn({
+            let mut i = 0;
+            move || {
+                let res = Some(match i % n {
+                    0 => (i / n * l..i / n * l + 2, Ok('\n')),
+                    1 => (i / n * l + 2..i / n * l + 6, Ok('"')),
+                    2 => (i / n * l + 6..i / n * l + 15, Ok('🦀')),
+                    3 => (i / n * l + 15..i / n * l + 24, Ok('🚀')),
+                    r if r >= n => unreachable!(),
+                    _ => unimplemented!(),
+                });
+                i += 1;
+                res
+            }
+        })
+        .take(n * LEN)
+        .collect::<Vec<_>>(),
+    );
 }
 
+// byte str
+
 #[bench]
 fn bench_unescape_byte_str_ascii(b: &mut test::Bencher) {
-    bench_unescape(b, r"\n", Mode::ByteStr, b'\n' as char);
+    bench_unescape_byte_str(
+        b,
+        r"a",
+        &array::from_fn::<_, { LEN }, _>(|i| (i..i + 1, Ok(b'a'))),
+    );
+}
+
+#[bench]
+fn bench_unescape_byte_str_ascii_escape(b: &mut test::Bencher) {
+    bench_unescape_byte_str(
+        b,
+        r"\n",
+        &array::from_fn::<_, { LEN }, _>(|i| (2 * i..2 * (i + 1), Ok(b'\n'))),
+    );
 }
 
 #[bench]
-fn bench_unescape_byte_str_hex(b: &mut test::Bencher) {
-    bench_unescape(b, r"\xff", Mode::ByteStr, b'\xff' as char);
+fn bench_unescape_byte_str_hex_escape(b: &mut test::Bencher) {
+    bench_unescape_byte_str(
+        b,
+        r"\xff",
+        &array::from_fn::<_, { LEN }, _>(|i| (4 * i..4 * (i + 1), Ok(b'\xff'))),
+    );
+}
+
+#[bench]
+fn bench_unescape_byte_str_mixed_escape(b: &mut test::Bencher) {
+    let inputs = [r"a", r"\n", r"\xff", r"z"];
+    let input = inputs.join("");
+    let n = inputs.len();
+    let l = input.len();
+    bench_unescape_byte_str(
+        b,
+        &input,
+        &iter::from_fn({
+            let mut i = 0;
+            move || {
+                let res = Some(match i % n {
+                    0 => (i / n * l..i / n * l + 1, Ok(b'a')),
+                    1 => (i / n * l + 1..i / n * l + 3, Ok(b'\n')),
+                    2 => (i / n * l + 3..i / n * l + 7, Ok(b'\xff')),
+                    3 => (i / n * l + 7..i / n * l + 8, Ok(b'z')),
+                    r if r >= n => unreachable!(),
+                    _ => unimplemented!(),
+                });
+                i += 1;
+                res
+            }
+        })
+        .take(n * LEN)
+        .collect::<Vec<_>>(),
+    );
 }
 
 // C str
 
-fn bench_unescape_c_str(b: &mut test::Bencher, s: &str, expected: MixedUnit) {
-    let input: String = test::black_box(repeat_n(s, LEN).collect());
-    assert_eq!(input.len(), LEN * s.len());
-    b.iter(|| {
-        let mut output = vec![];
-        unescape_mixed(&input, Mode::CStr, &mut |range, res| {
-            output.push((range, res))
-        });
-        assert_eq!(output.len(), LEN);
-        assert_eq!(output[0], ((0..s.len()), Ok(expected)));
-    });
+#[bench]
+fn bench_unescape_c_str_ascii(b: &mut test::Bencher) {
+    bench_unescape_c_str(
+        b,
+        r"a",
+        &array::from_fn::<_, { LEN }, _>(|i| (i..i + 1, Ok(MixedUnit::Char('a')))),
+    );
 }
 
 #[bench]
-fn bench_unescape_c_str_trivial(b: &mut test::Bencher) {
-    bench_unescape_c_str(b, r"a", MixedUnit::Char('a'));
+fn bench_unescape_c_str_non_ascii(b: &mut test::Bencher) {
+    bench_unescape_c_str(
+        b,
+        r"🦀",
+        &array::from_fn::<_, LEN, _>(|i| (4 * i..4 * (i + 1), Ok(MixedUnit::Char('🦀')))),
+    );
 }
 
 #[bench]
-fn bench_unescape_c_str_ascii(b: &mut test::Bencher) {
-    bench_unescape_c_str(b, r"\n", MixedUnit::Char('\n'));
+fn bench_unescape_c_str_unicode(b: &mut test::Bencher) {
+    let input = "a🦀🚀z";
+    let l = input.len();
+    bench_unescape_c_str(
+        b,
+        input,
+        &array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 {
+            0 => (i / 4 * l..i / 4 * l + 1, Ok(MixedUnit::Char('a'))),
+            1 => (i / 4 * l + 1..i / 4 * l + 5, Ok(MixedUnit::Char('🦀'))),
+            2 => (i / 4 * l + 5..i / 4 * l + 9, Ok(MixedUnit::Char('🚀'))),
+            3 => (i / 4 * l + 9..i / 4 * l + 10, Ok(MixedUnit::Char('z'))),
+            _ => unreachable!(),
+        }),
+    );
 }
 
 #[bench]
-fn bench_unescape_c_str_hex_ascii(b: &mut test::Bencher) {
-    bench_unescape_c_str(b, r"\x22", MixedUnit::Char('"'));
+fn bench_unescape_c_str_ascii_escape(b: &mut test::Bencher) {
+    bench_unescape_c_str(
+        b,
+        r"\n",
+        &array::from_fn::<_, { LEN }, _>(|i| (2 * i..2 * (i + 1), Ok(MixedUnit::Char('\n')))),
+    );
 }
 
 #[bench]
-fn bench_unescape_c_str_hex_byte(b: &mut test::Bencher) {
-    bench_unescape_c_str(b, r"\xff", MixedUnit::HighByte(b'\xff'));
+fn bench_unescape_c_str_hex_escape_ascii(b: &mut test::Bencher) {
+    bench_unescape_c_str(
+        b,
+        r"\x22",
+        &array::from_fn::<_, { LEN }, _>(|i| (4 * i..4 * (i + 1), Ok(MixedUnit::Char('"')))),
+    );
 }
 
 #[bench]
-fn bench_unescape_c_str_unicode(b: &mut test::Bencher) {
-    bench_unescape_c_str(b, r"\u{1f980}", MixedUnit::Char('🦀'));
+fn bench_unescape_c_str_hex_escape_byte(b: &mut test::Bencher) {
+    bench_unescape_c_str(
+        b,
+        r"\xff",
+        &array::from_fn::<_, { LEN }, _>(|i| {
+            (4 * i..4 * (i + 1), Ok(MixedUnit::HighByte(b'\xff')))
+        }),
+    );
+}
+
+#[bench]
+fn bench_unescape_c_str_unicode_escape(b: &mut test::Bencher) {
+    bench_unescape_c_str(
+        b,
+        r"\u{1f980}",
+        &array::from_fn::<_, { LEN }, _>(|i| (9 * i..9 * (i + 1), Ok(MixedUnit::Char('🦀')))),
+    );
+}
+
+#[bench]
+fn bench_unescape_c_str_mixed_escape(b: &mut test::Bencher) {
+    let inputs = [r"\n", r"\x22", r"\u{1f980}", r"\u{1f680}", r"\xff"];
+    let n = inputs.len();
+    let input = inputs.join("");
+    let l = input.len();
+    bench_unescape_c_str(
+        b,
+        &input,
+        &iter::from_fn({
+            let mut i = 0;
+            move || {
+                let res = Some(match i % n {
+                    0 => (i / n * l..i / n * l + 2, Ok(MixedUnit::Char('\n'))),
+                    1 => (i / n * l + 2..i / n * l + 6, Ok(MixedUnit::Char('"'))),
+                    2 => (i / n * l + 6..i / n * l + 15, Ok(MixedUnit::Char('🦀'))),
+                    3 => (i / n * l + 15..i / n * l + 24, Ok(MixedUnit::Char('🚀'))),
+                    4 => (
+                        i / n * l + 24..i / n * l + 28,
+                        Ok(MixedUnit::HighByte(b'\xff')),
+                    ),
+                    r if r >= n => unreachable!(),
+                    _ => unimplemented!(),
+                });
+                i += 1;
+                res
+            }
+        })
+        .take(n * LEN)
+        .collect::<Vec<_>>(),
+    );
 }
diff --git a/src/lib.rs b/src/lib.rs
index d315ed2..25584eb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -80,12 +80,106 @@ impl EscapeError {
     }
 }
 
+/// Takes the contents of a literal (without quotes)
+/// and produces a sequence of errors,
+/// which are returned by invoking `error_callback`.
+pub fn unescape_for_errors(
+    src: &str,
+    mode: Mode,
+    mut error_callback: impl FnMut(Range<usize>, EscapeError),
+) {
+    match mode {
+        Char => {
+            let mut chars = src.chars();
+            if let Err(e) = unescape_char_or_byte(&mut chars, Mode::Char) {
+                error_callback(0..(src.len() - chars.as_str().len()), e);
+            }
+        }
+        Byte => {
+            let mut chars = src.chars();
+            if let Err(e) = unescape_char_or_byte(&mut chars, Mode::Byte) {
+                error_callback(0..(src.len() - chars.as_str().len()), e);
+            }
+        }
+        Str => unescape_str(src, |range, res| {
+            if let Err(e) = res {
+                error_callback(range, e);
+            }
+        }),
+        ByteStr => unescape_byte_str(src, |range, res| {
+            if let Err(e) = res {
+                error_callback(range, e);
+            }
+        }),
+        CStr => unescape_c_str(src, |range, res| {
+            if let Err(e) = res {
+                error_callback(range, e);
+            }
+        }),
+        RawStr => check_raw_str(src, |range, res| {
+            if let Err(e) = res {
+                error_callback(range, e);
+            }
+        }),
+        RawByteStr => check_raw_byte_str(src, |range, res| {
+            if let Err(e) = res {
+                error_callback(range, e);
+            }
+        }),
+        RawCStr => check_raw_c_str(src, |range, res| {
+            if let Err(e) = res {
+                error_callback(range, e);
+            }
+        }),
+    }
+}
+
+pub fn check_raw_str(src: &str, mut callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
+    unescape_unicode(src, Mode::RawStr, &mut callback)
+}
+
+pub fn check_raw_byte_str(
+    src: &str,
+    mut callback: impl FnMut(Range<usize>, Result<u8, EscapeError>),
+) {
+    unescape_unicode(src, Mode::RawByteStr, &mut |r, res| {
+        callback(r, res.map(byte_from_char))
+    })
+}
+
+pub fn check_raw_c_str(
+    src: &str,
+    mut callback: impl FnMut(Range<usize>, Result<char, EscapeError>),
+) {
+    unescape_unicode(src, Mode::RawCStr, &mut callback)
+}
+
+pub fn unescape_str(src: &str, mut callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
+    unescape_unicode(src, Mode::Str, &mut callback)
+}
+
+pub fn unescape_byte_str(
+    src: &str,
+    mut callback: impl FnMut(Range<usize>, Result<u8, EscapeError>),
+) {
+    unescape_unicode(src, Mode::ByteStr, &mut |r, res| {
+        callback(r, res.map(byte_from_char))
+    })
+}
+
+pub fn unescape_c_str(
+    src: &str,
+    mut callback: impl FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
+) {
+    unescape_mixed(src, Mode::CStr, &mut callback)
+}
+
 /// Takes the contents of a unicode-only (non-mixed-utf8) literal (without
 /// quotes) and produces a sequence of escaped characters or errors.
 ///
 /// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
 /// the callback will be called exactly once.
-pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F)
+fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F)
 where
     F: FnMut(Range<usize>, Result<char, EscapeError>),
 {
@@ -147,7 +241,7 @@ impl From<u8> for MixedUnit {
 /// a sequence of escaped characters or errors.
 ///
 /// Values are returned by invoking `callback`.
-pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
+fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
 where
     F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
 {
@@ -444,7 +538,7 @@ where
 }
 
 #[inline]
-pub fn byte_from_char(c: char) -> u8 {
+fn byte_from_char(c: char) -> u8 {
     let res = c as u32;
     debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
     res as u8
diff --git a/src/tests.rs b/src/tests.rs
index a4bbdc0..a13d8a5 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -100,9 +100,7 @@ fn test_unescape_char_good() {
 fn test_unescape_str_warn() {
     fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
         let mut unescaped = Vec::with_capacity(literal.len());
-        unescape_unicode(literal, Mode::Str, &mut |range, res| {
-            unescaped.push((range, res))
-        });
+        unescape_str(literal, |range, res| unescaped.push((range, res)));
         assert_eq!(unescaped, expected);
     }
 
@@ -132,7 +130,7 @@ fn test_unescape_str_warn() {
 fn test_unescape_str_good() {
     fn check(literal_text: &str, expected: &str) {
         let mut buf = Ok(String::with_capacity(literal_text.len()));
-        unescape_unicode(literal_text, Mode::Str, &mut |range, c| {
+        unescape_str(literal_text, |range, c| {
             if let Ok(b) = &mut buf {
                 match c {
                     Ok(c) => b.push(c),
@@ -248,16 +246,16 @@ fn test_unescape_byte_good() {
 #[test]
 fn test_unescape_byte_str_good() {
     fn check(literal_text: &str, expected: &[u8]) {
-        let mut buf = Ok(Vec::with_capacity(literal_text.len()));
-        unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| {
-            if let Ok(b) = &mut buf {
-                match c {
-                    Ok(c) => b.push(byte_from_char(c)),
-                    Err(e) => buf = Err((range, e)),
+        let mut result = Ok(Vec::with_capacity(literal_text.len()));
+        unescape_byte_str(literal_text, |range, res| {
+            if let Ok(buf) = &mut result {
+                match res {
+                    Ok(b) => buf.push(b),
+                    Err(e) => result = Err((range, e)),
                 }
             }
         });
-        assert_eq!(buf.as_deref(), Ok(expected))
+        assert_eq!(result.as_deref(), Ok(expected))
     }
 
     check("foo", b"foo");
@@ -272,9 +270,7 @@ fn test_unescape_byte_str_good() {
 fn test_unescape_raw_str() {
     fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
         let mut unescaped = Vec::with_capacity(literal.len());
-        unescape_unicode(literal, Mode::RawStr, &mut |range, res| {
-            unescaped.push((range, res))
-        });
+        check_raw_str(literal, |range, res| unescaped.push((range, res)));
         assert_eq!(unescaped, expected);
     }
 
@@ -293,11 +289,9 @@ fn test_unescape_raw_str() {
 
 #[test]
 fn test_unescape_raw_byte_str() {
-    fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
+    fn check(literal: &str, expected: &[(Range<usize>, Result<u8, EscapeError>)]) {
         let mut unescaped = Vec::with_capacity(literal.len());
-        unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| {
-            unescaped.push((range, res))
-        });
+        check_raw_byte_str(literal, |range, res| unescaped.push((range, res)));
         assert_eq!(unescaped, expected);
     }
 
@@ -310,7 +304,7 @@ fn test_unescape_raw_byte_str() {
         "🦀a",
         &[
             (0..4, Err(EscapeError::NonAsciiCharInByte)),
-            (4..5, Ok('a')),
+            (4..5, Ok(b'a')),
         ],
     );
 }

From 617071840377f2f7fb94b0cdc5f0f78a9ac70358 Mon Sep 17 00:00:00 2001
From: Marijn Schouten <mhkbst@gmail.com>
Date: Tue, 13 May 2025 15:00:50 +0000
Subject: [PATCH 2/7] inline unescape_{unicode,mixed} and move docs

---
 src/lib.rs | 87 ++++++++++++++++++++++--------------------------------
 1 file changed, 36 insertions(+), 51 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 25584eb..f0e011e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -134,71 +134,75 @@ pub fn unescape_for_errors(
     }
 }
 
+/// Takes the contents of a raw string literal (without quotes)
+/// and produces a sequence of characters or errors,
+/// which are returned by invoking `callback`.
+/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
 pub fn check_raw_str(src: &str, mut callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
-    unescape_unicode(src, Mode::RawStr, &mut callback)
+    check_raw_common(src, Mode::RawStr, &mut callback)
 }
 
+/// Takes the contents of a raw byte string literal (without quotes)
+/// and produces a sequence of bytes or errors,
+/// which are returned by invoking `callback`.
+/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
 pub fn check_raw_byte_str(
     src: &str,
     mut callback: impl FnMut(Range<usize>, Result<u8, EscapeError>),
 ) {
-    unescape_unicode(src, Mode::RawByteStr, &mut |r, res| {
+    check_raw_common(src, Mode::RawByteStr, &mut |r, res| {
         callback(r, res.map(byte_from_char))
     })
 }
 
+/// Takes the contents of a raw C string literal (without quotes)
+/// and produces a sequence of characters or errors,
+/// which are returned by invoking `callback`.
+/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
 pub fn check_raw_c_str(
     src: &str,
     mut callback: impl FnMut(Range<usize>, Result<char, EscapeError>),
 ) {
-    unescape_unicode(src, Mode::RawCStr, &mut callback)
+    check_raw_common(src, Mode::RawCStr, &mut |r, mut result| {
+        if let Ok('\0') = result {
+            result = Err(EscapeError::NulInCStr);
+        }
+        callback(r, result)
+    })
 }
 
+/// Takes the contents of a string literal (without quotes)
+/// and produces a sequence of escaped characters or errors,
+/// which are returned by invoking `callback`.
 pub fn unescape_str(src: &str, mut callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
-    unescape_unicode(src, Mode::Str, &mut callback)
+    unescape_non_raw_common(src, Mode::Str, &mut callback)
 }
 
+/// Takes the contents of a byte string literal (without quotes)
+/// and produces a sequence of escaped bytes or errors,
+/// which are returned by invoking `callback`.
 pub fn unescape_byte_str(
     src: &str,
     mut callback: impl FnMut(Range<usize>, Result<u8, EscapeError>),
 ) {
-    unescape_unicode(src, Mode::ByteStr, &mut |r, res| {
+    unescape_non_raw_common(src, Mode::ByteStr, &mut |r, res| {
         callback(r, res.map(byte_from_char))
     })
 }
 
+/// Takes the contents of a C string literal (without quotes)
+/// and produces a sequence of escaped MixedUnits or errors,
+/// which are returned by invoking `callback`.
 pub fn unescape_c_str(
     src: &str,
     mut callback: impl FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
 ) {
-    unescape_mixed(src, Mode::CStr, &mut callback)
-}
-
-/// Takes the contents of a unicode-only (non-mixed-utf8) literal (without
-/// quotes) and produces a sequence of escaped characters or errors.
-///
-/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
-/// the callback will be called exactly once.
-fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F)
-where
-    F: FnMut(Range<usize>, Result<char, EscapeError>),
-{
-    match mode {
-        Char | Byte => {
-            let mut chars = src.chars();
-            let res = unescape_char_or_byte(&mut chars, mode);
-            callback(0..(src.len() - chars.as_str().len()), res);
+    unescape_non_raw_common(src, Mode::CStr, &mut |r, mut result| {
+        if let Ok(MixedUnit::Char('\0')) = result {
+            result = Err(EscapeError::NulInCStr);
         }
-        Str | ByteStr => unescape_non_raw_common(src, mode, callback),
-        RawStr | RawByteStr => check_raw_common(src, mode, callback),
-        RawCStr => check_raw_common(src, mode, &mut |r, mut result| {
-            if let Ok('\0') = result {
-                result = Err(EscapeError::NulInCStr);
-            }
-            callback(r, result)
-        }),
-        CStr => unreachable!(),
-    }
+        callback(r, result)
+    })
 }
 
 /// Used for mixed utf8 string literals, i.e. those that allow both unicode
@@ -237,25 +241,6 @@ impl From<u8> for MixedUnit {
     }
 }
 
-/// Takes the contents of a mixed-utf8 literal (without quotes) and produces
-/// a sequence of escaped characters or errors.
-///
-/// Values are returned by invoking `callback`.
-fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
-where
-    F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
-{
-    match mode {
-        CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| {
-            if let Ok(MixedUnit::Char('\0')) = result {
-                result = Err(EscapeError::NulInCStr);
-            }
-            callback(r, result)
-        }),
-        Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(),
-    }
-}
-
 /// Takes a contents of a char literal (without quotes), and returns an
 /// unescaped char or an error.
 pub fn unescape_char(src: &str) -> Result<char, EscapeError> {

From 115ae12e6d80befdb2ea17311495f95c8e85e1dc Mon Sep 17 00:00:00 2001
From: Marijn Schouten <mhkbst@gmail.com>
Date: Wed, 14 May 2025 05:35:49 +0000
Subject: [PATCH 3/7] replace check_raw_common with trait

---
 src/lib.rs | 128 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 85 insertions(+), 43 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index f0e011e..acbd591 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,6 +1,7 @@
 //! Utilities for validating string and char literals and turning them into
 //! values they represent.
 
+use std::ffi::CStr;
 use std::ops::Range;
 use std::str::Chars;
 
@@ -138,37 +139,94 @@ pub fn unescape_for_errors(
 /// and produces a sequence of characters or errors,
 /// which are returned by invoking `callback`.
 /// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
-pub fn check_raw_str(src: &str, mut callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
-    check_raw_common(src, Mode::RawStr, &mut callback)
+pub fn check_raw_str(src: &str, callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
+    str::check_raw(src, callback);
 }
 
 /// Takes the contents of a raw byte string literal (without quotes)
 /// and produces a sequence of bytes or errors,
 /// which are returned by invoking `callback`.
 /// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
-pub fn check_raw_byte_str(
-    src: &str,
-    mut callback: impl FnMut(Range<usize>, Result<u8, EscapeError>),
-) {
-    check_raw_common(src, Mode::RawByteStr, &mut |r, res| {
-        callback(r, res.map(byte_from_char))
-    })
+pub fn check_raw_byte_str(src: &str, callback: impl FnMut(Range<usize>, Result<u8, EscapeError>)) {
+    <[u8]>::check_raw(src, callback);
 }
 
 /// Takes the contents of a raw C string literal (without quotes)
 /// and produces a sequence of characters or errors,
 /// which are returned by invoking `callback`.
 /// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
-pub fn check_raw_c_str(
-    src: &str,
-    mut callback: impl FnMut(Range<usize>, Result<char, EscapeError>),
-) {
-    check_raw_common(src, Mode::RawCStr, &mut |r, mut result| {
-        if let Ok('\0') = result {
-            result = Err(EscapeError::NulInCStr);
+pub fn check_raw_c_str(src: &str, callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
+    CStr::check_raw(src, callback);
+}
+
+/// trait for checking raw strings
+trait CheckRaw {
+    /// Unit type of the implementing string type (`char` for string, `u8` for byte string)
+    type RawUnit;
+
+    /// Converts chars to the unit type of the literal type
+    fn char2raw_unit(c: char) -> Result<Self::RawUnit, EscapeError>;
+
+    /// Takes the contents of a raw literal (without quotes)
+    /// and produces a sequence of `Result<Self::RawUnit, EscapeError>`
+    /// which are returned via `callback`.
+    ///
+    /// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
+    fn check_raw(
+        src: &str,
+        mut callback: impl FnMut(Range<usize>, Result<Self::RawUnit, EscapeError>),
+    ) {
+        let mut chars = src.chars();
+        while let Some(c) = chars.next() {
+            let start = src.len() - chars.as_str().len() - c.len_utf8();
+            let res = match c {
+                '\r' => Err(EscapeError::BareCarriageReturnInRawString),
+                _ => Self::char2raw_unit(c),
+            };
+            let end = src.len() - chars.as_str().len();
+            callback(start..end, res);
         }
-        callback(r, result)
-    })
+
+        // Unfortunately, it is a bit unclear whether the following equivalent code is slower or faster: bug 141855
+        // src.char_indices().for_each(|(pos, c)| {
+        //     callback(
+        //         pos..pos + c.len_utf8(),
+        //         if c == '\r' {
+        //             Err(EscapeError::BareCarriageReturnInRawString)
+        //         } else {
+        //             Self::char2raw_unit(c)
+        //         },
+        //     );
+        // });
+    }
+}
+
+impl CheckRaw for str {
+    type RawUnit = char;
+
+    fn char2raw_unit(c: char) -> Result<Self::RawUnit, EscapeError> {
+        Ok(c)
+    }
+}
+
+impl CheckRaw for [u8] {
+    type RawUnit = u8;
+
+    fn char2raw_unit(c: char) -> Result<Self::RawUnit, EscapeError> {
+        char2byte(c)
+    }
+}
+
+impl CheckRaw for CStr {
+    type RawUnit = char;
+
+    fn char2raw_unit(c: char) -> Result<Self::RawUnit, EscapeError> {
+        if c == '\0' {
+            Err(EscapeError::NulInCStr)
+        } else {
+            Ok(c)
+        }
+    }
 }
 
 /// Takes the contents of a string literal (without quotes)
@@ -497,34 +555,18 @@ where
     *chars = tail.chars();
 }
 
-/// Takes a contents of a string literal (without quotes) and produces a
-/// sequence of characters or errors.
-/// NOTE: Raw strings do not perform any explicit character escaping, here we
-/// only produce errors on bare CR.
-fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
-where
-    F: FnMut(Range<usize>, Result<char, EscapeError>),
-{
-    let mut chars = src.chars();
-    let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
-
-    // The `start` and `end` computation here matches the one in
-    // `unescape_non_raw_common` for consistency, even though this function
-    // doesn't have to worry about skipping any chars.
-    while let Some(c) = chars.next() {
-        let start = src.len() - chars.as_str().len() - c.len_utf8();
-        let res = match c {
-            '\r' => Err(EscapeError::BareCarriageReturnInRawString),
-            _ => ascii_check(c, allow_unicode_chars),
-        };
-        let end = src.len() - chars.as_str().len();
-        callback(start..end, res);
-    }
-}
-
 #[inline]
 fn byte_from_char(c: char) -> u8 {
     let res = c as u32;
     debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
     res as u8
 }
+
+fn char2byte(c: char) -> Result<u8, EscapeError> {
+    // do NOT do: c.try_into().ok_or(EscapeError::NonAsciiCharInByte)
+    if c.is_ascii() {
+        Ok(c as u8)
+    } else {
+        Err(EscapeError::NonAsciiCharInByte)
+    }
+}

From 38012186467885992d9168dc6388d62f91936b9b Mon Sep 17 00:00:00 2001
From: Marijn Schouten <mhkbst@gmail.com>
Date: Wed, 14 May 2025 09:49:43 +0000
Subject: [PATCH 4/7] replace unescape_{char,byte} and check_non_raw_common
 with trait and remove unused Mode methods

---
 src/lib.rs | 414 +++++++++++++++++++++++++++++------------------------
 1 file changed, 223 insertions(+), 191 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index acbd591..ce88d2b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -92,13 +92,13 @@ pub fn unescape_for_errors(
     match mode {
         Char => {
             let mut chars = src.chars();
-            if let Err(e) = unescape_char_or_byte(&mut chars, Mode::Char) {
+            if let Err(e) = str::unescape_single(&mut chars) {
                 error_callback(0..(src.len() - chars.as_str().len()), e);
             }
         }
         Byte => {
             let mut chars = src.chars();
-            if let Err(e) = unescape_char_or_byte(&mut chars, Mode::Byte) {
+            if let Err(e) = <[u8]>::unescape_single(&mut chars) {
                 error_callback(0..(src.len() - chars.as_str().len()), e);
             }
         }
@@ -229,23 +229,30 @@ impl CheckRaw for CStr {
     }
 }
 
+/// Takes the contents of a char literal (without quotes),
+/// and returns an unescaped char or an error.
+pub fn unescape_char(src: &str) -> Result<char, EscapeError> {
+    str::unescape_single(&mut src.chars())
+}
+
+/// Takes the contents of a byte literal (without quotes),
+/// and returns an unescaped byte or an error.
+pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
+    <[u8]>::unescape_single(&mut src.chars())
+}
+
 /// Takes the contents of a string literal (without quotes)
 /// and produces a sequence of escaped characters or errors,
 /// which are returned by invoking `callback`.
-pub fn unescape_str(src: &str, mut callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
-    unescape_non_raw_common(src, Mode::Str, &mut callback)
+pub fn unescape_str(src: &str, callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
+    str::unescape(src, callback)
 }
 
 /// Takes the contents of a byte string literal (without quotes)
 /// and produces a sequence of escaped bytes or errors,
 /// which are returned by invoking `callback`.
-pub fn unescape_byte_str(
-    src: &str,
-    mut callback: impl FnMut(Range<usize>, Result<u8, EscapeError>),
-) {
-    unescape_non_raw_common(src, Mode::ByteStr, &mut |r, res| {
-        callback(r, res.map(byte_from_char))
-    })
+pub fn unescape_byte_str(src: &str, callback: impl FnMut(Range<usize>, Result<u8, EscapeError>)) {
+    <[u8]>::unescape(src, callback)
 }
 
 /// Takes the contents of a C string literal (without quotes)
@@ -253,14 +260,166 @@ pub fn unescape_byte_str(
 /// which are returned by invoking `callback`.
 pub fn unescape_c_str(
     src: &str,
-    mut callback: impl FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
+    callback: impl FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
 ) {
-    unescape_non_raw_common(src, Mode::CStr, &mut |r, mut result| {
-        if let Ok(MixedUnit::Char('\0')) = result {
-            result = Err(EscapeError::NulInCStr);
+    CStr::unescape(src, callback)
+}
+
+/// trait for unescaping escape sequences in strings
+trait Unescape {
+    /// Unit type of the implementing string type (`char` for string, `u8` for byte string)
+    type Unit: From<u8>;
+
+    /// Result of unescaping the zero char ('\0')
+    const ZERO_RESULT: Result<Self::Unit, EscapeError>;
+
+    /// Converts chars to the unit type
+    fn char2unit(c: char) -> Result<Self::Unit, EscapeError>;
+
+    /// Converts the byte of a hex escape to the unit type
+    fn hex2unit(b: u8) -> Result<Self::Unit, EscapeError>;
+
+    /// Converts the result of a unicode escape to the unit type
+    fn unicode2unit(r: Result<char, EscapeError>) -> Result<Self::Unit, EscapeError>;
+
+    /// Unescape a single unit (single quote syntax)
+    fn unescape_single(chars: &mut Chars<'_>) -> Result<Self::Unit, EscapeError> {
+        let res = match chars.next().ok_or(EscapeError::ZeroChars)? {
+            '\\' => Self::unescape_1(chars),
+            '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
+            '\r' => Err(EscapeError::BareCarriageReturn),
+            c => Self::char2unit(c),
+        }?;
+        if chars.next().is_some() {
+            return Err(EscapeError::MoreThanOneChar);
         }
-        callback(r, result)
-    })
+        Ok(res)
+    }
+
+    /// Unescape the first unit of a string (double quoted syntax)
+    fn unescape_1(chars: &mut Chars<'_>) -> Result<Self::Unit, EscapeError> {
+        // Previous character was '\\', unescape what follows.
+        let c = chars.next().ok_or(EscapeError::LoneSlash)?;
+        if c == '0' {
+            Self::ZERO_RESULT
+        } else {
+            simple_escape(c).map(|b| b.into()).or_else(|c| match c {
+                'x' => Self::hex2unit(hex_escape(chars)?),
+                'u' => Self::unicode2unit({
+                    let value = unicode_escape(chars)?;
+                    if value > char::MAX as u32 {
+                        Err(EscapeError::OutOfRangeUnicodeEscape)
+                    } else {
+                        char::from_u32(value).ok_or(EscapeError::LoneSurrogateUnicodeEscape)
+                    }
+                }),
+                _ => Err(EscapeError::InvalidEscape),
+            })
+        }
+    }
+
+    /// Takes the contents of a raw literal (without quotes)
+    /// and produces a sequence of `Result<Self::Unit, EscapeError>`
+    /// which are returned via `callback`.
+    fn unescape(
+        src: &str,
+        mut callback: impl FnMut(Range<usize>, Result<Self::Unit, EscapeError>),
+    ) {
+        let mut chars = src.chars();
+        while let Some(c) = chars.next() {
+            let start = src.len() - chars.as_str().len() - c.len_utf8();
+            let res = match c {
+                '\\' => {
+                    if let Some(b'\n') = chars.as_str().as_bytes().first() {
+                        let _ = chars.next();
+                        // skip whitespace for backslash newline, see [Rust language reference]
+                        // (https://doc.rust-lang.org/reference/tokens.html#string-literals).
+                        let mut callback_err = |range, err| callback(range, Err(err));
+                        skip_ascii_whitespace(&mut chars, start, &mut callback_err);
+                        continue;
+                    } else {
+                        Self::unescape_1(&mut chars)
+                    }
+                }
+                '"' => Err(EscapeError::EscapeOnlyChar),
+                '\r' => Err(EscapeError::BareCarriageReturn),
+                c => Self::char2unit(c),
+            };
+            let end = src.len() - chars.as_str().len();
+            callback(start..end, res);
+        }
+    }
+}
+
+impl Unescape for str {
+    type Unit = char;
+
+    const ZERO_RESULT: Result<Self::Unit, EscapeError> = Ok('\0');
+
+    fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
+        Ok(c)
+    }
+
+    fn hex2unit(b: u8) -> Result<Self::Unit, EscapeError> {
+        if b.is_ascii() {
+            Ok(b as char)
+        } else {
+            Err(EscapeError::OutOfRangeHexEscape)
+        }
+    }
+
+    /// Converts the result of a unicode escape to the unit type
+    fn unicode2unit(r: Result<char, EscapeError>) -> Result<Self::Unit, EscapeError> {
+        r
+    }
+}
+
+impl Unescape for [u8] {
+    type Unit = u8;
+
+    const ZERO_RESULT: Result<Self::Unit, EscapeError> = Ok(b'\0');
+
+    fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
+        char2byte(c)
+    }
+
+    fn hex2unit(b: u8) -> Result<Self::Unit, EscapeError> {
+        Ok(b)
+    }
+
+    /// Converts the result of a unicode escape to the unit type
+    fn unicode2unit(_r: Result<char, EscapeError>) -> Result<Self::Unit, EscapeError> {
+        Err(EscapeError::UnicodeEscapeInByte)
+    }
+}
+
+impl Unescape for CStr {
+    type Unit = MixedUnit;
+
+    const ZERO_RESULT: Result<Self::Unit, EscapeError> = Err(EscapeError::NulInCStr);
+
+    fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
+        if c == '\0' {
+            Err(EscapeError::NulInCStr)
+        } else {
+            Ok(MixedUnit::Char(c))
+        }
+    }
+
+    fn hex2unit(byte: u8) -> Result<Self::Unit, EscapeError> {
+        if byte == b'\0' {
+            Err(EscapeError::NulInCStr)
+        } else if byte.is_ascii() {
+            Ok(MixedUnit::Char(byte as char))
+        } else {
+            Ok(MixedUnit::HighByte(byte))
+        }
+    }
+
+    /// Converts the result of a unicode escape to the unit type
+    fn unicode2unit(r: Result<char, EscapeError>) -> Result<Self::Unit, EscapeError> {
+        Self::char2unit(r?)
+    }
 }
 
 /// Used for mixed utf8 string literals, i.e. those that allow both unicode
@@ -299,18 +458,6 @@ impl From<u8> for MixedUnit {
     }
 }
 
-/// Takes a contents of a char literal (without quotes), and returns an
-/// unescaped char or an error.
-pub fn unescape_char(src: &str) -> Result<char, EscapeError> {
-    unescape_char_or_byte(&mut src.chars(), Char)
-}
-
-/// Takes a contents of a byte literal (without quotes), and returns an
-/// unescaped byte or an error.
-pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
-    unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char)
-}
-
 /// What kind of literal do we parse.
 #[derive(Debug, Clone, Copy, PartialEq)]
 pub enum Mode {
@@ -336,33 +483,6 @@ impl Mode {
         }
     }
 
-    /// Are `\x80`..`\xff` allowed?
-    fn allow_high_bytes(self) -> bool {
-        match self {
-            Char | Str => false,
-            Byte | ByteStr | CStr => true,
-            RawStr | RawByteStr | RawCStr => unreachable!(),
-        }
-    }
-
-    /// Are unicode (non-ASCII) chars allowed?
-    #[inline]
-    fn allow_unicode_chars(self) -> bool {
-        match self {
-            Byte | ByteStr | RawByteStr => false,
-            Char | Str | RawStr | CStr | RawCStr => true,
-        }
-    }
-
-    /// Are unicode escapes (`\u`) allowed?
-    fn allow_unicode_escapes(self) -> bool {
-        match self {
-            Byte | ByteStr => false,
-            Char | Str | CStr => true,
-            RawByteStr | RawStr | RawCStr => unreachable!(),
-        }
-    }
-
     pub fn prefix_noraw(self) -> &'static str {
         match self {
             Char | Str | RawStr => "",
@@ -372,53 +492,39 @@ impl Mode {
     }
 }
 
-fn scan_escape<T: From<char> + From<u8>>(
-    chars: &mut Chars<'_>,
-    mode: Mode,
-) -> Result<T, EscapeError> {
+/// Parse the character of an ASCII escape (except nul) without the leading backslash.
+fn simple_escape(c: char) -> Result<u8, char> {
     // Previous character was '\\', unescape what follows.
-    let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? {
-        '"' => '"',
-        'n' => '\n',
-        'r' => '\r',
-        't' => '\t',
-        '\\' => '\\',
-        '\'' => '\'',
-        '0' => '\0',
-        'x' => {
-            // Parse hexadecimal character code.
-
-            let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
-            let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
-
-            let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
-            let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
-
-            let value = (hi * 16 + lo) as u8;
-
-            return if !mode.allow_high_bytes() && !value.is_ascii() {
-                Err(EscapeError::OutOfRangeHexEscape)
-            } else {
-                // This may be a high byte, but that will only happen if `T` is
-                // `MixedUnit`, because of the `allow_high_bytes` check above.
-                Ok(T::from(value))
-            };
-        }
-        'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from),
-        _ => return Err(EscapeError::InvalidEscape),
-    };
-    Ok(T::from(res))
+    Ok(match c {
+        '"' => b'"',
+        'n' => b'\n',
+        'r' => b'\r',
+        't' => b'\t',
+        '\\' => b'\\',
+        '\'' => b'\'',
+        _ => Err(c)?,
+    })
 }
 
-fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> {
-    // We've parsed '\u', now we have to parse '{..}'.
+/// Parse the two hexadecimal characters of a hexadecimal escape without the leading r"\x".
+fn hex_escape(chars: &mut impl Iterator<Item = char>) -> Result<u8, EscapeError> {
+    let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
+    let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
 
+    let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
+    let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
+
+    Ok((hi * 16 + lo) as u8)
+}
+
+/// Parse the braces with hexadecimal characters (and underscores) part of a unicode escape.
+/// This r"{...}" normally comes after r"\u" and cannot start with an underscore.
+fn unicode_escape(chars: &mut impl Iterator<Item = char>) -> Result<u32, EscapeError> {
     if chars.next() != Some('{') {
         return Err(EscapeError::NoBraceInUnicodeEscape);
     }
 
     // First character must be a hexadecimal digit.
-    let mut n_digits = 1;
     let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
         '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
         '}' => return Err(EscapeError::EmptyUnicodeEscape),
@@ -429,28 +535,19 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
 
     // First character is valid, now parse the rest of the number
     // and closing brace.
+    let mut n_digits = 1;
     loop {
         match chars.next() {
             None => return Err(EscapeError::UnclosedUnicodeEscape),
             Some('_') => continue,
             Some('}') => {
-                if n_digits > 6 {
-                    return Err(EscapeError::OverlongUnicodeEscape);
-                }
-
                 // Incorrect syntax has higher priority for error reporting
                 // than unallowed value for a literal.
-                if !allow_unicode_escapes {
-                    return Err(EscapeError::UnicodeEscapeInByte);
-                }
-
-                break std::char::from_u32(value).ok_or({
-                    if value > 0x10FFFF {
-                        EscapeError::OutOfRangeUnicodeEscape
-                    } else {
-                        EscapeError::LoneSurrogateUnicodeEscape
-                    }
-                });
+                return if n_digits > 6 {
+                    Err(EscapeError::OverlongUnicodeEscape)
+                } else {
+                    Ok(value)
+                };
             }
             Some(c) => {
                 let digit: u32 = c
@@ -467,99 +564,34 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
     }
 }
 
-#[inline]
-fn ascii_check(c: char, allow_unicode_chars: bool) -> Result<char, EscapeError> {
-    if allow_unicode_chars || c.is_ascii() {
-        Ok(c)
-    } else {
-        Err(EscapeError::NonAsciiCharInByte)
-    }
-}
-
-fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
-    let c = chars.next().ok_or(EscapeError::ZeroChars)?;
-    let res = match c {
-        '\\' => scan_escape(chars, mode),
-        '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
-        '\r' => Err(EscapeError::BareCarriageReturn),
-        _ => ascii_check(c, mode.allow_unicode_chars()),
-    }?;
-    if chars.next().is_some() {
-        return Err(EscapeError::MoreThanOneChar);
-    }
-    Ok(res)
-}
-
-/// Takes a contents of a string literal (without quotes) and produces a
-/// sequence of escaped characters or errors.
-fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
-where
-    F: FnMut(Range<usize>, Result<T, EscapeError>),
-{
-    let mut chars = src.chars();
-    let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
-
-    // The `start` and `end` computation here is complicated because
-    // `skip_ascii_whitespace` makes us to skip over chars without counting
-    // them in the range computation.
-    while let Some(c) = chars.next() {
-        let start = src.len() - chars.as_str().len() - c.len_utf8();
-        let res = match c {
-            '\\' => {
-                match chars.clone().next() {
-                    Some('\n') => {
-                        // Rust language specification requires us to skip whitespaces
-                        // if unescaped '\' character is followed by '\n'.
-                        // For details see [Rust language reference]
-                        // (https://doc.rust-lang.org/reference/tokens.html#string-literals).
-                        skip_ascii_whitespace(&mut chars, start, &mut |range, err| {
-                            callback(range, Err(err))
-                        });
-                        continue;
-                    }
-                    _ => scan_escape::<T>(&mut chars, mode),
-                }
-            }
-            '"' => Err(EscapeError::EscapeOnlyChar),
-            '\r' => Err(EscapeError::BareCarriageReturn),
-            _ => ascii_check(c, allow_unicode_chars).map(T::from),
-        };
-        let end = src.len() - chars.as_str().len();
-        callback(start..end, res);
-    }
-}
-
+/// Skip ASCII whitespace, except for the formfeed character
+/// (see [this issue](https://github.com/rust-lang/rust/issues/136600)).
+/// Warns on unescaped newline and following non-ASCII whitespace.
 fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
 where
     F: FnMut(Range<usize>, EscapeError),
 {
-    let tail = chars.as_str();
-    let first_non_space = tail
+    let rest = chars.as_str();
+    let first_non_space = rest
         .bytes()
         .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
-        .unwrap_or(tail.len());
-    if tail[1..first_non_space].contains('\n') {
-        // The +1 accounts for the escaping slash.
-        let end = start + first_non_space + 1;
+        .unwrap_or(rest.len());
+    let (space, rest) = rest.split_at(first_non_space);
+    // backslash newline adds 2 bytes
+    let end = start + 2 + first_non_space;
+    if space.contains('\n') {
         callback(start..end, EscapeError::MultipleSkippedLinesWarning);
     }
-    let tail = &tail[first_non_space..];
-    if let Some(c) = tail.chars().next() {
+    *chars = rest.chars();
+    if let Some(c) = chars.clone().next() {
         if c.is_whitespace() {
-            // For error reporting, we would like the span to contain the character that was not
-            // skipped. The +1 is necessary to account for the leading \ that started the escape.
-            let end = start + first_non_space + c.len_utf8() + 1;
-            callback(start..end, EscapeError::UnskippedWhitespaceWarning);
+            // for error reporting, include the character that was not skipped in the span
+            callback(
+                start..end + c.len_utf8(),
+                EscapeError::UnskippedWhitespaceWarning,
+            );
         }
     }
-    *chars = tail.chars();
-}
-
-#[inline]
-fn byte_from_char(c: char) -> u8 {
-    let res = c as u32;
-    debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
-    res as u8
 }
 
 fn char2byte(c: char) -> Result<u8, EscapeError> {

From 67eadd0eecdaf7501670b2621429bb531fb44040 Mon Sep 17 00:00:00 2001
From: Marijn Schouten <mhkbst@gmail.com>
Date: Wed, 14 May 2025 10:07:25 +0000
Subject: [PATCH 5/7] do not use Mode::* and move stuff around for better
 organisation

---
 src/lib.rs | 407 +++++++++++++++++++++++++++--------------------------
 1 file changed, 205 insertions(+), 202 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index ce88d2b..2137242 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -5,8 +5,6 @@ use std::ffi::CStr;
 use std::ops::Range;
 use std::str::Chars;
 
-use Mode::*;
-
 #[cfg(test)]
 mod tests;
 
@@ -81,60 +79,6 @@ impl EscapeError {
     }
 }
 
-/// Takes the contents of a literal (without quotes)
-/// and produces a sequence of errors,
-/// which are returned by invoking `error_callback`.
-pub fn unescape_for_errors(
-    src: &str,
-    mode: Mode,
-    mut error_callback: impl FnMut(Range<usize>, EscapeError),
-) {
-    match mode {
-        Char => {
-            let mut chars = src.chars();
-            if let Err(e) = str::unescape_single(&mut chars) {
-                error_callback(0..(src.len() - chars.as_str().len()), e);
-            }
-        }
-        Byte => {
-            let mut chars = src.chars();
-            if let Err(e) = <[u8]>::unescape_single(&mut chars) {
-                error_callback(0..(src.len() - chars.as_str().len()), e);
-            }
-        }
-        Str => unescape_str(src, |range, res| {
-            if let Err(e) = res {
-                error_callback(range, e);
-            }
-        }),
-        ByteStr => unescape_byte_str(src, |range, res| {
-            if let Err(e) = res {
-                error_callback(range, e);
-            }
-        }),
-        CStr => unescape_c_str(src, |range, res| {
-            if let Err(e) = res {
-                error_callback(range, e);
-            }
-        }),
-        RawStr => check_raw_str(src, |range, res| {
-            if let Err(e) = res {
-                error_callback(range, e);
-            }
-        }),
-        RawByteStr => check_raw_byte_str(src, |range, res| {
-            if let Err(e) = res {
-                error_callback(range, e);
-            }
-        }),
-        RawCStr => check_raw_c_str(src, |range, res| {
-            if let Err(e) = res {
-                error_callback(range, e);
-            }
-        }),
-    }
-}
-
 /// Takes the contents of a raw string literal (without quotes)
 /// and produces a sequence of characters or errors,
 /// which are returned by invoking `callback`.
@@ -217,6 +161,15 @@ impl CheckRaw for [u8] {
     }
 }
 
+fn char2byte(c: char) -> Result<u8, EscapeError> {
+    // do NOT do: c.try_into().ok_or(EscapeError::NonAsciiCharInByte)
+    if c.is_ascii() {
+        Ok(c as u8)
+    } else {
+        Err(EscapeError::NonAsciiCharInByte)
+    }
+}
+
 impl CheckRaw for CStr {
     type RawUnit = char;
 
@@ -265,6 +218,42 @@ pub fn unescape_c_str(
     CStr::unescape(src, callback)
 }
 
+/// Used for mixed utf8 string literals, i.e. those that allow both unicode
+/// chars and high bytes.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum MixedUnit {
+    /// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes)
+    /// and Unicode chars (written directly or via `\u` escapes).
+    ///
+    /// For example, if '¥' appears in a string it is represented here as
+    /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte
+    /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]`
+    Char(char),
+
+    /// Used for high bytes (`\x80`..`\xff`).
+    ///
+    /// For example, if `\xa5` appears in a string it is represented here as
+    /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant
+    /// byte string as the single byte `0xa5`.
+    HighByte(u8),
+}
+
+impl From<char> for MixedUnit {
+    fn from(c: char) -> Self {
+        MixedUnit::Char(c)
+    }
+}
+
+impl From<u8> for MixedUnit {
+    fn from(n: u8) -> Self {
+        if n.is_ascii() {
+            MixedUnit::Char(n as char)
+        } else {
+            MixedUnit::HighByte(n)
+        }
+    }
+}
+
 /// trait for unescaping escape sequences in strings
 trait Unescape {
     /// Unit type of the implementing string type (`char` for string, `u8` for byte string)
@@ -351,6 +340,108 @@ trait Unescape {
     }
 }
 
+/// Parse the character of an ASCII escape (except nul) without the leading backslash.
+fn simple_escape(c: char) -> Result<u8, char> {
+    // Previous character was '\\', unescape what follows.
+    Ok(match c {
+        '"' => b'"',
+        'n' => b'\n',
+        'r' => b'\r',
+        't' => b'\t',
+        '\\' => b'\\',
+        '\'' => b'\'',
+        _ => Err(c)?,
+    })
+}
+
+/// Parse the two hexadecimal characters of a hexadecimal escape without the leading r"\x".
+fn hex_escape(chars: &mut impl Iterator<Item = char>) -> Result<u8, EscapeError> {
+    let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
+    let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
+
+    let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
+    let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
+
+    Ok((hi * 16 + lo) as u8)
+}
+
+/// Parse the braces with hexadecimal characters (and underscores) part of a unicode escape.
+/// This r"{...}" normally comes after r"\u" and cannot start with an underscore.
+fn unicode_escape(chars: &mut impl Iterator<Item = char>) -> Result<u32, EscapeError> {
+    if chars.next() != Some('{') {
+        return Err(EscapeError::NoBraceInUnicodeEscape);
+    }
+
+    // First character must be a hexadecimal digit.
+    let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
+        '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
+        '}' => return Err(EscapeError::EmptyUnicodeEscape),
+        c => c
+            .to_digit(16)
+            .ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
+    };
+
+    // First character is valid, now parse the rest of the number
+    // and closing brace.
+    let mut n_digits = 1;
+    loop {
+        match chars.next() {
+            None => return Err(EscapeError::UnclosedUnicodeEscape),
+            Some('_') => continue,
+            Some('}') => {
+                // Incorrect syntax has higher priority for error reporting
+                // than unallowed value for a literal.
+                return if n_digits > 6 {
+                    Err(EscapeError::OverlongUnicodeEscape)
+                } else {
+                    Ok(value)
+                };
+            }
+            Some(c) => {
+                let digit: u32 = c
+                    .to_digit(16)
+                    .ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
+                n_digits += 1;
+                if n_digits > 6 {
+                    // Stop updating value since we're sure that it's incorrect already.
+                    continue;
+                }
+                value = value * 16 + digit;
+            }
+        };
+    }
+}
+
+/// Skip ASCII whitespace, except for the formfeed character
+/// (see [this issue](https://github.com/rust-lang/rust/issues/136600)).
+/// Warns on unescaped newline and following non-ASCII whitespace.
+fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
+where
+    F: FnMut(Range<usize>, EscapeError),
+{
+    let rest = chars.as_str();
+    let first_non_space = rest
+        .bytes()
+        .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
+        .unwrap_or(rest.len());
+    let (space, rest) = rest.split_at(first_non_space);
+    // backslash newline adds 2 bytes
+    let end = start + 2 + first_non_space;
+    if space.contains('\n') {
+        callback(start..end, EscapeError::MultipleSkippedLinesWarning);
+    }
+    *chars = rest.chars();
+    if let Some(c) = chars.clone().next() {
+        if c.is_whitespace() {
+            // for error reporting, include the character that was not skipped in the span
+            callback(
+                start..end + c.len_utf8(),
+                EscapeError::UnskippedWhitespaceWarning,
+            );
+        }
+    }
+}
+
 impl Unescape for str {
     type Unit = char;
 
@@ -422,42 +513,6 @@ impl Unescape for CStr {
     }
 }
 
-/// Used for mixed utf8 string literals, i.e. those that allow both unicode
-/// chars and high bytes.
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum MixedUnit {
-    /// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes)
-    /// and Unicode chars (written directly or via `\u` escapes).
-    ///
-    /// For example, if '¥' appears in a string it is represented here as
-    /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte
-    /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]`
-    Char(char),
-
-    /// Used for high bytes (`\x80`..`\xff`).
-    ///
-    /// For example, if `\xa5` appears in a string it is represented here as
-    /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant
-    /// byte string as the single byte `0xa5`.
-    HighByte(u8),
-}
-
-impl From<char> for MixedUnit {
-    fn from(c: char) -> Self {
-        MixedUnit::Char(c)
-    }
-}
-
-impl From<u8> for MixedUnit {
-    fn from(n: u8) -> Self {
-        if n.is_ascii() {
-            MixedUnit::Char(n as char)
-        } else {
-            MixedUnit::HighByte(n)
-        }
-    }
-}
-
 /// What kind of literal do we parse.
 #[derive(Debug, Clone, Copy, PartialEq)]
 pub enum Mode {
@@ -478,127 +533,75 @@ pub enum Mode {
 impl Mode {
     pub fn in_double_quotes(self) -> bool {
         match self {
-            Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true,
-            Char | Byte => false,
+            Mode::Str
+            | Mode::RawStr
+            | Mode::ByteStr
+            | Mode::RawByteStr
+            | Mode::CStr
+            | Mode::RawCStr => true,
+            Mode::Char | Mode::Byte => false,
         }
     }
 
     pub fn prefix_noraw(self) -> &'static str {
         match self {
-            Char | Str | RawStr => "",
-            Byte | ByteStr | RawByteStr => "b",
-            CStr | RawCStr => "c",
+            Mode::Char | Mode::Str | Mode::RawStr => "",
+            Mode::Byte | Mode::ByteStr | Mode::RawByteStr => "b",
+            Mode::CStr | Mode::RawCStr => "c",
         }
     }
 }
 
-/// Parse the character of an ASCII escape (except nul) without the leading backslash.
-fn simple_escape(c: char) -> Result<u8, char> {
-    // Previous character was '\\', unescape what follows.
-    Ok(match c {
-        '"' => b'"',
-        'n' => b'\n',
-        'r' => b'\r',
-        't' => b'\t',
-        '\\' => b'\\',
-        '\'' => b'\'',
-        _ => Err(c)?,
-    })
-}
-
-/// Parse the two hexadecimal characters of a hexadecimal escape without the leading r"\x".
-fn hex_escape(chars: &mut impl Iterator<Item = char>) -> Result<u8, EscapeError> {
-    let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
-    let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
-
-    let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
-    let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
-
-    Ok((hi * 16 + lo) as u8)
-}
-
-/// Parse the braces with hexadecimal characters (and underscores) part of a unicode escape.
-/// This r"{...}" normally comes after r"\u" and cannot start with an underscore.
-fn unicode_escape(chars: &mut impl Iterator<Item = char>) -> Result<u32, EscapeError> {
-    if chars.next() != Some('{') {
-        return Err(EscapeError::NoBraceInUnicodeEscape);
-    }
-
-    // First character must be a hexadecimal digit.
-    let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
-        '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
-        '}' => return Err(EscapeError::EmptyUnicodeEscape),
-        c => c
-            .to_digit(16)
-            .ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
-    };
-
-    // First character is valid, now parse the rest of the number
-    // and closing brace.
-    let mut n_digits = 1;
-    loop {
-        match chars.next() {
-            None => return Err(EscapeError::UnclosedUnicodeEscape),
-            Some('_') => continue,
-            Some('}') => {
-                // Incorrect syntax has higher priority for error reporting
-                // than unallowed value for a literal.
-                return if n_digits > 6 {
-                    Err(EscapeError::OverlongUnicodeEscape)
-                } else {
-                    Ok(value)
-                };
+/// Takes the contents of a literal (without quotes)
+/// and produces a sequence of errors,
+/// which are returned by invoking `error_callback`.
+pub fn unescape_for_errors(
+    src: &str,
+    mode: Mode,
+    mut error_callback: impl FnMut(Range<usize>, EscapeError),
+) {
+    match mode {
+        Mode::Char => {
+            let mut chars = src.chars();
+            if let Err(e) = str::unescape_single(&mut chars) {
+                error_callback(0..(src.len() - chars.as_str().len()), e);
             }
-            Some(c) => {
-                let digit: u32 = c
-                    .to_digit(16)
-                    .ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
-                n_digits += 1;
-                if n_digits > 6 {
-                    // Stop updating value since we're sure that it's incorrect already.
-                    continue;
-                }
-                value = value * 16 + digit;
+        }
+        Mode::Byte => {
+            let mut chars = src.chars();
+            if let Err(e) = <[u8]>::unescape_single(&mut chars) {
+                error_callback(0..(src.len() - chars.as_str().len()), e);
             }
-        };
-    }
-}
-
-/// Skip ASCII whitespace, except for the formfeed character
-/// (see [this issue](https://github.com/rust-lang/rust/issues/136600)).
-/// Warns on unescaped newline and following non-ASCII whitespace.
-fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
-where
-    F: FnMut(Range<usize>, EscapeError),
-{
-    let rest = chars.as_str();
-    let first_non_space = rest
-        .bytes()
-        .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
-        .unwrap_or(rest.len());
-    let (space, rest) = rest.split_at(first_non_space);
-    // backslash newline adds 2 bytes
-    let end = start + 2 + first_non_space;
-    if space.contains('\n') {
-        callback(start..end, EscapeError::MultipleSkippedLinesWarning);
-    }
-    *chars = rest.chars();
-    if let Some(c) = chars.clone().next() {
-        if c.is_whitespace() {
-            // for error reporting, include the character that was not skipped in the span
-            callback(
-                start..end + c.len_utf8(),
-                EscapeError::UnskippedWhitespaceWarning,
-            );
         }
-    }
-}
-
-fn char2byte(c: char) -> Result<u8, EscapeError> {
-    // do NOT do: c.try_into().ok_or(EscapeError::NonAsciiCharInByte)
-    if c.is_ascii() {
-        Ok(c as u8)
-    } else {
-        Err(EscapeError::NonAsciiCharInByte)
+        Mode::Str => unescape_str(src, |range, res| {
+            if let Err(e) = res {
+                error_callback(range, e);
+            }
+        }),
+        Mode::ByteStr => unescape_byte_str(src, |range, res| {
+            if let Err(e) = res {
+                error_callback(range, e);
+            }
+        }),
+        Mode::CStr => unescape_c_str(src, |range, res| {
+            if let Err(e) = res {
+                error_callback(range, e);
+            }
+        }),
+        Mode::RawStr => check_raw_str(src, |range, res| {
+            if let Err(e) = res {
+                error_callback(range, e);
+            }
+        }),
+        Mode::RawByteStr => check_raw_byte_str(src, |range, res| {
+            if let Err(e) = res {
+                error_callback(range, e);
+            }
+        }),
+        Mode::RawCStr => check_raw_c_str(src, |range, res| {
+            if let Err(e) = res {
+                error_callback(range, e);
+            }
+        }),
     }
 }

From 702b0dc063873ca889eaaf2b7e32bad674cd743c Mon Sep 17 00:00:00 2001
From: Marijn Schouten <mhkbst@gmail.com>
Date: Thu, 29 May 2025 09:10:41 +0000
Subject: [PATCH 6/7] rename unescape_for_errors -> check_for_errors, and
 improve docs

---
 CHANGELOG.md |  2 +-
 src/lib.rs   | 61 ++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d5bbc69..cddf56a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,7 @@
 
 - Add `check_raw_str`, `check_raw_byte_str`, `check_raw_c_str`,
 - Add `unescape_str`, `unescape_byte_str`, `unescape_c_str`,
-- Add `unescape_for_errors`,
+- Add `check_for_errors`,
 - Remove: `unescape_unicode` and `unescape_mixed`
 
 # 0.0.3
diff --git a/src/lib.rs b/src/lib.rs
index 2137242..f09f613 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,5 @@
-//! Utilities for validating string and char literals and turning them into
-//! values they represent.
+//! Utilities for validating (raw) string, char, and byte literals and
+//! turning escape sequences into the values they represent.
 
 use std::ffi::CStr;
 use std::ops::Range;
@@ -8,9 +8,9 @@ use std::str::Chars;
 #[cfg(test)]
 mod tests;
 
-/// Errors and warnings that can occur during string unescaping. They mostly
-/// relate to malformed escape sequences, but there are a few that are about
-/// other problems.
+/// Errors and warnings that can occur during string, char, and byte unescaping.
+///
+/// Mostly relating to malformed escape sequences, but also a few other problems.
 #[derive(Debug, PartialEq, Eq)]
 pub enum EscapeError {
     /// Expected 1 char, but 0 were found.
@@ -58,7 +58,7 @@ pub enum EscapeError {
     /// Non-ascii character in byte literal, byte string literal, or raw byte string literal.
     NonAsciiCharInByte,
 
-    // `\0` in a C string literal.
+    /// `\0` in a C string literal.
     NulInCStr,
 
     /// After a line ending with '\', the next line contains whitespace
@@ -79,6 +79,8 @@ impl EscapeError {
     }
 }
 
+/// Check a raw string literal for validity
+///
 /// Takes the contents of a raw string literal (without quotes)
 /// and produces a sequence of characters or errors,
 /// which are returned by invoking `callback`.
@@ -87,6 +89,8 @@ pub fn check_raw_str(src: &str, callback: impl FnMut(Range<usize>, Result<char,
     str::check_raw(src, callback);
 }
 
+/// Check a raw byte string literal for validity
+///
 /// Takes the contents of a raw byte string literal (without quotes)
 /// and produces a sequence of bytes or errors,
 /// which are returned by invoking `callback`.
@@ -95,6 +99,8 @@ pub fn check_raw_byte_str(src: &str, callback: impl FnMut(Range<usize>, Result<u
     <[u8]>::check_raw(src, callback);
 }
 
+/// Check a raw C string literal for validity
+///
 /// Takes the contents of a raw C string literal (without quotes)
 /// and produces a sequence of characters or errors,
 /// which are returned by invoking `callback`.
@@ -103,7 +109,7 @@ pub fn check_raw_c_str(src: &str, callback: impl FnMut(Range<usize>, Result<char
     CStr::check_raw(src, callback);
 }
 
-/// trait for checking raw strings
+/// Trait for checking raw string literals for validity
 trait CheckRaw {
     /// Unit type of the implementing string type (`char` for string, `u8` for byte string)
     type RawUnit;
@@ -161,6 +167,7 @@ impl CheckRaw for [u8] {
     }
 }
 
+/// Turn an ascii char into a byte
 fn char2byte(c: char) -> Result<u8, EscapeError> {
     // do NOT do: c.try_into().ok_or(EscapeError::NonAsciiCharInByte)
     if c.is_ascii() {
@@ -182,18 +189,24 @@ impl CheckRaw for CStr {
     }
 }
 
+/// Unescape a char literal
+///
 /// Takes the contents of a char literal (without quotes),
 /// and returns an unescaped char or an error.
 pub fn unescape_char(src: &str) -> Result<char, EscapeError> {
     str::unescape_single(&mut src.chars())
 }
 
+/// Unescape a byte literal
+///
 /// Takes the contents of a byte literal (without quotes),
 /// and returns an unescaped byte or an error.
 pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
     <[u8]>::unescape_single(&mut src.chars())
 }
 
+/// Unescape a string literal
+///
 /// Takes the contents of a string literal (without quotes)
 /// and produces a sequence of escaped characters or errors,
 /// which are returned by invoking `callback`.
@@ -201,6 +214,8 @@ pub fn unescape_str(src: &str, callback: impl FnMut(Range<usize>, Result<char, E
     str::unescape(src, callback)
 }
 
+/// Unescape a byte string literal
+///
 /// Takes the contents of a byte string literal (without quotes)
 /// and produces a sequence of escaped bytes or errors,
 /// which are returned by invoking `callback`.
@@ -208,6 +223,8 @@ pub fn unescape_byte_str(src: &str, callback: impl FnMut(Range<usize>, Result<u8
     <[u8]>::unescape(src, callback)
 }
 
+/// Unescape a C string literal
+///
 /// Takes the contents of a C string literal (without quotes)
 /// and produces a sequence of escaped MixedUnits or errors,
 /// which are returned by invoking `callback`.
@@ -218,6 +235,8 @@ pub fn unescape_c_str(
     CStr::unescape(src, callback)
 }
 
+/// Enum representing either a char or a byte
+///
 /// Used for mixed utf8 string literals, i.e. those that allow both unicode
 /// chars and high bytes.
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -254,7 +273,7 @@ impl From<u8> for MixedUnit {
     }
 }
 
-/// trait for unescaping escape sequences in strings
+/// Trait for unescaping escape sequences in strings
 trait Unescape {
     /// Unit type of the implementing string type (`char` for string, `u8` for byte string)
     type Unit: From<u8>;
@@ -307,7 +326,9 @@ trait Unescape {
         }
     }
 
-    /// Takes the contents of a raw literal (without quotes)
+    /// Unescape a string literal
+    ///
+    /// Takes the contents of a raw string literal (without quotes)
     /// and produces a sequence of `Result<Self::Unit, EscapeError>`
     /// which are returned via `callback`.
     fn unescape(
@@ -340,7 +361,9 @@ trait Unescape {
     }
 }
 
-/// Parse the character of an ASCII escape (except nul) without the leading backslash.
+/// Interpret a non-nul ASCII escape
+///
+/// Parses the character of an ASCII escape (except nul) without the leading backslash.
 fn simple_escape(c: char) -> Result<u8, char> {
     // Previous character was '\\', unescape what follows.
     Ok(match c {
@@ -354,7 +377,9 @@ fn simple_escape(c: char) -> Result<u8, char> {
     })
 }
 
-/// Parse the two hexadecimal characters of a hexadecimal escape without the leading r"\x".
+/// Interpret a hexadecimal escape
+///
+/// Parses the two hexadecimal characters of a hexadecimal escape without the leading r"\x".
 fn hex_escape(chars: &mut impl Iterator<Item = char>) -> Result<u8, EscapeError> {
     let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
     let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
@@ -365,6 +390,8 @@ fn hex_escape(chars: &mut impl Iterator<Item = char>) -> Result<u8, EscapeError>
     Ok((hi * 16 + lo) as u8)
 }
 
+/// Interpret a unicode escape
+///
 /// Parse the braces with hexadecimal characters (and underscores) part of a unicode escape.
 /// This r"{...}" normally comes after r"\u" and cannot start with an underscore.
 fn unicode_escape(chars: &mut impl Iterator<Item = char>) -> Result<u32, EscapeError> {
@@ -412,6 +439,8 @@ fn unicode_escape(chars: &mut impl Iterator<Item = char>) -> Result<u32, EscapeE
     }
 }
 
+/// Interpret a string continuation escape (https://doc.rust-lang.org/reference/expressions/literal-expr.html#string-continuation-escapes)
+///
 /// Skip ASCII whitespace, except for the formfeed character
 /// (see [this issue](https://github.com/rust-lang/rust/issues/136600)).
 /// Warns on unescaped newline and following non-ASCII whitespace.
@@ -513,7 +542,7 @@ impl Unescape for CStr {
     }
 }
 
-/// What kind of literal do we parse.
+/// Enum of the different kinds of literal
 #[derive(Debug, Clone, Copy, PartialEq)]
 pub enum Mode {
     Char,
@@ -552,10 +581,14 @@ impl Mode {
     }
 }
 
+/// Check a literal only for errors
+///
 /// Takes the contents of a literal (without quotes)
-/// and produces a sequence of errors,
+/// and produces a sequence of only errors,
 /// which are returned by invoking `error_callback`.
-pub fn unescape_for_errors(
+///
+/// NB Does not produce any output other than errors
+pub fn check_for_errors(
     src: &str,
     mode: Mode,
     mut error_callback: impl FnMut(Range<usize>, EscapeError),

From c9ae54eb631ad5ab381c7fa3514cf2893b256e2e Mon Sep 17 00:00:00 2001
From: Marijn Schouten <mhkbst@gmail.com>
Date: Fri, 13 Jun 2025 10:17:31 +0000
Subject: [PATCH 7/7] example literals for Mode

---
 src/lib.rs | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/lib.rs b/src/lib.rs
index f09f613..55299d1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -545,17 +545,25 @@ impl Unescape for CStr {
 /// Enum of the different kinds of literal
 #[derive(Debug, Clone, Copy, PartialEq)]
 pub enum Mode {
+    /// `'a'`
     Char,
 
+    /// `b'a'`
     Byte,
 
+    /// `"hello"`
     Str,
+    /// `r"hello"`
     RawStr,
 
+    /// `b"hello"`
     ByteStr,
+    /// `br"hello"`
     RawByteStr,
 
+    /// `c"hello"`
     CStr,
+    /// `cr"hello"`
     RawCStr,
 }