From a49b8d8bc9462d00e4878b6f4bcfcf729610b93f Mon Sep 17 00:00:00 2001 From: Marijn Schouten Date: Tue, 13 May 2025 13:37:27 +0000 Subject: [PATCH 1/7] New API which does not expose `unreachable` The old API exposes `unreachable` in both unescape_unicode and unescape_mixed. These are conceptually one function, but because their return types are incompatible, they could not be unified. The new API takes this insight further to separate unescape_unicode into separate functions, such that byte functions can return bytes instead of chars. --- CHANGELOG.md | 7 + Cargo.lock | 2 +- Cargo.toml | 2 +- benches/benches.rs | 372 +++++++++++++++++++++++++++++++++++++-------- src/lib.rs | 100 +++++++++++- src/tests.rs | 32 ++-- 6 files changed, 428 insertions(+), 87 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5778083..d5bbc69 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +# 0.0.4 + +- Add `check_raw_str`, `check_raw_byte_str`, `check_raw_c_str`, +- Add `unescape_str`, `unescape_byte_str`, `unescape_c_str`, +- Add `unescape_for_errors`, +- Remove: `unescape_unicode` and `unescape_mixed` + # 0.0.3 - Extend `rustc-dep-of-std` feature to include `libcore` diff --git a/Cargo.lock b/Cargo.lock index 628ead5..f8473de 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,7 +4,7 @@ version = 4 [[package]] name = "rustc-literal-escaper" -version = "0.0.3" +version = "0.0.4" dependencies = [ "rustc-std-workspace-core", "rustc-std-workspace-std", diff --git a/Cargo.toml b/Cargo.toml index facc7a4..6201905 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rustc-literal-escaper" -version = "0.0.3" +version = "0.0.4" edition = "2021" description = "Provides code to unescape string literals" license = "Apache-2.0 OR MIT" diff --git a/benches/benches.rs b/benches/benches.rs index a028dfd..ecaef3e 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -3,7 +3,8 @@ extern crate test; use rustc_literal_escaper::*; -use std::iter::repeat_n; +use std::ops::Range; +use std::{array, iter}; const LEN: usize = 10_000; @@ -23,9 +24,7 @@ fn bench_skip_ascii_whitespace(b: &mut test::Bencher) { // skip_ascii_whitespace(&mut input.chars(), 0, &mut |range, res| { // output.push((range, res)) // }); - unescape_unicode(&input, Mode::Str, &mut |range, res| { - output.push((range, res)) - }); + unescape_str(&input, |range, res| output.push((range, res))); assert_eq!( output, [((0..LEN + 2), Err(EscapeError::MultipleSkippedLinesWarning))] @@ -37,138 +36,385 @@ fn bench_skip_ascii_whitespace(b: &mut test::Bencher) { // Check raw // -fn bench_check_raw(b: &mut test::Bencher, c: char, mode: Mode) { - let input: String = test::black_box(repeat_n(c, LEN).collect()); - assert_eq!(input.len(), LEN * c.len_utf8()); - b.iter(|| { - let mut output = vec![]; - unescape_unicode(&input, mode, &mut |range, res| output.push((range, res))); - assert_eq!(output.len(), LEN); - assert_eq!(output[0], ((0..c.len_utf8()), Ok(c))); - }); +macro_rules! fn_bench_check_raw { + ($name:ident, $unit:ty, $check_raw:ident) => { + fn $name(b: &mut test::Bencher, s: &str, expected: &[$unit]) { + let input: String = test::black_box([s; LEN].join("")); + assert_eq!(input.len(), LEN * s.len()); + b.iter(|| { + let mut output = Vec::with_capacity(expected.len()); + + $check_raw(&input, |range, res| output.push((range, res))); + assert_eq!(output.len(), LEN * s.chars().count()); + + // check that the output is what is expected and comes from the right input bytes + for ((i, &e), (p, c)) in expected.iter().enumerate().zip(s.char_indices()) { + assert_eq!(output[i], ((p..p + c.len_utf8()), Ok(e))); + } + }); + } + }; } +fn_bench_check_raw!(bench_check_raw_str, char, check_raw_str); +fn_bench_check_raw!(bench_check_raw_byte_str, u8, check_raw_byte_str); +fn_bench_check_raw!(bench_check_raw_c_str, char, check_raw_c_str); + // raw str #[bench] fn bench_check_raw_str_ascii(b: &mut test::Bencher) { - bench_check_raw(b, 'a', Mode::RawStr); + bench_check_raw_str(b, "a", &['a'; LEN]); +} + +#[bench] +fn bench_check_raw_str_non_ascii(b: &mut test::Bencher) { + bench_check_raw_str(b, "🦀", &['🦀'; LEN]); } #[bench] fn bench_check_raw_str_unicode(b: &mut test::Bencher) { - bench_check_raw(b, '🦀', Mode::RawStr); + bench_check_raw_str( + b, + "a🦀🚀z", + &array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 { + 0 => 'a', + 1 => '🦀', + 2 => '🚀', + 3 => 'z', + _ => unreachable!(), + }), + ); } // raw byte str #[bench] -fn bench_check_raw_byte_str(b: &mut test::Bencher) { - bench_check_raw(b, 'a', Mode::RawByteStr); +fn bench_check_raw_byte_str_ascii(b: &mut test::Bencher) { + bench_check_raw_byte_str(b, "a", &[b'a'; LEN]); } // raw C str #[bench] fn bench_check_raw_c_str_ascii(b: &mut test::Bencher) { - bench_check_raw(b, 'a', Mode::RawCStr); + bench_check_raw_c_str(b, "a", &['a'; LEN]); +} + +#[bench] +fn bench_check_raw_c_str_non_ascii(b: &mut test::Bencher) { + bench_check_raw_c_str(b, "🦀", &['🦀'; LEN]); } #[bench] fn bench_check_raw_c_str_unicode(b: &mut test::Bencher) { - bench_check_raw(b, '🦀', Mode::RawCStr); + bench_check_raw_c_str( + b, + "a🦀🚀z", + &array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 { + 0 => 'a', + 1 => '🦀', + 2 => '🚀', + 3 => 'z', + _ => unreachable!(), + }), + ); } // // Unescape // -fn bench_unescape(b: &mut test::Bencher, s: &str, mode: Mode, expected: char) { - let input: String = test::black_box(repeat_n(s, LEN).collect()); - assert_eq!(input.len(), LEN * s.len()); - b.iter(|| { - let mut output = vec![]; - unescape_unicode(&input, mode, &mut |range, res| output.push((range, res))); - assert_eq!(output.len(), LEN); - assert_eq!(output[0], ((0..s.len()), Ok(expected))); - }); +macro_rules! fn_bench_unescape { + ($name:ident, $unit:ty, $unescape:ident) => { + fn $name( + b: &mut test::Bencher, + s: &str, + expected: &[(Range, Result<$unit, EscapeError>)], + ) { + let input: String = test::black_box([s; LEN].join("")); + b.iter(|| { + let mut output = Vec::with_capacity(expected.len()); + + $unescape(&input, |range, res| output.push((range, res))); + //assert_eq!(output.len(), LEN * s.chars().count()); + + // check that the output is what is expected and comes from the right input bytes + for (i, e) in expected.iter().enumerate() { + assert_eq!(output[i], *e); + } + }); + } + }; } +fn_bench_unescape!(bench_unescape_str, char, unescape_str); +fn_bench_unescape!(bench_unescape_byte_str, u8, unescape_byte_str); +fn_bench_unescape!(bench_unescape_c_str, MixedUnit, unescape_c_str); + // str #[bench] -fn bench_unescape_str_trivial(b: &mut test::Bencher) { - bench_unescape(b, r"a", Mode::Str, 'a'); +fn bench_unescape_str_ascii(b: &mut test::Bencher) { + bench_unescape_str( + b, + r"a", + &array::from_fn::<_, LEN, _>(|i| (i..i + 1, Ok('a'))), + ); } #[bench] -fn bench_unescape_str_ascii(b: &mut test::Bencher) { - bench_unescape(b, r"\n", Mode::Str, '\n'); +fn bench_unescape_str_non_ascii(b: &mut test::Bencher) { + bench_unescape_str( + b, + r"🦀", + &array::from_fn::<_, LEN, _>(|i| (4 * i..4 * (i + 1), Ok('🦀'))), + ); } #[bench] -fn bench_unescape_str_hex(b: &mut test::Bencher) { - bench_unescape(b, r"\x22", Mode::Str, '"'); +fn bench_unescape_str_unicode(b: &mut test::Bencher) { + let input = "a🦀🚀z"; + let l = input.len(); + bench_unescape_str( + b, + input, + &array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 { + 0 => (i / 4 * l..i / 4 * l + 1, Ok('a')), + 1 => (i / 4 * l + 1..i / 4 * l + 5, Ok('🦀')), + 2 => (i / 4 * l + 5..i / 4 * l + 9, Ok('🚀')), + 3 => (i / 4 * l + 9..i / 4 * l + 10, Ok('z')), + _ => unreachable!(), + }), + ); } #[bench] -fn bench_unescape_str_unicode(b: &mut test::Bencher) { - bench_unescape(b, r"\u{1f980}", Mode::Str, '🦀'); +fn bench_unescape_str_ascii_escape(b: &mut test::Bencher) { + bench_unescape_str( + b, + r"\n", + &array::from_fn::<_, LEN, _>(|i| (2 * i..2 * (i + 1), Ok('\n'))), + ); } -// byte str +#[bench] +fn bench_unescape_str_hex_escape(b: &mut test::Bencher) { + bench_unescape_str( + b, + r"\x22", + &array::from_fn::<_, LEN, _>(|i| (4 * i..4 * (i + 1), Ok('"'))), + ); +} + +#[bench] +fn bench_unescape_str_unicode_escape(b: &mut test::Bencher) { + let input = r"\u{1f980}\u{1f680}"; + let l = input.len(); + bench_unescape_str( + b, + input, + &array::from_fn::<_, LEN, _>(|i| { + if i % 2 == 0 { + (i / 2 * l..i / 2 * l + 9, Ok('🦀')) + } else { + (i / 2 * l + 9..i / 2 * l + 18, Ok('🚀')) + } + }), + ); +} #[bench] -fn bench_unescape_byte_str_trivial(b: &mut test::Bencher) { - bench_unescape(b, r"a", Mode::ByteStr, 'a'); +fn bench_unescape_str_mixed_escape(b: &mut test::Bencher) { + let inputs = [r"\n", r"\x22", r"\u{1f980}", r"\u{1f680}"]; + let n = inputs.len(); + let input = inputs.join(""); + let l = input.len(); + bench_unescape_str( + b, + &input, + &iter::from_fn({ + let mut i = 0; + move || { + let res = Some(match i % n { + 0 => (i / n * l..i / n * l + 2, Ok('\n')), + 1 => (i / n * l + 2..i / n * l + 6, Ok('"')), + 2 => (i / n * l + 6..i / n * l + 15, Ok('🦀')), + 3 => (i / n * l + 15..i / n * l + 24, Ok('🚀')), + r if r >= n => unreachable!(), + _ => unimplemented!(), + }); + i += 1; + res + } + }) + .take(n * LEN) + .collect::>(), + ); } +// byte str + #[bench] fn bench_unescape_byte_str_ascii(b: &mut test::Bencher) { - bench_unescape(b, r"\n", Mode::ByteStr, b'\n' as char); + bench_unescape_byte_str( + b, + r"a", + &array::from_fn::<_, { LEN }, _>(|i| (i..i + 1, Ok(b'a'))), + ); +} + +#[bench] +fn bench_unescape_byte_str_ascii_escape(b: &mut test::Bencher) { + bench_unescape_byte_str( + b, + r"\n", + &array::from_fn::<_, { LEN }, _>(|i| (2 * i..2 * (i + 1), Ok(b'\n'))), + ); } #[bench] -fn bench_unescape_byte_str_hex(b: &mut test::Bencher) { - bench_unescape(b, r"\xff", Mode::ByteStr, b'\xff' as char); +fn bench_unescape_byte_str_hex_escape(b: &mut test::Bencher) { + bench_unescape_byte_str( + b, + r"\xff", + &array::from_fn::<_, { LEN }, _>(|i| (4 * i..4 * (i + 1), Ok(b'\xff'))), + ); +} + +#[bench] +fn bench_unescape_byte_str_mixed_escape(b: &mut test::Bencher) { + let inputs = [r"a", r"\n", r"\xff", r"z"]; + let input = inputs.join(""); + let n = inputs.len(); + let l = input.len(); + bench_unescape_byte_str( + b, + &input, + &iter::from_fn({ + let mut i = 0; + move || { + let res = Some(match i % n { + 0 => (i / n * l..i / n * l + 1, Ok(b'a')), + 1 => (i / n * l + 1..i / n * l + 3, Ok(b'\n')), + 2 => (i / n * l + 3..i / n * l + 7, Ok(b'\xff')), + 3 => (i / n * l + 7..i / n * l + 8, Ok(b'z')), + r if r >= n => unreachable!(), + _ => unimplemented!(), + }); + i += 1; + res + } + }) + .take(n * LEN) + .collect::>(), + ); } // C str -fn bench_unescape_c_str(b: &mut test::Bencher, s: &str, expected: MixedUnit) { - let input: String = test::black_box(repeat_n(s, LEN).collect()); - assert_eq!(input.len(), LEN * s.len()); - b.iter(|| { - let mut output = vec![]; - unescape_mixed(&input, Mode::CStr, &mut |range, res| { - output.push((range, res)) - }); - assert_eq!(output.len(), LEN); - assert_eq!(output[0], ((0..s.len()), Ok(expected))); - }); +#[bench] +fn bench_unescape_c_str_ascii(b: &mut test::Bencher) { + bench_unescape_c_str( + b, + r"a", + &array::from_fn::<_, { LEN }, _>(|i| (i..i + 1, Ok(MixedUnit::Char('a')))), + ); } #[bench] -fn bench_unescape_c_str_trivial(b: &mut test::Bencher) { - bench_unescape_c_str(b, r"a", MixedUnit::Char('a')); +fn bench_unescape_c_str_non_ascii(b: &mut test::Bencher) { + bench_unescape_c_str( + b, + r"🦀", + &array::from_fn::<_, LEN, _>(|i| (4 * i..4 * (i + 1), Ok(MixedUnit::Char('🦀')))), + ); } #[bench] -fn bench_unescape_c_str_ascii(b: &mut test::Bencher) { - bench_unescape_c_str(b, r"\n", MixedUnit::Char('\n')); +fn bench_unescape_c_str_unicode(b: &mut test::Bencher) { + let input = "a🦀🚀z"; + let l = input.len(); + bench_unescape_c_str( + b, + input, + &array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 { + 0 => (i / 4 * l..i / 4 * l + 1, Ok(MixedUnit::Char('a'))), + 1 => (i / 4 * l + 1..i / 4 * l + 5, Ok(MixedUnit::Char('🦀'))), + 2 => (i / 4 * l + 5..i / 4 * l + 9, Ok(MixedUnit::Char('🚀'))), + 3 => (i / 4 * l + 9..i / 4 * l + 10, Ok(MixedUnit::Char('z'))), + _ => unreachable!(), + }), + ); } #[bench] -fn bench_unescape_c_str_hex_ascii(b: &mut test::Bencher) { - bench_unescape_c_str(b, r"\x22", MixedUnit::Char('"')); +fn bench_unescape_c_str_ascii_escape(b: &mut test::Bencher) { + bench_unescape_c_str( + b, + r"\n", + &array::from_fn::<_, { LEN }, _>(|i| (2 * i..2 * (i + 1), Ok(MixedUnit::Char('\n')))), + ); } #[bench] -fn bench_unescape_c_str_hex_byte(b: &mut test::Bencher) { - bench_unescape_c_str(b, r"\xff", MixedUnit::HighByte(b'\xff')); +fn bench_unescape_c_str_hex_escape_ascii(b: &mut test::Bencher) { + bench_unescape_c_str( + b, + r"\x22", + &array::from_fn::<_, { LEN }, _>(|i| (4 * i..4 * (i + 1), Ok(MixedUnit::Char('"')))), + ); } #[bench] -fn bench_unescape_c_str_unicode(b: &mut test::Bencher) { - bench_unescape_c_str(b, r"\u{1f980}", MixedUnit::Char('🦀')); +fn bench_unescape_c_str_hex_escape_byte(b: &mut test::Bencher) { + bench_unescape_c_str( + b, + r"\xff", + &array::from_fn::<_, { LEN }, _>(|i| { + (4 * i..4 * (i + 1), Ok(MixedUnit::HighByte(b'\xff'))) + }), + ); +} + +#[bench] +fn bench_unescape_c_str_unicode_escape(b: &mut test::Bencher) { + bench_unescape_c_str( + b, + r"\u{1f980}", + &array::from_fn::<_, { LEN }, _>(|i| (9 * i..9 * (i + 1), Ok(MixedUnit::Char('🦀')))), + ); +} + +#[bench] +fn bench_unescape_c_str_mixed_escape(b: &mut test::Bencher) { + let inputs = [r"\n", r"\x22", r"\u{1f980}", r"\u{1f680}", r"\xff"]; + let n = inputs.len(); + let input = inputs.join(""); + let l = input.len(); + bench_unescape_c_str( + b, + &input, + &iter::from_fn({ + let mut i = 0; + move || { + let res = Some(match i % n { + 0 => (i / n * l..i / n * l + 2, Ok(MixedUnit::Char('\n'))), + 1 => (i / n * l + 2..i / n * l + 6, Ok(MixedUnit::Char('"'))), + 2 => (i / n * l + 6..i / n * l + 15, Ok(MixedUnit::Char('🦀'))), + 3 => (i / n * l + 15..i / n * l + 24, Ok(MixedUnit::Char('🚀'))), + 4 => ( + i / n * l + 24..i / n * l + 28, + Ok(MixedUnit::HighByte(b'\xff')), + ), + r if r >= n => unreachable!(), + _ => unimplemented!(), + }); + i += 1; + res + } + }) + .take(n * LEN) + .collect::>(), + ); } diff --git a/src/lib.rs b/src/lib.rs index d315ed2..25584eb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -80,12 +80,106 @@ impl EscapeError { } } +/// Takes the contents of a literal (without quotes) +/// and produces a sequence of errors, +/// which are returned by invoking `error_callback`. +pub fn unescape_for_errors( + src: &str, + mode: Mode, + mut error_callback: impl FnMut(Range, EscapeError), +) { + match mode { + Char => { + let mut chars = src.chars(); + if let Err(e) = unescape_char_or_byte(&mut chars, Mode::Char) { + error_callback(0..(src.len() - chars.as_str().len()), e); + } + } + Byte => { + let mut chars = src.chars(); + if let Err(e) = unescape_char_or_byte(&mut chars, Mode::Byte) { + error_callback(0..(src.len() - chars.as_str().len()), e); + } + } + Str => unescape_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + ByteStr => unescape_byte_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + CStr => unescape_c_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + RawStr => check_raw_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + RawByteStr => check_raw_byte_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + RawCStr => check_raw_c_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + } +} + +pub fn check_raw_str(src: &str, mut callback: impl FnMut(Range, Result)) { + unescape_unicode(src, Mode::RawStr, &mut callback) +} + +pub fn check_raw_byte_str( + src: &str, + mut callback: impl FnMut(Range, Result), +) { + unescape_unicode(src, Mode::RawByteStr, &mut |r, res| { + callback(r, res.map(byte_from_char)) + }) +} + +pub fn check_raw_c_str( + src: &str, + mut callback: impl FnMut(Range, Result), +) { + unescape_unicode(src, Mode::RawCStr, &mut callback) +} + +pub fn unescape_str(src: &str, mut callback: impl FnMut(Range, Result)) { + unescape_unicode(src, Mode::Str, &mut callback) +} + +pub fn unescape_byte_str( + src: &str, + mut callback: impl FnMut(Range, Result), +) { + unescape_unicode(src, Mode::ByteStr, &mut |r, res| { + callback(r, res.map(byte_from_char)) + }) +} + +pub fn unescape_c_str( + src: &str, + mut callback: impl FnMut(Range, Result), +) { + unescape_mixed(src, Mode::CStr, &mut callback) +} + /// Takes the contents of a unicode-only (non-mixed-utf8) literal (without /// quotes) and produces a sequence of escaped characters or errors. /// /// Values are returned by invoking `callback`. For `Char` and `Byte` modes, /// the callback will be called exactly once. -pub fn unescape_unicode(src: &str, mode: Mode, callback: &mut F) +fn unescape_unicode(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { @@ -147,7 +241,7 @@ impl From for MixedUnit { /// a sequence of escaped characters or errors. /// /// Values are returned by invoking `callback`. -pub fn unescape_mixed(src: &str, mode: Mode, callback: &mut F) +fn unescape_mixed(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { @@ -444,7 +538,7 @@ where } #[inline] -pub fn byte_from_char(c: char) -> u8 { +fn byte_from_char(c: char) -> u8 { let res = c as u32; debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr"); res as u8 diff --git a/src/tests.rs b/src/tests.rs index a4bbdc0..a13d8a5 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -100,9 +100,7 @@ fn test_unescape_char_good() { fn test_unescape_str_warn() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::Str, &mut |range, res| { - unescaped.push((range, res)) - }); + unescape_str(literal, |range, res| unescaped.push((range, res))); assert_eq!(unescaped, expected); } @@ -132,7 +130,7 @@ fn test_unescape_str_warn() { fn test_unescape_str_good() { fn check(literal_text: &str, expected: &str) { let mut buf = Ok(String::with_capacity(literal_text.len())); - unescape_unicode(literal_text, Mode::Str, &mut |range, c| { + unescape_str(literal_text, |range, c| { if let Ok(b) = &mut buf { match c { Ok(c) => b.push(c), @@ -248,16 +246,16 @@ fn test_unescape_byte_good() { #[test] fn test_unescape_byte_str_good() { fn check(literal_text: &str, expected: &[u8]) { - let mut buf = Ok(Vec::with_capacity(literal_text.len())); - unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| { - if let Ok(b) = &mut buf { - match c { - Ok(c) => b.push(byte_from_char(c)), - Err(e) => buf = Err((range, e)), + let mut result = Ok(Vec::with_capacity(literal_text.len())); + unescape_byte_str(literal_text, |range, res| { + if let Ok(buf) = &mut result { + match res { + Ok(b) => buf.push(b), + Err(e) => result = Err((range, e)), } } }); - assert_eq!(buf.as_deref(), Ok(expected)) + assert_eq!(result.as_deref(), Ok(expected)) } check("foo", b"foo"); @@ -272,9 +270,7 @@ fn test_unescape_byte_str_good() { fn test_unescape_raw_str() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::RawStr, &mut |range, res| { - unescaped.push((range, res)) - }); + check_raw_str(literal, |range, res| unescaped.push((range, res))); assert_eq!(unescaped, expected); } @@ -293,11 +289,9 @@ fn test_unescape_raw_str() { #[test] fn test_unescape_raw_byte_str() { - fn check(literal: &str, expected: &[(Range, Result)]) { + fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| { - unescaped.push((range, res)) - }); + check_raw_byte_str(literal, |range, res| unescaped.push((range, res))); assert_eq!(unescaped, expected); } @@ -310,7 +304,7 @@ fn test_unescape_raw_byte_str() { "🦀a", &[ (0..4, Err(EscapeError::NonAsciiCharInByte)), - (4..5, Ok('a')), + (4..5, Ok(b'a')), ], ); } From 617071840377f2f7fb94b0cdc5f0f78a9ac70358 Mon Sep 17 00:00:00 2001 From: Marijn Schouten Date: Tue, 13 May 2025 15:00:50 +0000 Subject: [PATCH 2/7] inline unescape_{unicode,mixed} and move docs --- src/lib.rs | 87 ++++++++++++++++++++++-------------------------------- 1 file changed, 36 insertions(+), 51 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 25584eb..f0e011e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -134,71 +134,75 @@ pub fn unescape_for_errors( } } +/// Takes the contents of a raw string literal (without quotes) +/// and produces a sequence of characters or errors, +/// which are returned by invoking `callback`. +/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r'). pub fn check_raw_str(src: &str, mut callback: impl FnMut(Range, Result)) { - unescape_unicode(src, Mode::RawStr, &mut callback) + check_raw_common(src, Mode::RawStr, &mut callback) } +/// Takes the contents of a raw byte string literal (without quotes) +/// and produces a sequence of bytes or errors, +/// which are returned by invoking `callback`. +/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r'). pub fn check_raw_byte_str( src: &str, mut callback: impl FnMut(Range, Result), ) { - unescape_unicode(src, Mode::RawByteStr, &mut |r, res| { + check_raw_common(src, Mode::RawByteStr, &mut |r, res| { callback(r, res.map(byte_from_char)) }) } +/// Takes the contents of a raw C string literal (without quotes) +/// and produces a sequence of characters or errors, +/// which are returned by invoking `callback`. +/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r'). pub fn check_raw_c_str( src: &str, mut callback: impl FnMut(Range, Result), ) { - unescape_unicode(src, Mode::RawCStr, &mut callback) + check_raw_common(src, Mode::RawCStr, &mut |r, mut result| { + if let Ok('\0') = result { + result = Err(EscapeError::NulInCStr); + } + callback(r, result) + }) } +/// Takes the contents of a string literal (without quotes) +/// and produces a sequence of escaped characters or errors, +/// which are returned by invoking `callback`. pub fn unescape_str(src: &str, mut callback: impl FnMut(Range, Result)) { - unescape_unicode(src, Mode::Str, &mut callback) + unescape_non_raw_common(src, Mode::Str, &mut callback) } +/// Takes the contents of a byte string literal (without quotes) +/// and produces a sequence of escaped bytes or errors, +/// which are returned by invoking `callback`. pub fn unescape_byte_str( src: &str, mut callback: impl FnMut(Range, Result), ) { - unescape_unicode(src, Mode::ByteStr, &mut |r, res| { + unescape_non_raw_common(src, Mode::ByteStr, &mut |r, res| { callback(r, res.map(byte_from_char)) }) } +/// Takes the contents of a C string literal (without quotes) +/// and produces a sequence of escaped MixedUnits or errors, +/// which are returned by invoking `callback`. pub fn unescape_c_str( src: &str, mut callback: impl FnMut(Range, Result), ) { - unescape_mixed(src, Mode::CStr, &mut callback) -} - -/// Takes the contents of a unicode-only (non-mixed-utf8) literal (without -/// quotes) and produces a sequence of escaped characters or errors. -/// -/// Values are returned by invoking `callback`. For `Char` and `Byte` modes, -/// the callback will be called exactly once. -fn unescape_unicode(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - match mode { - Char | Byte => { - let mut chars = src.chars(); - let res = unescape_char_or_byte(&mut chars, mode); - callback(0..(src.len() - chars.as_str().len()), res); + unescape_non_raw_common(src, Mode::CStr, &mut |r, mut result| { + if let Ok(MixedUnit::Char('\0')) = result { + result = Err(EscapeError::NulInCStr); } - Str | ByteStr => unescape_non_raw_common(src, mode, callback), - RawStr | RawByteStr => check_raw_common(src, mode, callback), - RawCStr => check_raw_common(src, mode, &mut |r, mut result| { - if let Ok('\0') = result { - result = Err(EscapeError::NulInCStr); - } - callback(r, result) - }), - CStr => unreachable!(), - } + callback(r, result) + }) } /// Used for mixed utf8 string literals, i.e. those that allow both unicode @@ -237,25 +241,6 @@ impl From for MixedUnit { } } -/// Takes the contents of a mixed-utf8 literal (without quotes) and produces -/// a sequence of escaped characters or errors. -/// -/// Values are returned by invoking `callback`. -fn unescape_mixed(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - match mode { - CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| { - if let Ok(MixedUnit::Char('\0')) = result { - result = Err(EscapeError::NulInCStr); - } - callback(r, result) - }), - Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(), - } -} - /// Takes a contents of a char literal (without quotes), and returns an /// unescaped char or an error. pub fn unescape_char(src: &str) -> Result { From 115ae12e6d80befdb2ea17311495f95c8e85e1dc Mon Sep 17 00:00:00 2001 From: Marijn Schouten Date: Wed, 14 May 2025 05:35:49 +0000 Subject: [PATCH 3/7] replace check_raw_common with trait --- src/lib.rs | 128 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 85 insertions(+), 43 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index f0e011e..acbd591 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ //! Utilities for validating string and char literals and turning them into //! values they represent. +use std::ffi::CStr; use std::ops::Range; use std::str::Chars; @@ -138,37 +139,94 @@ pub fn unescape_for_errors( /// and produces a sequence of characters or errors, /// which are returned by invoking `callback`. /// NOTE: Does no escaping, but produces errors for bare carriage return ('\r'). -pub fn check_raw_str(src: &str, mut callback: impl FnMut(Range, Result)) { - check_raw_common(src, Mode::RawStr, &mut callback) +pub fn check_raw_str(src: &str, callback: impl FnMut(Range, Result)) { + str::check_raw(src, callback); } /// Takes the contents of a raw byte string literal (without quotes) /// and produces a sequence of bytes or errors, /// which are returned by invoking `callback`. /// NOTE: Does no escaping, but produces errors for bare carriage return ('\r'). -pub fn check_raw_byte_str( - src: &str, - mut callback: impl FnMut(Range, Result), -) { - check_raw_common(src, Mode::RawByteStr, &mut |r, res| { - callback(r, res.map(byte_from_char)) - }) +pub fn check_raw_byte_str(src: &str, callback: impl FnMut(Range, Result)) { + <[u8]>::check_raw(src, callback); } /// Takes the contents of a raw C string literal (without quotes) /// and produces a sequence of characters or errors, /// which are returned by invoking `callback`. /// NOTE: Does no escaping, but produces errors for bare carriage return ('\r'). -pub fn check_raw_c_str( - src: &str, - mut callback: impl FnMut(Range, Result), -) { - check_raw_common(src, Mode::RawCStr, &mut |r, mut result| { - if let Ok('\0') = result { - result = Err(EscapeError::NulInCStr); +pub fn check_raw_c_str(src: &str, callback: impl FnMut(Range, Result)) { + CStr::check_raw(src, callback); +} + +/// trait for checking raw strings +trait CheckRaw { + /// Unit type of the implementing string type (`char` for string, `u8` for byte string) + type RawUnit; + + /// Converts chars to the unit type of the literal type + fn char2raw_unit(c: char) -> Result; + + /// Takes the contents of a raw literal (without quotes) + /// and produces a sequence of `Result` + /// which are returned via `callback`. + /// + /// NOTE: Does no escaping, but produces errors for bare carriage return ('\r'). + fn check_raw( + src: &str, + mut callback: impl FnMut(Range, Result), + ) { + let mut chars = src.chars(); + while let Some(c) = chars.next() { + let start = src.len() - chars.as_str().len() - c.len_utf8(); + let res = match c { + '\r' => Err(EscapeError::BareCarriageReturnInRawString), + _ => Self::char2raw_unit(c), + }; + let end = src.len() - chars.as_str().len(); + callback(start..end, res); } - callback(r, result) - }) + + // Unfortunately, it is a bit unclear whether the following equivalent code is slower or faster: bug 141855 + // src.char_indices().for_each(|(pos, c)| { + // callback( + // pos..pos + c.len_utf8(), + // if c == '\r' { + // Err(EscapeError::BareCarriageReturnInRawString) + // } else { + // Self::char2raw_unit(c) + // }, + // ); + // }); + } +} + +impl CheckRaw for str { + type RawUnit = char; + + fn char2raw_unit(c: char) -> Result { + Ok(c) + } +} + +impl CheckRaw for [u8] { + type RawUnit = u8; + + fn char2raw_unit(c: char) -> Result { + char2byte(c) + } +} + +impl CheckRaw for CStr { + type RawUnit = char; + + fn char2raw_unit(c: char) -> Result { + if c == '\0' { + Err(EscapeError::NulInCStr) + } else { + Ok(c) + } + } } /// Takes the contents of a string literal (without quotes) @@ -497,34 +555,18 @@ where *chars = tail.chars(); } -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of characters or errors. -/// NOTE: Raw strings do not perform any explicit character escaping, here we -/// only produce errors on bare CR. -fn check_raw_common(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - let mut chars = src.chars(); - let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop - - // The `start` and `end` computation here matches the one in - // `unescape_non_raw_common` for consistency, even though this function - // doesn't have to worry about skipping any chars. - while let Some(c) = chars.next() { - let start = src.len() - chars.as_str().len() - c.len_utf8(); - let res = match c { - '\r' => Err(EscapeError::BareCarriageReturnInRawString), - _ => ascii_check(c, allow_unicode_chars), - }; - let end = src.len() - chars.as_str().len(); - callback(start..end, res); - } -} - #[inline] fn byte_from_char(c: char) -> u8 { let res = c as u32; debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr"); res as u8 } + +fn char2byte(c: char) -> Result { + // do NOT do: c.try_into().ok_or(EscapeError::NonAsciiCharInByte) + if c.is_ascii() { + Ok(c as u8) + } else { + Err(EscapeError::NonAsciiCharInByte) + } +} From 38012186467885992d9168dc6388d62f91936b9b Mon Sep 17 00:00:00 2001 From: Marijn Schouten Date: Wed, 14 May 2025 09:49:43 +0000 Subject: [PATCH 4/7] replace unescape_{char,byte} and check_non_raw_common with trait and remove unused Mode methods --- src/lib.rs | 414 +++++++++++++++++++++++++++++------------------------ 1 file changed, 223 insertions(+), 191 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index acbd591..ce88d2b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -92,13 +92,13 @@ pub fn unescape_for_errors( match mode { Char => { let mut chars = src.chars(); - if let Err(e) = unescape_char_or_byte(&mut chars, Mode::Char) { + if let Err(e) = str::unescape_single(&mut chars) { error_callback(0..(src.len() - chars.as_str().len()), e); } } Byte => { let mut chars = src.chars(); - if let Err(e) = unescape_char_or_byte(&mut chars, Mode::Byte) { + if let Err(e) = <[u8]>::unescape_single(&mut chars) { error_callback(0..(src.len() - chars.as_str().len()), e); } } @@ -229,23 +229,30 @@ impl CheckRaw for CStr { } } +/// Takes the contents of a char literal (without quotes), +/// and returns an unescaped char or an error. +pub fn unescape_char(src: &str) -> Result { + str::unescape_single(&mut src.chars()) +} + +/// Takes the contents of a byte literal (without quotes), +/// and returns an unescaped byte or an error. +pub fn unescape_byte(src: &str) -> Result { + <[u8]>::unescape_single(&mut src.chars()) +} + /// Takes the contents of a string literal (without quotes) /// and produces a sequence of escaped characters or errors, /// which are returned by invoking `callback`. -pub fn unescape_str(src: &str, mut callback: impl FnMut(Range, Result)) { - unescape_non_raw_common(src, Mode::Str, &mut callback) +pub fn unescape_str(src: &str, callback: impl FnMut(Range, Result)) { + str::unescape(src, callback) } /// Takes the contents of a byte string literal (without quotes) /// and produces a sequence of escaped bytes or errors, /// which are returned by invoking `callback`. -pub fn unescape_byte_str( - src: &str, - mut callback: impl FnMut(Range, Result), -) { - unescape_non_raw_common(src, Mode::ByteStr, &mut |r, res| { - callback(r, res.map(byte_from_char)) - }) +pub fn unescape_byte_str(src: &str, callback: impl FnMut(Range, Result)) { + <[u8]>::unescape(src, callback) } /// Takes the contents of a C string literal (without quotes) @@ -253,14 +260,166 @@ pub fn unescape_byte_str( /// which are returned by invoking `callback`. pub fn unescape_c_str( src: &str, - mut callback: impl FnMut(Range, Result), + callback: impl FnMut(Range, Result), ) { - unescape_non_raw_common(src, Mode::CStr, &mut |r, mut result| { - if let Ok(MixedUnit::Char('\0')) = result { - result = Err(EscapeError::NulInCStr); + CStr::unescape(src, callback) +} + +/// trait for unescaping escape sequences in strings +trait Unescape { + /// Unit type of the implementing string type (`char` for string, `u8` for byte string) + type Unit: From; + + /// Result of unescaping the zero char ('\0') + const ZERO_RESULT: Result; + + /// Converts chars to the unit type + fn char2unit(c: char) -> Result; + + /// Converts the byte of a hex escape to the unit type + fn hex2unit(b: u8) -> Result; + + /// Converts the result of a unicode escape to the unit type + fn unicode2unit(r: Result) -> Result; + + /// Unescape a single unit (single quote syntax) + fn unescape_single(chars: &mut Chars<'_>) -> Result { + let res = match chars.next().ok_or(EscapeError::ZeroChars)? { + '\\' => Self::unescape_1(chars), + '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + c => Self::char2unit(c), + }?; + if chars.next().is_some() { + return Err(EscapeError::MoreThanOneChar); } - callback(r, result) - }) + Ok(res) + } + + /// Unescape the first unit of a string (double quoted syntax) + fn unescape_1(chars: &mut Chars<'_>) -> Result { + // Previous character was '\\', unescape what follows. + let c = chars.next().ok_or(EscapeError::LoneSlash)?; + if c == '0' { + Self::ZERO_RESULT + } else { + simple_escape(c).map(|b| b.into()).or_else(|c| match c { + 'x' => Self::hex2unit(hex_escape(chars)?), + 'u' => Self::unicode2unit({ + let value = unicode_escape(chars)?; + if value > char::MAX as u32 { + Err(EscapeError::OutOfRangeUnicodeEscape) + } else { + char::from_u32(value).ok_or(EscapeError::LoneSurrogateUnicodeEscape) + } + }), + _ => Err(EscapeError::InvalidEscape), + }) + } + } + + /// Takes the contents of a raw literal (without quotes) + /// and produces a sequence of `Result` + /// which are returned via `callback`. + fn unescape( + src: &str, + mut callback: impl FnMut(Range, Result), + ) { + let mut chars = src.chars(); + while let Some(c) = chars.next() { + let start = src.len() - chars.as_str().len() - c.len_utf8(); + let res = match c { + '\\' => { + if let Some(b'\n') = chars.as_str().as_bytes().first() { + let _ = chars.next(); + // skip whitespace for backslash newline, see [Rust language reference] + // (https://doc.rust-lang.org/reference/tokens.html#string-literals). + let mut callback_err = |range, err| callback(range, Err(err)); + skip_ascii_whitespace(&mut chars, start, &mut callback_err); + continue; + } else { + Self::unescape_1(&mut chars) + } + } + '"' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + c => Self::char2unit(c), + }; + let end = src.len() - chars.as_str().len(); + callback(start..end, res); + } + } +} + +impl Unescape for str { + type Unit = char; + + const ZERO_RESULT: Result = Ok('\0'); + + fn char2unit(c: char) -> Result { + Ok(c) + } + + fn hex2unit(b: u8) -> Result { + if b.is_ascii() { + Ok(b as char) + } else { + Err(EscapeError::OutOfRangeHexEscape) + } + } + + /// Converts the result of a unicode escape to the unit type + fn unicode2unit(r: Result) -> Result { + r + } +} + +impl Unescape for [u8] { + type Unit = u8; + + const ZERO_RESULT: Result = Ok(b'\0'); + + fn char2unit(c: char) -> Result { + char2byte(c) + } + + fn hex2unit(b: u8) -> Result { + Ok(b) + } + + /// Converts the result of a unicode escape to the unit type + fn unicode2unit(_r: Result) -> Result { + Err(EscapeError::UnicodeEscapeInByte) + } +} + +impl Unescape for CStr { + type Unit = MixedUnit; + + const ZERO_RESULT: Result = Err(EscapeError::NulInCStr); + + fn char2unit(c: char) -> Result { + if c == '\0' { + Err(EscapeError::NulInCStr) + } else { + Ok(MixedUnit::Char(c)) + } + } + + fn hex2unit(byte: u8) -> Result { + if byte == b'\0' { + Err(EscapeError::NulInCStr) + } else if byte.is_ascii() { + Ok(MixedUnit::Char(byte as char)) + } else { + Ok(MixedUnit::HighByte(byte)) + } + } + + /// Converts the result of a unicode escape to the unit type + fn unicode2unit(r: Result) -> Result { + Self::char2unit(r?) + } } /// Used for mixed utf8 string literals, i.e. those that allow both unicode @@ -299,18 +458,6 @@ impl From for MixedUnit { } } -/// Takes a contents of a char literal (without quotes), and returns an -/// unescaped char or an error. -pub fn unescape_char(src: &str) -> Result { - unescape_char_or_byte(&mut src.chars(), Char) -} - -/// Takes a contents of a byte literal (without quotes), and returns an -/// unescaped byte or an error. -pub fn unescape_byte(src: &str) -> Result { - unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char) -} - /// What kind of literal do we parse. #[derive(Debug, Clone, Copy, PartialEq)] pub enum Mode { @@ -336,33 +483,6 @@ impl Mode { } } - /// Are `\x80`..`\xff` allowed? - fn allow_high_bytes(self) -> bool { - match self { - Char | Str => false, - Byte | ByteStr | CStr => true, - RawStr | RawByteStr | RawCStr => unreachable!(), - } - } - - /// Are unicode (non-ASCII) chars allowed? - #[inline] - fn allow_unicode_chars(self) -> bool { - match self { - Byte | ByteStr | RawByteStr => false, - Char | Str | RawStr | CStr | RawCStr => true, - } - } - - /// Are unicode escapes (`\u`) allowed? - fn allow_unicode_escapes(self) -> bool { - match self { - Byte | ByteStr => false, - Char | Str | CStr => true, - RawByteStr | RawStr | RawCStr => unreachable!(), - } - } - pub fn prefix_noraw(self) -> &'static str { match self { Char | Str | RawStr => "", @@ -372,53 +492,39 @@ impl Mode { } } -fn scan_escape + From>( - chars: &mut Chars<'_>, - mode: Mode, -) -> Result { +/// Parse the character of an ASCII escape (except nul) without the leading backslash. +fn simple_escape(c: char) -> Result { // Previous character was '\\', unescape what follows. - let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? { - '"' => '"', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '0' => '\0', - 'x' => { - // Parse hexadecimal character code. - - let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; - let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; - - let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; - let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; - - let value = (hi * 16 + lo) as u8; - - return if !mode.allow_high_bytes() && !value.is_ascii() { - Err(EscapeError::OutOfRangeHexEscape) - } else { - // This may be a high byte, but that will only happen if `T` is - // `MixedUnit`, because of the `allow_high_bytes` check above. - Ok(T::from(value)) - }; - } - 'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from), - _ => return Err(EscapeError::InvalidEscape), - }; - Ok(T::from(res)) + Ok(match c { + '"' => b'"', + 'n' => b'\n', + 'r' => b'\r', + 't' => b'\t', + '\\' => b'\\', + '\'' => b'\'', + _ => Err(c)?, + }) } -fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result { - // We've parsed '\u', now we have to parse '{..}'. +/// Parse the two hexadecimal characters of a hexadecimal escape without the leading r"\x". +fn hex_escape(chars: &mut impl Iterator) -> Result { + let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + Ok((hi * 16 + lo) as u8) +} + +/// Parse the braces with hexadecimal characters (and underscores) part of a unicode escape. +/// This r"{...}" normally comes after r"\u" and cannot start with an underscore. +fn unicode_escape(chars: &mut impl Iterator) -> Result { if chars.next() != Some('{') { return Err(EscapeError::NoBraceInUnicodeEscape); } // First character must be a hexadecimal digit. - let mut n_digits = 1; let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), '}' => return Err(EscapeError::EmptyUnicodeEscape), @@ -429,28 +535,19 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result return Err(EscapeError::UnclosedUnicodeEscape), Some('_') => continue, Some('}') => { - if n_digits > 6 { - return Err(EscapeError::OverlongUnicodeEscape); - } - // Incorrect syntax has higher priority for error reporting // than unallowed value for a literal. - if !allow_unicode_escapes { - return Err(EscapeError::UnicodeEscapeInByte); - } - - break std::char::from_u32(value).ok_or({ - if value > 0x10FFFF { - EscapeError::OutOfRangeUnicodeEscape - } else { - EscapeError::LoneSurrogateUnicodeEscape - } - }); + return if n_digits > 6 { + Err(EscapeError::OverlongUnicodeEscape) + } else { + Ok(value) + }; } Some(c) => { let digit: u32 = c @@ -467,99 +564,34 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result Result { - if allow_unicode_chars || c.is_ascii() { - Ok(c) - } else { - Err(EscapeError::NonAsciiCharInByte) - } -} - -fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { - let c = chars.next().ok_or(EscapeError::ZeroChars)?; - let res = match c { - '\\' => scan_escape(chars, mode), - '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), - '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, mode.allow_unicode_chars()), - }?; - if chars.next().is_some() { - return Err(EscapeError::MoreThanOneChar); - } - Ok(res) -} - -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of escaped characters or errors. -fn unescape_non_raw_common + From>(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - let mut chars = src.chars(); - let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop - - // The `start` and `end` computation here is complicated because - // `skip_ascii_whitespace` makes us to skip over chars without counting - // them in the range computation. - while let Some(c) = chars.next() { - let start = src.len() - chars.as_str().len() - c.len_utf8(); - let res = match c { - '\\' => { - match chars.clone().next() { - Some('\n') => { - // Rust language specification requires us to skip whitespaces - // if unescaped '\' character is followed by '\n'. - // For details see [Rust language reference] - // (https://doc.rust-lang.org/reference/tokens.html#string-literals). - skip_ascii_whitespace(&mut chars, start, &mut |range, err| { - callback(range, Err(err)) - }); - continue; - } - _ => scan_escape::(&mut chars, mode), - } - } - '"' => Err(EscapeError::EscapeOnlyChar), - '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, allow_unicode_chars).map(T::from), - }; - let end = src.len() - chars.as_str().len(); - callback(start..end, res); - } -} - +/// Skip ASCII whitespace, except for the formfeed character +/// (see [this issue](https://github.com/rust-lang/rust/issues/136600)). +/// Warns on unescaped newline and following non-ASCII whitespace. fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) where F: FnMut(Range, EscapeError), { - let tail = chars.as_str(); - let first_non_space = tail + let rest = chars.as_str(); + let first_non_space = rest .bytes() .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') - .unwrap_or(tail.len()); - if tail[1..first_non_space].contains('\n') { - // The +1 accounts for the escaping slash. - let end = start + first_non_space + 1; + .unwrap_or(rest.len()); + let (space, rest) = rest.split_at(first_non_space); + // backslash newline adds 2 bytes + let end = start + 2 + first_non_space; + if space.contains('\n') { callback(start..end, EscapeError::MultipleSkippedLinesWarning); } - let tail = &tail[first_non_space..]; - if let Some(c) = tail.chars().next() { + *chars = rest.chars(); + if let Some(c) = chars.clone().next() { if c.is_whitespace() { - // For error reporting, we would like the span to contain the character that was not - // skipped. The +1 is necessary to account for the leading \ that started the escape. - let end = start + first_non_space + c.len_utf8() + 1; - callback(start..end, EscapeError::UnskippedWhitespaceWarning); + // for error reporting, include the character that was not skipped in the span + callback( + start..end + c.len_utf8(), + EscapeError::UnskippedWhitespaceWarning, + ); } } - *chars = tail.chars(); -} - -#[inline] -fn byte_from_char(c: char) -> u8 { - let res = c as u32; - debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr"); - res as u8 } fn char2byte(c: char) -> Result { From 67eadd0eecdaf7501670b2621429bb531fb44040 Mon Sep 17 00:00:00 2001 From: Marijn Schouten Date: Wed, 14 May 2025 10:07:25 +0000 Subject: [PATCH 5/7] do not use Mode::* and move stuff around for better organisation --- src/lib.rs | 407 +++++++++++++++++++++++++++-------------------------- 1 file changed, 205 insertions(+), 202 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index ce88d2b..2137242 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,8 +5,6 @@ use std::ffi::CStr; use std::ops::Range; use std::str::Chars; -use Mode::*; - #[cfg(test)] mod tests; @@ -81,60 +79,6 @@ impl EscapeError { } } -/// Takes the contents of a literal (without quotes) -/// and produces a sequence of errors, -/// which are returned by invoking `error_callback`. -pub fn unescape_for_errors( - src: &str, - mode: Mode, - mut error_callback: impl FnMut(Range, EscapeError), -) { - match mode { - Char => { - let mut chars = src.chars(); - if let Err(e) = str::unescape_single(&mut chars) { - error_callback(0..(src.len() - chars.as_str().len()), e); - } - } - Byte => { - let mut chars = src.chars(); - if let Err(e) = <[u8]>::unescape_single(&mut chars) { - error_callback(0..(src.len() - chars.as_str().len()), e); - } - } - Str => unescape_str(src, |range, res| { - if let Err(e) = res { - error_callback(range, e); - } - }), - ByteStr => unescape_byte_str(src, |range, res| { - if let Err(e) = res { - error_callback(range, e); - } - }), - CStr => unescape_c_str(src, |range, res| { - if let Err(e) = res { - error_callback(range, e); - } - }), - RawStr => check_raw_str(src, |range, res| { - if let Err(e) = res { - error_callback(range, e); - } - }), - RawByteStr => check_raw_byte_str(src, |range, res| { - if let Err(e) = res { - error_callback(range, e); - } - }), - RawCStr => check_raw_c_str(src, |range, res| { - if let Err(e) = res { - error_callback(range, e); - } - }), - } -} - /// Takes the contents of a raw string literal (without quotes) /// and produces a sequence of characters or errors, /// which are returned by invoking `callback`. @@ -217,6 +161,15 @@ impl CheckRaw for [u8] { } } +fn char2byte(c: char) -> Result { + // do NOT do: c.try_into().ok_or(EscapeError::NonAsciiCharInByte) + if c.is_ascii() { + Ok(c as u8) + } else { + Err(EscapeError::NonAsciiCharInByte) + } +} + impl CheckRaw for CStr { type RawUnit = char; @@ -265,6 +218,42 @@ pub fn unescape_c_str( CStr::unescape(src, callback) } +/// Used for mixed utf8 string literals, i.e. those that allow both unicode +/// chars and high bytes. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum MixedUnit { + /// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes) + /// and Unicode chars (written directly or via `\u` escapes). + /// + /// For example, if '¥' appears in a string it is represented here as + /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte + /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]` + Char(char), + + /// Used for high bytes (`\x80`..`\xff`). + /// + /// For example, if `\xa5` appears in a string it is represented here as + /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant + /// byte string as the single byte `0xa5`. + HighByte(u8), +} + +impl From for MixedUnit { + fn from(c: char) -> Self { + MixedUnit::Char(c) + } +} + +impl From for MixedUnit { + fn from(n: u8) -> Self { + if n.is_ascii() { + MixedUnit::Char(n as char) + } else { + MixedUnit::HighByte(n) + } + } +} + /// trait for unescaping escape sequences in strings trait Unescape { /// Unit type of the implementing string type (`char` for string, `u8` for byte string) @@ -351,6 +340,108 @@ trait Unescape { } } +/// Parse the character of an ASCII escape (except nul) without the leading backslash. +fn simple_escape(c: char) -> Result { + // Previous character was '\\', unescape what follows. + Ok(match c { + '"' => b'"', + 'n' => b'\n', + 'r' => b'\r', + 't' => b'\t', + '\\' => b'\\', + '\'' => b'\'', + _ => Err(c)?, + }) +} + +/// Parse the two hexadecimal characters of a hexadecimal escape without the leading r"\x". +fn hex_escape(chars: &mut impl Iterator) -> Result { + let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + Ok((hi * 16 + lo) as u8) +} + +/// Parse the braces with hexadecimal characters (and underscores) part of a unicode escape. +/// This r"{...}" normally comes after r"\u" and cannot start with an underscore. +fn unicode_escape(chars: &mut impl Iterator) -> Result { + if chars.next() != Some('{') { + return Err(EscapeError::NoBraceInUnicodeEscape); + } + + // First character must be a hexadecimal digit. + let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { + '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), + '}' => return Err(EscapeError::EmptyUnicodeEscape), + c => c + .to_digit(16) + .ok_or(EscapeError::InvalidCharInUnicodeEscape)?, + }; + + // First character is valid, now parse the rest of the number + // and closing brace. + let mut n_digits = 1; + loop { + match chars.next() { + None => return Err(EscapeError::UnclosedUnicodeEscape), + Some('_') => continue, + Some('}') => { + // Incorrect syntax has higher priority for error reporting + // than unallowed value for a literal. + return if n_digits > 6 { + Err(EscapeError::OverlongUnicodeEscape) + } else { + Ok(value) + }; + } + Some(c) => { + let digit: u32 = c + .to_digit(16) + .ok_or(EscapeError::InvalidCharInUnicodeEscape)?; + n_digits += 1; + if n_digits > 6 { + // Stop updating value since we're sure that it's incorrect already. + continue; + } + value = value * 16 + digit; + } + }; + } +} + +/// Skip ASCII whitespace, except for the formfeed character +/// (see [this issue](https://github.com/rust-lang/rust/issues/136600)). +/// Warns on unescaped newline and following non-ASCII whitespace. +fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) +where + F: FnMut(Range, EscapeError), +{ + let rest = chars.as_str(); + let first_non_space = rest + .bytes() + .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') + .unwrap_or(rest.len()); + let (space, rest) = rest.split_at(first_non_space); + // backslash newline adds 2 bytes + let end = start + 2 + first_non_space; + if space.contains('\n') { + callback(start..end, EscapeError::MultipleSkippedLinesWarning); + } + *chars = rest.chars(); + if let Some(c) = chars.clone().next() { + if c.is_whitespace() { + // for error reporting, include the character that was not skipped in the span + callback( + start..end + c.len_utf8(), + EscapeError::UnskippedWhitespaceWarning, + ); + } + } +} + impl Unescape for str { type Unit = char; @@ -422,42 +513,6 @@ impl Unescape for CStr { } } -/// Used for mixed utf8 string literals, i.e. those that allow both unicode -/// chars and high bytes. -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum MixedUnit { - /// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes) - /// and Unicode chars (written directly or via `\u` escapes). - /// - /// For example, if '¥' appears in a string it is represented here as - /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte - /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]` - Char(char), - - /// Used for high bytes (`\x80`..`\xff`). - /// - /// For example, if `\xa5` appears in a string it is represented here as - /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant - /// byte string as the single byte `0xa5`. - HighByte(u8), -} - -impl From for MixedUnit { - fn from(c: char) -> Self { - MixedUnit::Char(c) - } -} - -impl From for MixedUnit { - fn from(n: u8) -> Self { - if n.is_ascii() { - MixedUnit::Char(n as char) - } else { - MixedUnit::HighByte(n) - } - } -} - /// What kind of literal do we parse. #[derive(Debug, Clone, Copy, PartialEq)] pub enum Mode { @@ -478,127 +533,75 @@ pub enum Mode { impl Mode { pub fn in_double_quotes(self) -> bool { match self { - Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true, - Char | Byte => false, + Mode::Str + | Mode::RawStr + | Mode::ByteStr + | Mode::RawByteStr + | Mode::CStr + | Mode::RawCStr => true, + Mode::Char | Mode::Byte => false, } } pub fn prefix_noraw(self) -> &'static str { match self { - Char | Str | RawStr => "", - Byte | ByteStr | RawByteStr => "b", - CStr | RawCStr => "c", + Mode::Char | Mode::Str | Mode::RawStr => "", + Mode::Byte | Mode::ByteStr | Mode::RawByteStr => "b", + Mode::CStr | Mode::RawCStr => "c", } } } -/// Parse the character of an ASCII escape (except nul) without the leading backslash. -fn simple_escape(c: char) -> Result { - // Previous character was '\\', unescape what follows. - Ok(match c { - '"' => b'"', - 'n' => b'\n', - 'r' => b'\r', - 't' => b'\t', - '\\' => b'\\', - '\'' => b'\'', - _ => Err(c)?, - }) -} - -/// Parse the two hexadecimal characters of a hexadecimal escape without the leading r"\x". -fn hex_escape(chars: &mut impl Iterator) -> Result { - let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; - let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; - - let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; - let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; - - Ok((hi * 16 + lo) as u8) -} - -/// Parse the braces with hexadecimal characters (and underscores) part of a unicode escape. -/// This r"{...}" normally comes after r"\u" and cannot start with an underscore. -fn unicode_escape(chars: &mut impl Iterator) -> Result { - if chars.next() != Some('{') { - return Err(EscapeError::NoBraceInUnicodeEscape); - } - - // First character must be a hexadecimal digit. - let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { - '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), - '}' => return Err(EscapeError::EmptyUnicodeEscape), - c => c - .to_digit(16) - .ok_or(EscapeError::InvalidCharInUnicodeEscape)?, - }; - - // First character is valid, now parse the rest of the number - // and closing brace. - let mut n_digits = 1; - loop { - match chars.next() { - None => return Err(EscapeError::UnclosedUnicodeEscape), - Some('_') => continue, - Some('}') => { - // Incorrect syntax has higher priority for error reporting - // than unallowed value for a literal. - return if n_digits > 6 { - Err(EscapeError::OverlongUnicodeEscape) - } else { - Ok(value) - }; +/// Takes the contents of a literal (without quotes) +/// and produces a sequence of errors, +/// which are returned by invoking `error_callback`. +pub fn unescape_for_errors( + src: &str, + mode: Mode, + mut error_callback: impl FnMut(Range, EscapeError), +) { + match mode { + Mode::Char => { + let mut chars = src.chars(); + if let Err(e) = str::unescape_single(&mut chars) { + error_callback(0..(src.len() - chars.as_str().len()), e); } - Some(c) => { - let digit: u32 = c - .to_digit(16) - .ok_or(EscapeError::InvalidCharInUnicodeEscape)?; - n_digits += 1; - if n_digits > 6 { - // Stop updating value since we're sure that it's incorrect already. - continue; - } - value = value * 16 + digit; + } + Mode::Byte => { + let mut chars = src.chars(); + if let Err(e) = <[u8]>::unescape_single(&mut chars) { + error_callback(0..(src.len() - chars.as_str().len()), e); } - }; - } -} - -/// Skip ASCII whitespace, except for the formfeed character -/// (see [this issue](https://github.com/rust-lang/rust/issues/136600)). -/// Warns on unescaped newline and following non-ASCII whitespace. -fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) -where - F: FnMut(Range, EscapeError), -{ - let rest = chars.as_str(); - let first_non_space = rest - .bytes() - .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') - .unwrap_or(rest.len()); - let (space, rest) = rest.split_at(first_non_space); - // backslash newline adds 2 bytes - let end = start + 2 + first_non_space; - if space.contains('\n') { - callback(start..end, EscapeError::MultipleSkippedLinesWarning); - } - *chars = rest.chars(); - if let Some(c) = chars.clone().next() { - if c.is_whitespace() { - // for error reporting, include the character that was not skipped in the span - callback( - start..end + c.len_utf8(), - EscapeError::UnskippedWhitespaceWarning, - ); } - } -} - -fn char2byte(c: char) -> Result { - // do NOT do: c.try_into().ok_or(EscapeError::NonAsciiCharInByte) - if c.is_ascii() { - Ok(c as u8) - } else { - Err(EscapeError::NonAsciiCharInByte) + Mode::Str => unescape_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + Mode::ByteStr => unescape_byte_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + Mode::CStr => unescape_c_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + Mode::RawStr => check_raw_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + Mode::RawByteStr => check_raw_byte_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + Mode::RawCStr => check_raw_c_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), } } From 702b0dc063873ca889eaaf2b7e32bad674cd743c Mon Sep 17 00:00:00 2001 From: Marijn Schouten Date: Thu, 29 May 2025 09:10:41 +0000 Subject: [PATCH 6/7] rename unescape_for_errors -> check_for_errors, and improve docs --- CHANGELOG.md | 2 +- src/lib.rs | 61 ++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 48 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d5bbc69..cddf56a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ - Add `check_raw_str`, `check_raw_byte_str`, `check_raw_c_str`, - Add `unescape_str`, `unescape_byte_str`, `unescape_c_str`, -- Add `unescape_for_errors`, +- Add `check_for_errors`, - Remove: `unescape_unicode` and `unescape_mixed` # 0.0.3 diff --git a/src/lib.rs b/src/lib.rs index 2137242..f09f613 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,5 @@ -//! Utilities for validating string and char literals and turning them into -//! values they represent. +//! Utilities for validating (raw) string, char, and byte literals and +//! turning escape sequences into the values they represent. use std::ffi::CStr; use std::ops::Range; @@ -8,9 +8,9 @@ use std::str::Chars; #[cfg(test)] mod tests; -/// Errors and warnings that can occur during string unescaping. They mostly -/// relate to malformed escape sequences, but there are a few that are about -/// other problems. +/// Errors and warnings that can occur during string, char, and byte unescaping. +/// +/// Mostly relating to malformed escape sequences, but also a few other problems. #[derive(Debug, PartialEq, Eq)] pub enum EscapeError { /// Expected 1 char, but 0 were found. @@ -58,7 +58,7 @@ pub enum EscapeError { /// Non-ascii character in byte literal, byte string literal, or raw byte string literal. NonAsciiCharInByte, - // `\0` in a C string literal. + /// `\0` in a C string literal. NulInCStr, /// After a line ending with '\', the next line contains whitespace @@ -79,6 +79,8 @@ impl EscapeError { } } +/// Check a raw string literal for validity +/// /// Takes the contents of a raw string literal (without quotes) /// and produces a sequence of characters or errors, /// which are returned by invoking `callback`. @@ -87,6 +89,8 @@ pub fn check_raw_str(src: &str, callback: impl FnMut(Range, Result, Result::check_raw(src, callback); } +/// Check a raw C string literal for validity +/// /// Takes the contents of a raw C string literal (without quotes) /// and produces a sequence of characters or errors, /// which are returned by invoking `callback`. @@ -103,7 +109,7 @@ pub fn check_raw_c_str(src: &str, callback: impl FnMut(Range, Result Result { // do NOT do: c.try_into().ok_or(EscapeError::NonAsciiCharInByte) if c.is_ascii() { @@ -182,18 +189,24 @@ impl CheckRaw for CStr { } } +/// Unescape a char literal +/// /// Takes the contents of a char literal (without quotes), /// and returns an unescaped char or an error. pub fn unescape_char(src: &str) -> Result { str::unescape_single(&mut src.chars()) } +/// Unescape a byte literal +/// /// Takes the contents of a byte literal (without quotes), /// and returns an unescaped byte or an error. pub fn unescape_byte(src: &str) -> Result { <[u8]>::unescape_single(&mut src.chars()) } +/// Unescape a string literal +/// /// Takes the contents of a string literal (without quotes) /// and produces a sequence of escaped characters or errors, /// which are returned by invoking `callback`. @@ -201,6 +214,8 @@ pub fn unescape_str(src: &str, callback: impl FnMut(Range, Result, Result::unescape(src, callback) } +/// Unescape a C string literal +/// /// Takes the contents of a C string literal (without quotes) /// and produces a sequence of escaped MixedUnits or errors, /// which are returned by invoking `callback`. @@ -218,6 +235,8 @@ pub fn unescape_c_str( CStr::unescape(src, callback) } +/// Enum representing either a char or a byte +/// /// Used for mixed utf8 string literals, i.e. those that allow both unicode /// chars and high bytes. #[derive(Copy, Clone, Debug, PartialEq, Eq)] @@ -254,7 +273,7 @@ impl From for MixedUnit { } } -/// trait for unescaping escape sequences in strings +/// Trait for unescaping escape sequences in strings trait Unescape { /// Unit type of the implementing string type (`char` for string, `u8` for byte string) type Unit: From; @@ -307,7 +326,9 @@ trait Unescape { } } - /// Takes the contents of a raw literal (without quotes) + /// Unescape a string literal + /// + /// Takes the contents of a raw string literal (without quotes) /// and produces a sequence of `Result` /// which are returned via `callback`. fn unescape( @@ -340,7 +361,9 @@ trait Unescape { } } -/// Parse the character of an ASCII escape (except nul) without the leading backslash. +/// Interpret a non-nul ASCII escape +/// +/// Parses the character of an ASCII escape (except nul) without the leading backslash. fn simple_escape(c: char) -> Result { // Previous character was '\\', unescape what follows. Ok(match c { @@ -354,7 +377,9 @@ fn simple_escape(c: char) -> Result { }) } -/// Parse the two hexadecimal characters of a hexadecimal escape without the leading r"\x". +/// Interpret a hexadecimal escape +/// +/// Parses the two hexadecimal characters of a hexadecimal escape without the leading r"\x". fn hex_escape(chars: &mut impl Iterator) -> Result { let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; @@ -365,6 +390,8 @@ fn hex_escape(chars: &mut impl Iterator) -> Result Ok((hi * 16 + lo) as u8) } +/// Interpret a unicode escape +/// /// Parse the braces with hexadecimal characters (and underscores) part of a unicode escape. /// This r"{...}" normally comes after r"\u" and cannot start with an underscore. fn unicode_escape(chars: &mut impl Iterator) -> Result { @@ -412,6 +439,8 @@ fn unicode_escape(chars: &mut impl Iterator) -> Result, EscapeError), From c9ae54eb631ad5ab381c7fa3514cf2893b256e2e Mon Sep 17 00:00:00 2001 From: Marijn Schouten Date: Fri, 13 Jun 2025 10:17:31 +0000 Subject: [PATCH 7/7] example literals for Mode --- src/lib.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index f09f613..55299d1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -545,17 +545,25 @@ impl Unescape for CStr { /// Enum of the different kinds of literal #[derive(Debug, Clone, Copy, PartialEq)] pub enum Mode { + /// `'a'` Char, + /// `b'a'` Byte, + /// `"hello"` Str, + /// `r"hello"` RawStr, + /// `b"hello"` ByteStr, + /// `br"hello"` RawByteStr, + /// `c"hello"` CStr, + /// `cr"hello"` RawCStr, }