diff --git a/CHANGELOG.md b/CHANGELOG.md index 5778083..cddf56a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +# 0.0.4 + +- Add `check_raw_str`, `check_raw_byte_str`, `check_raw_c_str`, +- Add `unescape_str`, `unescape_byte_str`, `unescape_c_str`, +- Add `check_for_errors`, +- Remove: `unescape_unicode` and `unescape_mixed` + # 0.0.3 - Extend `rustc-dep-of-std` feature to include `libcore` diff --git a/Cargo.lock b/Cargo.lock index 628ead5..f8473de 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,7 +4,7 @@ version = 4 [[package]] name = "rustc-literal-escaper" -version = "0.0.3" +version = "0.0.4" dependencies = [ "rustc-std-workspace-core", "rustc-std-workspace-std", diff --git a/Cargo.toml b/Cargo.toml index facc7a4..6201905 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rustc-literal-escaper" -version = "0.0.3" +version = "0.0.4" edition = "2021" description = "Provides code to unescape string literals" license = "Apache-2.0 OR MIT" diff --git a/benches/benches.rs b/benches/benches.rs index a028dfd..ecaef3e 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -3,7 +3,8 @@ extern crate test; use rustc_literal_escaper::*; -use std::iter::repeat_n; +use std::ops::Range; +use std::{array, iter}; const LEN: usize = 10_000; @@ -23,9 +24,7 @@ fn bench_skip_ascii_whitespace(b: &mut test::Bencher) { // skip_ascii_whitespace(&mut input.chars(), 0, &mut |range, res| { // output.push((range, res)) // }); - unescape_unicode(&input, Mode::Str, &mut |range, res| { - output.push((range, res)) - }); + unescape_str(&input, |range, res| output.push((range, res))); assert_eq!( output, [((0..LEN + 2), Err(EscapeError::MultipleSkippedLinesWarning))] @@ -37,138 +36,385 @@ fn bench_skip_ascii_whitespace(b: &mut test::Bencher) { // Check raw // -fn bench_check_raw(b: &mut test::Bencher, c: char, mode: Mode) { - let input: String = test::black_box(repeat_n(c, LEN).collect()); - assert_eq!(input.len(), LEN * c.len_utf8()); - b.iter(|| { - let mut output = vec![]; - unescape_unicode(&input, mode, &mut |range, res| output.push((range, res))); - assert_eq!(output.len(), LEN); - assert_eq!(output[0], ((0..c.len_utf8()), Ok(c))); - }); +macro_rules! fn_bench_check_raw { + ($name:ident, $unit:ty, $check_raw:ident) => { + fn $name(b: &mut test::Bencher, s: &str, expected: &[$unit]) { + let input: String = test::black_box([s; LEN].join("")); + assert_eq!(input.len(), LEN * s.len()); + b.iter(|| { + let mut output = Vec::with_capacity(expected.len()); + + $check_raw(&input, |range, res| output.push((range, res))); + assert_eq!(output.len(), LEN * s.chars().count()); + + // check that the output is what is expected and comes from the right input bytes + for ((i, &e), (p, c)) in expected.iter().enumerate().zip(s.char_indices()) { + assert_eq!(output[i], ((p..p + c.len_utf8()), Ok(e))); + } + }); + } + }; } +fn_bench_check_raw!(bench_check_raw_str, char, check_raw_str); +fn_bench_check_raw!(bench_check_raw_byte_str, u8, check_raw_byte_str); +fn_bench_check_raw!(bench_check_raw_c_str, char, check_raw_c_str); + // raw str #[bench] fn bench_check_raw_str_ascii(b: &mut test::Bencher) { - bench_check_raw(b, 'a', Mode::RawStr); + bench_check_raw_str(b, "a", &['a'; LEN]); +} + +#[bench] +fn bench_check_raw_str_non_ascii(b: &mut test::Bencher) { + bench_check_raw_str(b, "🦀", &['🦀'; LEN]); } #[bench] fn bench_check_raw_str_unicode(b: &mut test::Bencher) { - bench_check_raw(b, '🦀', Mode::RawStr); + bench_check_raw_str( + b, + "a🦀🚀z", + &array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 { + 0 => 'a', + 1 => '🦀', + 2 => '🚀', + 3 => 'z', + _ => unreachable!(), + }), + ); } // raw byte str #[bench] -fn bench_check_raw_byte_str(b: &mut test::Bencher) { - bench_check_raw(b, 'a', Mode::RawByteStr); +fn bench_check_raw_byte_str_ascii(b: &mut test::Bencher) { + bench_check_raw_byte_str(b, "a", &[b'a'; LEN]); } // raw C str #[bench] fn bench_check_raw_c_str_ascii(b: &mut test::Bencher) { - bench_check_raw(b, 'a', Mode::RawCStr); + bench_check_raw_c_str(b, "a", &['a'; LEN]); +} + +#[bench] +fn bench_check_raw_c_str_non_ascii(b: &mut test::Bencher) { + bench_check_raw_c_str(b, "🦀", &['🦀'; LEN]); } #[bench] fn bench_check_raw_c_str_unicode(b: &mut test::Bencher) { - bench_check_raw(b, '🦀', Mode::RawCStr); + bench_check_raw_c_str( + b, + "a🦀🚀z", + &array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 { + 0 => 'a', + 1 => '🦀', + 2 => '🚀', + 3 => 'z', + _ => unreachable!(), + }), + ); } // // Unescape // -fn bench_unescape(b: &mut test::Bencher, s: &str, mode: Mode, expected: char) { - let input: String = test::black_box(repeat_n(s, LEN).collect()); - assert_eq!(input.len(), LEN * s.len()); - b.iter(|| { - let mut output = vec![]; - unescape_unicode(&input, mode, &mut |range, res| output.push((range, res))); - assert_eq!(output.len(), LEN); - assert_eq!(output[0], ((0..s.len()), Ok(expected))); - }); +macro_rules! fn_bench_unescape { + ($name:ident, $unit:ty, $unescape:ident) => { + fn $name( + b: &mut test::Bencher, + s: &str, + expected: &[(Range, Result<$unit, EscapeError>)], + ) { + let input: String = test::black_box([s; LEN].join("")); + b.iter(|| { + let mut output = Vec::with_capacity(expected.len()); + + $unescape(&input, |range, res| output.push((range, res))); + //assert_eq!(output.len(), LEN * s.chars().count()); + + // check that the output is what is expected and comes from the right input bytes + for (i, e) in expected.iter().enumerate() { + assert_eq!(output[i], *e); + } + }); + } + }; } +fn_bench_unescape!(bench_unescape_str, char, unescape_str); +fn_bench_unescape!(bench_unescape_byte_str, u8, unescape_byte_str); +fn_bench_unescape!(bench_unescape_c_str, MixedUnit, unescape_c_str); + // str #[bench] -fn bench_unescape_str_trivial(b: &mut test::Bencher) { - bench_unescape(b, r"a", Mode::Str, 'a'); +fn bench_unescape_str_ascii(b: &mut test::Bencher) { + bench_unescape_str( + b, + r"a", + &array::from_fn::<_, LEN, _>(|i| (i..i + 1, Ok('a'))), + ); } #[bench] -fn bench_unescape_str_ascii(b: &mut test::Bencher) { - bench_unescape(b, r"\n", Mode::Str, '\n'); +fn bench_unescape_str_non_ascii(b: &mut test::Bencher) { + bench_unescape_str( + b, + r"🦀", + &array::from_fn::<_, LEN, _>(|i| (4 * i..4 * (i + 1), Ok('🦀'))), + ); } #[bench] -fn bench_unescape_str_hex(b: &mut test::Bencher) { - bench_unescape(b, r"\x22", Mode::Str, '"'); +fn bench_unescape_str_unicode(b: &mut test::Bencher) { + let input = "a🦀🚀z"; + let l = input.len(); + bench_unescape_str( + b, + input, + &array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 { + 0 => (i / 4 * l..i / 4 * l + 1, Ok('a')), + 1 => (i / 4 * l + 1..i / 4 * l + 5, Ok('🦀')), + 2 => (i / 4 * l + 5..i / 4 * l + 9, Ok('🚀')), + 3 => (i / 4 * l + 9..i / 4 * l + 10, Ok('z')), + _ => unreachable!(), + }), + ); } #[bench] -fn bench_unescape_str_unicode(b: &mut test::Bencher) { - bench_unescape(b, r"\u{1f980}", Mode::Str, '🦀'); +fn bench_unescape_str_ascii_escape(b: &mut test::Bencher) { + bench_unescape_str( + b, + r"\n", + &array::from_fn::<_, LEN, _>(|i| (2 * i..2 * (i + 1), Ok('\n'))), + ); } -// byte str +#[bench] +fn bench_unescape_str_hex_escape(b: &mut test::Bencher) { + bench_unescape_str( + b, + r"\x22", + &array::from_fn::<_, LEN, _>(|i| (4 * i..4 * (i + 1), Ok('"'))), + ); +} + +#[bench] +fn bench_unescape_str_unicode_escape(b: &mut test::Bencher) { + let input = r"\u{1f980}\u{1f680}"; + let l = input.len(); + bench_unescape_str( + b, + input, + &array::from_fn::<_, LEN, _>(|i| { + if i % 2 == 0 { + (i / 2 * l..i / 2 * l + 9, Ok('🦀')) + } else { + (i / 2 * l + 9..i / 2 * l + 18, Ok('🚀')) + } + }), + ); +} #[bench] -fn bench_unescape_byte_str_trivial(b: &mut test::Bencher) { - bench_unescape(b, r"a", Mode::ByteStr, 'a'); +fn bench_unescape_str_mixed_escape(b: &mut test::Bencher) { + let inputs = [r"\n", r"\x22", r"\u{1f980}", r"\u{1f680}"]; + let n = inputs.len(); + let input = inputs.join(""); + let l = input.len(); + bench_unescape_str( + b, + &input, + &iter::from_fn({ + let mut i = 0; + move || { + let res = Some(match i % n { + 0 => (i / n * l..i / n * l + 2, Ok('\n')), + 1 => (i / n * l + 2..i / n * l + 6, Ok('"')), + 2 => (i / n * l + 6..i / n * l + 15, Ok('🦀')), + 3 => (i / n * l + 15..i / n * l + 24, Ok('🚀')), + r if r >= n => unreachable!(), + _ => unimplemented!(), + }); + i += 1; + res + } + }) + .take(n * LEN) + .collect::>(), + ); } +// byte str + #[bench] fn bench_unescape_byte_str_ascii(b: &mut test::Bencher) { - bench_unescape(b, r"\n", Mode::ByteStr, b'\n' as char); + bench_unescape_byte_str( + b, + r"a", + &array::from_fn::<_, { LEN }, _>(|i| (i..i + 1, Ok(b'a'))), + ); +} + +#[bench] +fn bench_unescape_byte_str_ascii_escape(b: &mut test::Bencher) { + bench_unescape_byte_str( + b, + r"\n", + &array::from_fn::<_, { LEN }, _>(|i| (2 * i..2 * (i + 1), Ok(b'\n'))), + ); } #[bench] -fn bench_unescape_byte_str_hex(b: &mut test::Bencher) { - bench_unescape(b, r"\xff", Mode::ByteStr, b'\xff' as char); +fn bench_unescape_byte_str_hex_escape(b: &mut test::Bencher) { + bench_unescape_byte_str( + b, + r"\xff", + &array::from_fn::<_, { LEN }, _>(|i| (4 * i..4 * (i + 1), Ok(b'\xff'))), + ); +} + +#[bench] +fn bench_unescape_byte_str_mixed_escape(b: &mut test::Bencher) { + let inputs = [r"a", r"\n", r"\xff", r"z"]; + let input = inputs.join(""); + let n = inputs.len(); + let l = input.len(); + bench_unescape_byte_str( + b, + &input, + &iter::from_fn({ + let mut i = 0; + move || { + let res = Some(match i % n { + 0 => (i / n * l..i / n * l + 1, Ok(b'a')), + 1 => (i / n * l + 1..i / n * l + 3, Ok(b'\n')), + 2 => (i / n * l + 3..i / n * l + 7, Ok(b'\xff')), + 3 => (i / n * l + 7..i / n * l + 8, Ok(b'z')), + r if r >= n => unreachable!(), + _ => unimplemented!(), + }); + i += 1; + res + } + }) + .take(n * LEN) + .collect::>(), + ); } // C str -fn bench_unescape_c_str(b: &mut test::Bencher, s: &str, expected: MixedUnit) { - let input: String = test::black_box(repeat_n(s, LEN).collect()); - assert_eq!(input.len(), LEN * s.len()); - b.iter(|| { - let mut output = vec![]; - unescape_mixed(&input, Mode::CStr, &mut |range, res| { - output.push((range, res)) - }); - assert_eq!(output.len(), LEN); - assert_eq!(output[0], ((0..s.len()), Ok(expected))); - }); +#[bench] +fn bench_unescape_c_str_ascii(b: &mut test::Bencher) { + bench_unescape_c_str( + b, + r"a", + &array::from_fn::<_, { LEN }, _>(|i| (i..i + 1, Ok(MixedUnit::Char('a')))), + ); } #[bench] -fn bench_unescape_c_str_trivial(b: &mut test::Bencher) { - bench_unescape_c_str(b, r"a", MixedUnit::Char('a')); +fn bench_unescape_c_str_non_ascii(b: &mut test::Bencher) { + bench_unescape_c_str( + b, + r"🦀", + &array::from_fn::<_, LEN, _>(|i| (4 * i..4 * (i + 1), Ok(MixedUnit::Char('🦀')))), + ); } #[bench] -fn bench_unescape_c_str_ascii(b: &mut test::Bencher) { - bench_unescape_c_str(b, r"\n", MixedUnit::Char('\n')); +fn bench_unescape_c_str_unicode(b: &mut test::Bencher) { + let input = "a🦀🚀z"; + let l = input.len(); + bench_unescape_c_str( + b, + input, + &array::from_fn::<_, { 4 * LEN }, _>(|i| match i % 4 { + 0 => (i / 4 * l..i / 4 * l + 1, Ok(MixedUnit::Char('a'))), + 1 => (i / 4 * l + 1..i / 4 * l + 5, Ok(MixedUnit::Char('🦀'))), + 2 => (i / 4 * l + 5..i / 4 * l + 9, Ok(MixedUnit::Char('🚀'))), + 3 => (i / 4 * l + 9..i / 4 * l + 10, Ok(MixedUnit::Char('z'))), + _ => unreachable!(), + }), + ); } #[bench] -fn bench_unescape_c_str_hex_ascii(b: &mut test::Bencher) { - bench_unescape_c_str(b, r"\x22", MixedUnit::Char('"')); +fn bench_unescape_c_str_ascii_escape(b: &mut test::Bencher) { + bench_unescape_c_str( + b, + r"\n", + &array::from_fn::<_, { LEN }, _>(|i| (2 * i..2 * (i + 1), Ok(MixedUnit::Char('\n')))), + ); } #[bench] -fn bench_unescape_c_str_hex_byte(b: &mut test::Bencher) { - bench_unescape_c_str(b, r"\xff", MixedUnit::HighByte(b'\xff')); +fn bench_unescape_c_str_hex_escape_ascii(b: &mut test::Bencher) { + bench_unescape_c_str( + b, + r"\x22", + &array::from_fn::<_, { LEN }, _>(|i| (4 * i..4 * (i + 1), Ok(MixedUnit::Char('"')))), + ); } #[bench] -fn bench_unescape_c_str_unicode(b: &mut test::Bencher) { - bench_unescape_c_str(b, r"\u{1f980}", MixedUnit::Char('🦀')); +fn bench_unescape_c_str_hex_escape_byte(b: &mut test::Bencher) { + bench_unescape_c_str( + b, + r"\xff", + &array::from_fn::<_, { LEN }, _>(|i| { + (4 * i..4 * (i + 1), Ok(MixedUnit::HighByte(b'\xff'))) + }), + ); +} + +#[bench] +fn bench_unescape_c_str_unicode_escape(b: &mut test::Bencher) { + bench_unescape_c_str( + b, + r"\u{1f980}", + &array::from_fn::<_, { LEN }, _>(|i| (9 * i..9 * (i + 1), Ok(MixedUnit::Char('🦀')))), + ); +} + +#[bench] +fn bench_unescape_c_str_mixed_escape(b: &mut test::Bencher) { + let inputs = [r"\n", r"\x22", r"\u{1f980}", r"\u{1f680}", r"\xff"]; + let n = inputs.len(); + let input = inputs.join(""); + let l = input.len(); + bench_unescape_c_str( + b, + &input, + &iter::from_fn({ + let mut i = 0; + move || { + let res = Some(match i % n { + 0 => (i / n * l..i / n * l + 2, Ok(MixedUnit::Char('\n'))), + 1 => (i / n * l + 2..i / n * l + 6, Ok(MixedUnit::Char('"'))), + 2 => (i / n * l + 6..i / n * l + 15, Ok(MixedUnit::Char('🦀'))), + 3 => (i / n * l + 15..i / n * l + 24, Ok(MixedUnit::Char('🚀'))), + 4 => ( + i / n * l + 24..i / n * l + 28, + Ok(MixedUnit::HighByte(b'\xff')), + ), + r if r >= n => unreachable!(), + _ => unimplemented!(), + }); + i += 1; + res + } + }) + .take(n * LEN) + .collect::>(), + ); } diff --git a/src/lib.rs b/src/lib.rs index d315ed2..55299d1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,17 +1,16 @@ -//! Utilities for validating string and char literals and turning them into -//! values they represent. +//! Utilities for validating (raw) string, char, and byte literals and +//! turning escape sequences into the values they represent. +use std::ffi::CStr; use std::ops::Range; use std::str::Chars; -use Mode::*; - #[cfg(test)] mod tests; -/// Errors and warnings that can occur during string unescaping. They mostly -/// relate to malformed escape sequences, but there are a few that are about -/// other problems. +/// Errors and warnings that can occur during string, char, and byte unescaping. +/// +/// Mostly relating to malformed escape sequences, but also a few other problems. #[derive(Debug, PartialEq, Eq)] pub enum EscapeError { /// Expected 1 char, but 0 were found. @@ -59,7 +58,7 @@ pub enum EscapeError { /// Non-ascii character in byte literal, byte string literal, or raw byte string literal. NonAsciiCharInByte, - // `\0` in a C string literal. + /// `\0` in a C string literal. NulInCStr, /// After a line ending with '\', the next line contains whitespace @@ -80,33 +79,164 @@ impl EscapeError { } } -/// Takes the contents of a unicode-only (non-mixed-utf8) literal (without -/// quotes) and produces a sequence of escaped characters or errors. +/// Check a raw string literal for validity /// -/// Values are returned by invoking `callback`. For `Char` and `Byte` modes, -/// the callback will be called exactly once. -pub fn unescape_unicode(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - match mode { - Char | Byte => { - let mut chars = src.chars(); - let res = unescape_char_or_byte(&mut chars, mode); - callback(0..(src.len() - chars.as_str().len()), res); +/// Takes the contents of a raw string literal (without quotes) +/// and produces a sequence of characters or errors, +/// which are returned by invoking `callback`. +/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r'). +pub fn check_raw_str(src: &str, callback: impl FnMut(Range, Result)) { + str::check_raw(src, callback); +} + +/// Check a raw byte string literal for validity +/// +/// Takes the contents of a raw byte string literal (without quotes) +/// and produces a sequence of bytes or errors, +/// which are returned by invoking `callback`. +/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r'). +pub fn check_raw_byte_str(src: &str, callback: impl FnMut(Range, Result)) { + <[u8]>::check_raw(src, callback); +} + +/// Check a raw C string literal for validity +/// +/// Takes the contents of a raw C string literal (without quotes) +/// and produces a sequence of characters or errors, +/// which are returned by invoking `callback`. +/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r'). +pub fn check_raw_c_str(src: &str, callback: impl FnMut(Range, Result)) { + CStr::check_raw(src, callback); +} + +/// Trait for checking raw string literals for validity +trait CheckRaw { + /// Unit type of the implementing string type (`char` for string, `u8` for byte string) + type RawUnit; + + /// Converts chars to the unit type of the literal type + fn char2raw_unit(c: char) -> Result; + + /// Takes the contents of a raw literal (without quotes) + /// and produces a sequence of `Result` + /// which are returned via `callback`. + /// + /// NOTE: Does no escaping, but produces errors for bare carriage return ('\r'). + fn check_raw( + src: &str, + mut callback: impl FnMut(Range, Result), + ) { + let mut chars = src.chars(); + while let Some(c) = chars.next() { + let start = src.len() - chars.as_str().len() - c.len_utf8(); + let res = match c { + '\r' => Err(EscapeError::BareCarriageReturnInRawString), + _ => Self::char2raw_unit(c), + }; + let end = src.len() - chars.as_str().len(); + callback(start..end, res); } - Str | ByteStr => unescape_non_raw_common(src, mode, callback), - RawStr | RawByteStr => check_raw_common(src, mode, callback), - RawCStr => check_raw_common(src, mode, &mut |r, mut result| { - if let Ok('\0') = result { - result = Err(EscapeError::NulInCStr); - } - callback(r, result) - }), - CStr => unreachable!(), + + // Unfortunately, it is a bit unclear whether the following equivalent code is slower or faster: bug 141855 + // src.char_indices().for_each(|(pos, c)| { + // callback( + // pos..pos + c.len_utf8(), + // if c == '\r' { + // Err(EscapeError::BareCarriageReturnInRawString) + // } else { + // Self::char2raw_unit(c) + // }, + // ); + // }); + } +} + +impl CheckRaw for str { + type RawUnit = char; + + fn char2raw_unit(c: char) -> Result { + Ok(c) + } +} + +impl CheckRaw for [u8] { + type RawUnit = u8; + + fn char2raw_unit(c: char) -> Result { + char2byte(c) } } +/// Turn an ascii char into a byte +fn char2byte(c: char) -> Result { + // do NOT do: c.try_into().ok_or(EscapeError::NonAsciiCharInByte) + if c.is_ascii() { + Ok(c as u8) + } else { + Err(EscapeError::NonAsciiCharInByte) + } +} + +impl CheckRaw for CStr { + type RawUnit = char; + + fn char2raw_unit(c: char) -> Result { + if c == '\0' { + Err(EscapeError::NulInCStr) + } else { + Ok(c) + } + } +} + +/// Unescape a char literal +/// +/// Takes the contents of a char literal (without quotes), +/// and returns an unescaped char or an error. +pub fn unescape_char(src: &str) -> Result { + str::unescape_single(&mut src.chars()) +} + +/// Unescape a byte literal +/// +/// Takes the contents of a byte literal (without quotes), +/// and returns an unescaped byte or an error. +pub fn unescape_byte(src: &str) -> Result { + <[u8]>::unescape_single(&mut src.chars()) +} + +/// Unescape a string literal +/// +/// Takes the contents of a string literal (without quotes) +/// and produces a sequence of escaped characters or errors, +/// which are returned by invoking `callback`. +pub fn unescape_str(src: &str, callback: impl FnMut(Range, Result)) { + str::unescape(src, callback) +} + +/// Unescape a byte string literal +/// +/// Takes the contents of a byte string literal (without quotes) +/// and produces a sequence of escaped bytes or errors, +/// which are returned by invoking `callback`. +pub fn unescape_byte_str(src: &str, callback: impl FnMut(Range, Result)) { + <[u8]>::unescape(src, callback) +} + +/// Unescape a C string literal +/// +/// Takes the contents of a C string literal (without quotes) +/// and produces a sequence of escaped MixedUnits or errors, +/// which are returned by invoking `callback`. +pub fn unescape_c_str( + src: &str, + callback: impl FnMut(Range, Result), +) { + CStr::unescape(src, callback) +} + +/// Enum representing either a char or a byte +/// /// Used for mixed utf8 string literals, i.e. those that allow both unicode /// chars and high bytes. #[derive(Copy, Clone, Debug, PartialEq, Eq)] @@ -143,145 +273,133 @@ impl From for MixedUnit { } } -/// Takes the contents of a mixed-utf8 literal (without quotes) and produces -/// a sequence of escaped characters or errors. -/// -/// Values are returned by invoking `callback`. -pub fn unescape_mixed(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - match mode { - CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| { - if let Ok(MixedUnit::Char('\0')) = result { - result = Err(EscapeError::NulInCStr); - } - callback(r, result) - }), - Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(), - } -} +/// Trait for unescaping escape sequences in strings +trait Unescape { + /// Unit type of the implementing string type (`char` for string, `u8` for byte string) + type Unit: From; -/// Takes a contents of a char literal (without quotes), and returns an -/// unescaped char or an error. -pub fn unescape_char(src: &str) -> Result { - unescape_char_or_byte(&mut src.chars(), Char) -} + /// Result of unescaping the zero char ('\0') + const ZERO_RESULT: Result; -/// Takes a contents of a byte literal (without quotes), and returns an -/// unescaped byte or an error. -pub fn unescape_byte(src: &str) -> Result { - unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char) -} + /// Converts chars to the unit type + fn char2unit(c: char) -> Result; -/// What kind of literal do we parse. -#[derive(Debug, Clone, Copy, PartialEq)] -pub enum Mode { - Char, - - Byte, - - Str, - RawStr, - - ByteStr, - RawByteStr, - - CStr, - RawCStr, -} - -impl Mode { - pub fn in_double_quotes(self) -> bool { - match self { - Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true, - Char | Byte => false, - } - } + /// Converts the byte of a hex escape to the unit type + fn hex2unit(b: u8) -> Result; - /// Are `\x80`..`\xff` allowed? - fn allow_high_bytes(self) -> bool { - match self { - Char | Str => false, - Byte | ByteStr | CStr => true, - RawStr | RawByteStr | RawCStr => unreachable!(), - } - } + /// Converts the result of a unicode escape to the unit type + fn unicode2unit(r: Result) -> Result; - /// Are unicode (non-ASCII) chars allowed? - #[inline] - fn allow_unicode_chars(self) -> bool { - match self { - Byte | ByteStr | RawByteStr => false, - Char | Str | RawStr | CStr | RawCStr => true, + /// Unescape a single unit (single quote syntax) + fn unescape_single(chars: &mut Chars<'_>) -> Result { + let res = match chars.next().ok_or(EscapeError::ZeroChars)? { + '\\' => Self::unescape_1(chars), + '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + c => Self::char2unit(c), + }?; + if chars.next().is_some() { + return Err(EscapeError::MoreThanOneChar); } + Ok(res) } - /// Are unicode escapes (`\u`) allowed? - fn allow_unicode_escapes(self) -> bool { - match self { - Byte | ByteStr => false, - Char | Str | CStr => true, - RawByteStr | RawStr | RawCStr => unreachable!(), + /// Unescape the first unit of a string (double quoted syntax) + fn unescape_1(chars: &mut Chars<'_>) -> Result { + // Previous character was '\\', unescape what follows. + let c = chars.next().ok_or(EscapeError::LoneSlash)?; + if c == '0' { + Self::ZERO_RESULT + } else { + simple_escape(c).map(|b| b.into()).or_else(|c| match c { + 'x' => Self::hex2unit(hex_escape(chars)?), + 'u' => Self::unicode2unit({ + let value = unicode_escape(chars)?; + if value > char::MAX as u32 { + Err(EscapeError::OutOfRangeUnicodeEscape) + } else { + char::from_u32(value).ok_or(EscapeError::LoneSurrogateUnicodeEscape) + } + }), + _ => Err(EscapeError::InvalidEscape), + }) } } - pub fn prefix_noraw(self) -> &'static str { - match self { - Char | Str | RawStr => "", - Byte | ByteStr | RawByteStr => "b", - CStr | RawCStr => "c", + /// Unescape a string literal + /// + /// Takes the contents of a raw string literal (without quotes) + /// and produces a sequence of `Result` + /// which are returned via `callback`. + fn unescape( + src: &str, + mut callback: impl FnMut(Range, Result), + ) { + let mut chars = src.chars(); + while let Some(c) = chars.next() { + let start = src.len() - chars.as_str().len() - c.len_utf8(); + let res = match c { + '\\' => { + if let Some(b'\n') = chars.as_str().as_bytes().first() { + let _ = chars.next(); + // skip whitespace for backslash newline, see [Rust language reference] + // (https://doc.rust-lang.org/reference/tokens.html#string-literals). + let mut callback_err = |range, err| callback(range, Err(err)); + skip_ascii_whitespace(&mut chars, start, &mut callback_err); + continue; + } else { + Self::unescape_1(&mut chars) + } + } + '"' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + c => Self::char2unit(c), + }; + let end = src.len() - chars.as_str().len(); + callback(start..end, res); } } } -fn scan_escape + From>( - chars: &mut Chars<'_>, - mode: Mode, -) -> Result { +/// Interpret a non-nul ASCII escape +/// +/// Parses the character of an ASCII escape (except nul) without the leading backslash. +fn simple_escape(c: char) -> Result { // Previous character was '\\', unescape what follows. - let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? { - '"' => '"', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '0' => '\0', - 'x' => { - // Parse hexadecimal character code. - - let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; - let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; - - let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; - let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; - - let value = (hi * 16 + lo) as u8; - - return if !mode.allow_high_bytes() && !value.is_ascii() { - Err(EscapeError::OutOfRangeHexEscape) - } else { - // This may be a high byte, but that will only happen if `T` is - // `MixedUnit`, because of the `allow_high_bytes` check above. - Ok(T::from(value)) - }; - } - 'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from), - _ => return Err(EscapeError::InvalidEscape), - }; - Ok(T::from(res)) + Ok(match c { + '"' => b'"', + 'n' => b'\n', + 'r' => b'\r', + 't' => b'\t', + '\\' => b'\\', + '\'' => b'\'', + _ => Err(c)?, + }) } -fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result { - // We've parsed '\u', now we have to parse '{..}'. +/// Interpret a hexadecimal escape +/// +/// Parses the two hexadecimal characters of a hexadecimal escape without the leading r"\x". +fn hex_escape(chars: &mut impl Iterator) -> Result { + let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + Ok((hi * 16 + lo) as u8) +} + +/// Interpret a unicode escape +/// +/// Parse the braces with hexadecimal characters (and underscores) part of a unicode escape. +/// This r"{...}" normally comes after r"\u" and cannot start with an underscore. +fn unicode_escape(chars: &mut impl Iterator) -> Result { if chars.next() != Some('{') { return Err(EscapeError::NoBraceInUnicodeEscape); } // First character must be a hexadecimal digit. - let mut n_digits = 1; let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), '}' => return Err(EscapeError::EmptyUnicodeEscape), @@ -292,28 +410,19 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result return Err(EscapeError::UnclosedUnicodeEscape), Some('_') => continue, Some('}') => { - if n_digits > 6 { - return Err(EscapeError::OverlongUnicodeEscape); - } - // Incorrect syntax has higher priority for error reporting // than unallowed value for a literal. - if !allow_unicode_escapes { - return Err(EscapeError::UnicodeEscapeInByte); - } - - break std::char::from_u32(value).ok_or({ - if value > 0x10FFFF { - EscapeError::OutOfRangeUnicodeEscape - } else { - EscapeError::LoneSurrogateUnicodeEscape - } - }); + return if n_digits > 6 { + Err(EscapeError::OverlongUnicodeEscape) + } else { + Ok(value) + }; } Some(c) => { let digit: u32 = c @@ -330,122 +439,210 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result Result { - if allow_unicode_chars || c.is_ascii() { - Ok(c) - } else { - Err(EscapeError::NonAsciiCharInByte) +/// Interpret a string continuation escape (https://doc.rust-lang.org/reference/expressions/literal-expr.html#string-continuation-escapes) +/// +/// Skip ASCII whitespace, except for the formfeed character +/// (see [this issue](https://github.com/rust-lang/rust/issues/136600)). +/// Warns on unescaped newline and following non-ASCII whitespace. +fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) +where + F: FnMut(Range, EscapeError), +{ + let rest = chars.as_str(); + let first_non_space = rest + .bytes() + .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') + .unwrap_or(rest.len()); + let (space, rest) = rest.split_at(first_non_space); + // backslash newline adds 2 bytes + let end = start + 2 + first_non_space; + if space.contains('\n') { + callback(start..end, EscapeError::MultipleSkippedLinesWarning); + } + *chars = rest.chars(); + if let Some(c) = chars.clone().next() { + if c.is_whitespace() { + // for error reporting, include the character that was not skipped in the span + callback( + start..end + c.len_utf8(), + EscapeError::UnskippedWhitespaceWarning, + ); + } } } -fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { - let c = chars.next().ok_or(EscapeError::ZeroChars)?; - let res = match c { - '\\' => scan_escape(chars, mode), - '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), - '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, mode.allow_unicode_chars()), - }?; - if chars.next().is_some() { - return Err(EscapeError::MoreThanOneChar); +impl Unescape for str { + type Unit = char; + + const ZERO_RESULT: Result = Ok('\0'); + + fn char2unit(c: char) -> Result { + Ok(c) + } + + fn hex2unit(b: u8) -> Result { + if b.is_ascii() { + Ok(b as char) + } else { + Err(EscapeError::OutOfRangeHexEscape) + } + } + + /// Converts the result of a unicode escape to the unit type + fn unicode2unit(r: Result) -> Result { + r } - Ok(res) } -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of escaped characters or errors. -fn unescape_non_raw_common + From>(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - let mut chars = src.chars(); - let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop - - // The `start` and `end` computation here is complicated because - // `skip_ascii_whitespace` makes us to skip over chars without counting - // them in the range computation. - while let Some(c) = chars.next() { - let start = src.len() - chars.as_str().len() - c.len_utf8(); - let res = match c { - '\\' => { - match chars.clone().next() { - Some('\n') => { - // Rust language specification requires us to skip whitespaces - // if unescaped '\' character is followed by '\n'. - // For details see [Rust language reference] - // (https://doc.rust-lang.org/reference/tokens.html#string-literals). - skip_ascii_whitespace(&mut chars, start, &mut |range, err| { - callback(range, Err(err)) - }); - continue; - } - _ => scan_escape::(&mut chars, mode), - } - } - '"' => Err(EscapeError::EscapeOnlyChar), - '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, allow_unicode_chars).map(T::from), - }; - let end = src.len() - chars.as_str().len(); - callback(start..end, res); +impl Unescape for [u8] { + type Unit = u8; + + const ZERO_RESULT: Result = Ok(b'\0'); + + fn char2unit(c: char) -> Result { + char2byte(c) + } + + fn hex2unit(b: u8) -> Result { + Ok(b) + } + + /// Converts the result of a unicode escape to the unit type + fn unicode2unit(_r: Result) -> Result { + Err(EscapeError::UnicodeEscapeInByte) } } -fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) -where - F: FnMut(Range, EscapeError), -{ - let tail = chars.as_str(); - let first_non_space = tail - .bytes() - .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') - .unwrap_or(tail.len()); - if tail[1..first_non_space].contains('\n') { - // The +1 accounts for the escaping slash. - let end = start + first_non_space + 1; - callback(start..end, EscapeError::MultipleSkippedLinesWarning); +impl Unescape for CStr { + type Unit = MixedUnit; + + const ZERO_RESULT: Result = Err(EscapeError::NulInCStr); + + fn char2unit(c: char) -> Result { + if c == '\0' { + Err(EscapeError::NulInCStr) + } else { + Ok(MixedUnit::Char(c)) + } } - let tail = &tail[first_non_space..]; - if let Some(c) = tail.chars().next() { - if c.is_whitespace() { - // For error reporting, we would like the span to contain the character that was not - // skipped. The +1 is necessary to account for the leading \ that started the escape. - let end = start + first_non_space + c.len_utf8() + 1; - callback(start..end, EscapeError::UnskippedWhitespaceWarning); + + fn hex2unit(byte: u8) -> Result { + if byte == b'\0' { + Err(EscapeError::NulInCStr) + } else if byte.is_ascii() { + Ok(MixedUnit::Char(byte as char)) + } else { + Ok(MixedUnit::HighByte(byte)) } } - *chars = tail.chars(); + + /// Converts the result of a unicode escape to the unit type + fn unicode2unit(r: Result) -> Result { + Self::char2unit(r?) + } } -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of characters or errors. -/// NOTE: Raw strings do not perform any explicit character escaping, here we -/// only produce errors on bare CR. -fn check_raw_common(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - let mut chars = src.chars(); - let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop - - // The `start` and `end` computation here matches the one in - // `unescape_non_raw_common` for consistency, even though this function - // doesn't have to worry about skipping any chars. - while let Some(c) = chars.next() { - let start = src.len() - chars.as_str().len() - c.len_utf8(); - let res = match c { - '\r' => Err(EscapeError::BareCarriageReturnInRawString), - _ => ascii_check(c, allow_unicode_chars), - }; - let end = src.len() - chars.as_str().len(); - callback(start..end, res); +/// Enum of the different kinds of literal +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Mode { + /// `'a'` + Char, + + /// `b'a'` + Byte, + + /// `"hello"` + Str, + /// `r"hello"` + RawStr, + + /// `b"hello"` + ByteStr, + /// `br"hello"` + RawByteStr, + + /// `c"hello"` + CStr, + /// `cr"hello"` + RawCStr, +} + +impl Mode { + pub fn in_double_quotes(self) -> bool { + match self { + Mode::Str + | Mode::RawStr + | Mode::ByteStr + | Mode::RawByteStr + | Mode::CStr + | Mode::RawCStr => true, + Mode::Char | Mode::Byte => false, + } + } + + pub fn prefix_noraw(self) -> &'static str { + match self { + Mode::Char | Mode::Str | Mode::RawStr => "", + Mode::Byte | Mode::ByteStr | Mode::RawByteStr => "b", + Mode::CStr | Mode::RawCStr => "c", + } } } -#[inline] -pub fn byte_from_char(c: char) -> u8 { - let res = c as u32; - debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr"); - res as u8 +/// Check a literal only for errors +/// +/// Takes the contents of a literal (without quotes) +/// and produces a sequence of only errors, +/// which are returned by invoking `error_callback`. +/// +/// NB Does not produce any output other than errors +pub fn check_for_errors( + src: &str, + mode: Mode, + mut error_callback: impl FnMut(Range, EscapeError), +) { + match mode { + Mode::Char => { + let mut chars = src.chars(); + if let Err(e) = str::unescape_single(&mut chars) { + error_callback(0..(src.len() - chars.as_str().len()), e); + } + } + Mode::Byte => { + let mut chars = src.chars(); + if let Err(e) = <[u8]>::unescape_single(&mut chars) { + error_callback(0..(src.len() - chars.as_str().len()), e); + } + } + Mode::Str => unescape_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + Mode::ByteStr => unescape_byte_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + Mode::CStr => unescape_c_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + Mode::RawStr => check_raw_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + Mode::RawByteStr => check_raw_byte_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + Mode::RawCStr => check_raw_c_str(src, |range, res| { + if let Err(e) = res { + error_callback(range, e); + } + }), + } } diff --git a/src/tests.rs b/src/tests.rs index a4bbdc0..a13d8a5 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -100,9 +100,7 @@ fn test_unescape_char_good() { fn test_unescape_str_warn() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::Str, &mut |range, res| { - unescaped.push((range, res)) - }); + unescape_str(literal, |range, res| unescaped.push((range, res))); assert_eq!(unescaped, expected); } @@ -132,7 +130,7 @@ fn test_unescape_str_warn() { fn test_unescape_str_good() { fn check(literal_text: &str, expected: &str) { let mut buf = Ok(String::with_capacity(literal_text.len())); - unescape_unicode(literal_text, Mode::Str, &mut |range, c| { + unescape_str(literal_text, |range, c| { if let Ok(b) = &mut buf { match c { Ok(c) => b.push(c), @@ -248,16 +246,16 @@ fn test_unescape_byte_good() { #[test] fn test_unescape_byte_str_good() { fn check(literal_text: &str, expected: &[u8]) { - let mut buf = Ok(Vec::with_capacity(literal_text.len())); - unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| { - if let Ok(b) = &mut buf { - match c { - Ok(c) => b.push(byte_from_char(c)), - Err(e) => buf = Err((range, e)), + let mut result = Ok(Vec::with_capacity(literal_text.len())); + unescape_byte_str(literal_text, |range, res| { + if let Ok(buf) = &mut result { + match res { + Ok(b) => buf.push(b), + Err(e) => result = Err((range, e)), } } }); - assert_eq!(buf.as_deref(), Ok(expected)) + assert_eq!(result.as_deref(), Ok(expected)) } check("foo", b"foo"); @@ -272,9 +270,7 @@ fn test_unescape_byte_str_good() { fn test_unescape_raw_str() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::RawStr, &mut |range, res| { - unescaped.push((range, res)) - }); + check_raw_str(literal, |range, res| unescaped.push((range, res))); assert_eq!(unescaped, expected); } @@ -293,11 +289,9 @@ fn test_unescape_raw_str() { #[test] fn test_unescape_raw_byte_str() { - fn check(literal: &str, expected: &[(Range, Result)]) { + fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| { - unescaped.push((range, res)) - }); + check_raw_byte_str(literal, |range, res| unescaped.push((range, res))); assert_eq!(unescaped, expected); } @@ -310,7 +304,7 @@ fn test_unescape_raw_byte_str() { "🦀a", &[ (0..4, Err(EscapeError::NonAsciiCharInByte)), - (4..5, Ok('a')), + (4..5, Ok(b'a')), ], ); }