Skip to content

Separate the unescape functions but avoid duplicating code #138163

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions compiler/rustc_ast/src/util/literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ impl LitKind {
// programs with many long strings containing escapes.
unescape_str(
s,
&mut #[inline(always)]
#[inline(always)]
|_, res| match res {
Ok(c) => buf.push(c),
Err(err) => {
Expand All @@ -110,7 +110,7 @@ impl LitKind {
token::ByteStr => {
let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
unescape_byte_str(s, &mut |_, res| match res {
unescape_byte_str(s, |_, res| match res {
Ok(b) => buf.push(b),
Err(err) => {
assert!(!err.is_fatal(), "failed to unescape string literal")
Expand All @@ -127,7 +127,7 @@ impl LitKind {
token::CStr => {
let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
unescape_cstr(s, &mut |_span, c| match c {
unescape_cstr(s, |_span, c| match c {
Ok(MixedUnit::Char(c)) => {
buf.extend_from_slice(c.get().encode_utf8(&mut [0; 4]).as_bytes())
}
Expand Down
67 changes: 44 additions & 23 deletions compiler/rustc_lexer/src/unescape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ macro_rules! check {
" literal (without quotes) and produce a sequence of results of ",
stringify!($unit_ty), " or error (returned via `callback`).",
"\nNB: Raw strings don't do any unescaping, but do produce errors on bare CR.")]
pub fn $check(src: &str, callback: &mut impl FnMut(Range<usize>, Result<$unit, EscapeError>))
pub fn $check(src: &str, mut callback: impl FnMut(Range<usize>, Result<$unit, EscapeError>))
{
src.char_indices().for_each(|(pos, c)| {
callback(
Expand All @@ -162,7 +162,7 @@ macro_rules! unescape {
#[doc = concat!("Take the contents of a ", stringify!($string_ty),
" literal (without quotes) and produce a sequence of results of escaped ",
stringify!($unit_ty), " or error (returned via `callback`).")]
pub fn $unescape(src: &str, callback: &mut impl FnMut(Range<usize>, Result<$unit, EscapeError>))
pub fn $unescape(src: &str, mut callback: impl FnMut(Range<usize>, Result<$unit, EscapeError>))
{
let mut chars = src.chars();
while let Some(c) = chars.next() {
Expand Down Expand Up @@ -356,36 +356,57 @@ fn unicode_escape(chars: &mut impl Iterator<Item = char>) -> Result<u32, EscapeE
}
}

/// Takes the contents of a unicode-only (non-mixed-utf8) literal (without quotes)
/// and produces a sequence of unescaped characters or errors,
/// Takes the contents of a literal (without quotes)
/// and produces a sequence of errors,
/// which are returned by invoking `callback`.
///
/// For `Char` and `Byte` modes, the callback will be called exactly once.
pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
let mut byte_callback =
|range, res: Result<u8, EscapeError>| callback(range, res.map(char::from));
pub fn unescape_for_errors(
src: &str,
mode: Mode,
mut error_callback: impl FnMut(Range<usize>, EscapeError),
) {
match mode {
Char => {
let mut chars = src.chars();
let res = unescape_char_iter(&mut chars);
callback(0..(src.len() - chars.as_str().len()), res);
if let Err(e) = unescape_char_iter(&mut chars) {
error_callback(0..(src.len() - chars.as_str().len()), e);
}
}
Byte => {
let mut chars = src.chars();
let res = unescape_byte_iter(&mut chars).map(char::from);
callback(0..(src.len() - chars.as_str().len()), res);
if let Err(e) = unescape_byte_iter(&mut chars) {
error_callback(0..(src.len() - chars.as_str().len()), e);
}
}
Str => unescape_str(src, callback),
ByteStr => unescape_byte_str(src, &mut byte_callback),
RawStr => check_raw_str(src, callback),
RawByteStr => check_raw_byte_str(src, &mut byte_callback),
RawCStr => check_raw_cstr(src, &mut |r, res: Result<NonZero<char>, EscapeError>| {
callback(r, res.map(|c| c.get()))
Str => unescape_str(src, |range, res| {
if let Err(e) = res {
error_callback(range, e);
}
}),
ByteStr => unescape_byte_str(src, |range, res| {
if let Err(e) = res {
error_callback(range, e);
}
}),
CStr => unescape_cstr(src, |range, res| {
if let Err(e) = res {
error_callback(range, e);
}
}),
RawStr => check_raw_str(src, |range, res| {
if let Err(e) = res {
error_callback(range, e);
}
}),
RawByteStr => check_raw_byte_str(src, |range, res| {
if let Err(e) = res {
error_callback(range, e);
}
}),
RawCStr => check_raw_cstr(src, |range, res| {
if let Err(e) = res {
error_callback(range, e);
}
}),
CStr => unreachable!(),
}
}

Expand Down
14 changes: 7 additions & 7 deletions compiler/rustc_lexer/src/unescape/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ fn test_unescape_char_good() {
fn test_unescape_str_warn() {
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
let mut unescaped = Vec::with_capacity(literal.len());
unescape_unicode(literal, Mode::Str, &mut |range, res| unescaped.push((range, res)));
unescape_str(literal, |range, res| unescaped.push((range, res)));
assert_eq!(unescaped, expected);
}

Expand All @@ -124,7 +124,7 @@ fn test_unescape_str_warn() {
fn test_unescape_str_good() {
fn check(literal_text: &str, expected: &str) {
let mut buf = Ok(String::with_capacity(literal_text.len()));
unescape_unicode(literal_text, Mode::Str, &mut |range, c| {
unescape_str(literal_text, |range, c| {
if let Ok(b) = &mut buf {
match c {
Ok(c) => b.push(c),
Expand Down Expand Up @@ -241,7 +241,7 @@ fn test_unescape_byte_good() {
fn test_unescape_byte_str_good() {
fn check(literal_text: &str, expected: &[u8]) {
let mut buf = Ok(Vec::with_capacity(literal_text.len()));
unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| {
unescape_byte_str(literal_text, |range, c| {
if let Ok(b) = &mut buf {
match c {
Ok(c) => b.push(c as u8),
Expand All @@ -264,7 +264,7 @@ fn test_unescape_byte_str_good() {
fn test_unescape_raw_str() {
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
let mut unescaped = Vec::with_capacity(literal.len());
unescape_unicode(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res)));
check_raw_str(literal, |range, res| unescaped.push((range, res)));
assert_eq!(unescaped, expected);
}

Expand All @@ -274,13 +274,13 @@ fn test_unescape_raw_str() {

#[test]
fn test_unescape_raw_byte_str() {
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
fn check(literal: &str, expected: &[(Range<usize>, Result<u8, EscapeError>)]) {
let mut unescaped = Vec::with_capacity(literal.len());
unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res)));
check_raw_byte_str(literal, |range, res| unescaped.push((range, res)));
assert_eq!(unescaped, expected);
}

check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByte))]);
check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok('a'))]);
check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok(b'a'))]);
}
90 changes: 27 additions & 63 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::ops::Range;

use rustc_ast::ast::{self, AttrStyle};
use rustc_ast::token::{self, CommentKind, Delimiter, IdentIsRaw, Token, TokenKind};
use rustc_ast::tokenstream::TokenStream;
Expand Down Expand Up @@ -525,7 +523,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
}
err.emit()
}
self.cook_unicode(token::Char, Mode::Char, start, end, 1, 1) // ' '
self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' '
}
rustc_lexer::LiteralKind::Byte { terminated } => {
if !terminated {
Expand All @@ -537,7 +535,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
.with_code(E0763)
.emit()
}
self.cook_unicode(token::Byte, Mode::Byte, start, end, 2, 1) // b' '
self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' '
}
rustc_lexer::LiteralKind::Str { terminated } => {
if !terminated {
Expand All @@ -549,7 +547,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
.with_code(E0765)
.emit()
}
self.cook_unicode(token::Str, Mode::Str, start, end, 1, 1) // " "
self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " "
}
rustc_lexer::LiteralKind::ByteStr { terminated } => {
if !terminated {
Expand All @@ -561,7 +559,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
.with_code(E0766)
.emit()
}
self.cook_unicode(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" "
self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" "
}
rustc_lexer::LiteralKind::CStr { terminated } => {
if !terminated {
Expand All @@ -573,13 +571,13 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
.with_code(E0767)
.emit()
}
self.cook_mixed(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
self.cook_quoted(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
}
rustc_lexer::LiteralKind::RawStr { n_hashes } => {
if let Some(n_hashes) = n_hashes {
let n = u32::from(n_hashes);
let kind = token::StrRaw(n_hashes);
self.cook_unicode(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "##
self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "##
} else {
self.report_raw_str_error(start, 1);
}
Expand All @@ -588,7 +586,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
if let Some(n_hashes) = n_hashes {
let n = u32::from(n_hashes);
let kind = token::ByteStrRaw(n_hashes);
self.cook_unicode(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "##
self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "##
} else {
self.report_raw_str_error(start, 2);
}
Expand All @@ -597,7 +595,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
if let Some(n_hashes) = n_hashes {
let n = u32::from(n_hashes);
let kind = token::CStrRaw(n_hashes);
self.cook_unicode(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "##
self.cook_quoted(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "##
} else {
self.report_raw_str_error(start, 2);
}
Expand Down Expand Up @@ -913,40 +911,36 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
self.dcx().emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
}

fn cook_common(
fn cook_quoted(
&self,
mut kind: token::LitKind,
mode: Mode,
start: BytePos,
end: BytePos,
prefix_len: u32,
postfix_len: u32,
unescape: fn(&str, Mode, &mut dyn FnMut(Range<usize>, Result<(), EscapeError>)),
) -> (token::LitKind, Symbol) {
let content_start = start + BytePos(prefix_len);
let content_end = end - BytePos(postfix_len);
let lit_content = self.str_from_to(content_start, content_end);
unescape(lit_content, mode, &mut |range, result| {
// Here we only check for errors. The actual unescaping is done later.
if let Err(err) = result {
let span_with_quotes = self.mk_sp(start, end);
let (start, end) = (range.start as u32, range.end as u32);
let lo = content_start + BytePos(start);
let hi = lo + BytePos(end - start);
let span = self.mk_sp(lo, hi);
let is_fatal = err.is_fatal();
if let Some(guar) = emit_unescape_error(
self.dcx(),
lit_content,
span_with_quotes,
span,
mode,
range,
err,
) {
assert!(is_fatal);
kind = token::Err(guar);
}
unescape::unescape_for_errors(lit_content, mode, |range, err| {
let span_with_quotes = self.mk_sp(start, end);
let (start, end) = (range.start as u32, range.end as u32);
let lo = content_start + BytePos(start);
let hi = lo + BytePos(end - start);
let span = self.mk_sp(lo, hi);
let is_fatal = err.is_fatal();
if let Some(guar) = emit_unescape_error(
self.dcx(),
lit_content,
span_with_quotes,
span,
mode,
range,
err,
) {
assert!(is_fatal);
kind = token::Err(guar);
}
});

Expand All @@ -959,36 +953,6 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
};
(kind, sym)
}

fn cook_unicode(
&self,
kind: token::LitKind,
mode: Mode,
start: BytePos,
end: BytePos,
prefix_len: u32,
postfix_len: u32,
) -> (token::LitKind, Symbol) {
self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
unescape::unescape_unicode(src, mode, &mut |span, result| {
callback(span, result.map(drop))
})
})
}

fn cook_mixed(
&self,
kind: token::LitKind,
mode: Mode,
start: BytePos,
end: BytePos,
prefix_len: u32,
postfix_len: u32,
) -> (token::LitKind, Symbol) {
self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, _mode, callback| {
unescape::unescape_cstr(src, &mut |span, result| callback(span, result.map(drop)))
})
}
}

pub fn nfc_normalize(string: &str) -> Symbol {
Expand Down
8 changes: 3 additions & 5 deletions compiler/rustc_parse_format/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1094,11 +1094,9 @@ fn find_width_map_from_snippet(
fn unescape_string(string: &str) -> Option<String> {
let mut buf = String::new();
let mut ok = true;
unescape::unescape_unicode(string, unescape::Mode::Str, &mut |_, unescaped_char| {
match unescaped_char {
Ok(c) => buf.push(c),
Err(_) => ok = false,
}
unescape::unescape_str(string, &mut |_, res| match res {
Ok(c) => buf.push(c),
Err(_) => ok = false,
});

ok.then_some(buf)
Expand Down
2 changes: 1 addition & 1 deletion src/tools/clippy/clippy_dev/src/update_lints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -830,7 +830,7 @@ fn remove_line_splices(s: &str) -> String {
.and_then(|s| s.strip_suffix('"'))
.unwrap_or_else(|| panic!("expected quoted string, found `{s}`"));
let mut res = String::with_capacity(s.len());
unescape::unescape_unicode(s, unescape::Mode::Str, &mut |range, ch| {
unescape::unescape_str(s, |range, ch| {
if ch.is_ok() {
res.push_str(&s[range]);
}
Expand Down
2 changes: 1 addition & 1 deletion src/tools/rust-analyzer/crates/hir-expand/src/attrs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@ fn unescape(s: &str) -> Option<Cow<'_, str>> {
let mut buf = String::new();
let mut prev_end = 0;
let mut has_error = false;
unescape::unescape_unicode(s, unescape::Mode::Str, &mut |char_range, unescaped_char| match (
unescape::unescape_str(s, |char_range, unescaped_char| match (
unescaped_char,
buf.capacity() == 0,
) {
Expand Down
Loading
Loading