From 707a6f54631c322e8c8ccff363fe024d67d93aa2 Mon Sep 17 00:00:00 2001 From: Marijn Schouten Date: Fri, 7 Mar 2025 11:17:39 +0000 Subject: [PATCH] update to literal-escaper 0.0.4 for better API without `unreachable` and faster string parsing --- Cargo.lock | 4 +- Cargo.toml | 5 + compiler/rustc_ast/Cargo.toml | 2 +- compiler/rustc_ast/src/util/literal.rs | 15 ++- compiler/rustc_parse/Cargo.toml | 2 +- compiler/rustc_parse/src/lexer/mod.rs | 94 +++++++------------ compiler/rustc_parse_format/Cargo.toml | 2 +- compiler/rustc_parse_format/src/lib.rs | 3 +- compiler/rustc_proc_macro/Cargo.toml | 2 +- library/Cargo.lock | 5 +- library/Cargo.toml | 3 +- library/proc_macro/Cargo.toml | 2 +- library/proc_macro/src/lib.rs | 13 ++- .../clippy/clippy_dev/src/update_lints.rs | 3 +- src/tools/lint-docs/Cargo.toml | 2 +- src/tools/lint-docs/src/lib.rs | 4 +- 16 files changed, 67 insertions(+), 94 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index df2842bddb386..4fc363a04e228 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3166,9 +3166,9 @@ checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" [[package]] name = "rustc-literal-escaper" -version = "0.0.2" +version = "0.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0041b6238913c41fe704213a4a9329e2f685a156d1781998128b4149c230ad04" +checksum = "ab03008eb631b703dd16978282ae36c73282e7922fe101a4bd072a40ecea7b8b" [[package]] name = "rustc-main" diff --git a/Cargo.toml b/Cargo.toml index c4d2a06f4cb17..bfe95a82ed2bf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -89,3 +89,8 @@ codegen-units = 1 # FIXME: LTO cannot be enabled for binaries in a workspace # # lto = true + +# If you want to use a crate with local modifications, you can set a path or git dependency here. +# For git dependencies, also add your source to ALLOWED_SOURCES in src/tools/tidy/src/extdeps.rs. +#[patch.crates-io] + diff --git a/compiler/rustc_ast/Cargo.toml b/compiler/rustc_ast/Cargo.toml index b2d3b90fc4494..5de2e69072fa7 100644 --- a/compiler/rustc_ast/Cargo.toml +++ b/compiler/rustc_ast/Cargo.toml @@ -7,7 +7,7 @@ edition = "2024" # tidy-alphabetical-start bitflags = "2.4.1" memchr = "2.7.4" -rustc-literal-escaper = "0.0.2" +rustc-literal-escaper = "0.0.4" rustc_ast_ir = { path = "../rustc_ast_ir" } rustc_data_structures = { path = "../rustc_data_structures" } rustc_index = { path = "../rustc_index" } diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index b8526cf9d9529..ad9e5d1468b09 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -3,7 +3,7 @@ use std::{ascii, fmt, str}; use rustc_literal_escaper::{ - MixedUnit, Mode, byte_from_char, unescape_byte, unescape_char, unescape_mixed, unescape_unicode, + MixedUnit, unescape_byte, unescape_byte_str, unescape_c_str, unescape_char, unescape_str, }; use rustc_span::{Span, Symbol, kw, sym}; use tracing::debug; @@ -87,11 +87,10 @@ impl LitKind { // Force-inlining here is aggressive but the closure is // called on every char in the string, so it can be hot in // programs with many long strings containing escapes. - unescape_unicode( + unescape_str( s, - Mode::Str, - &mut #[inline(always)] - |_, c| match c { + #[inline(always)] + |_, res| match res { Ok(c) => buf.push(c), Err(err) => { assert!(!err.is_fatal(), "failed to unescape string literal") @@ -111,8 +110,8 @@ impl LitKind { token::ByteStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - unescape_unicode(s, Mode::ByteStr, &mut |_, c| match c { - Ok(c) => buf.push(byte_from_char(c)), + unescape_byte_str(s, |_, res| match res { + Ok(b) => buf.push(b), Err(err) => { assert!(!err.is_fatal(), "failed to unescape string literal") } @@ -128,7 +127,7 @@ impl LitKind { token::CStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - unescape_mixed(s, Mode::CStr, &mut |_span, c| match c { + unescape_c_str(s, |_span, c| match c { Ok(MixedUnit::Char(c)) => { buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) } diff --git a/compiler/rustc_parse/Cargo.toml b/compiler/rustc_parse/Cargo.toml index 6504081f0b9ce..c4a0ae2ce9dd9 100644 --- a/compiler/rustc_parse/Cargo.toml +++ b/compiler/rustc_parse/Cargo.toml @@ -6,7 +6,7 @@ edition = "2024" [dependencies] # tidy-alphabetical-start bitflags = "2.4.1" -rustc-literal-escaper = "0.0.2" +rustc-literal-escaper = "0.0.4" rustc_ast = { path = "../rustc_ast" } rustc_ast_pretty = { path = "../rustc_ast_pretty" } rustc_data_structures = { path = "../rustc_data_structures" } diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 2845bbed1c0ee..60d275bf2b402 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -1,5 +1,3 @@ -use std::ops::Range; - use diagnostics::make_unclosed_delims_error; use rustc_ast::ast::{self, AttrStyle}; use rustc_ast::token::{self, CommentKind, Delimiter, IdentIsRaw, Token, TokenKind}; @@ -10,7 +8,7 @@ use rustc_errors::{Applicability, Diag, DiagCtxtHandle, StashKey}; use rustc_lexer::{ Base, Cursor, DocStyle, FrontmatterAllowed, LiteralKind, RawStrError, is_whitespace, }; -use rustc_literal_escaper::{EscapeError, Mode, unescape_mixed, unescape_unicode}; +use rustc_literal_escaper::{EscapeError, Mode, check_for_errors}; use rustc_session::lint::BuiltinLintDiag; use rustc_session::lint::builtin::{ RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX, @@ -702,7 +700,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { } err.emit() } - self.cook_unicode(token::Char, Mode::Char, start, end, 1, 1) // ' ' + self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' ' } rustc_lexer::LiteralKind::Byte { terminated } => { if !terminated { @@ -714,7 +712,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { .with_code(E0763) .emit() } - self.cook_unicode(token::Byte, Mode::Byte, start, end, 2, 1) // b' ' + self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' ' } rustc_lexer::LiteralKind::Str { terminated } => { if !terminated { @@ -726,7 +724,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { .with_code(E0765) .emit() } - self.cook_unicode(token::Str, Mode::Str, start, end, 1, 1) // " " + self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " " } rustc_lexer::LiteralKind::ByteStr { terminated } => { if !terminated { @@ -738,7 +736,8 @@ impl<'psess, 'src> Lexer<'psess, 'src> { .with_code(E0766) .emit() } - self.cook_unicode(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" " + self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1) + // b" " } rustc_lexer::LiteralKind::CStr { terminated } => { if !terminated { @@ -750,13 +749,14 @@ impl<'psess, 'src> Lexer<'psess, 'src> { .with_code(E0767) .emit() } - self.cook_mixed(token::CStr, Mode::CStr, start, end, 2, 1) // c" " + self.cook_quoted(token::CStr, Mode::CStr, start, end, 2, 1) // c" " } rustc_lexer::LiteralKind::RawStr { n_hashes } => { if let Some(n_hashes) = n_hashes { let n = u32::from(n_hashes); let kind = token::StrRaw(n_hashes); - self.cook_unicode(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "## + self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n) + // r##" "## } else { self.report_raw_str_error(start, 1); } @@ -765,7 +765,8 @@ impl<'psess, 'src> Lexer<'psess, 'src> { if let Some(n_hashes) = n_hashes { let n = u32::from(n_hashes); let kind = token::ByteStrRaw(n_hashes); - self.cook_unicode(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "## + self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) + // br##" "## } else { self.report_raw_str_error(start, 2); } @@ -774,7 +775,8 @@ impl<'psess, 'src> Lexer<'psess, 'src> { if let Some(n_hashes) = n_hashes { let n = u32::from(n_hashes); let kind = token::CStrRaw(n_hashes); - self.cook_unicode(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "## + self.cook_quoted(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) + // cr##" "## } else { self.report_raw_str_error(start, 2); } @@ -1091,7 +1093,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { self.dcx().emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num }); } - fn cook_common( + fn cook_quoted( &self, mut kind: token::LitKind, mode: Mode, @@ -1099,32 +1101,28 @@ impl<'psess, 'src> Lexer<'psess, 'src> { end: BytePos, prefix_len: u32, postfix_len: u32, - unescape: fn(&str, Mode, &mut dyn FnMut(Range, Result<(), EscapeError>)), ) -> (token::LitKind, Symbol) { let content_start = start + BytePos(prefix_len); let content_end = end - BytePos(postfix_len); let lit_content = self.str_from_to(content_start, content_end); - unescape(lit_content, mode, &mut |range, result| { - // Here we only check for errors. The actual unescaping is done later. - if let Err(err) = result { - let span_with_quotes = self.mk_sp(start, end); - let (start, end) = (range.start as u32, range.end as u32); - let lo = content_start + BytePos(start); - let hi = lo + BytePos(end - start); - let span = self.mk_sp(lo, hi); - let is_fatal = err.is_fatal(); - if let Some(guar) = emit_unescape_error( - self.dcx(), - lit_content, - span_with_quotes, - span, - mode, - range, - err, - ) { - assert!(is_fatal); - kind = token::Err(guar); - } + check_for_errors(lit_content, mode, |range, err| { + let span_with_quotes = self.mk_sp(start, end); + let (start, end) = (range.start as u32, range.end as u32); + let lo = content_start + BytePos(start); + let hi = lo + BytePos(end - start); + let span = self.mk_sp(lo, hi); + let is_fatal = err.is_fatal(); + if let Some(guar) = emit_unescape_error( + self.dcx(), + lit_content, + span_with_quotes, + span, + mode, + range, + err, + ) { + assert!(is_fatal); + kind = token::Err(guar); } }); @@ -1137,34 +1135,6 @@ impl<'psess, 'src> Lexer<'psess, 'src> { }; (kind, sym) } - - fn cook_unicode( - &self, - kind: token::LitKind, - mode: Mode, - start: BytePos, - end: BytePos, - prefix_len: u32, - postfix_len: u32, - ) -> (token::LitKind, Symbol) { - self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| { - unescape_unicode(src, mode, &mut |span, result| callback(span, result.map(drop))) - }) - } - - fn cook_mixed( - &self, - kind: token::LitKind, - mode: Mode, - start: BytePos, - end: BytePos, - prefix_len: u32, - postfix_len: u32, - ) -> (token::LitKind, Symbol) { - self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| { - unescape_mixed(src, mode, &mut |span, result| callback(span, result.map(drop))) - }) - } } pub fn nfc_normalize(string: &str) -> Symbol { diff --git a/compiler/rustc_parse_format/Cargo.toml b/compiler/rustc_parse_format/Cargo.toml index 52f23c00d4bc0..0666ae2940928 100644 --- a/compiler/rustc_parse_format/Cargo.toml +++ b/compiler/rustc_parse_format/Cargo.toml @@ -5,7 +5,7 @@ edition = "2024" [dependencies] # tidy-alphabetical-start -rustc-literal-escaper = "0.0.2" +rustc-literal-escaper = "0.0.4" rustc_lexer = { path = "../rustc_lexer" } # tidy-alphabetical-end diff --git a/compiler/rustc_parse_format/src/lib.rs b/compiler/rustc_parse_format/src/lib.rs index 42bd0f5d847f7..8e4da7923fcb5 100644 --- a/compiler/rustc_parse_format/src/lib.rs +++ b/compiler/rustc_parse_format/src/lib.rs @@ -20,7 +20,6 @@ use std::ops::Range; pub use Alignment::*; pub use Count::*; pub use Position::*; -use rustc_literal_escaper::{Mode, unescape_unicode}; /// The type of format string that we are parsing. #[derive(Copy, Clone, Debug, Eq, PartialEq)] @@ -320,7 +319,7 @@ impl<'input> Parser<'input> { let without_quotes = &snippet[1..snippet.len() - 1]; let (mut ok, mut vec) = (true, vec![]); let mut chars = input.chars(); - unescape_unicode(without_quotes, Mode::Str, &mut |range, res| match res { + rustc_literal_escaper::unescape_str(without_quotes, |range, res| match res { Ok(ch) if ok && chars.next().is_some_and(|c| ch == c) => { vec.push((range, ch)); } diff --git a/compiler/rustc_proc_macro/Cargo.toml b/compiler/rustc_proc_macro/Cargo.toml index 4a7c0d78ede81..748fa944e286d 100644 --- a/compiler/rustc_proc_macro/Cargo.toml +++ b/compiler/rustc_proc_macro/Cargo.toml @@ -15,7 +15,7 @@ test = false doctest = false [dependencies] -rustc-literal-escaper = "0.0.2" +rustc-literal-escaper = "0.0.4" [features] rustc-dep-of-std = [] diff --git a/library/Cargo.lock b/library/Cargo.lock index 1bd97e7b5273b..522a81325fb13 100644 --- a/library/Cargo.lock +++ b/library/Cargo.lock @@ -273,10 +273,11 @@ dependencies = [ [[package]] name = "rustc-literal-escaper" -version = "0.0.2" +version = "0.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0041b6238913c41fe704213a4a9329e2f685a156d1781998128b4149c230ad04" +checksum = "ab03008eb631b703dd16978282ae36c73282e7922fe101a4bd072a40ecea7b8b" dependencies = [ + "rustc-std-workspace-core", "rustc-std-workspace-std", ] diff --git a/library/Cargo.toml b/library/Cargo.toml index 35480b9319d7d..2fbc0775c3273 100644 --- a/library/Cargo.toml +++ b/library/Cargo.toml @@ -45,8 +45,7 @@ rustc-demangle.debug = 0 rustc-demangle.opt-level = "s" [patch.crates-io] -# See comments in `library/rustc-std-workspace-core/README.md` for what's going on -# here +# See comments in `library/rustc-std-workspace-core/README.md` for what's going on here rustc-std-workspace-core = { path = 'rustc-std-workspace-core' } rustc-std-workspace-alloc = { path = 'rustc-std-workspace-alloc' } rustc-std-workspace-std = { path = 'rustc-std-workspace-std' } diff --git a/library/proc_macro/Cargo.toml b/library/proc_macro/Cargo.toml index 1d79246356a35..8ea92088a84ab 100644 --- a/library/proc_macro/Cargo.toml +++ b/library/proc_macro/Cargo.toml @@ -9,7 +9,7 @@ std = { path = "../std" } # `core` when resolving doc links. Without this line a different `core` will be # loaded from sysroot causing duplicate lang items and other similar errors. core = { path = "../core" } -rustc-literal-escaper = { version = "0.0.2", features = ["rustc-dep-of-std"] } +rustc-literal-escaper = { version = "0.0.4", features = ["rustc-dep-of-std"] } [features] default = ["rustc-dep-of-std"] diff --git a/library/proc_macro/src/lib.rs b/library/proc_macro/src/lib.rs index 32c306be94ecd..652aa05d6f1f6 100644 --- a/library/proc_macro/src/lib.rs +++ b/library/proc_macro/src/lib.rs @@ -55,7 +55,7 @@ use std::{error, fmt}; pub use diagnostic::{Diagnostic, Level, MultiSpan}; #[unstable(feature = "proc_macro_value", issue = "136652")] pub use rustc_literal_escaper::EscapeError; -use rustc_literal_escaper::{MixedUnit, Mode, byte_from_char, unescape_mixed, unescape_unicode}; +use rustc_literal_escaper::{MixedUnit, unescape_byte_str, unescape_c_str, unescape_str}; #[unstable(feature = "proc_macro_totokens", issue = "130977")] pub use to_tokens::ToTokens; @@ -1439,10 +1439,9 @@ impl Literal { // Force-inlining here is aggressive but the closure is // called on every char in the string, so it can be hot in // programs with many long strings containing escapes. - unescape_unicode( + unescape_str( symbol, - Mode::Str, - &mut #[inline(always)] + #[inline(always)] |_, c| match c { Ok(c) => buf.push(c), Err(err) => { @@ -1471,7 +1470,7 @@ impl Literal { let mut error = None; let mut buf = Vec::with_capacity(symbol.len()); - unescape_mixed(symbol, Mode::CStr, &mut |_span, c| match c { + unescape_c_str(symbol, |_span, c| match c { Ok(MixedUnit::Char(c)) => { buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) } @@ -1510,8 +1509,8 @@ impl Literal { let mut buf = Vec::with_capacity(symbol.len()); let mut error = None; - unescape_unicode(symbol, Mode::ByteStr, &mut |_, c| match c { - Ok(c) => buf.push(byte_from_char(c)), + unescape_byte_str(symbol, |_, res| match res { + Ok(b) => buf.push(b), Err(err) => { if err.is_fatal() { error = Some(ConversionErrorKind::FailedToUnescape(err)); diff --git a/src/tools/clippy/clippy_dev/src/update_lints.rs b/src/tools/clippy/clippy_dev/src/update_lints.rs index 08592f2521f7d..3b827cc5603e5 100644 --- a/src/tools/clippy/clippy_dev/src/update_lints.rs +++ b/src/tools/clippy/clippy_dev/src/update_lints.rs @@ -2,6 +2,7 @@ use crate::utils::{ ErrAction, File, FileUpdater, RustSearcher, Token, UpdateMode, UpdateStatus, expect_action, update_text_region_fn, }; use itertools::Itertools; +use rustc_lexer::{LiteralKind, TokenKind, tokenize}; use std::collections::HashSet; use std::fmt::Write; use std::ops::Range; @@ -342,7 +343,7 @@ fn parse_str_lit(s: &str) -> String { .and_then(|s| s.strip_suffix('"')) .unwrap_or_else(|| panic!("expected quoted string, found `{s}`")); let mut res = String::with_capacity(s.len()); - rustc_literal_escaper::unescape_unicode(s, mode, &mut |_, ch| { + rustc_literal_escaper::unescape_str(s, |range, ch| { if let Ok(ch) = ch { res.push(ch); } diff --git a/src/tools/lint-docs/Cargo.toml b/src/tools/lint-docs/Cargo.toml index f1ffda75ac0f7..e914a2df2badc 100644 --- a/src/tools/lint-docs/Cargo.toml +++ b/src/tools/lint-docs/Cargo.toml @@ -7,7 +7,7 @@ description = "A script to extract the lint documentation for the rustc book." # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -rustc-literal-escaper = "0.0.2" +rustc-literal-escaper = "0.0.4" serde_json = "1.0.57" tempfile = "3.1.0" walkdir = "2.3.1" diff --git a/src/tools/lint-docs/src/lib.rs b/src/tools/lint-docs/src/lib.rs index 6bb18c2bced70..b33344ca5dda4 100644 --- a/src/tools/lint-docs/src/lib.rs +++ b/src/tools/lint-docs/src/lib.rs @@ -4,7 +4,7 @@ use std::fs; use std::path::{Path, PathBuf}; use std::process::Command; -use rustc_literal_escaper::{Mode, unescape_unicode}; +use rustc_literal_escaper::unescape_str; use walkdir::WalkDir; mod groups; @@ -218,7 +218,7 @@ impl<'a> LintExtractor<'a> { } else if let Some(text) = line.strip_prefix("#[doc = \"") { let escaped = text.strip_suffix("\"]").unwrap(); let mut buf = String::new(); - unescape_unicode(escaped, Mode::Str, &mut |_, c| match c { + unescape_str(escaped, |_, res| match res { Ok(c) => buf.push(c), Err(err) => { assert!(!err.is_fatal(), "failed to unescape string literal")