Rollup merge of #143708 - epage:pretty, r=compiler-errors

matthiaskrgr · web-flow · commit dbd2f303953e · 2025-07-11T07:35:21.000+02:00
fix: Include frontmatter in -Zunpretty output In the implementation (#140035), this was left as an open question for the tracking issue (#136889). My assumption is that this should be carried over. The test was carried over from #137193 which was superseded by #140035. Thankfully, either way, `-Zunpretty` is unstable and we can always change it even if we stabilize frontmatter.
diff --git a/compiler/rustc_ast_pretty/src/pprust/state.rs b/compiler/rustc_ast_pretty/src/pprust/state.rs
@@ -120,7 +120,7 @@ fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comment>
         pos += shebang_len;
     }
 
-    for token in rustc_lexer::tokenize(&text[pos..]) {
+    for token in rustc_lexer::tokenize(&text[pos..], rustc_lexer::FrontmatterAllowed::Yes) {
         let token_text = &text[pos..pos + token.len as usize];
         match token.kind {
             rustc_lexer::TokenKind::Whitespace => {
@@ -171,6 +171,14 @@ fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comment>
                     })
                 }
             }
+            rustc_lexer::TokenKind::Frontmatter { .. } => {
+                code_to_the_left = false;
+                comments.push(Comment {
+                    style: CommentStyle::Isolated,
+                    lines: vec![token_text.to_string()],
+                    pos: start_bpos + BytePos(pos as u32),
+                });
+            }
             _ => {
                 code_to_the_left = true;
             }
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
@@ -273,14 +273,15 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
     if let Some(input_tail) = input.strip_prefix("#!") {
         // Ok, this is a shebang but if the next non-whitespace token is `[`,
         // then it may be valid Rust code, so consider it Rust code.
-        let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok| {
-            !matches!(
-                tok,
-                TokenKind::Whitespace
-                    | TokenKind::LineComment { doc_style: None }
-                    | TokenKind::BlockComment { doc_style: None, .. }
-            )
-        });
+        let next_non_whitespace_token =
+            tokenize(input_tail, FrontmatterAllowed::No).map(|tok| tok.kind).find(|tok| {
+                !matches!(
+                    tok,
+                    TokenKind::Whitespace
+                        | TokenKind::LineComment { doc_style: None }
+                        | TokenKind::BlockComment { doc_style: None, .. }
+                )
+            });
         if next_non_whitespace_token != Some(TokenKind::OpenBracket) {
             // No other choice than to consider this a shebang.
             return Some(2 + input_tail.lines().next().unwrap_or_default().len());
@@ -303,8 +304,16 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
 }
 
 /// Creates an iterator that produces tokens from the input string.
-pub fn tokenize(input: &str) -> impl Iterator<Item = Token> {
-    let mut cursor = Cursor::new(input, FrontmatterAllowed::No);
+///
+/// When parsing a full Rust document,
+/// first [`strip_shebang`] and then allow frontmatters with [`FrontmatterAllowed::Yes`].
+///
+/// When tokenizing a slice of a document, be sure to disallow frontmatters with [`FrontmatterAllowed::No`]
+pub fn tokenize(
+    input: &str,
+    frontmatter_allowed: FrontmatterAllowed,
+) -> impl Iterator<Item = Token> {
+    let mut cursor = Cursor::new(input, frontmatter_allowed);
     std::iter::from_fn(move || {
         let token = cursor.advance_token();
         if token.kind != TokenKind::Eof { Some(token) } else { None }
diff --git a/compiler/rustc_lexer/src/tests.rs b/compiler/rustc_lexer/src/tests.rs
@@ -124,15 +124,17 @@ fn test_valid_shebang() {
     assert_eq!(strip_shebang(input), None);
 }
 
-fn check_lexing(src: &str, expect: Expect) {
-    let actual: String = tokenize(src).map(|token| format!("{:?}\n", token)).collect();
+fn check_lexing(src: &str, frontmatter_allowed: FrontmatterAllowed, expect: Expect) {
+    let actual: String =
+        tokenize(src, frontmatter_allowed).map(|token| format!("{:?}\n", token)).collect();
     expect.assert_eq(&actual)
 }
 
 #[test]
 fn smoke_test() {
     check_lexing(
         "/* my source file */ fn main() { println!(\"zebra\"); }\n",
+        FrontmatterAllowed::No,
         expect![[r#"
             Token { kind: BlockComment { doc_style: None, terminated: true }, len: 20 }
             Token { kind: Whitespace, len: 1 }
@@ -171,6 +173,7 @@ fn comment_flavors() {
 /** outer doc block */
 /*! inner doc block */
 ",
+        FrontmatterAllowed::No,
         expect![[r#"
             Token { kind: Whitespace, len: 1 }
             Token { kind: LineComment { doc_style: None }, len: 7 }
@@ -199,6 +202,7 @@ fn comment_flavors() {
 fn nested_block_comments() {
     check_lexing(
         "/* /* */ */'a'",
+        FrontmatterAllowed::No,
         expect![[r#"
             Token { kind: BlockComment { doc_style: None, terminated: true }, len: 11 }
             Token { kind: Literal { kind: Char { terminated: true }, suffix_start: 3 }, len: 3 }
@@ -210,6 +214,7 @@ fn nested_block_comments() {
 fn characters() {
     check_lexing(
         "'a' ' ' '\\n'",
+        FrontmatterAllowed::No,
         expect![[r#"
             Token { kind: Literal { kind: Char { terminated: true }, suffix_start: 3 }, len: 3 }
             Token { kind: Whitespace, len: 1 }
@@ -224,6 +229,7 @@ fn characters() {
 fn lifetime() {
     check_lexing(
         "'abc",
+        FrontmatterAllowed::No,
         expect![[r#"
             Token { kind: Lifetime { starts_with_number: false }, len: 4 }
         "#]],
@@ -234,6 +240,7 @@ fn lifetime() {
 fn raw_string() {
     check_lexing(
         "r###\"\"#a\\b\x00c\"\"###",
+        FrontmatterAllowed::No,
         expect![[r#"
             Token { kind: Literal { kind: RawStr { n_hashes: Some(3) }, suffix_start: 17 }, len: 17 }
         "#]],
@@ -257,6 +264,7 @@ b"a"
 r###"raw"###suffix
 br###"raw"###suffix
 "####,
+        FrontmatterAllowed::No,
         expect![[r#"
             Token { kind: Whitespace, len: 1 }
             Token { kind: Literal { kind: Char { terminated: true }, suffix_start: 3 }, len: 3 }
@@ -286,3 +294,78 @@ br###"raw"###suffix
         "#]],
     )
 }
+
+#[test]
+fn frontmatter_allowed() {
+    check_lexing(
+        r#"
+---cargo
+[dependencies]
+clap = "4"
+---
+
+fn main() {}
+"#,
+        FrontmatterAllowed::Yes,
+        expect![[r#"
+            Token { kind: Whitespace, len: 1 }
+            Token { kind: Frontmatter { has_invalid_preceding_whitespace: false, invalid_infostring: false }, len: 38 }
+            Token { kind: Whitespace, len: 2 }
+            Token { kind: Ident, len: 2 }
+            Token { kind: Whitespace, len: 1 }
+            Token { kind: Ident, len: 4 }
+            Token { kind: OpenParen, len: 1 }
+            Token { kind: CloseParen, len: 1 }
+            Token { kind: Whitespace, len: 1 }
+            Token { kind: OpenBrace, len: 1 }
+            Token { kind: CloseBrace, len: 1 }
+            Token { kind: Whitespace, len: 1 }
+        "#]],
+    )
+}
+
+#[test]
+fn frontmatter_disallowed() {
+    check_lexing(
+        r#"
+---cargo
+[dependencies]
+clap = "4"
+---
+
+fn main() {}
+"#,
+        FrontmatterAllowed::No,
+        expect![[r#"
+            Token { kind: Whitespace, len: 1 }
+            Token { kind: Minus, len: 1 }
+            Token { kind: Minus, len: 1 }
+            Token { kind: Minus, len: 1 }
+            Token { kind: Ident, len: 5 }
+            Token { kind: Whitespace, len: 1 }
+            Token { kind: OpenBracket, len: 1 }
+            Token { kind: Ident, len: 12 }
+            Token { kind: CloseBracket, len: 1 }
+            Token { kind: Whitespace, len: 1 }
+            Token { kind: Ident, len: 4 }
+            Token { kind: Whitespace, len: 1 }
+            Token { kind: Eq, len: 1 }
+            Token { kind: Whitespace, len: 1 }
+            Token { kind: Literal { kind: Str { terminated: true }, suffix_start: 3 }, len: 3 }
+            Token { kind: Whitespace, len: 1 }
+            Token { kind: Minus, len: 1 }
+            Token { kind: Minus, len: 1 }
+            Token { kind: Minus, len: 1 }
+            Token { kind: Whitespace, len: 2 }
+            Token { kind: Ident, len: 2 }
+            Token { kind: Whitespace, len: 1 }
+            Token { kind: Ident, len: 4 }
+            Token { kind: OpenParen, len: 1 }
+            Token { kind: CloseParen, len: 1 }
+            Token { kind: Whitespace, len: 1 }
+            Token { kind: OpenBrace, len: 1 }
+            Token { kind: CloseBrace, len: 1 }
+            Token { kind: Whitespace, len: 1 }
+        "#]],
+    )
+}
diff --git a/src/tools/clippy/clippy_lints/src/undocumented_unsafe_blocks.rs b/src/tools/clippy/clippy_lints/src/undocumented_unsafe_blocks.rs
@@ -9,7 +9,7 @@ use clippy_utils::visitors::{Descend, for_each_expr};
 use hir::HirId;
 use rustc_hir as hir;
 use rustc_hir::{Block, BlockCheckMode, ItemKind, Node, UnsafeSource};
-use rustc_lexer::{TokenKind, tokenize};
+use rustc_lexer::{FrontmatterAllowed, TokenKind, tokenize};
 use rustc_lint::{LateContext, LateLintPass, LintContext};
 use rustc_session::impl_lint_pass;
 use rustc_span::{BytePos, Pos, RelativeBytePos, Span, SyntaxContext};
@@ -746,7 +746,7 @@ fn text_has_safety_comment(src: &str, line_starts: &[RelativeBytePos], start_pos
     loop {
         if line.starts_with("/*") {
             let src = &src[line_start..line_starts.last().unwrap().to_usize()];
-            let mut tokens = tokenize(src);
+            let mut tokens = tokenize(src, FrontmatterAllowed::No);
             return (src[..tokens.next().unwrap().len as usize]
                 .to_ascii_uppercase()
                 .contains("SAFETY:")
diff --git a/src/tools/clippy/clippy_lints/src/utils/format_args_collector.rs b/src/tools/clippy/clippy_lints/src/utils/format_args_collector.rs
@@ -3,7 +3,7 @@ use clippy_utils::source::SpanRangeExt;
 use itertools::Itertools;
 use rustc_ast::{Crate, Expr, ExprKind, FormatArgs};
 use rustc_data_structures::fx::FxHashMap;
-use rustc_lexer::{TokenKind, tokenize};
+use rustc_lexer::{FrontmatterAllowed, TokenKind, tokenize};
 use rustc_lint::{EarlyContext, EarlyLintPass};
 use rustc_session::impl_lint_pass;
 use rustc_span::{Span, hygiene};
@@ -82,7 +82,7 @@ fn has_span_from_proc_macro(cx: &EarlyContext<'_>, args: &FormatArgs) -> bool {
         .all(|sp| {
             sp.check_source_text(cx, |src| {
                 // text should be either `, name` or `, name =`
-                let mut iter = tokenize(src).filter(|t| {
+                let mut iter = tokenize(src, FrontmatterAllowed::No).filter(|t| {
                     !matches!(
                         t.kind,
                         TokenKind::LineComment { .. } | TokenKind::BlockComment { .. } | TokenKind::Whitespace
diff --git a/src/tools/clippy/clippy_utils/src/consts.rs b/src/tools/clippy/clippy_utils/src/consts.rs
@@ -15,7 +15,7 @@ use rustc_hir::def::{DefKind, Res};
 use rustc_hir::{
     BinOpKind, Block, ConstBlock, Expr, ExprKind, HirId, Item, ItemKind, Node, PatExpr, PatExprKind, QPath, UnOp,
 };
-use rustc_lexer::tokenize;
+use rustc_lexer::{FrontmatterAllowed, tokenize};
 use rustc_lint::LateContext;
 use rustc_middle::mir::ConstValue;
 use rustc_middle::mir::interpret::{Scalar, alloc_range};
@@ -304,9 +304,7 @@ pub fn lit_to_mir_constant<'tcx>(lit: &LitKind, ty: Option<Ty<'tcx>>) -> Constan
     match *lit {
         LitKind::Str(ref is, _) => Constant::Str(is.to_string()),
         LitKind::Byte(b) => Constant::Int(u128::from(b)),
-        LitKind::ByteStr(ref s, _) | LitKind::CStr(ref s, _) => {
-            Constant::Binary(s.as_byte_str().to_vec())
-        }
+        LitKind::ByteStr(ref s, _) | LitKind::CStr(ref s, _) => Constant::Binary(s.as_byte_str().to_vec()),
         LitKind::Char(c) => Constant::Char(c),
         LitKind::Int(n, _) => Constant::Int(n.get()),
         LitKind::Float(ref is, LitFloatType::Suffixed(fty)) => match fty {
@@ -568,9 +566,7 @@ impl<'tcx> ConstEvalCtxt<'tcx> {
                 } else {
                     match &lit.node {
                         LitKind::Str(is, _) => Some(is.is_empty()),
-                        LitKind::ByteStr(s, _) | LitKind::CStr(s, _) => {
-                            Some(s.as_byte_str().is_empty())
-                        }
+                        LitKind::ByteStr(s, _) | LitKind::CStr(s, _) => Some(s.as_byte_str().is_empty()),
                         _ => None,
                     }
                 }
@@ -715,7 +711,7 @@ impl<'tcx> ConstEvalCtxt<'tcx> {
                     && let Some(src) = src.as_str()
                 {
                     use rustc_lexer::TokenKind::{BlockComment, LineComment, OpenBrace, Semi, Whitespace};
-                    if !tokenize(src)
+                    if !tokenize(src, FrontmatterAllowed::No)
                         .map(|t| t.kind)
                         .filter(|t| !matches!(t, Whitespace | LineComment { .. } | BlockComment { .. } | Semi))
                         .eq([OpenBrace])
diff --git a/src/tools/clippy/clippy_utils/src/hir_utils.rs b/src/tools/clippy/clippy_utils/src/hir_utils.rs
@@ -12,7 +12,7 @@ use rustc_hir::{
     Pat, PatExpr, PatExprKind, PatField, PatKind, Path, PathSegment, PrimTy, QPath, Stmt, StmtKind, StructTailExpr,
     TraitBoundModifiers, Ty, TyKind, TyPat, TyPatKind,
 };
-use rustc_lexer::{TokenKind, tokenize};
+use rustc_lexer::{FrontmatterAllowed, TokenKind, tokenize};
 use rustc_lint::LateContext;
 use rustc_middle::ty::TypeckResults;
 use rustc_span::{BytePos, ExpnKind, MacroKind, Symbol, SyntaxContext, sym};
@@ -686,7 +686,7 @@ fn reduce_exprkind<'hir>(cx: &LateContext<'_>, kind: &'hir ExprKind<'hir>) -> &'
             // `{}` => `()`
             ([], None)
                 if block.span.check_source_text(cx, |src| {
-                    tokenize(src)
+                    tokenize(src, FrontmatterAllowed::No)
                         .map(|t| t.kind)
                         .filter(|t| {
                             !matches!(
diff --git a/src/tools/clippy/clippy_utils/src/lib.rs b/src/tools/clippy/clippy_utils/src/lib.rs
@@ -106,7 +106,7 @@ use rustc_hir::{
     Param, Pat, PatExpr, PatExprKind, PatKind, Path, PathSegment, QPath, Stmt, StmtKind, TraitFn, TraitItem,
     TraitItemKind, TraitRef, TyKind, UnOp, def,
 };
-use rustc_lexer::{TokenKind, tokenize};
+use rustc_lexer::{FrontmatterAllowed, TokenKind, tokenize};
 use rustc_lint::{LateContext, Level, Lint, LintContext};
 use rustc_middle::hir::nested_filter;
 use rustc_middle::hir::place::PlaceBase;
@@ -2764,7 +2764,7 @@ pub fn expr_use_ctxt<'tcx>(cx: &LateContext<'tcx>, e: &Expr<'tcx>) -> ExprUseCtx
 /// Tokenizes the input while keeping the text associated with each token.
 pub fn tokenize_with_text(s: &str) -> impl Iterator<Item = (TokenKind, &str, InnerSpan)> {
     let mut pos = 0;
-    tokenize(s).map(move |t| {
+    tokenize(s, FrontmatterAllowed::No).map(move |t| {
         let end = pos + t.len;
         let range = pos as usize..end as usize;
         let inner = InnerSpan::new(range.start, range.end);
@@ -2779,7 +2779,7 @@ pub fn span_contains_comment(sm: &SourceMap, span: Span) -> bool {
     let Ok(snippet) = sm.span_to_snippet(span) else {
         return false;
     };
-    return tokenize(&snippet).any(|token| {
+    return tokenize(&snippet, FrontmatterAllowed::No).any(|token| {
         matches!(
             token.kind,
             TokenKind::BlockComment { .. } | TokenKind::LineComment { .. }
diff --git a/src/tools/clippy/clippy_utils/src/source.rs b/src/tools/clippy/clippy_utils/src/source.rs
@@ -7,7 +7,7 @@ use std::sync::Arc;
 use rustc_ast::{LitKind, StrStyle};
 use rustc_errors::Applicability;
 use rustc_hir::{BlockCheckMode, Expr, ExprKind, UnsafeSource};
-use rustc_lexer::{LiteralKind, TokenKind, tokenize};
+use rustc_lexer::{FrontmatterAllowed, LiteralKind, TokenKind, tokenize};
 use rustc_lint::{EarlyContext, LateContext};
 use rustc_middle::ty::TyCtxt;
 use rustc_session::Session;
@@ -277,7 +277,7 @@ fn map_range(
 }
 
 fn ends_with_line_comment_or_broken(text: &str) -> bool {
-    let Some(last) = tokenize(text).last() else {
+    let Some(last) = tokenize(text, FrontmatterAllowed::No).last() else {
         return false;
     };
     match last.kind {
@@ -310,7 +310,8 @@ fn with_leading_whitespace_inner(lines: &[RelativeBytePos], src: &str, range: Ra
         && ends_with_line_comment_or_broken(&start[prev_start..])
         && let next_line = lines.partition_point(|&pos| pos.to_usize() < range.end)
         && let next_start = lines.get(next_line).map_or(src.len(), |&x| x.to_usize())
-        && tokenize(src.get(range.end..next_start)?).any(|t| !matches!(t.kind, TokenKind::Whitespace))
+        && tokenize(src.get(range.end..next_start)?, FrontmatterAllowed::No)
+            .any(|t| !matches!(t.kind, TokenKind::Whitespace))
     {
         Some(range.start)
     } else {
diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
@@ -11,8 +11,8 @@
 use std::ops;
 
 use rustc_literal_escaper::{
-    EscapeError, Mode, unescape_byte, unescape_byte_str, unescape_c_str, unescape_char,
-    unescape_str,
+    unescape_byte, unescape_byte_str, unescape_c_str, unescape_char, unescape_str, EscapeError,
+    Mode,
 };
 
 use crate::{
@@ -44,7 +44,9 @@ impl<'a> LexedStr<'a> {
 
         // Re-create the tokenizer from scratch every token because `GuardedStrPrefix` is one token in the lexer
         // but we want to split it to two in edition <2024.
-        while let Some(token) = rustc_lexer::tokenize(&text[conv.offset..]).next() {
+        while let Some(token) =
+            rustc_lexer::tokenize(&text[conv.offset..], rustc_lexer::FrontmatterAllowed::No).next()
+        {
             let token_text = &text[conv.offset..][..token.len as usize];
 
             conv.extend_token(&token.kind, token_text);
@@ -58,7 +60,7 @@ impl<'a> LexedStr<'a> {
             return None;
         }
 
-        let token = rustc_lexer::tokenize(text).next()?;
+        let token = rustc_lexer::tokenize(text, rustc_lexer::FrontmatterAllowed::No).next()?;
         if token.len as usize != text.len() {
             return None;
         }
diff --git a/src/tools/rust-analyzer/crates/proc-macro-srv/src/server_impl.rs b/src/tools/rust-analyzer/crates/proc-macro-srv/src/server_impl.rs
diff --git a/src/tools/rust-analyzer/crates/tt/src/lib.rs b/src/tools/rust-analyzer/crates/tt/src/lib.rs
diff --git a/tests/ui/unpretty/frontmatter.rs b/tests/ui/unpretty/frontmatter.rs
diff --git a/tests/ui/unpretty/frontmatter.stdout b/tests/ui/unpretty/frontmatter.stdout

Original file line number	Diff line number	Diff line change
`@@ -120,7 +120,7 @@ fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comment>`
`120`	`120`	`pos += shebang_len;`
`121`	`121`	`}`
`122`	`122`
`123`		`- for token in rustc_lexer::tokenize(&text[pos..]) {`
	`123`	`+ for token in rustc_lexer::tokenize(&text[pos..], rustc_lexer::FrontmatterAllowed::Yes) {`
`124`	`124`	`let token_text = &text[pos..pos + token.len as usize];`
`125`	`125`	`match token.kind {`
`126`	`126`	`rustc_lexer::TokenKind::Whitespace => {`
`@@ -171,6 +171,14 @@ fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comment>`
`171`	`171`	`})`
`172`	`172`	`}`
`173`	`173`	`}`
	`174`	`+ rustc_lexer::TokenKind::Frontmatter { .. } => {`
	`175`	`+ code_to_the_left = false;`
	`176`	`+ comments.push(Comment {`
	`177`	`+ style: CommentStyle::Isolated,`
	`178`	`+ lines: vec![token_text.to_string()],`
	`179`	`+ pos: start_bpos + BytePos(pos as u32),`
	`180`	`+ });`
	`181`	`+ }`
`174`	`182`	`_ => {`
`175`	`183`	`code_to_the_left = true;`
`176`	`184`	`}`