diff --git a/src/cargo/util/toml/embedded.rs b/src/cargo/util/toml/embedded.rs index 6ed4858b27d..70638254116 100644 --- a/src/cargo/util/toml/embedded.rs +++ b/src/cargo/util/toml/embedded.rs @@ -78,59 +78,74 @@ impl<'s> ScriptSource<'s> { source.content = content; } - const FENCE_CHAR: char = '-'; - let mut rest = source.content; - while !rest.is_empty() { - let without_spaces = rest.trim_start_matches([' ', '\t']); - let without_nl = without_spaces.trim_start_matches(['\r', '\n']); - if without_nl == rest { - // nothing trimmed - break; - } else if without_nl == without_spaces { - // frontmatter must come after a newline + + // Whitespace may precede a frontmatter but must end with a newline + let trimmed = rest.trim_start_matches(is_whitespace); + if trimmed.len() != rest.len() { + let trimmed_len = rest.len() - trimmed.len(); + let last_trimmed_index = trimmed_len - 1; + if rest.as_bytes()[last_trimmed_index] != b'\n' { + // either not a frontmatter or invalid opening return Ok(source); } - rest = without_nl; } - let fence_end = rest + rest = trimmed; + + // Opens with a line that starts with 3 or more `-` followed by an optional identifier + const FENCE_CHAR: char = '-'; + let fence_length = rest .char_indices() .find_map(|(i, c)| (c != FENCE_CHAR).then_some(i)) - .unwrap_or(source.content.len()); - let (fence_pattern, rest) = match fence_end { + .unwrap_or(rest.len()); + match fence_length { 0 => { return Ok(source); } 1 | 2 => { + // either not a frontmatter or invalid frontmatter opening anyhow::bail!( - "found {fence_end} `{FENCE_CHAR}` in rust frontmatter, expected at least 3" + "found {fence_length} `{FENCE_CHAR}` in rust frontmatter, expected at least 3" ) } - _ => rest.split_at(fence_end), + _ => {} + } + let (fence_pattern, rest) = rest.split_at(fence_length); + let Some(info_end_index) = rest.find('\n') else { + anyhow::bail!("no closing `{fence_pattern}` found for frontmatter"); }; - let nl_fence_pattern = format!("\n{fence_pattern}"); - let (info, content) = rest.split_once("\n").unwrap_or((rest, "")); - let info = info.trim(); + let (info, rest) = rest.split_at(info_end_index); + let info = info.trim_matches(is_whitespace); if !info.is_empty() { source.info = Some(info); } - source.content = content; - let Some(frontmatter_nl) = source.content.find(&nl_fence_pattern) else { + // Ends with a line that starts with a matching number of `-` only followed by whitespace + let nl_fence_pattern = format!("\n{fence_pattern}"); + let Some(frontmatter_nl) = rest.find(&nl_fence_pattern) else { anyhow::bail!("no closing `{fence_pattern}` found for frontmatter"); }; - source.frontmatter = Some(&source.content[..frontmatter_nl + 1]); - source.content = &source.content[frontmatter_nl + nl_fence_pattern.len()..]; - - let (line, content) = source - .content - .split_once("\n") - .unwrap_or((source.content, "")); - let line = line.trim(); - if !line.is_empty() { - anyhow::bail!("unexpected trailing content on closing fence: `{line}`"); + let frontmatter = &rest[..frontmatter_nl + 1]; + let frontmatter = frontmatter + .strip_prefix('\n') + .expect("earlier `found` + `split_at` left us here"); + source.frontmatter = Some(frontmatter); + let rest = &rest[frontmatter_nl + nl_fence_pattern.len()..]; + + let (after_closing_fence, rest) = rest.split_once("\n").unwrap_or((rest, "")); + let after_closing_fence = after_closing_fence.trim_matches(is_whitespace); + if !after_closing_fence.is_empty() { + // extra characters beyond the original fence pattern, even if they are extra `-` + anyhow::bail!("trailing characters found after frontmatter close"); + } + + let frontmatter_len = input.len() - rest.len(); + source.content = &input[frontmatter_len..]; + + let repeat = Self::parse(source.content)?; + if repeat.frontmatter.is_some() { + anyhow::bail!("only one frontmatter is supported"); } - source.content = content; Ok(source) } @@ -172,6 +187,40 @@ fn strip_shebang(input: &str) -> Option { None } +/// True if `c` is considered a whitespace according to Rust language definition. +/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html) +/// for definitions of these classes. +/// +/// See rust-lang/rust's compiler/rustc_lexer/src/lib.rs `is_whitespace` +fn is_whitespace(c: char) -> bool { + // This is Pattern_White_Space. + // + // Note that this set is stable (ie, it doesn't change with different + // Unicode versions), so it's ok to just hard-code the values. + + matches!( + c, + // Usual ASCII suspects + '\u{0009}' // \t + | '\u{000A}' // \n + | '\u{000B}' // vertical tab + | '\u{000C}' // form feed + | '\u{000D}' // \r + | '\u{0020}' // space + + // NEXT LINE from latin1 + | '\u{0085}' + + // Bidi markers + | '\u{200E}' // LEFT-TO-RIGHT MARK + | '\u{200F}' // RIGHT-TO-LEFT MARK + + // Dedicated whitespace characters from Unicode + | '\u{2028}' // LINE SEPARATOR + | '\u{2029}' // PARAGRAPH SEPARATOR + ) +} + #[cfg(test)] mod test_expand { use snapbox::assert_data_eq; @@ -216,6 +265,466 @@ mod test_expand { } } + #[test] + fn rustc_dot_in_infostring_leading() { + // We don't validate infostrings besides `info == "cargo"` + assert_source( + r#"---.toml +//~^ ERROR: invalid infostring for frontmatter +--- + +// infostrings cannot have leading dots + +fn main() {} +"#, + str![[r#" +shebang: None +info: ".toml" +frontmatter: "//~^ ERROR: invalid infostring for frontmatter\n" +content: "\n// infostrings cannot have leading dots\n\nfn main() {}\n" + +"#]], + ); + } + + #[test] + fn rustc_dot_in_infostring_non_leading() { + assert_source( + r#"---Cargo.toml +--- + +// infostrings can contain dots as long as a dot isn't the first character. +//@ check-pass + +fn main() {} +"#, + str![[r#" +shebang: None +info: "Cargo.toml" +frontmatter: "" +content: "\n// infostrings can contain dots as long as a dot isn't the first character.\n//@ check-pass\n\nfn main() {}\n" + +"#]], + ); + } + + #[test] + fn rustc_escape() { + assert_source( + r#"---- + +--- + +---- + +//@ check-pass + +// This test checks that longer dashes for opening and closing can be used to +// escape sequences such as three dashes inside the frontmatter block. + +fn main() {} +"#, + str![[r#" +shebang: None +info: None +frontmatter: "\n---\n\n" +content: "\n//@ check-pass\n\n// This test checks that longer dashes for opening and closing can be used to\n// escape sequences such as three dashes inside the frontmatter block.\n\nfn main() {}\n" + +"#]], + ); + } + + #[test] + fn rustc_extra_after_end() { + assert_err( + ScriptSource::parse( + r#"--- +---cargo +//~^ ERROR: extra characters after frontmatter close are not allowed + +fn main() {} +"#, + ), + str!["trailing characters found after frontmatter close"], + ); + } + + #[test] + fn rustc_frontmatter_after_tokens() { + // Deferred to rustc since this requires knowledge of Rust grammar + assert_source( + r#"#![feature(frontmatter)] + +--- +//~^ ERROR: expected item, found `-` +// FIXME(frontmatter): make this diagnostic better +--- + +// frontmatters must be at the start of a file. This test ensures that. + +fn main() {} +"#, + str![[r##" +shebang: None +info: None +frontmatter: None +content: "#![feature(frontmatter)]\n\n---\n//~^ ERROR: expected item, found `-`\n// FIXME(frontmatter): make this diagnostic better\n---\n\n// frontmatters must be at the start of a file. This test ensures that.\n\nfn main() {}\n" + +"##]], + ); + } + + #[test] + fn rustc_frontmatter_non_lexible_tokens() { + assert_source( + r#"---uwu +🏳️‍⚧️ +--- + +//@ check-pass + +// check that frontmatter blocks can have tokens that are otherwise not accepted by +// the lexer as Rust code. + +fn main() {} +"#, + str![[r#" +shebang: None +info: "uwu" +frontmatter: "🏳\u{fe0f}\u{200d}⚧\u{fe0f}\n" +content: "\n//@ check-pass\n\n// check that frontmatter blocks can have tokens that are otherwise not accepted by\n// the lexer as Rust code.\n\nfn main() {}\n" + +"#]], + ); + } + + #[test] + fn rustc_frontmatter_whitespace_1() { + // Deferred to rustc since this requires knowledge of Rust grammar + assert_source( + r#" --- +//~^ ERROR: invalid preceding whitespace for frontmatter opening + --- +//~^ ERROR: invalid preceding whitespace for frontmatter close + +// check that whitespaces should not precede the frontmatter opening or close. + +fn main() {} +"#, + str![[r#" +shebang: None +info: None +frontmatter: None +content: " ---\n//~^ ERROR: invalid preceding whitespace for frontmatter opening\n ---\n//~^ ERROR: invalid preceding whitespace for frontmatter close\n\n// check that whitespaces should not precede the frontmatter opening or close.\n\nfn main() {}\n" + +"#]], + ); + } + + #[test] + fn rustc_frontmatter_whitespace_2() { + assert_err( + ScriptSource::parse( + r#"---cargo + +//@ compile-flags: --crate-type lib + +fn foo(x: i32) -> i32 { + ---x + //~^ ERROR: invalid preceding whitespace for frontmatter close + //~| ERROR: extra characters after frontmatter close are not allowed +} +//~^ ERROR: unexpected closing delimiter: `}` + +// this test is for the weird case that valid Rust code can have three dashes +// within them and get treated as a frontmatter close. +"#, + ), + str!["no closing `---` found for frontmatter"], + ); + } + + #[test] + fn rustc_frontmatter_whitespace_3() { + assert_source( + r#" + + +---cargo +--- + +// please note the whitespace characters after the first four lines. +// This ensures that we accept whitespaces before the frontmatter, after +// the frontmatter opening and the frontmatter close. + +//@ check-pass +// ignore-tidy-end-whitespace +// ignore-tidy-leading-newlines + +fn main() {} +"#, + str![[r#" +shebang: None +info: "cargo" +frontmatter: "" +content: "\n// please note the whitespace characters after the first four lines.\n// This ensures that we accept whitespaces before the frontmatter, after\n// the frontmatter opening and the frontmatter close.\n\n//@ check-pass\n// ignore-tidy-end-whitespace\n// ignore-tidy-leading-newlines\n\nfn main() {}\n" + +"#]], + ); + } + + #[test] + fn rustc_frontmatter_whitespace_4() { + assert_source( + r#"--- cargo +--- + +//@ check-pass +// A frontmatter infostring can have leading whitespace. + +fn main() {} +"#, + str![[r#" +shebang: None +info: "cargo" +frontmatter: "" +content: "\n//@ check-pass\n// A frontmatter infostring can have leading whitespace.\n\nfn main() {}\n" + +"#]], + ); + } + + #[test] + fn rustc_infostring_fail() { + // We don't validate infostrings besides `info == "cargo"` + assert_source( + r#" +---cargo,clippy +//~^ ERROR: invalid infostring for frontmatter +--- + +// infostrings can only be a single identifier. + +fn main() {} +"#, + str![[r#" +shebang: None +info: "cargo,clippy" +frontmatter: "//~^ ERROR: invalid infostring for frontmatter\n" +content: "\n// infostrings can only be a single identifier.\n\nfn main() {}\n" + +"#]], + ); + } + + #[test] + fn rustc_mismatch_1() { + assert_err( + ScriptSource::parse( + r#"---cargo +//~^ ERROR: frontmatter close does not match the opening +---- + +// there must be the same number of dashes for both the opening and the close +// of the frontmatter. + +fn main() {} +"#, + ), + str!["trailing characters found after frontmatter close"], + ); + } + + #[test] + fn rustc_mismatch_2() { + assert_err( + ScriptSource::parse( + r#"----cargo +//~^ ERROR: frontmatter close does not match the opening +---cargo +//~^ ERROR: extra characters after frontmatter close are not allowed + +fn main() {} +"#, + ), + str!["no closing `----` found for frontmatter"], + ); + } + + #[test] + fn rustc_multifrontmatter_2() { + // This should be valid, bug on rustc's side, see rust-lang/rust#141367 + assert_source( + r#"--- + --- +//~^ ERROR: invalid preceding whitespace for frontmatter close + + --- +//~^ ERROR: expected item, found `-` +// FIXME(frontmatter): make this diagnostic better +--- + +fn main() {} +"#, + str![[r#" +shebang: None +info: None +frontmatter: " ---\n//~^ ERROR: invalid preceding whitespace for frontmatter close\n\n ---\n//~^ ERROR: expected item, found `-`\n// FIXME(frontmatter): make this diagnostic better\n" +content: "\nfn main() {}\n" + +"#]], + ); + } + + #[test] + fn rustc_multifrontmatter() { + assert_err( + ScriptSource::parse( + r#"--- +--- + +--- +//~^ ERROR: expected item, found `-` +// FIXME(frontmatter): make this diagnostic better +--- + +// test that we do not parse another frontmatter block after the first one. + +fn main() {} +"#, + ), + str!["only one frontmatter is supported"], + ); + } + + #[test] + fn rustc_shebang() { + assert_source( + r#"#!/usr/bin/env -S cargo -Zscript +--- +[dependencies] +clap = "4" +--- + +//@ check-pass + +// Shebangs on a file can precede a frontmatter. + +fn main () {} +"#, + str![[r##" +shebang: "#!/usr/bin/env -S cargo -Zscript\n" +info: None +frontmatter: "[dependencies]\nclap = \"4\"\n" +content: "\n//@ check-pass\n\n// Shebangs on a file can precede a frontmatter.\n\nfn main () {}\n" + +"##]], + ); + } + + #[test] + fn rustc_unclosed_1() { + assert_err( + ScriptSource::parse( + r#"----cargo +//~^ ERROR: unclosed frontmatter + +// This test checks that the #! characters can help us recover a frontmatter +// close. There should not be a "missing `main` function" error as the rest +// are properly parsed. + +fn main() {} +"#, + ), + str!["no closing `----` found for frontmatter"], + ); + } + + #[test] + fn rustc_unclosed_2() { + assert_err( + ScriptSource::parse( + r#"----cargo +//~^ ERROR: unclosed frontmatter +//~| ERROR: frontmatters are experimental + +//@ compile-flags: --crate-type lib + +// Leading whitespace on the feature line prevents recovery. However +// the dashes quoted will not be used for recovery and the entire file +// should be treated as within the frontmatter block. + +fn foo() -> &str { + "----" +} +"#, + ), + str!["no closing `----` found for frontmatter"], + ); + } + + #[test] + fn rustc_unclosed_3() { + assert_err( + ScriptSource::parse( + r#"----cargo +//~^ ERROR: frontmatter close does not match the opening + +//@ compile-flags: --crate-type lib + +// Unfortunate recovery situation. Not really preventable with improving the +// recovery strategy, but this type of code is rare enough already. + +fn foo(x: i32) -> i32 { + ---x + //~^ ERROR: invalid preceding whitespace for frontmatter close + //~| ERROR: extra characters after frontmatter close are not allowed +} +//~^ ERROR: unexpected closing delimiter: `}` +"#, + ), + str!["no closing `----` found for frontmatter"], + ); + } + + #[test] + fn rustc_unclosed_4() { + assert_err( + ScriptSource::parse( + r#" +----cargo +//~^ ERROR: unclosed frontmatter + +//! Similarly, a module-level content should allow for recovery as well (as +//! per unclosed-1.rs) + +fn main() {} +"#, + ), + str!["no closing `----` found for frontmatter"], + ); + } + + #[test] + fn rustc_unclosed_5() { + assert_err( + ScriptSource::parse( + r#"----cargo +//~^ ERROR: unclosed frontmatter +//~| ERROR: frontmatters are experimental + +// Similarly, a use statement should allow for recovery as well (as +// per unclosed-1.rs) + +use std::env; + +fn main() {} +"#, + ), + str!["no closing `----` found for frontmatter"], + ); + } + #[test] fn split_default() { assert_source( @@ -458,7 +967,7 @@ content: "\nfn main() {}\n" fn main() {} "#, ), - str!["unexpected trailing content on closing fence: `--`"], + str!["trailing characters found after frontmatter close"], ); } @@ -495,7 +1004,7 @@ time="0.1.25" fn main() {} "#, ), - str!["unexpected trailing content on closing fence: `-`"], + str!["trailing characters found after frontmatter close"], ); }