Skip to content

Commit 0ce1811

Browse files
committed
feat(lexer): Allow including frontmatter with 'tokenize'
1 parent 425cd0f commit 0ce1811

File tree

11 files changed

+45
-37
lines changed

11 files changed

+45
-37
lines changed

compiler/rustc_ast_pretty/src/pprust/state.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ fn gather_comments(sm: &SourceMap, path: FileName, src: String) -> Vec<Comment>
120120
pos += shebang_len;
121121
}
122122

123-
for token in rustc_lexer::tokenize(&text[pos..]) {
123+
for token in rustc_lexer::tokenize(&text[pos..], rustc_lexer::FrontmatterAllowed::No) {
124124
let token_text = &text[pos..pos + token.len as usize];
125125
match token.kind {
126126
rustc_lexer::TokenKind::Whitespace => {

compiler/rustc_lexer/src/lib.rs

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -273,14 +273,15 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
273273
if let Some(input_tail) = input.strip_prefix("#!") {
274274
// Ok, this is a shebang but if the next non-whitespace token is `[`,
275275
// then it may be valid Rust code, so consider it Rust code.
276-
let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok| {
277-
!matches!(
278-
tok,
279-
TokenKind::Whitespace
280-
| TokenKind::LineComment { doc_style: None }
281-
| TokenKind::BlockComment { doc_style: None, .. }
282-
)
283-
});
276+
let next_non_whitespace_token =
277+
tokenize(input_tail, FrontmatterAllowed::No).map(|tok| tok.kind).find(|tok| {
278+
!matches!(
279+
tok,
280+
TokenKind::Whitespace
281+
| TokenKind::LineComment { doc_style: None }
282+
| TokenKind::BlockComment { doc_style: None, .. }
283+
)
284+
});
284285
if next_non_whitespace_token != Some(TokenKind::OpenBracket) {
285286
// No other choice than to consider this a shebang.
286287
return Some(2 + input_tail.lines().next().unwrap_or_default().len());
@@ -303,8 +304,16 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
303304
}
304305

305306
/// Creates an iterator that produces tokens from the input string.
306-
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> {
307-
let mut cursor = Cursor::new(input, FrontmatterAllowed::No);
307+
///
308+
/// When parsing a full Rust document,
309+
/// first [`strip_shebang`] and then allow frontmatters with [`FrontmatterAllowed::Yes`].
310+
///
311+
/// When tokenizing a slice of a document, be sure to disallow frontmatters with [`FrontmatterAllowed::No`]
312+
pub fn tokenize(
313+
input: &str,
314+
frontmatter_allowed: FrontmatterAllowed,
315+
) -> impl Iterator<Item = Token> {
316+
let mut cursor = Cursor::new(input, frontmatter_allowed);
308317
std::iter::from_fn(move || {
309318
let token = cursor.advance_token();
310319
if token.kind != TokenKind::Eof { Some(token) } else { None }

src/tools/clippy/clippy_lints/src/undocumented_unsafe_blocks.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use clippy_utils::visitors::{Descend, for_each_expr};
99
use hir::HirId;
1010
use rustc_hir as hir;
1111
use rustc_hir::{Block, BlockCheckMode, ItemKind, Node, UnsafeSource};
12-
use rustc_lexer::{TokenKind, tokenize};
12+
use rustc_lexer::{FrontmatterAllowed, TokenKind, tokenize};
1313
use rustc_lint::{LateContext, LateLintPass, LintContext};
1414
use rustc_session::impl_lint_pass;
1515
use rustc_span::{BytePos, Pos, RelativeBytePos, Span, SyntaxContext};
@@ -746,7 +746,7 @@ fn text_has_safety_comment(src: &str, line_starts: &[RelativeBytePos], start_pos
746746
loop {
747747
if line.starts_with("/*") {
748748
let src = &src[line_start..line_starts.last().unwrap().to_usize()];
749-
let mut tokens = tokenize(src);
749+
let mut tokens = tokenize(src, FrontmatterAllowed::No);
750750
return (src[..tokens.next().unwrap().len as usize]
751751
.to_ascii_uppercase()
752752
.contains("SAFETY:")

src/tools/clippy/clippy_lints/src/utils/format_args_collector.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use clippy_utils::source::SpanRangeExt;
33
use itertools::Itertools;
44
use rustc_ast::{Crate, Expr, ExprKind, FormatArgs};
55
use rustc_data_structures::fx::FxHashMap;
6-
use rustc_lexer::{TokenKind, tokenize};
6+
use rustc_lexer::{FrontmatterAllowed, TokenKind, tokenize};
77
use rustc_lint::{EarlyContext, EarlyLintPass};
88
use rustc_session::impl_lint_pass;
99
use rustc_span::{Span, hygiene};
@@ -82,7 +82,7 @@ fn has_span_from_proc_macro(cx: &EarlyContext<'_>, args: &FormatArgs) -> bool {
8282
.all(|sp| {
8383
sp.check_source_text(cx, |src| {
8484
// text should be either `, name` or `, name =`
85-
let mut iter = tokenize(src).filter(|t| {
85+
let mut iter = tokenize(src, FrontmatterAllowed::No).filter(|t| {
8686
!matches!(
8787
t.kind,
8888
TokenKind::LineComment { .. } | TokenKind::BlockComment { .. } | TokenKind::Whitespace

src/tools/clippy/clippy_utils/src/consts.rs

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use rustc_hir::def::{DefKind, Res};
1515
use rustc_hir::{
1616
BinOpKind, Block, ConstBlock, Expr, ExprKind, HirId, Item, ItemKind, Node, PatExpr, PatExprKind, QPath, UnOp,
1717
};
18-
use rustc_lexer::tokenize;
18+
use rustc_lexer::{FrontmatterAllowed, tokenize};
1919
use rustc_lint::LateContext;
2020
use rustc_middle::mir::ConstValue;
2121
use rustc_middle::mir::interpret::{Scalar, alloc_range};
@@ -304,9 +304,7 @@ pub fn lit_to_mir_constant<'tcx>(lit: &LitKind, ty: Option<Ty<'tcx>>) -> Constan
304304
match *lit {
305305
LitKind::Str(ref is, _) => Constant::Str(is.to_string()),
306306
LitKind::Byte(b) => Constant::Int(u128::from(b)),
307-
LitKind::ByteStr(ref s, _) | LitKind::CStr(ref s, _) => {
308-
Constant::Binary(s.as_byte_str().to_vec())
309-
}
307+
LitKind::ByteStr(ref s, _) | LitKind::CStr(ref s, _) => Constant::Binary(s.as_byte_str().to_vec()),
310308
LitKind::Char(c) => Constant::Char(c),
311309
LitKind::Int(n, _) => Constant::Int(n.get()),
312310
LitKind::Float(ref is, LitFloatType::Suffixed(fty)) => match fty {
@@ -568,9 +566,7 @@ impl<'tcx> ConstEvalCtxt<'tcx> {
568566
} else {
569567
match &lit.node {
570568
LitKind::Str(is, _) => Some(is.is_empty()),
571-
LitKind::ByteStr(s, _) | LitKind::CStr(s, _) => {
572-
Some(s.as_byte_str().is_empty())
573-
}
569+
LitKind::ByteStr(s, _) | LitKind::CStr(s, _) => Some(s.as_byte_str().is_empty()),
574570
_ => None,
575571
}
576572
}
@@ -715,7 +711,7 @@ impl<'tcx> ConstEvalCtxt<'tcx> {
715711
&& let Some(src) = src.as_str()
716712
{
717713
use rustc_lexer::TokenKind::{BlockComment, LineComment, OpenBrace, Semi, Whitespace};
718-
if !tokenize(src)
714+
if !tokenize(src, FrontmatterAllowed::No)
719715
.map(|t| t.kind)
720716
.filter(|t| !matches!(t, Whitespace | LineComment { .. } | BlockComment { .. } | Semi))
721717
.eq([OpenBrace])

src/tools/clippy/clippy_utils/src/hir_utils.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ use rustc_hir::{
1212
Pat, PatExpr, PatExprKind, PatField, PatKind, Path, PathSegment, PrimTy, QPath, Stmt, StmtKind, StructTailExpr,
1313
TraitBoundModifiers, Ty, TyKind, TyPat, TyPatKind,
1414
};
15-
use rustc_lexer::{TokenKind, tokenize};
15+
use rustc_lexer::{FrontmatterAllowed, TokenKind, tokenize};
1616
use rustc_lint::LateContext;
1717
use rustc_middle::ty::TypeckResults;
1818
use rustc_span::{BytePos, ExpnKind, MacroKind, Symbol, SyntaxContext, sym};
@@ -686,7 +686,7 @@ fn reduce_exprkind<'hir>(cx: &LateContext<'_>, kind: &'hir ExprKind<'hir>) -> &'
686686
// `{}` => `()`
687687
([], None)
688688
if block.span.check_source_text(cx, |src| {
689-
tokenize(src)
689+
tokenize(src, FrontmatterAllowed::No)
690690
.map(|t| t.kind)
691691
.filter(|t| {
692692
!matches!(

src/tools/clippy/clippy_utils/src/lib.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ use rustc_hir::{
106106
Param, Pat, PatExpr, PatExprKind, PatKind, Path, PathSegment, QPath, Stmt, StmtKind, TraitFn, TraitItem,
107107
TraitItemKind, TraitRef, TyKind, UnOp, def,
108108
};
109-
use rustc_lexer::{TokenKind, tokenize};
109+
use rustc_lexer::{FrontmatterAllowed, TokenKind, tokenize};
110110
use rustc_lint::{LateContext, Level, Lint, LintContext};
111111
use rustc_middle::hir::nested_filter;
112112
use rustc_middle::hir::place::PlaceBase;
@@ -2764,7 +2764,7 @@ pub fn expr_use_ctxt<'tcx>(cx: &LateContext<'tcx>, e: &Expr<'tcx>) -> ExprUseCtx
27642764
/// Tokenizes the input while keeping the text associated with each token.
27652765
pub fn tokenize_with_text(s: &str) -> impl Iterator<Item = (TokenKind, &str, InnerSpan)> {
27662766
let mut pos = 0;
2767-
tokenize(s).map(move |t| {
2767+
tokenize(s, FrontmatterAllowed::No).map(move |t| {
27682768
let end = pos + t.len;
27692769
let range = pos as usize..end as usize;
27702770
let inner = InnerSpan::new(range.start, range.end);
@@ -2779,7 +2779,7 @@ pub fn span_contains_comment(sm: &SourceMap, span: Span) -> bool {
27792779
let Ok(snippet) = sm.span_to_snippet(span) else {
27802780
return false;
27812781
};
2782-
return tokenize(&snippet).any(|token| {
2782+
return tokenize(&snippet, FrontmatterAllowed::No).any(|token| {
27832783
matches!(
27842784
token.kind,
27852785
TokenKind::BlockComment { .. } | TokenKind::LineComment { .. }

src/tools/clippy/clippy_utils/src/source.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use std::sync::Arc;
77
use rustc_ast::{LitKind, StrStyle};
88
use rustc_errors::Applicability;
99
use rustc_hir::{BlockCheckMode, Expr, ExprKind, UnsafeSource};
10-
use rustc_lexer::{LiteralKind, TokenKind, tokenize};
10+
use rustc_lexer::{FrontmatterAllowed, LiteralKind, TokenKind, tokenize};
1111
use rustc_lint::{EarlyContext, LateContext};
1212
use rustc_middle::ty::TyCtxt;
1313
use rustc_session::Session;
@@ -277,7 +277,7 @@ fn map_range(
277277
}
278278

279279
fn ends_with_line_comment_or_broken(text: &str) -> bool {
280-
let Some(last) = tokenize(text).last() else {
280+
let Some(last) = tokenize(text, FrontmatterAllowed::No).last() else {
281281
return false;
282282
};
283283
match last.kind {
@@ -310,7 +310,8 @@ fn with_leading_whitespace_inner(lines: &[RelativeBytePos], src: &str, range: Ra
310310
&& ends_with_line_comment_or_broken(&start[prev_start..])
311311
&& let next_line = lines.partition_point(|&pos| pos.to_usize() < range.end)
312312
&& let next_start = lines.get(next_line).map_or(src.len(), |&x| x.to_usize())
313-
&& tokenize(src.get(range.end..next_start)?).any(|t| !matches!(t.kind, TokenKind::Whitespace))
313+
&& tokenize(src.get(range.end..next_start)?, FrontmatterAllowed::No)
314+
.any(|t| !matches!(t.kind, TokenKind::Whitespace))
314315
{
315316
Some(range.start)
316317
} else {

src/tools/rust-analyzer/crates/parser/src/lexed_str.rs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
use std::ops;
1212

1313
use rustc_literal_escaper::{
14-
EscapeError, Mode, unescape_byte, unescape_byte_str, unescape_c_str, unescape_char,
15-
unescape_str,
14+
unescape_byte, unescape_byte_str, unescape_c_str, unescape_char, unescape_str, EscapeError,
15+
Mode,
1616
};
1717

1818
use crate::{
@@ -44,7 +44,9 @@ impl<'a> LexedStr<'a> {
4444

4545
// Re-create the tokenizer from scratch every token because `GuardedStrPrefix` is one token in the lexer
4646
// but we want to split it to two in edition <2024.
47-
while let Some(token) = rustc_lexer::tokenize(&text[conv.offset..]).next() {
47+
while let Some(token) =
48+
rustc_lexer::tokenize(&text[conv.offset..], rustc_lexer::FrontmatterAllowed::No).next()
49+
{
4850
let token_text = &text[conv.offset..][..token.len as usize];
4951

5052
conv.extend_token(&token.kind, token_text);
@@ -58,7 +60,7 @@ impl<'a> LexedStr<'a> {
5860
return None;
5961
}
6062

61-
let token = rustc_lexer::tokenize(text).next()?;
63+
let token = rustc_lexer::tokenize(text, rustc_lexer::FrontmatterAllowed::No).next()?;
6264
if token.len as usize != text.len() {
6365
return None;
6466
}

src/tools/rust-analyzer/crates/proc-macro-srv/src/server_impl.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ pub(super) fn literal_from_str<Span: Copy>(
121121
use proc_macro::bridge::LitKind;
122122
use rustc_lexer::{LiteralKind, Token, TokenKind};
123123

124-
let mut tokens = rustc_lexer::tokenize(s);
124+
let mut tokens = rustc_lexer::tokenize(s, rustc_lexer::FrontmatterAllowed::No);
125125
let minus_or_lit = tokens.next().unwrap_or(Token { kind: TokenKind::Eof, len: 0 });
126126

127127
let lit = if minus_or_lit.kind == TokenKind::Minus {

0 commit comments

Comments
 (0)