Skip to content

Commit 095df3e

Browse files
committed
lex_via_rustc: updates for more recent rustc
rustc's high-level lexer now provides a public interface returning a TokenStream, so we now use that rather than making a Parser and pulling tokens from it one by one. (And in any case the previous approach no longer works, because Parser::token_spacing is no longer public.) See rust-lang/rust#125815 rust-lang/rust#126052 Other rustc changes: ParseSess now provides a dcx() method rather than a public dcx field. There are new NtIdent and NtLifetime TokenKinds, which AIUI won't appear in token streams created by the lexer.
1 parent d26f07d commit 095df3e

File tree

3 files changed

+154
-110
lines changed

3 files changed

+154
-110
lines changed

rust-toolchain.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
[toolchain]
2-
channel = "nightly-2024-05-02"
2+
channel = "nightly-2024-07-29"
33
components = ["rustc-dev", "llvm-tools"]

src/lex_via_rustc.rs

Lines changed: 152 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
//! Runs rustc's lexical analysis.
22
//!
3-
//! This works by running the low-level and high-level lexers as far as making a `TokenTree`, then
4-
//! pulling tokens from it one by one in the same way as rustc's parser does. If rustc emits any
5-
//! error messages (or panics), we treat the input as rejected.
3+
//! This works by running the low-level and high-level lexers as far as making a `TokenStream`, then
4+
//! flattening the `TokenTree`s it contains back into a sequence of tokens in a similar way to
5+
//! rustc's parser.
6+
//! If rustc emits any error messages (or panics), we treat the input as rejected.
67
//!
78
//! Stringlike literal tokens are further run through ast::LitKind::from_token_lit(), to obtain the
89
//! "unescaped" value.
@@ -11,9 +12,8 @@
1112
//! (BOM-removal and CRLF-conversion) happen. Later shebang removal happens too. See the
1213
//! [`cleaning`][`crate::cleaning`] module for how we make equivalent input for comparison.
1314
//!
14-
//! One weakness of this approach is that, because it constructs a token tree, input with imbalanced
15-
//! delimiters is rejected. (I don't see a `pub` interface giving access to the stream before
16-
//! building the `TokenTree`.)
15+
//! A limitation of this approach is that, because it constructs token trees, input with imbalanced
16+
//! delimiters is rejected.
1717
1818
extern crate rustc_ast;
1919
extern crate rustc_data_structures;
@@ -25,17 +25,19 @@ extern crate rustc_session;
2525
extern crate rustc_span;
2626

2727
// This compiles with
28-
// rustc nightly from approximately 2024-05-02
28+
// rustc nightly from approximately 2024-07-29
2929

3030
use std::{
3131
mem,
3232
sync::{Arc, Mutex},
3333
};
3434

35-
use rustc_ast::token::TokenKind;
35+
use rustc_ast::{
36+
token::{Token, TokenKind},
37+
tokenstream::{TokenStream, TokenTree},
38+
};
3639
use rustc_data_structures::sync::Lrc;
3740
use rustc_errors::{DiagCtxt, LazyFallbackBundle};
38-
use rustc_parse::parser::Parser;
3941
use rustc_span::{
4042
source_map::{FilePathMapping, SourceMap},
4143
FileName,
@@ -211,16 +213,16 @@ pub enum Analysis {
211213
/// - if rustc would have reported a non-fatal error, at least one message has
212214
/// been added to error_list
213215
/// - in this case, the returned tokens are what would have been passed on to
214-
/// the parser (an empty list if tokentree construction failed).
216+
/// the parser (an empty list if token stream construction failed).
215217
fn run_lexer(input: &str, error_list: ErrorAccumulator) -> Vec<RustcToken> {
216218
let psess = make_parser_session(error_list.clone());
217219
let source_map = psess.source_map();
218220
let input = String::from(input);
219221
let filename = FileName::Custom("lex_via_rustc".into());
220-
let lexed = match rustc_parse::maybe_new_parser_from_source_str(&psess, filename, input) {
221-
Ok(parser) => tokens_from_parser(parser, source_map),
222+
let lexed = match rustc_parse::source_str_to_stream(&psess, filename, input, None) {
223+
Ok(token_stream) => TokenStreamProcessor::process(&token_stream, &source_map),
222224
Err(diags) => {
223-
// Errors constructing the tokentree are reported here
225+
// Errors constructing the token stream are reported here
224226
// (ie, unbalanced delimiters).
225227
assert!(!diags.is_empty());
226228
for diag in diags {
@@ -232,7 +234,7 @@ fn run_lexer(input: &str, error_list: ErrorAccumulator) -> Vec<RustcToken> {
232234
// The lexer doesn't report errors itself when it sees emoji in 'identifiers'. Instead it leaves
233235
// a note in the ParseSess to be examined later. So we have to make this extra check.
234236
if !&psess.bad_unicode_identifiers.borrow_mut().is_empty() {
235-
psess.dcx.err("bad unicode identifier(s)");
237+
psess.dcx().err("bad unicode identifier(s)");
236238
}
237239
lexed
238240
}
@@ -305,104 +307,146 @@ fn make_parser_session(error_list: ErrorAccumulator) -> rustc_session::parse::Pa
305307
rustc_session::parse::ParseSess::with_dcx(dcx, sm)
306308
}
307309

308-
fn tokens_from_parser(mut parser: Parser, source_map: &SourceMap) -> Vec<RustcToken> {
309-
let mut tokens = Vec::new();
310-
while parser.token.kind != TokenKind::Eof {
311-
let data = match parser.token.kind {
312-
TokenKind::DocComment(comment_kind, style, symbol) => RustcTokenData::DocComment {
313-
comment_kind: comment_kind.into(),
314-
style: style.into(),
315-
body: symbol.to_string(),
316-
},
317-
TokenKind::Eq => RustcTokenData::Punctuation,
318-
TokenKind::Lt => RustcTokenData::Punctuation,
319-
TokenKind::Le => RustcTokenData::Punctuation,
320-
TokenKind::EqEq => RustcTokenData::Punctuation,
321-
TokenKind::Ne => RustcTokenData::Punctuation,
322-
TokenKind::Ge => RustcTokenData::Punctuation,
323-
TokenKind::Gt => RustcTokenData::Punctuation,
324-
TokenKind::AndAnd => RustcTokenData::Punctuation,
325-
TokenKind::OrOr => RustcTokenData::Punctuation,
326-
TokenKind::Not => RustcTokenData::Punctuation,
327-
TokenKind::Tilde => RustcTokenData::Punctuation,
328-
TokenKind::BinOp(_) => RustcTokenData::Punctuation,
329-
TokenKind::BinOpEq(_) => RustcTokenData::Punctuation,
330-
TokenKind::At => RustcTokenData::Punctuation,
331-
TokenKind::Dot => RustcTokenData::Punctuation,
332-
TokenKind::DotDot => RustcTokenData::Punctuation,
333-
TokenKind::DotDotDot => RustcTokenData::Punctuation,
334-
TokenKind::DotDotEq => RustcTokenData::Punctuation,
335-
TokenKind::Comma => RustcTokenData::Punctuation,
336-
TokenKind::Semi => RustcTokenData::Punctuation,
337-
TokenKind::Colon => RustcTokenData::Punctuation,
338-
TokenKind::PathSep => RustcTokenData::Punctuation,
339-
TokenKind::RArrow => RustcTokenData::Punctuation,
340-
TokenKind::LArrow => RustcTokenData::Punctuation,
341-
TokenKind::FatArrow => RustcTokenData::Punctuation,
342-
TokenKind::Pound => RustcTokenData::Punctuation,
343-
TokenKind::Dollar => RustcTokenData::Punctuation,
344-
TokenKind::Question => RustcTokenData::Punctuation,
345-
TokenKind::SingleQuote => RustcTokenData::Punctuation,
346-
TokenKind::OpenDelim(_) => RustcTokenData::Punctuation,
347-
TokenKind::CloseDelim(_) => RustcTokenData::Punctuation,
348-
TokenKind::Ident(symbol, style) => RustcTokenData::Ident {
349-
style: style.into(),
350-
identifier: symbol.to_string(),
351-
},
352-
TokenKind::Lifetime(symbol) => RustcTokenData::Lifetime {
353-
symbol: symbol.to_string(),
354-
},
355-
TokenKind::Literal(rustc_ast::token::Lit {
356-
kind: rustc_ast::token::LitKind::Integer,
357-
suffix,
358-
..
359-
}) => RustcTokenData::Lit {
360-
literal_data: RustcLiteralData::Integer(
361-
suffix.map(|s| s.to_string()).unwrap_or_else(String::new),
362-
),
363-
},
364-
TokenKind::Literal(rustc_ast::token::Lit {
365-
kind: rustc_ast::token::LitKind::Float,
366-
suffix,
367-
..
368-
}) => RustcTokenData::Lit {
369-
literal_data: RustcLiteralData::Float(
370-
suffix.map(|s| s.to_string()).unwrap_or_else(String::new),
371-
),
372-
},
373-
TokenKind::Literal(lit) => {
374-
match lit.suffix {
375-
// from_token_lit() is what performs unescaping, but it will panic if it sees a
376-
// suffix
377-
None => {
378-
let ast_lit = rustc_ast::ast::LitKind::from_token_lit(lit)
379-
.expect("from_token_lit failed");
380-
RustcTokenData::Lit {
381-
literal_data: literal_data_from_ast_litkind(ast_lit),
382-
}
310+
/// Converts a rustc_ast `TokenStream` to a flat sequence of `RustcToken`s.
311+
struct TokenStreamProcessor<'a> {
312+
source_map: &'a SourceMap,
313+
output: Vec<RustcToken>,
314+
}
315+
316+
impl<'a> TokenStreamProcessor<'a> {
317+
fn process(token_stream: &TokenStream, source_map: &'a SourceMap) -> Vec<RustcToken> {
318+
let mut flattener = Self {
319+
source_map,
320+
output: Vec::new(),
321+
};
322+
flattener.add_tokens_from_stream(token_stream);
323+
flattener.output
324+
}
325+
326+
fn add_tokens_from_stream(&mut self, token_stream: &TokenStream) {
327+
for token_tree in token_stream.trees() {
328+
self.add_tokens_from_tree(token_tree);
329+
}
330+
}
331+
332+
fn add_tokens_from_tree(&mut self, token_tree: &TokenTree) {
333+
match token_tree {
334+
&TokenTree::Token(ref token, spacing) => {
335+
self.output
336+
.push(token_from_ast_token(token, spacing, self.source_map))
337+
}
338+
&TokenTree::Delimited(delim_span, delim_spacing, delimiter, ref token_stream) => {
339+
self.output.push(token_from_ast_token(
340+
&Token::new(TokenKind::OpenDelim(delimiter), delim_span.open),
341+
delim_spacing.open,
342+
self.source_map,
343+
));
344+
self.add_tokens_from_stream(token_stream);
345+
self.output.push(token_from_ast_token(
346+
&Token::new(TokenKind::CloseDelim(delimiter), delim_span.close),
347+
delim_spacing.close,
348+
self.source_map,
349+
));
350+
}
351+
}
352+
}
353+
}
354+
355+
fn token_from_ast_token(
356+
token: &Token,
357+
spacing: rustc_ast::tokenstream::Spacing,
358+
source_map: &SourceMap,
359+
) -> RustcToken {
360+
let data = match token.kind {
361+
TokenKind::DocComment(comment_kind, style, symbol) => RustcTokenData::DocComment {
362+
comment_kind: comment_kind.into(),
363+
style: style.into(),
364+
body: symbol.to_string(),
365+
},
366+
TokenKind::Eq => RustcTokenData::Punctuation,
367+
TokenKind::Lt => RustcTokenData::Punctuation,
368+
TokenKind::Le => RustcTokenData::Punctuation,
369+
TokenKind::EqEq => RustcTokenData::Punctuation,
370+
TokenKind::Ne => RustcTokenData::Punctuation,
371+
TokenKind::Ge => RustcTokenData::Punctuation,
372+
TokenKind::Gt => RustcTokenData::Punctuation,
373+
TokenKind::AndAnd => RustcTokenData::Punctuation,
374+
TokenKind::OrOr => RustcTokenData::Punctuation,
375+
TokenKind::Not => RustcTokenData::Punctuation,
376+
TokenKind::Tilde => RustcTokenData::Punctuation,
377+
TokenKind::BinOp(_) => RustcTokenData::Punctuation,
378+
TokenKind::BinOpEq(_) => RustcTokenData::Punctuation,
379+
TokenKind::At => RustcTokenData::Punctuation,
380+
TokenKind::Dot => RustcTokenData::Punctuation,
381+
TokenKind::DotDot => RustcTokenData::Punctuation,
382+
TokenKind::DotDotDot => RustcTokenData::Punctuation,
383+
TokenKind::DotDotEq => RustcTokenData::Punctuation,
384+
TokenKind::Comma => RustcTokenData::Punctuation,
385+
TokenKind::Semi => RustcTokenData::Punctuation,
386+
TokenKind::Colon => RustcTokenData::Punctuation,
387+
TokenKind::PathSep => RustcTokenData::Punctuation,
388+
TokenKind::RArrow => RustcTokenData::Punctuation,
389+
TokenKind::LArrow => RustcTokenData::Punctuation,
390+
TokenKind::FatArrow => RustcTokenData::Punctuation,
391+
TokenKind::Pound => RustcTokenData::Punctuation,
392+
TokenKind::Dollar => RustcTokenData::Punctuation,
393+
TokenKind::Question => RustcTokenData::Punctuation,
394+
TokenKind::SingleQuote => RustcTokenData::Punctuation,
395+
TokenKind::OpenDelim(_) => RustcTokenData::Punctuation,
396+
TokenKind::CloseDelim(_) => RustcTokenData::Punctuation,
397+
TokenKind::Ident(symbol, style) => RustcTokenData::Ident {
398+
style: style.into(),
399+
identifier: symbol.to_string(),
400+
},
401+
TokenKind::Lifetime(symbol) => RustcTokenData::Lifetime {
402+
symbol: symbol.to_string(),
403+
},
404+
TokenKind::Literal(rustc_ast::token::Lit {
405+
kind: rustc_ast::token::LitKind::Integer,
406+
suffix,
407+
..
408+
}) => RustcTokenData::Lit {
409+
literal_data: RustcLiteralData::Integer(
410+
suffix.map(|s| s.to_string()).unwrap_or_else(String::new),
411+
),
412+
},
413+
TokenKind::Literal(rustc_ast::token::Lit {
414+
kind: rustc_ast::token::LitKind::Float,
415+
suffix,
416+
..
417+
}) => RustcTokenData::Lit {
418+
literal_data: RustcLiteralData::Float(
419+
suffix.map(|s| s.to_string()).unwrap_or_else(String::new),
420+
),
421+
},
422+
TokenKind::Literal(lit) => {
423+
match lit.suffix {
424+
// from_token_lit() is what performs unescaping, but it will panic if it sees a
425+
// suffix
426+
None => {
427+
let ast_lit = rustc_ast::ast::LitKind::from_token_lit(lit)
428+
.expect("from_token_lit failed");
429+
RustcTokenData::Lit {
430+
literal_data: literal_data_from_ast_litkind(ast_lit),
383431
}
384-
Some(suffix) => RustcTokenData::Lit {
385-
literal_data: RustcLiteralData::ForbiddenSuffix(suffix.to_string()),
386-
},
387432
}
433+
Some(suffix) => RustcTokenData::Lit {
434+
literal_data: RustcLiteralData::ForbiddenSuffix(suffix.to_string()),
435+
},
388436
}
389-
// These shouldn't happen
390-
TokenKind::Interpolated(_) => RustcTokenData::Other,
391-
TokenKind::Eof => RustcTokenData::Other,
392-
};
393-
tokens.push(RustcToken {
394-
extent: source_map.span_to_snippet(parser.token.span).unwrap(),
395-
spacing: parser.token_spacing.into(),
396-
data,
397-
summary: format!(
398-
"{:} {:?}",
399-
format_spacing(&parser.token_spacing),
400-
parser.token.kind.clone()
401-
),
402-
});
403-
parser.bump();
437+
}
438+
// These shouldn't happen
439+
TokenKind::Interpolated(_) => RustcTokenData::Other,
440+
TokenKind::NtIdent(_, _) => RustcTokenData::Other,
441+
TokenKind::NtLifetime(_) => RustcTokenData::Other,
442+
TokenKind::Eof => RustcTokenData::Other,
443+
};
444+
RustcToken {
445+
extent: source_map.span_to_snippet(token.span).unwrap(),
446+
spacing: spacing.into(),
447+
data,
448+
summary: format!("{:} {:?}", format_spacing(&spacing), token.kind.clone()),
404449
}
405-
tokens
406450
}
407451

408452
fn literal_data_from_ast_litkind(ast_lit: rustc_ast::ast::LitKind) -> RustcLiteralData {

writeup/introduction.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ That means it describes `c""` literals, but not
2020
Other statements in this document are intended to be true as of April 2024.
2121

2222
The comparable implementation is intended to be compiled against (and compared against)\
23-
rustc nightly from approximately 2024-05-02
23+
rustc nightly from approximately 2024-07-29
2424

2525

2626
### Editions

0 commit comments

Comments
 (0)