1
1
//! Runs rustc's lexical analysis.
2
2
//!
3
- //! This works by running the low-level and high-level lexers as far as making a `TokenTree`, then
4
- //! pulling tokens from it one by one in the same way as rustc's parser does. If rustc emits any
5
- //! error messages (or panics), we treat the input as rejected.
3
+ //! This works by running the low-level and high-level lexers as far as making a `TokenStream`, then
4
+ //! flattening the `TokenTree`s it contains back into a sequence of tokens in a similar way to
5
+ //! rustc's parser.
6
+ //! If rustc emits any error messages (or panics), we treat the input as rejected.
6
7
//!
7
8
//! Stringlike literal tokens are further run through ast::LitKind::from_token_lit(), to obtain the
8
9
//! "unescaped" value.
11
12
//! (BOM-removal and CRLF-conversion) happen. Later shebang removal happens too. See the
12
13
//! [`cleaning`][`crate::cleaning`] module for how we make equivalent input for comparison.
13
14
//!
14
- //! One weakness of this approach is that, because it constructs a token tree, input with imbalanced
15
- //! delimiters is rejected. (I don't see a `pub` interface giving access to the stream before
16
- //! building the `TokenTree`.)
15
+ //! A limitation of this approach is that, because it constructs token trees, input with imbalanced
16
+ //! delimiters is rejected.
17
17
18
18
extern crate rustc_ast;
19
19
extern crate rustc_data_structures;
@@ -25,17 +25,19 @@ extern crate rustc_session;
25
25
extern crate rustc_span;
26
26
27
27
// This compiles with
28
- // rustc nightly from approximately 2024-05-02
28
+ // rustc nightly from approximately 2024-07-29
29
29
30
30
use std:: {
31
31
mem,
32
32
sync:: { Arc , Mutex } ,
33
33
} ;
34
34
35
- use rustc_ast:: token:: TokenKind ;
35
+ use rustc_ast:: {
36
+ token:: { Token , TokenKind } ,
37
+ tokenstream:: { TokenStream , TokenTree } ,
38
+ } ;
36
39
use rustc_data_structures:: sync:: Lrc ;
37
40
use rustc_errors:: { DiagCtxt , LazyFallbackBundle } ;
38
- use rustc_parse:: parser:: Parser ;
39
41
use rustc_span:: {
40
42
source_map:: { FilePathMapping , SourceMap } ,
41
43
FileName ,
@@ -211,16 +213,16 @@ pub enum Analysis {
211
213
/// - if rustc would have reported a non-fatal error, at least one message has
212
214
/// been added to error_list
213
215
/// - in this case, the returned tokens are what would have been passed on to
214
- /// the parser (an empty list if tokentree construction failed).
216
+ /// the parser (an empty list if token stream construction failed).
215
217
fn run_lexer ( input : & str , error_list : ErrorAccumulator ) -> Vec < RustcToken > {
216
218
let psess = make_parser_session ( error_list. clone ( ) ) ;
217
219
let source_map = psess. source_map ( ) ;
218
220
let input = String :: from ( input) ;
219
221
let filename = FileName :: Custom ( "lex_via_rustc" . into ( ) ) ;
220
- let lexed = match rustc_parse:: maybe_new_parser_from_source_str ( & psess, filename, input) {
221
- Ok ( parser ) => tokens_from_parser ( parser , source_map) ,
222
+ let lexed = match rustc_parse:: source_str_to_stream ( & psess, filename, input, None ) {
223
+ Ok ( token_stream ) => TokenStreamProcessor :: process ( & token_stream , & source_map) ,
222
224
Err ( diags) => {
223
- // Errors constructing the tokentree are reported here
225
+ // Errors constructing the token stream are reported here
224
226
// (ie, unbalanced delimiters).
225
227
assert ! ( !diags. is_empty( ) ) ;
226
228
for diag in diags {
@@ -232,7 +234,7 @@ fn run_lexer(input: &str, error_list: ErrorAccumulator) -> Vec<RustcToken> {
232
234
// The lexer doesn't report errors itself when it sees emoji in 'identifiers'. Instead it leaves
233
235
// a note in the ParseSess to be examined later. So we have to make this extra check.
234
236
if !& psess. bad_unicode_identifiers . borrow_mut ( ) . is_empty ( ) {
235
- psess. dcx . err ( "bad unicode identifier(s)" ) ;
237
+ psess. dcx ( ) . err ( "bad unicode identifier(s)" ) ;
236
238
}
237
239
lexed
238
240
}
@@ -305,104 +307,146 @@ fn make_parser_session(error_list: ErrorAccumulator) -> rustc_session::parse::Pa
305
307
rustc_session:: parse:: ParseSess :: with_dcx ( dcx, sm)
306
308
}
307
309
308
- fn tokens_from_parser ( mut parser : Parser , source_map : & SourceMap ) -> Vec < RustcToken > {
309
- let mut tokens = Vec :: new ( ) ;
310
- while parser. token . kind != TokenKind :: Eof {
311
- let data = match parser. token . kind {
312
- TokenKind :: DocComment ( comment_kind, style, symbol) => RustcTokenData :: DocComment {
313
- comment_kind : comment_kind. into ( ) ,
314
- style : style. into ( ) ,
315
- body : symbol. to_string ( ) ,
316
- } ,
317
- TokenKind :: Eq => RustcTokenData :: Punctuation ,
318
- TokenKind :: Lt => RustcTokenData :: Punctuation ,
319
- TokenKind :: Le => RustcTokenData :: Punctuation ,
320
- TokenKind :: EqEq => RustcTokenData :: Punctuation ,
321
- TokenKind :: Ne => RustcTokenData :: Punctuation ,
322
- TokenKind :: Ge => RustcTokenData :: Punctuation ,
323
- TokenKind :: Gt => RustcTokenData :: Punctuation ,
324
- TokenKind :: AndAnd => RustcTokenData :: Punctuation ,
325
- TokenKind :: OrOr => RustcTokenData :: Punctuation ,
326
- TokenKind :: Not => RustcTokenData :: Punctuation ,
327
- TokenKind :: Tilde => RustcTokenData :: Punctuation ,
328
- TokenKind :: BinOp ( _) => RustcTokenData :: Punctuation ,
329
- TokenKind :: BinOpEq ( _) => RustcTokenData :: Punctuation ,
330
- TokenKind :: At => RustcTokenData :: Punctuation ,
331
- TokenKind :: Dot => RustcTokenData :: Punctuation ,
332
- TokenKind :: DotDot => RustcTokenData :: Punctuation ,
333
- TokenKind :: DotDotDot => RustcTokenData :: Punctuation ,
334
- TokenKind :: DotDotEq => RustcTokenData :: Punctuation ,
335
- TokenKind :: Comma => RustcTokenData :: Punctuation ,
336
- TokenKind :: Semi => RustcTokenData :: Punctuation ,
337
- TokenKind :: Colon => RustcTokenData :: Punctuation ,
338
- TokenKind :: PathSep => RustcTokenData :: Punctuation ,
339
- TokenKind :: RArrow => RustcTokenData :: Punctuation ,
340
- TokenKind :: LArrow => RustcTokenData :: Punctuation ,
341
- TokenKind :: FatArrow => RustcTokenData :: Punctuation ,
342
- TokenKind :: Pound => RustcTokenData :: Punctuation ,
343
- TokenKind :: Dollar => RustcTokenData :: Punctuation ,
344
- TokenKind :: Question => RustcTokenData :: Punctuation ,
345
- TokenKind :: SingleQuote => RustcTokenData :: Punctuation ,
346
- TokenKind :: OpenDelim ( _) => RustcTokenData :: Punctuation ,
347
- TokenKind :: CloseDelim ( _) => RustcTokenData :: Punctuation ,
348
- TokenKind :: Ident ( symbol, style) => RustcTokenData :: Ident {
349
- style : style. into ( ) ,
350
- identifier : symbol. to_string ( ) ,
351
- } ,
352
- TokenKind :: Lifetime ( symbol) => RustcTokenData :: Lifetime {
353
- symbol : symbol. to_string ( ) ,
354
- } ,
355
- TokenKind :: Literal ( rustc_ast:: token:: Lit {
356
- kind : rustc_ast:: token:: LitKind :: Integer ,
357
- suffix,
358
- ..
359
- } ) => RustcTokenData :: Lit {
360
- literal_data : RustcLiteralData :: Integer (
361
- suffix. map ( |s| s. to_string ( ) ) . unwrap_or_else ( String :: new) ,
362
- ) ,
363
- } ,
364
- TokenKind :: Literal ( rustc_ast:: token:: Lit {
365
- kind : rustc_ast:: token:: LitKind :: Float ,
366
- suffix,
367
- ..
368
- } ) => RustcTokenData :: Lit {
369
- literal_data : RustcLiteralData :: Float (
370
- suffix. map ( |s| s. to_string ( ) ) . unwrap_or_else ( String :: new) ,
371
- ) ,
372
- } ,
373
- TokenKind :: Literal ( lit) => {
374
- match lit. suffix {
375
- // from_token_lit() is what performs unescaping, but it will panic if it sees a
376
- // suffix
377
- None => {
378
- let ast_lit = rustc_ast:: ast:: LitKind :: from_token_lit ( lit)
379
- . expect ( "from_token_lit failed" ) ;
380
- RustcTokenData :: Lit {
381
- literal_data : literal_data_from_ast_litkind ( ast_lit) ,
382
- }
310
+ /// Converts a rustc_ast `TokenStream` to a flat sequence of `RustcToken`s.
311
+ struct TokenStreamProcessor < ' a > {
312
+ source_map : & ' a SourceMap ,
313
+ output : Vec < RustcToken > ,
314
+ }
315
+
316
+ impl < ' a > TokenStreamProcessor < ' a > {
317
+ fn process ( token_stream : & TokenStream , source_map : & ' a SourceMap ) -> Vec < RustcToken > {
318
+ let mut flattener = Self {
319
+ source_map,
320
+ output : Vec :: new ( ) ,
321
+ } ;
322
+ flattener. add_tokens_from_stream ( token_stream) ;
323
+ flattener. output
324
+ }
325
+
326
+ fn add_tokens_from_stream ( & mut self , token_stream : & TokenStream ) {
327
+ for token_tree in token_stream. trees ( ) {
328
+ self . add_tokens_from_tree ( token_tree) ;
329
+ }
330
+ }
331
+
332
+ fn add_tokens_from_tree ( & mut self , token_tree : & TokenTree ) {
333
+ match token_tree {
334
+ & TokenTree :: Token ( ref token, spacing) => {
335
+ self . output
336
+ . push ( token_from_ast_token ( token, spacing, self . source_map ) )
337
+ }
338
+ & TokenTree :: Delimited ( delim_span, delim_spacing, delimiter, ref token_stream) => {
339
+ self . output . push ( token_from_ast_token (
340
+ & Token :: new ( TokenKind :: OpenDelim ( delimiter) , delim_span. open ) ,
341
+ delim_spacing. open ,
342
+ self . source_map ,
343
+ ) ) ;
344
+ self . add_tokens_from_stream ( token_stream) ;
345
+ self . output . push ( token_from_ast_token (
346
+ & Token :: new ( TokenKind :: CloseDelim ( delimiter) , delim_span. close ) ,
347
+ delim_spacing. close ,
348
+ self . source_map ,
349
+ ) ) ;
350
+ }
351
+ }
352
+ }
353
+ }
354
+
355
+ fn token_from_ast_token (
356
+ token : & Token ,
357
+ spacing : rustc_ast:: tokenstream:: Spacing ,
358
+ source_map : & SourceMap ,
359
+ ) -> RustcToken {
360
+ let data = match token. kind {
361
+ TokenKind :: DocComment ( comment_kind, style, symbol) => RustcTokenData :: DocComment {
362
+ comment_kind : comment_kind. into ( ) ,
363
+ style : style. into ( ) ,
364
+ body : symbol. to_string ( ) ,
365
+ } ,
366
+ TokenKind :: Eq => RustcTokenData :: Punctuation ,
367
+ TokenKind :: Lt => RustcTokenData :: Punctuation ,
368
+ TokenKind :: Le => RustcTokenData :: Punctuation ,
369
+ TokenKind :: EqEq => RustcTokenData :: Punctuation ,
370
+ TokenKind :: Ne => RustcTokenData :: Punctuation ,
371
+ TokenKind :: Ge => RustcTokenData :: Punctuation ,
372
+ TokenKind :: Gt => RustcTokenData :: Punctuation ,
373
+ TokenKind :: AndAnd => RustcTokenData :: Punctuation ,
374
+ TokenKind :: OrOr => RustcTokenData :: Punctuation ,
375
+ TokenKind :: Not => RustcTokenData :: Punctuation ,
376
+ TokenKind :: Tilde => RustcTokenData :: Punctuation ,
377
+ TokenKind :: BinOp ( _) => RustcTokenData :: Punctuation ,
378
+ TokenKind :: BinOpEq ( _) => RustcTokenData :: Punctuation ,
379
+ TokenKind :: At => RustcTokenData :: Punctuation ,
380
+ TokenKind :: Dot => RustcTokenData :: Punctuation ,
381
+ TokenKind :: DotDot => RustcTokenData :: Punctuation ,
382
+ TokenKind :: DotDotDot => RustcTokenData :: Punctuation ,
383
+ TokenKind :: DotDotEq => RustcTokenData :: Punctuation ,
384
+ TokenKind :: Comma => RustcTokenData :: Punctuation ,
385
+ TokenKind :: Semi => RustcTokenData :: Punctuation ,
386
+ TokenKind :: Colon => RustcTokenData :: Punctuation ,
387
+ TokenKind :: PathSep => RustcTokenData :: Punctuation ,
388
+ TokenKind :: RArrow => RustcTokenData :: Punctuation ,
389
+ TokenKind :: LArrow => RustcTokenData :: Punctuation ,
390
+ TokenKind :: FatArrow => RustcTokenData :: Punctuation ,
391
+ TokenKind :: Pound => RustcTokenData :: Punctuation ,
392
+ TokenKind :: Dollar => RustcTokenData :: Punctuation ,
393
+ TokenKind :: Question => RustcTokenData :: Punctuation ,
394
+ TokenKind :: SingleQuote => RustcTokenData :: Punctuation ,
395
+ TokenKind :: OpenDelim ( _) => RustcTokenData :: Punctuation ,
396
+ TokenKind :: CloseDelim ( _) => RustcTokenData :: Punctuation ,
397
+ TokenKind :: Ident ( symbol, style) => RustcTokenData :: Ident {
398
+ style : style. into ( ) ,
399
+ identifier : symbol. to_string ( ) ,
400
+ } ,
401
+ TokenKind :: Lifetime ( symbol) => RustcTokenData :: Lifetime {
402
+ symbol : symbol. to_string ( ) ,
403
+ } ,
404
+ TokenKind :: Literal ( rustc_ast:: token:: Lit {
405
+ kind : rustc_ast:: token:: LitKind :: Integer ,
406
+ suffix,
407
+ ..
408
+ } ) => RustcTokenData :: Lit {
409
+ literal_data : RustcLiteralData :: Integer (
410
+ suffix. map ( |s| s. to_string ( ) ) . unwrap_or_else ( String :: new) ,
411
+ ) ,
412
+ } ,
413
+ TokenKind :: Literal ( rustc_ast:: token:: Lit {
414
+ kind : rustc_ast:: token:: LitKind :: Float ,
415
+ suffix,
416
+ ..
417
+ } ) => RustcTokenData :: Lit {
418
+ literal_data : RustcLiteralData :: Float (
419
+ suffix. map ( |s| s. to_string ( ) ) . unwrap_or_else ( String :: new) ,
420
+ ) ,
421
+ } ,
422
+ TokenKind :: Literal ( lit) => {
423
+ match lit. suffix {
424
+ // from_token_lit() is what performs unescaping, but it will panic if it sees a
425
+ // suffix
426
+ None => {
427
+ let ast_lit = rustc_ast:: ast:: LitKind :: from_token_lit ( lit)
428
+ . expect ( "from_token_lit failed" ) ;
429
+ RustcTokenData :: Lit {
430
+ literal_data : literal_data_from_ast_litkind ( ast_lit) ,
383
431
}
384
- Some ( suffix) => RustcTokenData :: Lit {
385
- literal_data : RustcLiteralData :: ForbiddenSuffix ( suffix. to_string ( ) ) ,
386
- } ,
387
432
}
433
+ Some ( suffix) => RustcTokenData :: Lit {
434
+ literal_data : RustcLiteralData :: ForbiddenSuffix ( suffix. to_string ( ) ) ,
435
+ } ,
388
436
}
389
- // These shouldn't happen
390
- TokenKind :: Interpolated ( _) => RustcTokenData :: Other ,
391
- TokenKind :: Eof => RustcTokenData :: Other ,
392
- } ;
393
- tokens. push ( RustcToken {
394
- extent : source_map. span_to_snippet ( parser. token . span ) . unwrap ( ) ,
395
- spacing : parser. token_spacing . into ( ) ,
396
- data,
397
- summary : format ! (
398
- "{:} {:?}" ,
399
- format_spacing( & parser. token_spacing) ,
400
- parser. token. kind. clone( )
401
- ) ,
402
- } ) ;
403
- parser. bump ( ) ;
437
+ }
438
+ // These shouldn't happen
439
+ TokenKind :: Interpolated ( _) => RustcTokenData :: Other ,
440
+ TokenKind :: NtIdent ( _, _) => RustcTokenData :: Other ,
441
+ TokenKind :: NtLifetime ( _) => RustcTokenData :: Other ,
442
+ TokenKind :: Eof => RustcTokenData :: Other ,
443
+ } ;
444
+ RustcToken {
445
+ extent : source_map. span_to_snippet ( token. span ) . unwrap ( ) ,
446
+ spacing : spacing. into ( ) ,
447
+ data,
448
+ summary : format ! ( "{:} {:?}" , format_spacing( & spacing) , token. kind. clone( ) ) ,
404
449
}
405
- tokens
406
450
}
407
451
408
452
fn literal_data_from_ast_litkind ( ast_lit : rustc_ast:: ast:: LitKind ) -> RustcLiteralData {
0 commit comments