Skip to content

Commit f8dcf03

Browse files
committed
Merge remote-tracking branch 'cmr/lexical-syntax'
2 parents 7f48db6 + 26b5257 commit f8dcf03

File tree

1 file changed

+243
-0
lines changed

1 file changed

+243
-0
lines changed
Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
- Start Date: 2014-05-23
2+
- RFC PR #: (leave this empty)
3+
- Rust Issue #: (leave this empty)
4+
5+
# Summary
6+
7+
Simplify Rust's lexical syntax to make tooling easier to use and easier to
8+
define.
9+
10+
# Motivation
11+
12+
Rust's lexer does a lot of work. It un-escapes escape sequences in string and
13+
character literals, and parses numeric literals of 4 different bases. It also
14+
strips comments, which is sensible, but can be undesirable for pretty printing
15+
or syntax highlighting without hacks. Since many characters are allowed in
16+
strings both escaped and raw (tabs, newlines, and unicode characters come to
17+
mind), after lexing it is impossible to tell if a given character was escaped
18+
or unescaped in the source, making the lexer difficult to test against a
19+
model.
20+
21+
# Detailed design
22+
23+
The following (antlr4) grammar completely describes the proposed lexical
24+
syntax:
25+
26+
lexer grammar RustLexer;
27+
28+
/* import Xidstart, Xidcont; */
29+
30+
/* Expression-operator symbols */
31+
32+
EQ : '=' ;
33+
LT : '<' ;
34+
LE : '<=' ;
35+
EQEQ : '==' ;
36+
NE : '!=' ;
37+
GE : '>=' ;
38+
GT : '>' ;
39+
ANDAND : '&&' ;
40+
OROR : '||' ;
41+
NOT : '!' ;
42+
TILDE : '~' ;
43+
PLUS : '+' ;
44+
MINUS : '-' ;
45+
STAR : '*' ;
46+
SLASH : '/' ;
47+
PERCENT : '%' ;
48+
CARET : '^' ;
49+
AND : '&' ;
50+
OR : '|' ;
51+
SHL : '<<' ;
52+
SHR : '>>' ;
53+
54+
BINOP
55+
: PLUS
56+
| MINUS
57+
| STAR
58+
| PERCENT
59+
| CARET
60+
| AND
61+
| OR
62+
| SHL
63+
| SHR
64+
;
65+
66+
BINOPEQ : BINOP EQ ;
67+
68+
/* "Structural symbols" */
69+
70+
AT : '@' ;
71+
DOT : '.' ;
72+
DOTDOT : '..' ;
73+
DOTDOTDOT : '...' ;
74+
COMMA : ',' ;
75+
SEMI : ';' ;
76+
COLON : ':' ;
77+
MOD_SEP : '::' ;
78+
LARROW : '->' ;
79+
FAT_ARROW : '=>' ;
80+
LPAREN : '(' ;
81+
RPAREN : ')' ;
82+
LBRACKET : '[' ;
83+
RBRACKET : ']' ;
84+
LBRACE : '{' ;
85+
RBRACE : '}' ;
86+
POUND : '#';
87+
DOLLAR : '$' ;
88+
UNDERSCORE : '_' ;
89+
90+
KEYWORD : STRICT_KEYWORD | RESERVED_KEYWORD ;
91+
92+
fragment STRICT_KEYWORD
93+
: 'as'
94+
| 'box'
95+
| 'break'
96+
| 'continue'
97+
| 'crate'
98+
| 'else'
99+
| 'enum'
100+
| 'extern'
101+
| 'fn'
102+
| 'for'
103+
| 'if'
104+
| 'impl'
105+
| 'in'
106+
| 'let'
107+
| 'loop'
108+
| 'match'
109+
| 'mod'
110+
| 'mut'
111+
| 'once'
112+
| 'proc'
113+
| 'pub'
114+
| 'ref'
115+
| 'return'
116+
| 'self'
117+
| 'static'
118+
| 'struct'
119+
| 'super'
120+
| 'trait'
121+
| 'true'
122+
| 'type'
123+
| 'unsafe'
124+
| 'use'
125+
| 'virtual'
126+
| 'while'
127+
;
128+
129+
fragment RESERVED_KEYWORD
130+
: 'alignof'
131+
| 'be'
132+
| 'const'
133+
| 'do'
134+
| 'offsetof'
135+
| 'priv'
136+
| 'pure'
137+
| 'sizeof'
138+
| 'typeof'
139+
| 'unsized'
140+
| 'yield'
141+
;
142+
143+
// Literals
144+
145+
fragment HEXIT
146+
: [0-9a-fA-F]
147+
;
148+
149+
fragment CHAR_ESCAPE
150+
: [nrt\\'"0]
151+
| [xX] HEXIT HEXIT
152+
| 'u' HEXIT HEXIT HEXIT HEXIT
153+
| 'U' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT
154+
;
155+
156+
LIT_CHAR
157+
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] ) '\''
158+
;
159+
160+
INT_SUFFIX
161+
: 'i'
162+
| 'i8'
163+
| 'i16'
164+
| 'i32'
165+
| 'i64'
166+
| 'u'
167+
| 'u8'
168+
| 'u16'
169+
| 'u32'
170+
| 'u64'
171+
;
172+
173+
LIT_INTEGER
174+
: [0-9][0-9_]* INT_SUFFIX?
175+
| '0b' [01][01_]* INT_SUFFIX?
176+
| '0o' [0-7][0-7_]* INT_SUFFIX?
177+
| '0x' [0-9a-fA-F][0-9a-fA-F_]* INT_SUFFIX?
178+
;
179+
180+
FLOAT_SUFFIX
181+
: 'f32'
182+
| 'f64'
183+
| 'f128'
184+
;
185+
186+
LIT_FLOAT
187+
: [0-9][0-9_]* ('.' | ('.' [0-9][0-9_]*)? ([eE] [-+]? [0-9][0-9_]*)? FLOAT_SUFFIX?)
188+
;
189+
190+
LIT_STR
191+
: '"' ('\\\n' | '\\\r\n' | '\\' CHAR_ESCAPE | .)*? '"'
192+
;
193+
194+
/* this is a bit messy */
195+
196+
fragment LIT_STR_RAW_INNER
197+
: '"' .*? '"'
198+
| LIT_STR_RAW_INNER2
199+
;
200+
201+
fragment LIT_STR_RAW_INNER2
202+
: POUND LIT_STR_RAW_INNER POUND
203+
;
204+
205+
LIT_STR_RAW
206+
: 'r' LIT_STR_RAW_INNER
207+
;
208+
209+
fragment BLOCK_COMMENT
210+
: '/*' (BLOCK_COMMENT | .)*? '*/'
211+
;
212+
213+
COMMENT
214+
: '//' ~[\r\n]*
215+
| BLOCK_COMMENT
216+
;
217+
218+
IDENT : XID_start XID_continue* ;
219+
220+
LIFETIME : '\'' IDENT ;
221+
222+
WHITESPACE : [ \r\n\t]+ ;
223+
224+
225+
There are a few notable changes from today's lexical syntax:
226+
227+
- Non-doc comments are not stripped. To compensate, when encountering a
228+
COMMENT token the parser can check itself whether or not it's a doc comment.
229+
This can be done with a simple regex: `(//(/[^/]|!)|/\*(\*[^*]|!))`.
230+
- Numeric literals are not differentiated based on presence of type suffix,
231+
nor are they converted from binary/octal/hexadecimal to decimal, nor are
232+
underscores stripped. This can be done trivially in the parser.
233+
- Character escapes are not unescaped. That is, if you write '\x20', this
234+
lexer will give you `LIT_CHAR('\x20')` rather than `LIT_CHAR(' ')`. The same
235+
applies to string literals.
236+
237+
The output of the lexer then becomes annotated spans -- which part of the
238+
document corresponds to which token type. Even whitespace is categorized.
239+
240+
# Drawbacks
241+
242+
Including comments and whitespace in the token stream is very non-traditional
243+
and not strictly necessary.

0 commit comments

Comments
 (0)