Skip to content

Commit 19e1f5c

Browse files
committed
Lexer; subtly wrong; no makefile
1 parent e624791 commit 19e1f5c

File tree

3 files changed

+401
-0
lines changed

3 files changed

+401
-0
lines changed

src/grammar/README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
Reference grammar.
2+
3+
Uses [antlr4](http://www.antlr.org/) and a custom Rust tool to compare
4+
ASTs/token streams generated.
5+
6+
To use:
7+
8+
```
9+
antlr4 RustLexer.g4
10+
javac *.java
11+
rustc -O verify.rs
12+
for file in ../*/**.rs; do
13+
echo $file;
14+
grun RustLexer tokens -tokens < $file | ./verify $file || break
15+
done
16+
```
17+
18+
Note That the `../*/**.rs` glob will match every `*.rs` file in the above
19+
directory and all of its recursive children. This is a zsh extension.

src/grammar/RustLexer.g4

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
lexer grammar RustLexer;
2+
3+
/* Note: due to antlr limitations, we can't represent XID_start and
4+
* XID_continue properly. ASCII-only substitute. */
5+
6+
fragment XID_start : [_a-zA-Z] ;
7+
fragment XID_continue : [_a-zA-Z0-9] ;
8+
9+
/* Expression-operator symbols */
10+
11+
EQ : '=' ;
12+
LT : '<' ;
13+
LE : '<=' ;
14+
EQEQ : '==' ;
15+
NE : '!=' ;
16+
GE : '>=' ;
17+
GT : '>' ;
18+
ANDAND : '&&' ;
19+
OROR : '||' ;
20+
NOT : '!' ;
21+
TILDE : '~' ;
22+
PLUS : '+' ;
23+
MINUS : '-' ;
24+
STAR : '*' ;
25+
SLASH : '/' ;
26+
PERCENT : '%' ;
27+
CARET : '^' ;
28+
AND : '&' ;
29+
OR : '|' ;
30+
SHL : '<<' ;
31+
SHR : '>>' ;
32+
33+
BINOP
34+
: PLUS
35+
| MINUS
36+
| STAR
37+
| PERCENT
38+
| CARET
39+
| AND
40+
| OR
41+
| SHL
42+
| SHR
43+
;
44+
45+
BINOPEQ : BINOP EQ ;
46+
47+
/* "Structural symbols" */
48+
49+
AT : '@' ;
50+
DOT : '.' ;
51+
DOTDOT : '..' ;
52+
DOTDOTDOT : '...' ;
53+
COMMA : ',' ;
54+
SEMI : ';' ;
55+
COLON : ':' ;
56+
MOD_SEP : '::' ;
57+
RARROW : '->' ;
58+
FAT_ARROW : '=>' ;
59+
LPAREN : '(' ;
60+
RPAREN : ')' ;
61+
LBRACKET : '[' ;
62+
RBRACKET : ']' ;
63+
LBRACE : '{' ;
64+
RBRACE : '}' ;
65+
POUND : '#';
66+
DOLLAR : '$' ;
67+
UNDERSCORE : '_' ;
68+
69+
// Literals
70+
71+
fragment HEXIT
72+
: [0-9a-fA-F]
73+
;
74+
75+
fragment CHAR_ESCAPE
76+
: [nrt\\'"0]
77+
| [xX] HEXIT HEXIT
78+
| 'u' HEXIT HEXIT HEXIT HEXIT
79+
| 'U' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT
80+
;
81+
82+
LIT_CHAR
83+
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] ) '\''
84+
;
85+
86+
INT_SUFFIX
87+
: 'i'
88+
| 'i8'
89+
| 'i16'
90+
| 'i32'
91+
| 'i64'
92+
| 'u'
93+
| 'u8'
94+
| 'u16'
95+
| 'u32'
96+
| 'u64'
97+
;
98+
99+
LIT_INTEGER
100+
: [0-9][0-9_]* INT_SUFFIX?
101+
| '0b' [01][01_]* INT_SUFFIX?
102+
| '0o' [0-7][0-7_]* INT_SUFFIX?
103+
| '0x' [0-9a-fA-F][0-9a-fA-F_]* INT_SUFFIX?
104+
;
105+
106+
FLOAT_SUFFIX
107+
: 'f32'
108+
| 'f64'
109+
| 'f128'
110+
;
111+
112+
LIT_FLOAT
113+
: [0-9][0-9_]* ('.' | ('.' [0-9][0-9_]*)? ([eE] [-+]? [0-9][0-9_]*)? FLOAT_SUFFIX?)
114+
;
115+
116+
LIT_STR
117+
: '"' ('\\\n' | '\\\r\n' | '\\' CHAR_ESCAPE | .)*? '"'
118+
;
119+
120+
LIT_BINARY : 'b' LIT_STR ;
121+
LIT_BINARY_RAW : 'b' LIT_STR_RAW ;
122+
123+
/* this is a bit messy */
124+
125+
fragment LIT_STR_RAW_INNER
126+
: '"' .*? '"'
127+
| LIT_STR_RAW_INNER2
128+
;
129+
130+
fragment LIT_STR_RAW_INNER2
131+
: POUND LIT_STR_RAW_INNER POUND
132+
;
133+
134+
LIT_STR_RAW
135+
: 'r' LIT_STR_RAW_INNER
136+
;
137+
138+
IDENT : XID_start XID_continue* ;
139+
140+
LIFETIME : '\'' IDENT ;
141+
142+
WHITESPACE : [ \r\n\t]+ ;
143+
144+
COMMENT
145+
: '//' ~[\r\n]*
146+
| '////' ~[\r\n]*
147+
| BLOCK_COMMENT
148+
;
149+
150+
mode DOCCOMMENT;
151+
152+
fragment DOC_BLOCK_COMMENT
153+
: ('/**' | '/*!') (DOC_BLOCK_COMMENT | .)*? '*/'
154+
;
155+
156+
DOC_COMMENT
157+
: '///' ~[\r\n]*
158+
| '//!' ~[\r\n]*
159+
| DOC_BLOCK_COMMENT
160+
;
161+
162+
fragment BLOCK_COMMENT
163+
: '/*' (BLOCK_COMMENT | .)*? '*/'
164+
;
165+

0 commit comments

Comments
 (0)