Skip to content

Commit 58188b6

Browse files
committed
feat: break apart raw_string_literal
1 parent bd352d2 commit 58188b6

File tree

3 files changed

+85
-25
lines changed

3 files changed

+85
-25
lines changed

grammar.js

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,9 @@ module.exports = grammar({
6969

7070
externals: $ => [
7171
$.string_content,
72-
$.raw_string_literal,
72+
$._raw_string_literal_start,
73+
$.raw_string_literal_content,
74+
$._raw_string_literal_end,
7375
$.float_literal,
7476
$._outer_block_doc_comment_marker,
7577
$._inner_block_doc_comment_marker,
@@ -1484,6 +1486,12 @@ module.exports = grammar({
14841486
token.immediate('"'),
14851487
),
14861488

1489+
raw_string_literal: $ => seq(
1490+
$._raw_string_literal_start,
1491+
alias($.raw_string_literal_content, $.string_content),
1492+
$._raw_string_literal_end,
1493+
),
1494+
14871495
char_literal: _ => token(seq(
14881496
optional('b'),
14891497
'\'',

src/scanner.c

Lines changed: 58 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
1+
#include "tree_sitter/alloc.h"
12
#include "tree_sitter/parser.h"
3+
24
#include <wctype.h>
35

46
enum TokenType {
57
STRING_CONTENT,
6-
RAW_STRING_LITERAL,
8+
RAW_STRING_LITERAL_START,
9+
RAW_STRING_LITERAL_CONTENT,
10+
RAW_STRING_LITERAL_END,
711
FLOAT_LITERAL,
812
BLOCK_OUTER_DOC_MARKER,
913
BLOCK_INNER_DOC_MARKER,
@@ -12,15 +16,28 @@ enum TokenType {
1216
ERROR_SENTINEL
1317
};
1418

15-
void *tree_sitter_rust_external_scanner_create() { return NULL; }
19+
typedef struct {
20+
uint8_t opening_hash_count;
21+
} Scanner;
1622

17-
void tree_sitter_rust_external_scanner_destroy(void *p) {}
23+
void *tree_sitter_rust_external_scanner_create() { return ts_calloc(1, sizeof(Scanner)); }
1824

19-
void tree_sitter_rust_external_scanner_reset(void *p) {}
25+
void tree_sitter_rust_external_scanner_destroy(void *payload) { ts_free((Scanner *)payload); }
2026

21-
unsigned tree_sitter_rust_external_scanner_serialize(void *p, char *buffer) { return 0; }
27+
unsigned tree_sitter_rust_external_scanner_serialize(void *payload, char *buffer) {
28+
Scanner *scanner = (Scanner *)payload;
29+
buffer[0] = (char)scanner->opening_hash_count;
30+
return 1;
31+
}
2232

23-
void tree_sitter_rust_external_scanner_deserialize(void *p, const char *b, unsigned n) {}
33+
void tree_sitter_rust_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
34+
Scanner *scanner = (Scanner *)payload;
35+
scanner->opening_hash_count = 0;
36+
if (length == 1) {
37+
Scanner *scanner = (Scanner *)payload;
38+
scanner->opening_hash_count = buffer[0];
39+
}
40+
}
2441

2542
static inline bool is_num_char(int32_t c) { return c == '_' || iswdigit(c); }
2643

@@ -45,8 +62,7 @@ static inline bool process_string(TSLexer *lexer) {
4562
return has_content;
4663
}
4764

48-
static inline bool process_raw_string(TSLexer *lexer) {
49-
lexer->result_symbol = RAW_STRING_LITERAL;
65+
static inline bool scan_raw_string_start(Scanner *scanner, TSLexer *lexer) {
5066
if (lexer->lookahead == 'b' || lexer->lookahead == 'c') {
5167
advance(lexer);
5268
}
@@ -55,7 +71,7 @@ static inline bool process_raw_string(TSLexer *lexer) {
5571
}
5672
advance(lexer);
5773

58-
unsigned opening_hash_count = 0;
74+
uint8_t opening_hash_count = 0;
5975
while (lexer->lookahead == '#') {
6076
advance(lexer);
6177
opening_hash_count++;
@@ -65,20 +81,27 @@ static inline bool process_raw_string(TSLexer *lexer) {
6581
return false;
6682
}
6783
advance(lexer);
84+
scanner->opening_hash_count = opening_hash_count;
6885

86+
lexer->result_symbol = RAW_STRING_LITERAL_START;
87+
return true;
88+
}
89+
90+
static inline bool scan_raw_string_content(Scanner *scanner, TSLexer *lexer) {
6991
for (;;) {
7092
if (lexer->eof(lexer)) {
7193
return false;
7294
}
7395
if (lexer->lookahead == '"') {
96+
lexer->mark_end(lexer);
7497
advance(lexer);
7598
unsigned hash_count = 0;
76-
while (lexer->lookahead == '#' && hash_count < opening_hash_count) {
99+
while (lexer->lookahead == '#' && hash_count < scanner->opening_hash_count) {
77100
advance(lexer);
78101
hash_count++;
79102
}
80-
if (hash_count == opening_hash_count) {
81-
lexer->mark_end(lexer);
103+
if (hash_count == scanner->opening_hash_count) {
104+
lexer->result_symbol = RAW_STRING_LITERAL_CONTENT;
82105
return true;
83106
}
84107
} else {
@@ -87,6 +110,15 @@ static inline bool process_raw_string(TSLexer *lexer) {
87110
}
88111
}
89112

113+
static inline bool scan_raw_string_end(Scanner *scanner, TSLexer *lexer) {
114+
advance(lexer);
115+
for (unsigned i = 0; i < scanner->opening_hash_count; i++) {
116+
advance(lexer);
117+
}
118+
lexer->result_symbol = RAW_STRING_LITERAL_END;
119+
return true;
120+
}
121+
90122
static inline bool process_float_literal(TSLexer *lexer) {
91123
lexer->result_symbol = FLOAT_LITERAL;
92124

@@ -321,7 +353,10 @@ bool tree_sitter_rust_external_scanner_scan(void *payload, TSLexer *lexer, const
321353
return false;
322354
}
323355

324-
if (valid_symbols[BLOCK_COMMENT_CONTENT] || valid_symbols[BLOCK_INNER_DOC_MARKER] || valid_symbols[BLOCK_OUTER_DOC_MARKER]) {
356+
Scanner *scanner = (Scanner *)payload;
357+
358+
if (valid_symbols[BLOCK_COMMENT_CONTENT] || valid_symbols[BLOCK_INNER_DOC_MARKER] ||
359+
valid_symbols[BLOCK_OUTER_DOC_MARKER]) {
325360
return process_block_comment(lexer, valid_symbols);
326361
}
327362

@@ -337,9 +372,17 @@ bool tree_sitter_rust_external_scanner_scan(void *payload, TSLexer *lexer, const
337372
skip(lexer);
338373
}
339374

340-
if (valid_symbols[RAW_STRING_LITERAL] &&
375+
if (valid_symbols[RAW_STRING_LITERAL_START] &&
341376
(lexer->lookahead == 'r' || lexer->lookahead == 'b' || lexer->lookahead == 'c')) {
342-
return process_raw_string(lexer);
377+
return scan_raw_string_start(scanner, lexer);
378+
}
379+
380+
if (valid_symbols[RAW_STRING_LITERAL_CONTENT]) {
381+
return scan_raw_string_content(scanner, lexer);
382+
}
383+
384+
if (valid_symbols[RAW_STRING_LITERAL_END] && lexer->lookahead == '"') {
385+
return scan_raw_string_end(scanner, lexer);
343386
}
344387

345388
if (valid_symbols[FLOAT_LITERAL] && iswdigit(lexer->lookahead)) {

test/corpus/literals.txt

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -138,15 +138,20 @@ r######"foo ##### bar"######;
138138

139139
(source_file
140140
(expression_statement
141-
(raw_string_literal))
141+
(raw_string_literal
142+
(string_content)))
142143
(expression_statement
143-
(raw_string_literal))
144+
(raw_string_literal
145+
(string_content)))
144146
(expression_statement
145-
(raw_string_literal))
147+
(raw_string_literal
148+
(string_content)))
146149
(expression_statement
147-
(raw_string_literal))
150+
(raw_string_literal
151+
(string_content)))
148152
(expression_statement
149-
(raw_string_literal)))
153+
(raw_string_literal
154+
(string_content))))
150155

151156
================================================================================
152157
Raw byte string literals
@@ -159,9 +164,11 @@ br##"abc"##;
159164

160165
(source_file
161166
(expression_statement
162-
(raw_string_literal))
167+
(raw_string_literal
168+
(string_content)))
163169
(expression_statement
164-
(raw_string_literal)))
170+
(raw_string_literal
171+
(string_content))))
165172

166173
================================================================================
167174
Raw C string literals
@@ -174,9 +181,11 @@ cr##"abc"##;
174181

175182
(source_file
176183
(expression_statement
177-
(raw_string_literal))
184+
(raw_string_literal
185+
(string_content)))
178186
(expression_statement
179-
(raw_string_literal)))
187+
(raw_string_literal
188+
(string_content))))
180189

181190
================================================================================
182191
Character literals

0 commit comments

Comments
 (0)