@@ -55,7 +55,7 @@ RscGrammar preprocess(RscGrammar rsc) {
55
55
// Replace occurrences of singleton ranges with just the corresponding
56
56
// literal. This makes it easier to identify delimiters.
57
57
return visit (rsc ) {
58
- case s : \char -class ([range (char , char )]) => d
58
+ case \char -class ([range (char , char )]) => d
59
59
when d := \lit ("<stringChar (char )> " ), isDelimiter (d )
60
60
}
61
61
}
@@ -113,12 +113,10 @@ list[ConversionUnit] analyze(RscGrammar rsc) {
113
113
114
114
// Analyze dependencies among productions
115
115
println("[LOG] Analyzing dependencies among productions");
116
- Dependencies dependencies = deps(toGraph(rsc));
117
- list[Production] prods = dependencies
118
- .removeProds(isCyclic, true) // `true` means "also remove ancestors"
119
- .retainProds(isNonEmpty)
120
- .retainProds(hasCategory)
121
- .getProds();
116
+ Graph[Production] graph = toGraph(rsc);
117
+ list[Production] prods = deps(graph).retainProds(isNonEmpty).retainProds(hasCategory).getProds();
118
+ list[Production] prodsNonRecursive = prods & deps(graph).removeProds(isCyclic, true).getProds();
119
+ list[Production] prodsRecursive = prods - prodsNonRecursive;
122
120
123
121
// Analyze delimiters
124
122
println("[LOG] Analyzing delimiters");
@@ -134,13 +132,15 @@ list[ConversionUnit] analyze(RscGrammar rsc) {
134
132
list [Production ] prodsKeywords = [prod (lex (KEYWORDS_PRODUCTION_NAME ), [\alt (keywords )], {\tag ("category" ("keyword.control" ))})];
135
133
136
134
// Return
137
- bool isEmptyProd (prod (_, [\alt (alternatives )], _)) = alternatives == {};
138
- list [ConversionUnit ] units
139
- = [unit (rsc , p , hasNewline (rsc , p ), getOuterDelimiterPair (rsc , p ), getInnerDelimiterPair (rsc , p , getOnlyFirst = true )) | p <- prods ]
140
- + [unit (rsc , p , false , <nothing (), nothing ()> , <nothing (), nothing ()> ) | p <- prodsDelimiters , !isEmptyProd (p )]
141
- + [unit (rsc , p , false , <nothing (), nothing ()> , <nothing (), nothing ()> ) | p <- prodsKeywords , !isEmptyProd (p )];
142
-
143
- return sort (units );
135
+ bool isRecursive (Production p )
136
+ = p in prodsRecursive ;
137
+ bool isEmptyProd (prod (_, [\alt (alternatives )], _))
138
+ = alternatives == {};
139
+
140
+ set [ConversionUnit ] units = {};
141
+ units += {unit (rsc , p , isRecursive (p ), hasNewline (rsc , p ), getOuterDelimiterPair (rsc , p ), getInnerDelimiterPair (rsc , p , getOnlyFirst = true )) | p <- prods };
142
+ units += {unit (rsc , p , false , false , <nothing (), nothing ()> , <nothing (), nothing ()> ) | p <- prodsDelimiters + prodsKeywords , !isEmptyProd (p )};
143
+ return sort ([*removeStrictPrefixes (units )]);
144
144
}
145
145
146
146
@synopsis {
@@ -196,7 +196,7 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
196
196
197
197
// Convert all units in the group to match patterns (including,
198
198
// optimistically, multi-line units as-if they are single-line)
199
- for (u <- group ) {
199
+ for (u <- group , ! u . recursive ) {
200
200
TmRule r = toTmRule (toRegExp (u .rsc , u .prod , guard = true ))
201
201
[name = "/inner/single/<u .name > " ];
202
202
@@ -216,32 +216,116 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
216
216
// Simple case: each unit does have an `end` inner delimiter
217
217
if (_ <- group && all (u <- group , just (_) := u .innerDelimiters .end )) {
218
218
219
- // Compute a list of segments that need to be consumed between
219
+ // Compute a set of segments that need to be consumed between
220
220
// the `begin` delimiter and the `end` delimiters. Each of these
221
221
// segments will be converted to a match pattern.
222
222
set [Segment ] segs = {*getSegments (rsc , u .prod ) | u <- group };
223
223
segs = {removeBeginEnd (seg , begins , ends ) | seg <- segs };
224
224
225
- list [Symbol ] terminals = [\seq (seg .symbols ) | seg <- segs ];
226
- terminals = [s | s <- terminals , [] != s .symbols ];
227
- terminals = [destar (s ) | s <- terminals ]; // The tokenization engine always tries to apply rules repeatedly
228
- terminals = dup (terminals );
229
- terminals = sortByMinimumLength (terminals ); // Small symbols first
230
- terminals = reverse (terminals ); // Large symbols first
231
- terminals = terminals + \char -class ([range (1 ,0x10FFFF )]); // Any char (as a fallback)
232
-
233
225
TmRule r = toTmRule (
234
226
toRegExp (rsc , [begin ], {t }),
235
227
toRegExp (rsc , [\alt (ends )], {t }),
236
- [toTmRule (toRegExp (rsc , [s ], {t })) | s <- terminals ])
228
+ [toTmRule (toRegExp (rsc , [s ], {t })) | s <- toTerminals ( segs ) ])
237
229
[name = "/inner/multi/<intercalate ("," , [u .name | u <- group ])> " ];
238
230
239
231
rules = insertIn (rules , (u : r | u <- group ));
240
232
}
241
233
242
- // Complex case: some unit doesn't have an `end` inner delimiter
234
+ // Complex case: some unit doesn't have an `end` inner delimiter.
235
+ // This requires (substantial) extra care, as there is no obvious
236
+ // marker to close the begin/end pattern with.
243
237
else {
244
- ; // TODO (part of future support for *recursive* multi-line units)
238
+ Decomposition decomposition = decompose ([*group ]);
239
+
240
+ // TODO: The following condition can be true (even though there
241
+ // has to be a `begin` delimiter) because `decompose` doesn't
242
+ // expand non-terminals. Consider if it should, to maybe improve
243
+ // accuracy.
244
+ if ([] == decomposition .prefix ) {
245
+ continue ;
246
+ }
247
+
248
+ RegExp reBegin = toRegExp (rsc , decomposition .prefix , {t });
249
+ RegExp reEnd = regExp ("(?=.)" , []);
250
+
251
+ patterns = for (suffix <- decomposition .suffixes ) {
252
+ if (just (Symbol begin ) := getInnerDelimiterPair (rsc , suffix [0 ], getOnlyFirst = true ).begin ) {
253
+ if (just (Symbol end ) := getInnerDelimiterPair (rsc , suffix [-1 ], getOnlyFirst = true ).end ) {
254
+ // If the suffix has has both a `begin` delimiter
255
+ // and an `end` delimiter, then generate a
256
+ // begin/end pattern to highlight these delimiters
257
+ // and all content in between.
258
+
259
+ set [Segment ] segs = getSegments (rsc , suffix );
260
+ segs = {removeBeginEnd (seg , {begin }, {end }) | seg <- segs };
261
+
262
+ append toTmRule (
263
+ toRegExp (rsc , [begin ], {t }),
264
+ toRegExp (rsc , [end ], {t }),
265
+ [toTmRule (toRegExp (rsc , [s ], {t })) | s <- toTerminals (segs )]);
266
+ }
267
+
268
+ else {
269
+ // If the suffix has a `begin` delimiter, but not
270
+ // an `end` delimiter, then generate a match pattern
271
+ // just to highlight that `begin` delimiter. Ignore
272
+ // the remainder of the suffix (because it's
273
+ // recursive, so no regular expression can be
274
+ // generated for it).
275
+ append toTmRule (toRegExp (rsc , [begin ], {t }));
276
+ }
277
+ }
278
+
279
+ else {
280
+ // If the suffix doesn't have a `begin` delimiter, then
281
+ // ignore it (because it's recursive, so no regular
282
+ // expression can be generated for it).
283
+ ;
284
+ }
285
+ }
286
+
287
+ TmRule r = toTmRule (reBegin , reEnd , patterns );
288
+ r = r [name = "/inner/multi/<intercalate ("," , [u .name | u <- group ])> " ];
289
+ r = r [applyEndPatternLast = true ];
290
+
291
+ rules = insertIn (rules , (u : r | u <- group ));
292
+
293
+ // TODO: The current approach produces "partially"
294
+ // newline-sensitive rules, in the sense that newlines are
295
+ // accepted between the prefix and the suffixes, but not between
296
+ // symbols in the prefix. This approach could be improved to
297
+ // produce "totally" newline-sensitive rules (at the cost of
298
+ // much more complicated rule generation and generated rules) by
299
+ // adopting an approach in which the rules for each symbol in
300
+ // the prefix looks something like the following three:
301
+ //
302
+ // ```
303
+ // "foo": {
304
+ // "name": "foo",
305
+ // "begin": "(\\@)",
306
+ // "end": "(?!\\G)|(?:(?!$)(?![a-z]+))",
307
+ // "patterns": [{ "include": "#foo.$" }, { "match": "[a-z]+" }],
308
+ // "contentName": "comment",
309
+ // "beginCaptures": { "1": { "name": "comment" } }
310
+ // },
311
+ // "foo.$": {
312
+ // "begin": "$",
313
+ // "end": "(?<=^.+)|(?:(?!$)(?![a-z]+))",
314
+ // "name": "foo.$",
315
+ // "patterns": [ { "include": "#foo.^" }]
316
+ // },
317
+ // "foo.^": {
318
+ // "begin": "^",
319
+ // "end": "(?!\\G)|(?:(?!$)(?![a-z]+))",
320
+ // "name": "foo.^",
321
+ // "patterns": [{ "include": "#foo.$" }, { "match": "[a-z]+" }]
322
+ // }
323
+ // ```
324
+ //
325
+ // Note: This alternative approach would likely render the
326
+ // present distinction between the "simple case" and the
327
+ // "complex case" unneeded, so in that sense, rule generation
328
+ // would actually become simpler.
245
329
}
246
330
}
247
331
}
@@ -302,10 +386,20 @@ private Segment removeBeginEnd(Segment seg, set[Symbol] begins, set[Symbol] ends
302
386
if (seg .final , _ <- symbols , symbols [-1 ] in ends ) {
303
387
symbols = symbols [..-1 ];
304
388
}
305
-
306
389
return seg [symbols = symbols ];
307
390
}
308
391
392
+ private list [Symbol ] toTerminals (set [Segment ] segs ) {
393
+ list [Symbol ] terminals = [\seq (seg .symbols ) | seg <- segs ];
394
+ terminals = [s | s <- terminals , [] != s .symbols ];
395
+ terminals = [destar (s ) | s <- terminals ]; // The tokenization engine always tries to apply rules repeatedly
396
+ terminals = dup (terminals );
397
+ terminals = sortByMinimumLength (terminals ); // Small symbols first
398
+ terminals = reverse (terminals ); // Large symbols first
399
+ terminals = terminals + \char -class ([range (1 ,0x10FFFF )]); // Any char (as a fallback)
400
+ return terminals ;
401
+ }
402
+
309
403
// TODO: This function could be moved to a separate, generic module
310
404
private list [&T ] dupLast (list [&T ] l )
311
405
= reverse (dup (reverse (l ))); // TODO: Optimize/avoid `reverse`-ing?
0 commit comments