Skip to content

Commit 8557e38

Browse files
authored
Merge pull request #17 from SWAT-engineering/recursive-multiline-highlighting2
Recursive multiline highlighting
2 parents 17df6e6 + e8a887c commit 8557e38

File tree

19 files changed

+724
-302
lines changed

19 files changed

+724
-302
lines changed

rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,22 @@ bool tryParse(Grammar g, Symbol s, str input, bool allowAmbiguity = false) {
3131
return false;
3232
}
3333

34+
@synopsis{
35+
Checks if symbol `s` is recursive in grammar `g`
36+
}
37+
38+
bool isRecursive(Grammar g, Symbol s) {
39+
set[Symbol] getChildren(Symbol s)
40+
= {s | p <- lookup(g, s), /Symbol s := p.symbols};
41+
42+
bool check(set[Symbol] checking, Symbol s)
43+
= s in checking
44+
? true
45+
: any(child <- getChildren(s), check(checking + s, child));
46+
47+
return check({}, s);
48+
}
49+
3450
@synopsis{
3551
Lookups a list of productions for symbol `s` in grammar `g`, replacing
3652
formal parameters with actual parameters when needed
@@ -96,10 +112,20 @@ Symbol destar(\seq([symbol]))
96112
Symbol destar(\alt({symbol}))
97113
= \alt({destar(symbol)});
98114
115+
Symbol destar(\conditional(symbol, conditions))
116+
= \conditional(destar(symbol), conditions);
117+
99118
default Symbol destar(Symbol s) = s;
100119
101120
@synopsis{
102-
Retain from set `symbols` each symbol that is a strict prefix of any other
121+
Removes the conditional from symbol `s`, if any
122+
}
123+
124+
Symbol decond(\conditional(Symbol s, _)) = decond(s);
125+
default Symbol decond(Symbol s) = s;
126+
127+
@synopsis{
128+
Retains from set `symbols` each symbol that is a strict prefix of any other
103129
symbol in `symbols`
104130
}
105131

rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Delimiters.rsc

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,28 @@ data Direction // Traverse lists of symbols (in productions)...
2626
list[&T] reorder(list[&T] l, forward()) = l;
2727
list[&T] reorder(list[&T] l, backward()) = reverse(l);
2828

29+
@synopsis{
30+
Gets the unique leftmost delimiter (`begin`) and the unique rightmost
31+
delimiter `end`, if any, that occur **inside** productions of symbol `s`
32+
(when `s` is a non-terminal) or `s` itself (when `s` is a delimiter). If
33+
`getOnlyFirst` is `true` (default: `false`), then only the first (resp.
34+
last) symbol of the productions can be considered as leftmost (resp.
35+
rightmost).
36+
}
37+
38+
DelimiterPair getInnerDelimiterPair(Grammar g, Symbol s, bool getOnlyFirst = false) {
39+
s = delabel(s);
40+
if (isDelimiter(s)) {
41+
return <just(s), just(s)>;
42+
} else if (isNonTerminalType(s)) {
43+
Maybe[Symbol] begin = getInnerDelimiterBySymbol(g, forward(), getOnlyFirst = getOnlyFirst)[s];
44+
Maybe[Symbol] end = getInnerDelimiterBySymbol(g, backward(), getOnlyFirst = getOnlyFirst)[s];
45+
return <begin, end>;
46+
} else {
47+
return <nothing(), nothing()>;
48+
}
49+
}
50+
2951
@synopsis{
3052
Gets the unique leftmost delimiter (`begin`) and the unique rightmost
3153
delimiter (`end`), if any, that occur **inside** production `p` in grammar
@@ -60,7 +82,7 @@ list[&T] reorder(list[&T] l, backward()) = reverse(l);
6082
}
6183

6284
DelimiterPair getInnerDelimiterPair(Grammar g, Production p, bool getOnlyFirst = false) {
63-
Maybe[Symbol] begin = getInnerDelimiterByProduction(g, forward() , getOnlyFirst = getOnlyFirst)[p];
85+
Maybe[Symbol] begin = getInnerDelimiterByProduction(g, forward(), getOnlyFirst = getOnlyFirst)[p];
6486
Maybe[Symbol] end = getInnerDelimiterByProduction(g, backward(), getOnlyFirst = getOnlyFirst)[p];
6587
return <begin, end>;
6688
}
@@ -79,6 +101,7 @@ private map[Production, Maybe[Symbol]] getInnerDelimiterByProduction(Grammar g,
79101
for (p <- ret, ret[p] == nothing()) {
80102
for (s <- reorder(p.symbols, direction)) {
81103
s = delabel(s);
104+
s = decond(s);
82105
if (isDelimiter(s)) {
83106
ret[p] = just(s);
84107
break;

rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc

Lines changed: 122 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ RscGrammar preprocess(RscGrammar rsc) {
5555
// Replace occurrences of singleton ranges with just the corresponding
5656
// literal. This makes it easier to identify delimiters.
5757
return visit (rsc) {
58-
case s: \char-class([range(char, char)]) => d
58+
case \char-class([range(char, char)]) => d
5959
when d := \lit("<stringChar(char)>"), isDelimiter(d)
6060
}
6161
}
@@ -113,12 +113,10 @@ list[ConversionUnit] analyze(RscGrammar rsc) {
113113
114114
// Analyze dependencies among productions
115115
println("[LOG] Analyzing dependencies among productions");
116-
Dependencies dependencies = deps(toGraph(rsc));
117-
list[Production] prods = dependencies
118-
.removeProds(isCyclic, true) // `true` means "also remove ancestors"
119-
.retainProds(isNonEmpty)
120-
.retainProds(hasCategory)
121-
.getProds();
116+
Graph[Production] graph = toGraph(rsc);
117+
list[Production] prods = deps(graph).retainProds(isNonEmpty).retainProds(hasCategory).getProds();
118+
list[Production] prodsNonRecursive = prods & deps(graph).removeProds(isCyclic, true).getProds();
119+
list[Production] prodsRecursive = prods - prodsNonRecursive;
122120
123121
// Analyze delimiters
124122
println("[LOG] Analyzing delimiters");
@@ -134,13 +132,15 @@ list[ConversionUnit] analyze(RscGrammar rsc) {
134132
list[Production] prodsKeywords = [prod(lex(KEYWORDS_PRODUCTION_NAME), [\alt(keywords)], {\tag("category"("keyword.control"))})];
135133
136134
// Return
137-
bool isEmptyProd(prod(_, [\alt(alternatives)], _)) = alternatives == {};
138-
list[ConversionUnit] units
139-
= [unit(rsc, p, hasNewline(rsc, p), getOuterDelimiterPair(rsc, p), getInnerDelimiterPair(rsc, p, getOnlyFirst = true)) | p <- prods]
140-
+ [unit(rsc, p, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsDelimiters, !isEmptyProd(p)]
141-
+ [unit(rsc, p, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsKeywords, !isEmptyProd(p)];
142-
143-
return sort(units);
135+
bool isRecursive(Production p)
136+
= p in prodsRecursive;
137+
bool isEmptyProd(prod(_, [\alt(alternatives)], _))
138+
= alternatives == {};
139+
140+
set[ConversionUnit] units = {};
141+
units += {unit(rsc, p, isRecursive(p), hasNewline(rsc, p), getOuterDelimiterPair(rsc, p), getInnerDelimiterPair(rsc, p, getOnlyFirst = true)) | p <- prods};
142+
units += {unit(rsc, p, false, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsDelimiters + prodsKeywords, !isEmptyProd(p)};
143+
return sort([*removeStrictPrefixes(units)]);
144144
}
145145
146146
@synopsis{
@@ -196,7 +196,7 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
196196
197197
// Convert all units in the group to match patterns (including,
198198
// optimistically, multi-line units as-if they are single-line)
199-
for (u <- group) {
199+
for (u <- group, !u.recursive) {
200200
TmRule r = toTmRule(toRegExp(u.rsc, u.prod, guard = true))
201201
[name = "/inner/single/<u.name>"];
202202
@@ -216,32 +216,116 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
216216
// Simple case: each unit does have an `end` inner delimiter
217217
if (_ <- group && all(u <- group, just(_) := u.innerDelimiters.end)) {
218218
219-
// Compute a list of segments that need to be consumed between
219+
// Compute a set of segments that need to be consumed between
220220
// the `begin` delimiter and the `end` delimiters. Each of these
221221
// segments will be converted to a match pattern.
222222
set[Segment] segs = {*getSegments(rsc, u.prod) | u <- group};
223223
segs = {removeBeginEnd(seg, begins, ends) | seg <- segs};
224224
225-
list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs];
226-
terminals = [s | s <- terminals, [] != s.symbols];
227-
terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
228-
terminals = dup(terminals);
229-
terminals = sortByMinimumLength(terminals); // Small symbols first
230-
terminals = reverse(terminals); // Large symbols first
231-
terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)
232-
233225
TmRule r = toTmRule(
234226
toRegExp(rsc, [begin], {t}),
235227
toRegExp(rsc, [\alt(ends)], {t}),
236-
[toTmRule(toRegExp(rsc, [s], {t})) | s <- terminals])
228+
[toTmRule(toRegExp(rsc, [s], {t})) | s <- toTerminals(segs)])
237229
[name = "/inner/multi/<intercalate(",", [u.name | u <- group])>"];
238230
239231
rules = insertIn(rules, (u: r | u <- group));
240232
}
241233
242-
// Complex case: some unit doesn't have an `end` inner delimiter
234+
// Complex case: some unit doesn't have an `end` inner delimiter.
235+
// This requires (substantial) extra care, as there is no obvious
236+
// marker to close the begin/end pattern with.
243237
else {
244-
; // TODO (part of future support for *recursive* multi-line units)
238+
Decomposition decomposition = decompose([*group]);
239+
240+
// TODO: The following condition can be true (even though there
241+
// has to be a `begin` delimiter) because `decompose` doesn't
242+
// expand non-terminals. Consider if it should, to maybe improve
243+
// accuracy.
244+
if ([] == decomposition.prefix) {
245+
continue;
246+
}
247+
248+
RegExp reBegin = toRegExp(rsc, decomposition.prefix, {t});
249+
RegExp reEnd = regExp("(?=.)", []);
250+
251+
patterns = for (suffix <- decomposition.suffixes) {
252+
if (just(Symbol begin) := getInnerDelimiterPair(rsc, suffix[0], getOnlyFirst = true).begin) {
253+
if (just(Symbol end) := getInnerDelimiterPair(rsc, suffix[-1], getOnlyFirst = true).end) {
254+
// If the suffix has has both a `begin` delimiter
255+
// and an `end` delimiter, then generate a
256+
// begin/end pattern to highlight these delimiters
257+
// and all content in between.
258+
259+
set[Segment] segs = getSegments(rsc, suffix);
260+
segs = {removeBeginEnd(seg, {begin}, {end}) | seg <- segs};
261+
262+
append toTmRule(
263+
toRegExp(rsc, [begin], {t}),
264+
toRegExp(rsc, [end], {t}),
265+
[toTmRule(toRegExp(rsc, [s], {t})) | s <- toTerminals(segs)]);
266+
}
267+
268+
else {
269+
// If the suffix has a `begin` delimiter, but not
270+
// an `end` delimiter, then generate a match pattern
271+
// just to highlight that `begin` delimiter. Ignore
272+
// the remainder of the suffix (because it's
273+
// recursive, so no regular expression can be
274+
// generated for it).
275+
append toTmRule(toRegExp(rsc, [begin], {t}));
276+
}
277+
}
278+
279+
else {
280+
// If the suffix doesn't have a `begin` delimiter, then
281+
// ignore it (because it's recursive, so no regular
282+
// expression can be generated for it).
283+
;
284+
}
285+
}
286+
287+
TmRule r = toTmRule(reBegin, reEnd, patterns);
288+
r = r[name = "/inner/multi/<intercalate(",", [u.name | u <- group])>"];
289+
r = r[applyEndPatternLast = true];
290+
291+
rules = insertIn(rules, (u: r | u <- group));
292+
293+
// TODO: The current approach produces "partially"
294+
// newline-sensitive rules, in the sense that newlines are
295+
// accepted between the prefix and the suffixes, but not between
296+
// symbols in the prefix. This approach could be improved to
297+
// produce "totally" newline-sensitive rules (at the cost of
298+
// much more complicated rule generation and generated rules) by
299+
// adopting an approach in which the rules for each symbol in
300+
// the prefix looks something like the following three:
301+
//
302+
// ```
303+
// "foo": {
304+
// "name": "foo",
305+
// "begin": "(\\@)",
306+
// "end": "(?!\\G)|(?:(?!$)(?![a-z]+))",
307+
// "patterns": [{ "include": "#foo.$" }, { "match": "[a-z]+" }],
308+
// "contentName": "comment",
309+
// "beginCaptures": { "1": { "name": "comment" } }
310+
// },
311+
// "foo.$": {
312+
// "begin": "$",
313+
// "end": "(?<=^.+)|(?:(?!$)(?![a-z]+))",
314+
// "name": "foo.$",
315+
// "patterns": [ { "include": "#foo.^" }]
316+
// },
317+
// "foo.^": {
318+
// "begin": "^",
319+
// "end": "(?!\\G)|(?:(?!$)(?![a-z]+))",
320+
// "name": "foo.^",
321+
// "patterns": [{ "include": "#foo.$" }, { "match": "[a-z]+" }]
322+
// }
323+
// ```
324+
//
325+
// Note: This alternative approach would likely render the
326+
// present distinction between the "simple case" and the
327+
// "complex case" unneeded, so in that sense, rule generation
328+
// would actually become simpler.
245329
}
246330
}
247331
}
@@ -302,10 +386,20 @@ private Segment removeBeginEnd(Segment seg, set[Symbol] begins, set[Symbol] ends
302386
if (seg.final, _ <- symbols, symbols[-1] in ends) {
303387
symbols = symbols[..-1];
304388
}
305-
306389
return seg[symbols = symbols];
307390
}
308391
392+
private list[Symbol] toTerminals(set[Segment] segs) {
393+
list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs];
394+
terminals = [s | s <- terminals, [] != s.symbols];
395+
terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
396+
terminals = dup(terminals);
397+
terminals = sortByMinimumLength(terminals); // Small symbols first
398+
terminals = reverse(terminals); // Large symbols first
399+
terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)
400+
return terminals;
401+
}
402+
309403
// TODO: This function could be moved to a separate, generic module
310404
private list[&T] dupLast(list[&T] l)
311405
= reverse(dup(reverse(l))); // TODO: Optimize/avoid `reverse`-ing?

rascal-textmate-core/src/main/rascal/lang/textmate/ConversionTests.rsc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ bool doAnalyzeTest(RscGrammar rsc, list[ConversionUnit] expect, bool printActual
3838
println();
3939
for (i <- [0..size(actual)]) {
4040
ConversionUnit u = actual[i];
41-
println(" unit(rsc, <toStr(u.prod)>, <u.multiLine>, <u.outerDelimiters>, <u.innerDelimiters>)<i < size(actual) - 1 ? "," : "">");
41+
println(" unit(rsc, <toStr(u.prod)>, <u.recursive>, <u.multiLine>, <u.outerDelimiters>, <u.innerDelimiters>)<i < size(actual) - 1 ? "," : "">");
4242
}
4343
println();
4444
}

0 commit comments

Comments
 (0)