Skip to content

Commit 17df6e6

Browse files
authored
Merge pull request #15 from SWAT-engineering/identify-newline-separated-segments
Identify newline separated segments
2 parents c7ed45e + 27c26ef commit 17df6e6

File tree

9 files changed

+417
-46
lines changed

9 files changed

+417
-46
lines changed

rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,6 @@ bool tryParse(Grammar g, Symbol s, str input, bool allowAmbiguity = false) {
3131
return false;
3232
}
3333

34-
@synopsis{
35-
Gets the terminals that occur in production `p`, possibly recursively
36-
(default: `true`)
37-
}
38-
39-
set[Symbol] getTerminals(Grammar g, Production p, bool recur = true)
40-
= {s | s <- p.symbols, !isNonTerminalType(s)}
41-
+ {*getTerminals(g, child) | recur, s <- p.symbols, child <- lookup(g, s)};
42-
4334
@synopsis{
4435
Lookups a list of productions for symbol `s` in grammar `g`, replacing
4536
formal parameters with actual parameters when needed
@@ -84,21 +75,26 @@ Symbol expand(\iter-star-seps(symbol, separators))
8475
Removes the label from symbol `s`, if any
8576
}
8677
87-
Symbol delabel(label(_, Symbol s)) = s;
88-
default Symbol delabel(Symbol s) = s;
78+
Symbol delabel(\label(_, Symbol s)) = delabel(s);
79+
default Symbol delabel(Symbol s) = s;
8980
9081
@synopsis{
9182
Removes operators `?` and `*` from symbol `s`, if any
9283
}
9384
94-
Symbol destar(label(name, symbol))
85+
Symbol destar(\label(name, symbol))
9586
= label(name, destar(symbol));
87+
9688
Symbol destar(\opt(symbol))
9789
= destar(symbol);
9890
Symbol destar(\iter-star(symbol))
9991
= \iter(destar(symbol));
10092
Symbol destar(\iter-star-seps(symbol, separators))
10193
= \iter-seps(destar(symbol), separators);
94+
Symbol destar(\seq([symbol]))
95+
= \seq([destar(symbol)]);
96+
Symbol destar(\alt({symbol}))
97+
= \alt({destar(symbol)});
10298
10399
default Symbol destar(Symbol s) = s;
104100

rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Newlines.rsc

Lines changed: 140 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,153 @@ module lang::rascal::grammar::analyze::Newlines
77
import Grammar;
88
import ParseTree;
99
import String;
10+
import util::Maybe;
1011

1112
import lang::rascal::grammar::Util;
13+
import util::MaybeUtil;
14+
15+
@synopsis{
16+
Representation of a *newline-free* segment of symbols. A segment is
17+
*initial* when it occurs first in a production/list of symbols; it is
18+
*final* when it occurs last.
19+
}
20+
21+
data Segment = segment(
22+
list[Symbol] symbols,
23+
bool initial = false,
24+
bool final = false);
25+
26+
@synopsis{
27+
Gets the (newline-free) segments of a production/list of symbols in grammar
28+
`g`, separated by symbols that have a newline (not part of any segment),
29+
recursively for non-terminals. For instance, the segments of
30+
`[lit("foo"), lit("bar"), lit("\n"), lit("baz")]` are:
31+
- `[lit("foo"), lit("bar")]`;
32+
- `[lit("baz")]`.
33+
}
34+
35+
set[Segment] getSegments(Grammar g, Production p) {
36+
return unmaybe(getSegmentsByProduction(g)[p]);
37+
}
38+
39+
set[Segment] getSegments(Grammar g, list[Symbol] symbols) {
40+
map[Production, Maybe[set[Segment]]] env = getSegmentsByProduction(g);
41+
return unmaybe(getSegmentsWithEnvironment(g, symbols, env));
42+
}
43+
44+
@memo
45+
private map[Production, Maybe[set[Segment]]] getSegmentsByProduction(Grammar g) {
46+
map[Production, Maybe[set[Segment]]] ret = (p : nothing() | /p: prod(_, _, _) := g);
47+
48+
solve (ret) {
49+
for (p <- ret, nothing() == ret[p]) {
50+
ret[p] = getSegmentsWithEnvironment(g, p.symbols, ret);
51+
}
52+
}
53+
54+
return ret;
55+
}
56+
57+
private Maybe[set[Segment]] getSegmentsWithEnvironment(
58+
Grammar g, list[Symbol] symbols,
59+
map[Production, Maybe[set[Segment]]] env) {
60+
61+
// General idea: Recursively traverse `symbols` from left to right, while
62+
// keeping track of a "running segment" (initially empty). Each time a
63+
// symbol that has a newline is encountered, finish/collect the running
64+
// segment, and start a new one for the remainder of `symbols`.
65+
66+
// Base case: No symbols remaining
67+
Maybe[set[Segment]] get(Segment running, [], bool final = true) {
68+
return just(_ <- running.symbols ? {running[final = final]} : {});
69+
}
70+
71+
// Recursive case: At least one symbol remaining
72+
Maybe[set[Segment]] get(Segment running, [Symbol head, *Symbol tail]) {
73+
set[Symbol] nested = {s | /Symbol s := head};
74+
75+
Maybe[set[Segment]] finished = get(running, [], final = tail == []);
76+
77+
// If the head contains a non-terminal, then: (1) finish the running
78+
// segment; (2) lookup the segments of the non-terminals in the
79+
// environment, if any; (3) compute the segments of the tail. Return the
80+
// union of 1-3.
81+
if (any(s <- nested, isNonTerminalType(s))) {
82+
list[Maybe[set[Segment]]] sets = [];
83+
84+
// (1)
85+
sets += finished;
86+
87+
// (2)
88+
sets += for (s <- nested, isNonTerminalType(s), p <- lookup(g, s)) {
89+
90+
bool isInitial(Segment seg)
91+
= seg.initial && running.initial && running.symbols == [];
92+
bool isFinal(Segment seg)
93+
= seg.final && tail == [];
94+
Segment update(Segment seg)
95+
= seg[initial = isInitial(seg)][final = isFinal(seg)];
96+
97+
append just(segs) := env[p] ? just({update(seg) | seg <- segs}) : nothing();
98+
}
99+
100+
// (3)
101+
sets += get(segment([]), tail);
102+
103+
// Return union
104+
return (sets[0] | union(it, \set) | \set <- sets[1..]);
105+
}
106+
107+
// If the head doesn't contain a non-terminal, but it has a newline,
108+
// then: (1) finish the running segment; (2) compute the segments of the
109+
// tail. Return the union of 1-2. Note: the head, as it has a newline,
110+
// is ignored and won't be part of any segment.
111+
else if (any(s <- nested, hasNewline(g, s))) {
112+
return union(finished, get(segment([]), tail));
113+
}
114+
115+
// If the head doesn't contain a non-terminal, and if it doesn't have a
116+
// newline, then add the head to the running segment and proceed with
117+
// the tail.
118+
else {
119+
Segment old = running;
120+
Segment new = old[symbols = old.symbols + head];
121+
return get(new, tail);
122+
}
123+
}
124+
125+
return get(segment([], initial = true), symbols);
126+
}
127+
128+
@synopsis{
129+
Checks if a symbol has a newline character
130+
}
131+
132+
bool hasNewline(Grammar g, Symbol s) {
133+
return any(p <- lookup(g, delabel(s)), hasNewline(g, p));
134+
}
12135
13136
@synopsis{
14137
Checks if a production has a newline character
15138
}
16139
17-
bool hasNewline(Grammar g, prod(_, symbols, _)) {
18-
set[Symbol] nonTerminals = {s | /Symbol s := symbols, isNonTerminalType(s)};
19-
return any(/r: range(_, _) := symbols, hasNewline(r)) ||
20-
any(s <- nonTerminals, Production p <- lookup(g, s), hasNewline(g, p));
140+
bool hasNewline(Grammar g, Production p) {
141+
return hasNewlineByProduction(g)[p];
142+
}
143+
144+
@memo
145+
private map[Production, bool] hasNewlineByProduction(Grammar g) {
146+
map[Production, bool] ret = (p: false | /p: prod(_, _, _) := g);
147+
148+
solve (ret) {
149+
for (p <- ret, !ret[p]) {
150+
set[Symbol] nonTerminals = {s | /Symbol s := p.symbols, isNonTerminalType(s)};
151+
ret[p] = ret[p] || any(/r: range(_, _) := p.symbols, hasNewline(r))
152+
|| any(s <- nonTerminals, Production child <- lookup(g, s), ret[child]);
153+
}
154+
}
155+
156+
return ret;
21157
}
22158
23159
@synopsis{

rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Symbols.rsc

Lines changed: 58 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,12 @@ module lang::rascal::grammar::analyze::Symbols
1717

1818
import Grammar;
1919
import ParseTree;
20+
import String;
21+
import util::Math;
2022
import util::Maybe;
2123

2224
import lang::rascal::grammar::Util;
25+
import util::MaybeUtil;
2326

2427
@synopsis{
2528
Representation of a traversal direction along a list of symbols
@@ -55,9 +58,9 @@ private map[Symbol, Maybe[set[Symbol]]] firstBySymbol(Grammar g, bool(Symbol) pr
5558
5659
Maybe[set[Symbol]] firstOf([])
5760
= just({});
58-
Maybe[set[Symbol]] firstOf([h, *t])
61+
Maybe[set[Symbol]] firstOf([Symbol h, *Symbol t])
5962
= \set: just({\empty(), *_}) := ret[delabel(h)]
60-
? union(\set, firstOf(t))
63+
? util::MaybeUtil::union(\set, firstOf(t))
6164
: ret[delabel(h)];
6265
6366
solve (ret) {
@@ -112,19 +115,61 @@ private map[Symbol, Maybe[set[Symbol]]] followBySymbol(Grammar g, bool(Symbol) p
112115
return ret;
113116
}
114117
115-
private set[Symbol] unmaybe(just(set[Symbol] \set))
116-
= \set;
117-
private set[Symbol] unmaybe(nothing())
118-
= {};
119-
120-
private Maybe[set[Symbol]] union(just(set[Symbol] \set1), just(set[Symbol] \set2))
121-
= just(\set1 + \set2);
122-
private default Maybe[set[Symbol]] union(Maybe[set[Symbol]] _, Maybe[set[Symbol]] _)
123-
= nothing();
124-
125118
@synopsis{
126119
Checks if symbol `s` is a terminal
127120
}
128121
129122
bool isTerminal(Symbol s)
130-
= !isNonTerminalType(s);
123+
= !isNonTerminalType(s);
124+
125+
@synposis{
126+
Sorts list of terminals `symbols` by minimum length (in ascending order)
127+
}
128+
129+
list[Symbol] sortByMinimumLength(list[Symbol] symbols) {
130+
bool less(Symbol s1, Symbol s2) = length(s1).min < length(s2).min;
131+
return sort(symbols, less);
132+
}
133+
134+
@synopsis{
135+
Representation of the minimum length and the maximum length of the text
136+
produced by a symbol. If `max` is `nothing()`, then the text produced is
137+
statically unbounded.
138+
}
139+
140+
alias Range = tuple[int min, Maybe[int] max];
141+
142+
private Range ZERO = <0, just(0)>;
143+
private Range seq(Range r1, Range r2) = <r1.min + r2.min, add(r1.max, r2.max)>;
144+
private Range alt(Range r1, Range r2) = <min(r1.min, r2.min), max(r1.max, r2.max)>;
145+
146+
private Maybe[int] add(just(int i), just(int j)) = just(i + j);
147+
private default Maybe[int] add(Maybe[int] _, Maybe[int] _) = nothing();
148+
149+
private Maybe[int] max(just(int i), just(int j)) = just(max(i, j));
150+
private default Maybe[int] max(Maybe[int] _, Maybe[int] _) = nothing();
151+
152+
@synopsis{
153+
Computes the length of a terminal symbol as a range
154+
}
155+
156+
Range length(\lit(string)) = <size(string), just(size(string))>;
157+
Range length(\cilit(string)) = <size(string), just(size(string))>;
158+
Range length(\char-class(_)) = <1, just(1)>;
159+
160+
Range length(\empty()) = ZERO;
161+
Range length(\opt(symbol)) = length(symbol)[min = 0];
162+
Range length(\iter(symbol)) = length(symbol)[max = issue2007];
163+
Range length(\iter-star(symbol)) = <0, max: just(0) := length(symbol).max ? max : nothing()>;
164+
Range length(\iter-seps(symbol, _)) = length(symbol)[max = issue2007];
165+
Range length(\iter-star-seps(symbol, _)) = <0, max: just(0) := length(symbol).max ? max : nothing()>;
166+
Range length(\alt(alternatives)) = {Symbol first, *Symbol rest} := alternatives
167+
? (length(first) | alt(it, length(s)) | s <- rest)
168+
: ZERO;
169+
Range length(\seq(symbols)) = (ZERO | seq(it, length(s)) | s <- symbols);
170+
171+
Range length(\conditional(symbol, _)) = length(symbol);
172+
173+
// TODO: Remove this workaround when issue #2007 is fixed:
174+
// - https://github.com/usethesource/rascal/issues/2007
175+
private Maybe[int] issue2007 = nothing();

rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import lang::rascal::grammar::Util;
1616
import lang::rascal::grammar::analyze::Delimiters;
1717
import lang::rascal::grammar::analyze::Dependencies;
1818
import lang::rascal::grammar::analyze::Newlines;
19+
import lang::rascal::grammar::analyze::Symbols;
1920
import lang::textmate::ConversionConstants;
2021
import lang::textmate::ConversionUnit;
2122
import lang::textmate::Grammar;
@@ -215,13 +216,18 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
215216
// Simple case: each unit does have an `end` inner delimiter
216217
if (_ <- group && all(u <- group, just(_) := u.innerDelimiters.end)) {
217218
218-
// Compute a list of terminals that need to be consumed between
219+
// Compute a list of segments that need to be consumed between
219220
// the `begin` delimiter and the `end` delimiters. Each of these
220-
// terminals will be converted to a match pattern.
221-
list[Symbol] terminals = [*getTerminals(rsc, u.prod) | u <- group];
222-
terminals = [s | s <- terminals, s notin begins && s notin ends];
221+
// segments will be converted to a match pattern.
222+
set[Segment] segs = {*getSegments(rsc, u.prod) | u <- group};
223+
segs = {removeBeginEnd(seg, begins, ends) | seg <- segs};
224+
225+
list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs];
226+
terminals = [s | s <- terminals, [] != s.symbols];
223227
terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
224228
terminals = dup(terminals);
229+
terminals = sortByMinimumLength(terminals); // Small symbols first
230+
terminals = reverse(terminals); // Large symbols first
225231
terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)
226232
227233
TmRule r = toTmRule(
@@ -288,6 +294,18 @@ private list[ConversionUnit] addOuterRules(list[ConversionUnit] units) {
288294
// precision than a unit-driven approach; I suspect it might.
289295
}
290296
297+
private Segment removeBeginEnd(Segment seg, set[Symbol] begins, set[Symbol] ends) {
298+
list[Symbol] symbols = seg.symbols;
299+
if (seg.initial, _ <- symbols, symbols[0] in begins) {
300+
symbols = symbols[1..];
301+
}
302+
if (seg.final, _ <- symbols, symbols[-1] in ends) {
303+
symbols = symbols[..-1];
304+
}
305+
306+
return seg[symbols = symbols];
307+
}
308+
291309
// TODO: This function could be moved to a separate, generic module
292310
private list[&T] dupLast(list[&T] l)
293311
= reverse(dup(reverse(l))); // TODO: Optimize/avoid `reverse`-ing?

rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/PicoWithCategories.rsc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,11 @@ syntax Expression
4444

4545
lexical Id = ([a-z][a-z0-9]*) !>> [a-z0-9] \ Keyword;
4646
lexical Natural = [0-9]+ !>> [0-9];
47-
lexical String = "\"" ![\"]* "\"";
47+
lexical String = "\"" Char* "\"";
48+
49+
lexical Char
50+
= ![\\\"]
51+
| "\\" [\\\"];
4852

4953
keyword Keyword
5054
= "begin"
@@ -70,7 +74,7 @@ lexical WhitespaceAndComment
7074
Grammar rsc = preprocess(grammar(#Program));
7175

7276
list[ConversionUnit] units = [
73-
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":=")})],{}), false, <nothing(),nothing()>, <nothing(),nothing()>),
77+
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":="),lit("\\")})],{}), false, <nothing(),nothing()>, <nothing(),nothing()>),
7478
unit(rsc, prod(label("natural",sort("Type")),[lit("natural")],{\tag("category"("storage.type"))}), false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),
7579
unit(rsc, prod(label("nil",sort("Type")),[lit("nil-type")],{\tag("category"("storage.type"))}), false, <just(lit(":")),just(lit(";"))>, <just(lit("nil-type")),just(lit("nil-type"))>),
7680
unit(rsc, prod(label("string",sort("Type")),[lit("string")],{\tag("category"("storage.type"))}), false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),

0 commit comments

Comments
 (0)