Skip to content

Identify newline separated segments #15

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 8 additions & 12 deletions rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,6 @@ bool tryParse(Grammar g, Symbol s, str input, bool allowAmbiguity = false) {
return false;
}

@synopsis{
Gets the terminals that occur in production `p`, possibly recursively
(default: `true`)
}

set[Symbol] getTerminals(Grammar g, Production p, bool recur = true)
= {s | s <- p.symbols, !isNonTerminalType(s)}
+ {*getTerminals(g, child) | recur, s <- p.symbols, child <- lookup(g, s)};

@synopsis{
Lookups a list of productions for symbol `s` in grammar `g`, replacing
formal parameters with actual parameters when needed
Expand Down Expand Up @@ -84,21 +75,26 @@ Symbol expand(\iter-star-seps(symbol, separators))
Removes the label from symbol `s`, if any
}

Symbol delabel(label(_, Symbol s)) = s;
default Symbol delabel(Symbol s) = s;
Symbol delabel(\label(_, Symbol s)) = delabel(s);
default Symbol delabel(Symbol s) = s;

@synopsis{
Removes operators `?` and `*` from symbol `s`, if any
}

Symbol destar(label(name, symbol))
Symbol destar(\label(name, symbol))
= label(name, destar(symbol));

Symbol destar(\opt(symbol))
= destar(symbol);
Symbol destar(\iter-star(symbol))
= \iter(destar(symbol));
Symbol destar(\iter-star-seps(symbol, separators))
= \iter-seps(destar(symbol), separators);
Symbol destar(\seq([symbol]))
= \seq([destar(symbol)]);
Symbol destar(\alt({symbol}))
= \alt({destar(symbol)});

default Symbol destar(Symbol s) = s;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,153 @@ module lang::rascal::grammar::analyze::Newlines
import Grammar;
import ParseTree;
import String;
import util::Maybe;

import lang::rascal::grammar::Util;
import util::MaybeUtil;

@synopsis{
Representation of a *newline-free* segment of symbols. A segment is
*initial* when it occurs first in a production/list of symbols; it is
*final* when it occurs last.
}

data Segment = segment(
list[Symbol] symbols,
bool initial = false,
bool final = false);

@synopsis{
Gets the (newline-free) segments of a production/list of symbols in grammar
`g`, separated by symbols that have a newline (not part of any segment),
recursively for non-terminals. For instance, the segments of
`[lit("foo"), lit("bar"), lit("\n"), lit("baz")]` are:
- `[lit("foo"), lit("bar")]`;
- `[lit("baz")]`.
}

set[Segment] getSegments(Grammar g, Production p) {
return unmaybe(getSegmentsByProduction(g)[p]);
}

set[Segment] getSegments(Grammar g, list[Symbol] symbols) {
map[Production, Maybe[set[Segment]]] env = getSegmentsByProduction(g);
return unmaybe(getSegmentsWithEnvironment(g, symbols, env));
}

@memo
private map[Production, Maybe[set[Segment]]] getSegmentsByProduction(Grammar g) {
map[Production, Maybe[set[Segment]]] ret = (p : nothing() | /p: prod(_, _, _) := g);

solve (ret) {
for (p <- ret, nothing() == ret[p]) {
ret[p] = getSegmentsWithEnvironment(g, p.symbols, ret);
}
}

return ret;
}

private Maybe[set[Segment]] getSegmentsWithEnvironment(
Grammar g, list[Symbol] symbols,
map[Production, Maybe[set[Segment]]] env) {

// General idea: Recursively traverse `symbols` from left to right, while
// keeping track of a "running segment" (initially empty). Each time a
// symbol that has a newline is encountered, finish/collect the running
// segment, and start a new one for the remainder of `symbols`.

// Base case: No symbols remaining
Maybe[set[Segment]] get(Segment running, [], bool final = true) {
return just(_ <- running.symbols ? {running[final = final]} : {});
}

// Recursive case: At least one symbol remaining
Maybe[set[Segment]] get(Segment running, [Symbol head, *Symbol tail]) {
set[Symbol] nested = {s | /Symbol s := head};

Maybe[set[Segment]] finished = get(running, [], final = tail == []);

// If the head contains a non-terminal, then: (1) finish the running
// segment; (2) lookup the segments of the non-terminals in the
// environment, if any; (3) compute the segments of the tail. Return the
// union of 1-3.
if (any(s <- nested, isNonTerminalType(s))) {
list[Maybe[set[Segment]]] sets = [];

// (1)
sets += finished;

// (2)
sets += for (s <- nested, isNonTerminalType(s), p <- lookup(g, s)) {

bool isInitial(Segment seg)
= seg.initial && running.initial && running.symbols == [];
bool isFinal(Segment seg)
= seg.final && tail == [];
Segment update(Segment seg)
= seg[initial = isInitial(seg)][final = isFinal(seg)];

append just(segs) := env[p] ? just({update(seg) | seg <- segs}) : nothing();
}

// (3)
sets += get(segment([]), tail);

// Return union
return (sets[0] | union(it, \set) | \set <- sets[1..]);
}

// If the head doesn't contain a non-terminal, but it has a newline,
// then: (1) finish the running segment; (2) compute the segments of the
// tail. Return the union of 1-2. Note: the head, as it has a newline,
// is ignored and won't be part of any segment.
else if (any(s <- nested, hasNewline(g, s))) {
return union(finished, get(segment([]), tail));
}

// If the head doesn't contain a non-terminal, and if it doesn't have a
// newline, then add the head to the running segment and proceed with
// the tail.
else {
Segment old = running;
Segment new = old[symbols = old.symbols + head];
return get(new, tail);
}
}

return get(segment([], initial = true), symbols);
}

@synopsis{
Checks if a symbol has a newline character
}

bool hasNewline(Grammar g, Symbol s) {
return any(p <- lookup(g, delabel(s)), hasNewline(g, p));
}

@synopsis{
Checks if a production has a newline character
}

bool hasNewline(Grammar g, prod(_, symbols, _)) {
set[Symbol] nonTerminals = {s | /Symbol s := symbols, isNonTerminalType(s)};
return any(/r: range(_, _) := symbols, hasNewline(r)) ||
any(s <- nonTerminals, Production p <- lookup(g, s), hasNewline(g, p));
bool hasNewline(Grammar g, Production p) {
return hasNewlineByProduction(g)[p];
}

@memo
private map[Production, bool] hasNewlineByProduction(Grammar g) {
map[Production, bool] ret = (p: false | /p: prod(_, _, _) := g);

solve (ret) {
for (p <- ret, !ret[p]) {
set[Symbol] nonTerminals = {s | /Symbol s := p.symbols, isNonTerminalType(s)};
ret[p] = ret[p] || any(/r: range(_, _) := p.symbols, hasNewline(r))
|| any(s <- nonTerminals, Production child <- lookup(g, s), ret[child]);
}
}

return ret;
}

@synopsis{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@ module lang::rascal::grammar::analyze::Symbols

import Grammar;
import ParseTree;
import String;
import util::Math;
import util::Maybe;

import lang::rascal::grammar::Util;
import util::MaybeUtil;

@synopsis{
Representation of a traversal direction along a list of symbols
Expand Down Expand Up @@ -55,9 +58,9 @@ private map[Symbol, Maybe[set[Symbol]]] firstBySymbol(Grammar g, bool(Symbol) pr

Maybe[set[Symbol]] firstOf([])
= just({});
Maybe[set[Symbol]] firstOf([h, *t])
Maybe[set[Symbol]] firstOf([Symbol h, *Symbol t])
= \set: just({\empty(), *_}) := ret[delabel(h)]
? union(\set, firstOf(t))
? util::MaybeUtil::union(\set, firstOf(t))
: ret[delabel(h)];

solve (ret) {
Expand Down Expand Up @@ -112,19 +115,61 @@ private map[Symbol, Maybe[set[Symbol]]] followBySymbol(Grammar g, bool(Symbol) p
return ret;
}

private set[Symbol] unmaybe(just(set[Symbol] \set))
= \set;
private set[Symbol] unmaybe(nothing())
= {};

private Maybe[set[Symbol]] union(just(set[Symbol] \set1), just(set[Symbol] \set2))
= just(\set1 + \set2);
private default Maybe[set[Symbol]] union(Maybe[set[Symbol]] _, Maybe[set[Symbol]] _)
= nothing();

@synopsis{
Checks if symbol `s` is a terminal
}

bool isTerminal(Symbol s)
= !isNonTerminalType(s);
= !isNonTerminalType(s);

@synposis{
Sorts list of terminals `symbols` by minimum length (in ascending order)
}

list[Symbol] sortByMinimumLength(list[Symbol] symbols) {
bool less(Symbol s1, Symbol s2) = length(s1).min < length(s2).min;
return sort(symbols, less);
}

@synopsis{
Representation of the minimum length and the maximum length of the text
produced by a symbol. If `max` is `nothing()`, then the text produced is
statically unbounded.
}

alias Range = tuple[int min, Maybe[int] max];

private Range ZERO = <0, just(0)>;
private Range seq(Range r1, Range r2) = <r1.min + r2.min, add(r1.max, r2.max)>;
private Range alt(Range r1, Range r2) = <min(r1.min, r2.min), max(r1.max, r2.max)>;

private Maybe[int] add(just(int i), just(int j)) = just(i + j);
private default Maybe[int] add(Maybe[int] _, Maybe[int] _) = nothing();

private Maybe[int] max(just(int i), just(int j)) = just(max(i, j));
private default Maybe[int] max(Maybe[int] _, Maybe[int] _) = nothing();

@synopsis{
Computes the length of a terminal symbol as a range
}

Range length(\lit(string)) = <size(string), just(size(string))>;
Range length(\cilit(string)) = <size(string), just(size(string))>;
Range length(\char-class(_)) = <1, just(1)>;

Range length(\empty()) = ZERO;
Range length(\opt(symbol)) = length(symbol)[min = 0];
Range length(\iter(symbol)) = length(symbol)[max = issue2007];
Range length(\iter-star(symbol)) = <0, max: just(0) := length(symbol).max ? max : nothing()>;
Range length(\iter-seps(symbol, _)) = length(symbol)[max = issue2007];
Range length(\iter-star-seps(symbol, _)) = <0, max: just(0) := length(symbol).max ? max : nothing()>;
Range length(\alt(alternatives)) = {Symbol first, *Symbol rest} := alternatives
? (length(first) | alt(it, length(s)) | s <- rest)
: ZERO;
Range length(\seq(symbols)) = (ZERO | seq(it, length(s)) | s <- symbols);

Range length(\conditional(symbol, _)) = length(symbol);

// TODO: Remove this workaround when issue #2007 is fixed:
// - https://github.com/usethesource/rascal/issues/2007
private Maybe[int] issue2007 = nothing();
26 changes: 22 additions & 4 deletions rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import lang::rascal::grammar::Util;
import lang::rascal::grammar::analyze::Delimiters;
import lang::rascal::grammar::analyze::Dependencies;
import lang::rascal::grammar::analyze::Newlines;
import lang::rascal::grammar::analyze::Symbols;
import lang::textmate::ConversionConstants;
import lang::textmate::ConversionUnit;
import lang::textmate::Grammar;
Expand Down Expand Up @@ -215,13 +216,18 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
// Simple case: each unit does have an `end` inner delimiter
if (_ <- group && all(u <- group, just(_) := u.innerDelimiters.end)) {

// Compute a list of terminals that need to be consumed between
// Compute a list of segments that need to be consumed between
// the `begin` delimiter and the `end` delimiters. Each of these
// terminals will be converted to a match pattern.
list[Symbol] terminals = [*getTerminals(rsc, u.prod) | u <- group];
terminals = [s | s <- terminals, s notin begins && s notin ends];
// segments will be converted to a match pattern.
set[Segment] segs = {*getSegments(rsc, u.prod) | u <- group};
segs = {removeBeginEnd(seg, begins, ends) | seg <- segs};

list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs];
terminals = [s | s <- terminals, [] != s.symbols];
terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
terminals = dup(terminals);
terminals = sortByMinimumLength(terminals); // Small symbols first
terminals = reverse(terminals); // Large symbols first
terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)

TmRule r = toTmRule(
Expand Down Expand Up @@ -288,6 +294,18 @@ private list[ConversionUnit] addOuterRules(list[ConversionUnit] units) {
// precision than a unit-driven approach; I suspect it might.
}

private Segment removeBeginEnd(Segment seg, set[Symbol] begins, set[Symbol] ends) {
list[Symbol] symbols = seg.symbols;
if (seg.initial, _ <- symbols, symbols[0] in begins) {
symbols = symbols[1..];
}
if (seg.final, _ <- symbols, symbols[-1] in ends) {
symbols = symbols[..-1];
}

return seg[symbols = symbols];
}

// TODO: This function could be moved to a separate, generic module
private list[&T] dupLast(list[&T] l)
= reverse(dup(reverse(l))); // TODO: Optimize/avoid `reverse`-ing?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@ syntax Expression

lexical Id = ([a-z][a-z0-9]*) !>> [a-z0-9] \ Keyword;
lexical Natural = [0-9]+ !>> [0-9];
lexical String = "\"" ![\"]* "\"";
lexical String = "\"" Char* "\"";

lexical Char
= ![\\\"]
| "\\" [\\\"];

keyword Keyword
= "begin"
Expand All @@ -70,7 +74,7 @@ lexical WhitespaceAndComment
Grammar rsc = preprocess(grammar(#Program));

list[ConversionUnit] units = [
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":=")})],{}), false, <nothing(),nothing()>, <nothing(),nothing()>),
unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit("-"),lit(","),lit(")"),lit("("),lit("+"),lit("||"),lit(":="),lit("\\")})],{}), false, <nothing(),nothing()>, <nothing(),nothing()>),
unit(rsc, prod(label("natural",sort("Type")),[lit("natural")],{\tag("category"("storage.type"))}), false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),
unit(rsc, prod(label("nil",sort("Type")),[lit("nil-type")],{\tag("category"("storage.type"))}), false, <just(lit(":")),just(lit(";"))>, <just(lit("nil-type")),just(lit("nil-type"))>),
unit(rsc, prod(label("string",sort("Type")),[lit("string")],{\tag("category"("storage.type"))}), false, <just(lit(":")),just(lit(";"))>, <nothing(),nothing()>),
Expand Down
Loading