From ddd2dc0a0464fa5830b0129bdec69200aec74c34 Mon Sep 17 00:00:00 2001 From: Jon Shea Date: Tue, 4 Mar 2025 11:03:25 -0500 Subject: [PATCH] Support for string escapes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds support for string escapes, as proposed in https://github.com/tree-sitter/tree-sitter-scala/issues/207 I also slightly changed the definition of `interpolated_string` by qualifying the open quotes with `token.immediate(…)`. Prior to this change the rule would incorrectly match `foo ""` as an `interpolated_string_expression` I ran these changes against all of the `.scala` files in https://github.com/scala/scala3 and https://github.com/scala/scala. The files in `scala/scala3` that newly have errors are: * `tests/neg/fEscapes.scala` * `tests/neg/unicodeEscapes-interpolations.scala` * `tests/pos/multiLineOps.scala` * `tests/run/i14164.scala` The first two are tests containing examples of invalid escape sequences that are expected to fail. `multiLineOps.scala` contains a line `send_! "!"` that now parses with an error. Previously this parsed as an `interpolated_string_expression`, which is also entirely incorrect, So this error is a result of the adding `token.immediate('"')` to the definition of `interpolated_string`, and I do not think the change is a regression. Similarly, `i14164.scala` contains a multi-line expression that previously incorrectly parsed to `interpolated_string_expression`, and now parses more correctly, though with an error. The files in `scala/scala` that newly have errors are similar. Two test files with intentionally broken escape sequences, and the same `multiLineOps.scala`. --- grammar.js | 110 +++++++++++---- src/scanner.c | 178 ++++++++++++++++++++----- test/corpus/literals.txt | 281 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 505 insertions(+), 64 deletions(-) diff --git a/grammar.js b/grammar.js index 1b1184cf..67bb0b4e 100644 --- a/grammar.js +++ b/grammar.js @@ -32,19 +32,24 @@ module.exports = grammar({ externals: $ => [ $._automatic_semicolon, $._indent, + $._outdent, + $._simple_string_start, + $._simple_string_middle, + $._simple_multiline_string_start, $._interpolated_string_middle, - $._interpolated_string_end, $._interpolated_multiline_string_middle, - $._interpolated_multiline_string_end, - $._outdent, - $._simple_multiline_string, - $._simple_string, + $._raw_string_start, + $._raw_string_middle, + $._raw_string_multiline_middle, + $._single_line_string_end, + $._multiline_string_end, "else", "catch", "finally", "extends", "derives", "with", + $.error_sentinel, ], inline: $ => [ @@ -209,7 +214,7 @@ module.exports = grammar({ "package", field("name", $.package_identifier), // This is slightly more permissive than the EBNF in that it allows any - // kind of delcaration inside of the package blocks. As we're more + // kind of declaration inside of the package blocks. As we're more // concerned with the structure rather than the validity of the program // we'll allow it. field("body", optional($.template_body)), @@ -677,7 +682,7 @@ module.exports = grammar({ // In theory structural_type should just be added to simple_type, // but doing so increases the state of template_body to 4000 $._structural_type, - // This adds _simple_type, but not the above intentionall/y. + // This adds _simple_type, but not the above intentionally. seq($._simple_type, field("arguments", $.arguments)), seq($._annotated_type, field("arguments", $.arguments)), seq($.compound_type, field("arguments", $.arguments)), @@ -1540,14 +1545,14 @@ module.exports = grammar({ /** * Regex patterns created to avoid matching // comments and /* comment starts. - * This could technically match illeagal tokens such as val ?// = 1 + * This could technically match illegal tokens such as val ?// = 1 */ operator_identifier: $ => token( choice( // opchar minus colon, equal, at // Technically speaking, Sm (Math symbols https://www.compart.com/en/unicode/category/Sm) - // should be allowed as a single-characeter opchar, however, it includes `=`, + // should be allowed as a single-character opchar, however, it includes `=`, // so we should to avoid that to prevent bad parsing of `=` as infix term or type. /[\-!#%&*+\/\\<>?\u005e\u007c~\u00ac\u00b1\u00d7\u00f7\u2190-\u2194\p{So}]/, seq( @@ -1616,7 +1621,7 @@ module.exports = grammar({ choice( seq( "\\", - choice(/[^xu]/, /uu?[0-9a-fA-F]{4}/, /x[0-9a-fA-F]{2}/), + choice(/[^xu]/, /[uU]+[0-9a-fA-F]{4}/, /x[0-9a-fA-F]{2}/), ), /[^\\'\n]/, ), @@ -1625,14 +1630,13 @@ module.exports = grammar({ ), ), - interpolated_string_expression: $ => - seq(field("interpolator", $.identifier), $.interpolated_string), - - _interpolated_string_start: $ => '"', - - _interpolated_multiline_string_start: $ => '"""', + interpolated_string_expression: $ => + choice( + seq(field("interpolator", alias($._raw_string_start, $.identifier)), alias($._raw_string, $.interpolated_string)), + seq(field("interpolator", $.identifier), $.interpolated_string), + ), - _dollar_escape: $ => seq("$", choice("$", '"')), + _dollar_escape: $ => alias(token(seq("$", choice("$", '"'))), $.escape_sequence), _aliased_interpolation_identifier: $ => alias($._interpolation_identifier, $.identifier), @@ -1643,28 +1647,88 @@ module.exports = grammar({ interpolated_string: $ => choice( seq( - $._interpolated_string_start, + token.immediate('"'), repeat( seq( $._interpolated_string_middle, - choice($._dollar_escape, $.interpolation), + choice($._dollar_escape, $.interpolation, $.escape_sequence), ), ), - $._interpolated_string_end, + $._single_line_string_end, ), seq( - $._interpolated_multiline_string_start, + token.immediate('"""'), repeat( seq( $._interpolated_multiline_string_middle, + // Multiline strings ignore escape sequences choice($._dollar_escape, $.interpolation), ), ), - $._interpolated_multiline_string_end, + $._multiline_string_end, + ), + ), + + // We need to handle single-line raw strings separately from interpolated strings, + // because raw strings are not parsed for escape sequences. For example, raw strings + // are often used for regular expressions, which contain backslashes that would + // be invalid if parsed as escape sequences. We do not special case multiline + // raw strings, because multiline strings do not parse escape sequences anyway. + // Scala handles multiline raw strings identically to other multiline interpolated, + // so we could parse them as interpolated strings, but I think the code is cleaner + // if we maintain the distinction. + _raw_string: $ => + choice( + seq( + $._simple_string_start, + seq( + repeat( + seq( + $._raw_string_middle, + choice($._dollar_escape, $.interpolation), + ), + ), + $._single_line_string_end, + ), + ), + seq( + $._simple_multiline_string_start, + repeat( + seq( + $._raw_string_multiline_middle, + choice($._dollar_escape, $.interpolation), + ) + ), + $._multiline_string_end, ), ), - string: $ => choice($._simple_string, $._simple_multiline_string), + escape_sequence: _ => token.immediate(seq( + '\\', + choice( + /[tbnrf"'\\]/, + // The Java spec allows any number of u's and U's at the start of a unicode escape. + /[uU]+[0-9a-fA-F]{4}/, + // Octals are not allowed in Scala 3, but are allowed in Scala 2. tree-sitter + // does not have a mechanism for distinguishing between different versions of a + // language, so I think it makes sense to allow them. Maybe in the future we + // should move them to a `deprecated` syntax node? + /[0-3]?[0-7]{1,2}/, + ), + )), + + string: $ => choice( + seq( + $._simple_string_start, + repeat(seq($._simple_string_middle, $.escape_sequence)), + $._single_line_string_end, + ), + seq( + $._simple_multiline_string_start, + /// Multiline strings ignore escape sequences + $._multiline_string_end, + ), + ), _semicolon: $ => choice(";", $._automatic_semicolon), diff --git a/src/scanner.c b/src/scanner.c index a95e5f42..4e653ef9 100644 --- a/src/scanner.c +++ b/src/scanner.c @@ -15,19 +15,46 @@ enum TokenType { AUTOMATIC_SEMICOLON, INDENT, + OUTDENT, + SIMPLE_STRING_START, + SIMPLE_STRING_MIDDLE, + SIMPLE_MULTILINE_STRING_START, INTERPOLATED_STRING_MIDDLE, - INTERPOLATED_STRING_END, INTERPOLATED_MULTILINE_STRING_MIDDLE, - INTERPOLATED_MULTILINE_STRING_END, - OUTDENT, - SIMPLE_MULTILINE_STRING, - SIMPLE_STRING, + RAW_STRING_START, + RAW_STRING_MIDDLE, + RAW_STRING_MULTILINE_MIDDLE, + SINGLE_LINE_STRING_END, + MULTILINE_STRING_END, ELSE, CATCH, FINALLY, EXTENDS, DERIVES, WITH, + ERROR_SENTINEL +}; + +const char* token_name[] = { + "AUTOMATIC_SEMICOLON", + "INDENT", + "OUTDENT", + "SIMPLE_STRING_START", + "SIMPLE_STRING_MIDDLE", + "SIMPLE_MULTILINE_STRING_START", + "INTERPOLATED_STRING_MIDDLE", + "INTERPOLATED_MULTILINE_STRING_MIDDLE", + "RAW_STRING_MIDDLE", + "RAW_STRING_MULTILINE_MIDDLE", + "SINGLE_LINE_STRING_END", + "MULTILINE_STRING_END", + "ELSE", + "CATCH", + "FINALLY", + "EXTENDS", + "DERIVES", + "WITH", + "ERROR_SENTINEL" }; typedef struct { @@ -107,43 +134,71 @@ static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } -static bool scan_string_content(TSLexer *lexer, bool is_multiline, bool has_interpolation) { +// We enumerate 3 types of strings that we need to handle differently: +// 1. Simple strings, `"..."` or `"""..."""` +// 2. Interpolated strings, `s"..."` or `f"..."` or `foo"..."` or foo"""...""". +// 3. Raw strings, `raw"..."` +typedef enum { + STRING_MODE_SIMPLE, + STRING_MODE_INTERPOLATED, + STRING_MODE_RAW +} StringMode; + +static bool scan_string_content(TSLexer *lexer, bool is_multiline, StringMode string_mode) { + LOG("scan_string_content(%d, %d, %c)\n", is_multiline, string_mode, lexer->lookahead); unsigned closing_quote_count = 0; for (;;) { if (lexer->lookahead == '"') { advance(lexer); closing_quote_count++; if (!is_multiline) { - lexer->result_symbol = has_interpolation ? INTERPOLATED_STRING_END : SIMPLE_STRING; + lexer->result_symbol = SINGLE_LINE_STRING_END; + lexer->mark_end(lexer); return true; } if (closing_quote_count >= 3 && lexer->lookahead != '"') { - lexer->result_symbol = has_interpolation ? INTERPOLATED_MULTILINE_STRING_END : SIMPLE_MULTILINE_STRING; - return true; - } - } else if (lexer->lookahead == '$') { - if (is_multiline && has_interpolation) { - lexer->result_symbol = INTERPOLATED_MULTILINE_STRING_MIDDLE; + lexer->result_symbol = MULTILINE_STRING_END; + lexer->mark_end(lexer); return true; } - if (has_interpolation) { - lexer->result_symbol = INTERPOLATED_STRING_MIDDLE; - return true; + } else if (lexer->lookahead == '$' && string_mode != STRING_MODE_SIMPLE) { + switch (string_mode) { + case STRING_MODE_INTERPOLATED: + lexer->result_symbol = is_multiline ? INTERPOLATED_MULTILINE_STRING_MIDDLE : INTERPOLATED_STRING_MIDDLE; + break; + case STRING_MODE_RAW: + lexer->result_symbol = is_multiline ? RAW_STRING_MULTILINE_MIDDLE : RAW_STRING_MIDDLE; + break; + default: + assert(false); } - advance(lexer); + lexer->mark_end(lexer); + return true; } else { closing_quote_count = 0; if (lexer->lookahead == '\\') { - advance(lexer); - if (!lexer->eof(lexer)) { - advance(lexer); - } - } else if (lexer->lookahead == '\n') { - if (is_multiline) { + // Multiline strings ignore escape sequences + if (is_multiline || string_mode == STRING_MODE_RAW) { + // FIXME: In raw string mode, we have to jump over escaped quotes. advance(lexer); + // In single-line raw strings, `\"` is not translated to `"`, but it also does + // not close the string. Likewise, `\\` is not translated to `\`, but it does + // stop the second `\` from stopping a double-quote from closing the string. + if (!is_multiline && string_mode == STRING_MODE_RAW && + (lexer->lookahead == '"' || lexer->lookahead == '\\')) { + advance(lexer); + } } else { - return false; + lexer->result_symbol = string_mode == STRING_MODE_SIMPLE ? SIMPLE_STRING_MIDDLE : INTERPOLATED_STRING_MIDDLE; + lexer->mark_end(lexer); + return true; } + // During error recovery and dynamic precedence resolution, the external + // scanner will be invoked with all valid_symbols set to true, which means + // we will be asked to scan a string token when we are not actually in a + // string context. Here we detect these cases and return false. + } else if (lexer->lookahead == '\n' && !is_multiline) { + return false; } else if (lexer->eof(lexer)) { return false; } else { @@ -185,6 +240,24 @@ static inline void debug_indents(Scanner *scanner) { bool tree_sitter_scala_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { + #ifdef DEBUG + { + if (valid_symbols[ERROR_SENTINEL]) { + LOG("entering tree_sitter_scala_external_scanner_scan. ERROR_SENTINEL is valid\n"); + } else { + char debug_str[1024] = "entering tree_sitter_scala_external_scanner_scan valid symbols: "; + for (unsigned i = 0; i < ERROR_SENTINEL; i++) { + if (valid_symbols[i]) { + strcat(debug_str, token_name[i]); + strcat(debug_str, ", "); + } + } + strcat(debug_str, "\n"); + LOG("%s", debug_str); + } + } + #endif + Scanner *scanner = (Scanner *)payload; int16_t prev = scanner->indents.size > 0 ? *array_back(&scanner->indents) : -1; int16_t newline_count = 0; @@ -249,7 +322,7 @@ bool tree_sitter_scala_external_scanner_scan(void *payload, TSLexer *lexer, } // This saves the indentation_size and newline_count so it can be used - // in subsequent calls for multiple outdent or autosemicolon. + // in subsequent calls for multiple outdent or auto-semicolon. if (valid_symbols[OUTDENT] && (lexer->lookahead == 0 || ( @@ -388,30 +461,69 @@ bool tree_sitter_scala_external_scanner_scan(void *payload, TSLexer *lexer, skip(lexer); } - if (valid_symbols[SIMPLE_STRING] && lexer->lookahead == '"') { + if (valid_symbols[SIMPLE_STRING_START] && lexer->lookahead == '"') { advance(lexer); + lexer->mark_end(lexer); - bool is_multiline = false; if (lexer->lookahead == '"') { advance(lexer); if (lexer->lookahead == '"') { advance(lexer); - is_multiline = true; - } else { - lexer->result_symbol = SIMPLE_STRING; + lexer->result_symbol = SIMPLE_MULTILINE_STRING_START; + lexer->mark_end(lexer); return true; } } - return scan_string_content(lexer, is_multiline, false); + lexer->result_symbol = SIMPLE_STRING_START; + return true; + } + + // We need two tokens of lookahead to determine if we are parsing a raw string, + // the `raw` and the `"`, which is why we need to do it in the external scanner. + if (valid_symbols[RAW_STRING_START] && lexer->lookahead == 'r') { + advance(lexer); + if (lexer->lookahead == 'a') { + advance(lexer); + if (lexer->lookahead == 'w') { + advance(lexer); + if (lexer->lookahead == '"') { + lexer->mark_end(lexer); + lexer->result_symbol = RAW_STRING_START; + return true; + } + } + } + } + + if (valid_symbols[SIMPLE_STRING_MIDDLE]) { + return scan_string_content(lexer, false, STRING_MODE_SIMPLE); } if (valid_symbols[INTERPOLATED_STRING_MIDDLE]) { - return scan_string_content(lexer, false, true); + return scan_string_content(lexer, false, STRING_MODE_INTERPOLATED); } + if (valid_symbols[RAW_STRING_MIDDLE]) { + return scan_string_content(lexer, false, STRING_MODE_RAW); + } + + if (valid_symbols[RAW_STRING_MULTILINE_MIDDLE]) { + return scan_string_content(lexer, true, STRING_MODE_RAW); + } + if (valid_symbols[INTERPOLATED_MULTILINE_STRING_MIDDLE]) { - return scan_string_content(lexer, true, true); + return scan_string_content(lexer, true, STRING_MODE_INTERPOLATED); + } + + // We still need to handle the simple multiline string case, but there is + // no `MULTILINE_STRING_MIDDLE` token, and `MULTILINE_STRING_END` is used + // by all three of simple raw, and interpolated multiline strings. So this + // check needs to come after the `INTERPOLATED_MULTILINE_STRING_MIDDLE` and + // `RAW_STRING_MULTILINE_MIDDLE` check, so that we can be sure we are in a + // simple multiline string context. + if (valid_symbols[MULTILINE_STRING_END]) { + return scan_string_content(lexer, true, STRING_MODE_SIMPLE); } return false; diff --git a/test/corpus/literals.txt b/test/corpus/literals.txt index 8b2d3666..efd308bf 100644 --- a/test/corpus/literals.txt +++ b/test/corpus/literals.txt @@ -2,19 +2,56 @@ Simple strings ================================================================================ +val emptyString = "" + val oneLineString = "I'm just on one line" +val stringWithCommentLikeContent1 = "// not a comment" + +val stringWithCommentLikeContent2 = "/* not a comment */" + +val stringWithEscapeSequence = "first line\nsecond line" + val multiLineString = """ a $thisIsntInterpolated ${thisEither} + no escape codes in multiline strings \uD83D\uDE00 \n """ -val multiLineString2 = """"""" +val emptyMultilineStringf = """""" + +val stringOfOneDoubleQuote = """"""" + +val multiLineString3 = """\{@inheritDoc\p{Zs}*\}""" + +val blackslashDoesNotEscapeClosingQuote = """\""" -------------------------------------------------------------------------------- (compilation_unit + (val_definition + (identifier) + (string)) + (val_definition + (identifier) + (string)) + (val_definition + (identifier) + (string)) + (val_definition + (identifier) + (string)) + (val_definition + (identifier) + (string + (escape_sequence))) + (val_definition + (identifier) + (string)) + (val_definition + (identifier) + (string)) (val_definition (identifier) (string)) @@ -25,15 +62,60 @@ val multiLineString2 = """"""" (identifier) (string))) +================================================================================ +Escape sequences in strings +================================================================================ + +val singleEscapeCode = "\n" + +val singleCharacterEscapes = "\n\r\t\b\f\'\"\\" + +val unicodeEscape = "\uD83D\UDE00" + +val repeatedUs = "\uuuuD83D\UUDE00" + +-------------------------------------------------------------------------------- + +(compilation_unit + (val_definition + (identifier) + (string + (escape_sequence))) + (val_definition + (identifier) + (string + (escape_sequence) + (escape_sequence) + (escape_sequence) + (escape_sequence) + (escape_sequence) + (escape_sequence) + (escape_sequence) + (escape_sequence))) + (val_definition + (identifier) + (string + (escape_sequence) + (escape_sequence))) + (val_definition + (identifier) + (string + (escape_sequence) + (escape_sequence)))) + ================================================================================ Interpolated strings ================================================================================ +val empty = s"" + +val emptyMultiline = s"""""" + val string1 = s"a $b ${c}" val string2 = f"hi $name%s" -val string3 = raw"Not a new line \n${ha}" +val string3 = raw"Not a really a new line \n${ha}." val string4 = s""" works even in multiline strings, ${name} @@ -47,9 +129,29 @@ val string7 = s"$$ $a" val string8 = s"$"$a" +val string9 = s"$"$a\uD83D\UDE00\n" + +val multiline = raw""" + $$ + ${interp} + \n + \x + \" +\""" + -------------------------------------------------------------------------------- (compilation_unit + (val_definition + (identifier) + (interpolated_string_expression + (identifier) + (interpolated_string))) + (val_definition + (identifier) + (interpolated_string_expression + (identifier) + (interpolated_string))) (val_definition (identifier) (interpolated_string_expression @@ -99,16 +201,25 @@ val string8 = s"$"$a" (identifier) (interpolated_string_expression (identifier) - (interpolated_string - (interpolation - (identifier)) - (interpolation - (identifier))))) + (interpolated_string + (interpolation + (identifier)) + (interpolation + (identifier))))) + (val_definition + (identifier) + (interpolated_string_expression + (identifier) + (interpolated_string + (escape_sequence) + (interpolation + (identifier))))) (val_definition (identifier) (interpolated_string_expression (identifier) (interpolated_string + (escape_sequence) (interpolation (identifier))))) (val_definition @@ -116,8 +227,162 @@ val string8 = s"$"$a" (interpolated_string_expression (identifier) (interpolated_string + (escape_sequence) + (interpolation + (identifier)) + (escape_sequence) + (escape_sequence) + (escape_sequence)))) + (val_definition + (identifier) + (interpolated_string_expression + (identifier) + (interpolated_string + (escape_sequence) + (interpolation + (block + (identifier))))))) + +================================================================================ +Raw strings +================================================================================ + +val emptyRaw = raw"" + +val emptyMultilineRaw = raw"""""" + +val invalidEscapeCodesAllowedAndValidEscapesIgnored = raw"\n\t\x\w\g\k" + +val ErasedFunctionN = raw"ErasedFunction(\d+)".r + +val escapedAndInterpolated = raw"Not a really a new line \n${ha}." + +val blackslashQuoteDoesNotCloseString = raw"\"" + +val doubleSlashQuoteDoesCloseString = raw"\\" + +val dollarEscapeInSingleLine = raw"$$" + +val slashDoesNotEscapeDollarSign = raw"\$$" + +val multiline = raw""" + $$ + ${interp} + \n + \x + \" +\""" + +val ensureIdentifierNamedRawStillWorks = someFunction(raw) + +val raw = raw(raw) + +-------------------------------------------------------------------------------- + +(compilation_unit + (val_definition + (identifier) + (interpolated_string_expression + (identifier) + (interpolated_string))) + (val_definition + (identifier) + (interpolated_string_expression + (identifier) + (interpolated_string))) + (val_definition + (identifier) + (interpolated_string_expression + (identifier) + (interpolated_string))) + (val_definition + (identifier) + (field_expression + (interpolated_string_expression + (identifier) + (interpolated_string)) + (identifier))) + (val_definition + (identifier) + (interpolated_string_expression + (identifier) + (interpolated_string + (interpolation + (block + (identifier)))))) + (val_definition + (identifier) + (interpolated_string_expression + (identifier) + (interpolated_string))) + (val_definition + (identifier) + (interpolated_string_expression + (identifier) + (interpolated_string))) + (val_definition + (identifier) + (interpolated_string_expression + (identifier) + (interpolated_string + (escape_sequence)))) + (val_definition + (identifier) + (interpolated_string_expression + (identifier) + (interpolated_string + (escape_sequence)))) + (val_definition + (identifier) + (interpolated_string_expression + (identifier) + (interpolated_string + (escape_sequence) + (interpolation + (block + (identifier)))))) + (val_definition + (identifier) + (call_expression + (identifier) + (arguments + (identifier)))) + (val_definition + (identifier) + (call_expression + (identifier) + (arguments + (identifier))))) + +================================================================================ +Raw and interpolated strings have equivalent parse trees +================================================================================ + +val raw = raw"Foo $$ ${bar}" + +val raw = s"Foo $$ ${bar}" + +-------------------------------------------------------------------------------- + +(compilation_unit + (val_definition + (identifier) + (interpolated_string_expression + (identifier) + (interpolated_string + (escape_sequence) (interpolation - (identifier)))))) + (block + (identifier)))))) + (val_definition + (identifier) + (interpolated_string_expression + (identifier) + (interpolated_string + (escape_sequence) + (interpolation + (block + (identifier))))))) ================================================================================