-
Notifications
You must be signed in to change notification settings - Fork 59
Support for string escapes #459
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
const PREC = { | ||
Check notice on line 1 in grammar.js
|
||
comment: 1, | ||
using_directive: 2, | ||
control: 1, | ||
|
@@ -32,19 +32,24 @@ | |
externals: $ => [ | ||
$._automatic_semicolon, | ||
$._indent, | ||
$._outdent, | ||
$._simple_string_start, | ||
$._simple_string_middle, | ||
$._simple_multiline_string_start, | ||
$._interpolated_string_middle, | ||
$._interpolated_string_end, | ||
$._interpolated_multiline_string_middle, | ||
$._interpolated_multiline_string_end, | ||
$._outdent, | ||
$._simple_multiline_string, | ||
$._simple_string, | ||
$._raw_string_start, | ||
$._raw_string_middle, | ||
$._raw_string_multiline_middle, | ||
$._single_line_string_end, | ||
$._multiline_string_end, | ||
"else", | ||
"catch", | ||
"finally", | ||
"extends", | ||
"derives", | ||
"with", | ||
$.error_sentinel, | ||
], | ||
|
||
inline: $ => [ | ||
|
@@ -209,7 +214,7 @@ | |
"package", | ||
field("name", $.package_identifier), | ||
// This is slightly more permissive than the EBNF in that it allows any | ||
// kind of delcaration inside of the package blocks. As we're more | ||
// kind of declaration inside of the package blocks. As we're more | ||
// concerned with the structure rather than the validity of the program | ||
// we'll allow it. | ||
field("body", optional($.template_body)), | ||
|
@@ -677,7 +682,7 @@ | |
// In theory structural_type should just be added to simple_type, | ||
// but doing so increases the state of template_body to 4000 | ||
$._structural_type, | ||
// This adds _simple_type, but not the above intentionall/y. | ||
// This adds _simple_type, but not the above intentionally. | ||
seq($._simple_type, field("arguments", $.arguments)), | ||
seq($._annotated_type, field("arguments", $.arguments)), | ||
seq($.compound_type, field("arguments", $.arguments)), | ||
|
@@ -936,7 +941,7 @@ | |
), | ||
), | ||
|
||
tuple_type: $ => seq("(", trailingCommaSep1($._type), ")"), | ||
|
||
named_tuple_type: $ => seq("(", trailingCommaSep1($.name_and_type), ")"), | ||
|
||
|
@@ -1540,14 +1545,14 @@ | |
|
||
/** | ||
* Regex patterns created to avoid matching // comments and /* comment starts. | ||
* This could technically match illeagal tokens such as val ?// = 1 | ||
* This could technically match illegal tokens such as val ?// = 1 | ||
*/ | ||
operator_identifier: $ => | ||
token( | ||
choice( | ||
// opchar minus colon, equal, at | ||
// Technically speaking, Sm (Math symbols https://www.compart.com/en/unicode/category/Sm) | ||
// should be allowed as a single-characeter opchar, however, it includes `=`, | ||
// should be allowed as a single-character opchar, however, it includes `=`, | ||
// so we should to avoid that to prevent bad parsing of `=` as infix term or type. | ||
/[\-!#%&*+\/\\<>?\u005e\u007c~\u00ac\u00b1\u00d7\u00f7\u2190-\u2194\p{So}]/, | ||
seq( | ||
|
@@ -1616,7 +1621,7 @@ | |
choice( | ||
seq( | ||
"\\", | ||
choice(/[^xu]/, /uu?[0-9a-fA-F]{4}/, /x[0-9a-fA-F]{2}/), | ||
choice(/[^xu]/, /[uU]+[0-9a-fA-F]{4}/, /x[0-9a-fA-F]{2}/), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have a comment noting this elsewhere, but Java weirdly allows any number of |
||
), | ||
/[^\\'\n]/, | ||
), | ||
|
@@ -1625,14 +1630,13 @@ | |
), | ||
), | ||
|
||
interpolated_string_expression: $ => | ||
seq(field("interpolator", $.identifier), $.interpolated_string), | ||
|
||
_interpolated_string_start: $ => '"', | ||
|
||
_interpolated_multiline_string_start: $ => '"""', | ||
interpolated_string_expression: $ => | ||
choice( | ||
seq(field("interpolator", alias($._raw_string_start, $.identifier)), alias($._raw_string, $.interpolated_string)), | ||
seq(field("interpolator", $.identifier), $.interpolated_string), | ||
), | ||
|
||
_dollar_escape: $ => seq("$", choice("$", '"')), | ||
_dollar_escape: $ => alias(token(seq("$", choice("$", '"'))), $.escape_sequence), | ||
|
||
_aliased_interpolation_identifier: $ => | ||
alias($._interpolation_identifier, $.identifier), | ||
|
@@ -1643,28 +1647,88 @@ | |
interpolated_string: $ => | ||
choice( | ||
seq( | ||
$._interpolated_string_start, | ||
token.immediate('"'), | ||
repeat( | ||
seq( | ||
$._interpolated_string_middle, | ||
choice($._dollar_escape, $.interpolation), | ||
choice($._dollar_escape, $.interpolation, $.escape_sequence), | ||
), | ||
), | ||
$._interpolated_string_end, | ||
$._single_line_string_end, | ||
), | ||
seq( | ||
$._interpolated_multiline_string_start, | ||
token.immediate('"""'), | ||
repeat( | ||
seq( | ||
$._interpolated_multiline_string_middle, | ||
// Multiline strings ignore escape sequences | ||
choice($._dollar_escape, $.interpolation), | ||
), | ||
), | ||
$._interpolated_multiline_string_end, | ||
$._multiline_string_end, | ||
), | ||
), | ||
|
||
// We need to handle single-line raw strings separately from interpolated strings, | ||
// because raw strings are not parsed for escape sequences. For example, raw strings | ||
// are often used for regular expressions, which contain backslashes that would | ||
// be invalid if parsed as escape sequences. We do not special case multiline | ||
// raw strings, because multiline strings do not parse escape sequences anyway. | ||
// Scala handles multiline raw strings identically to other multiline interpolated, | ||
// so we could parse them as interpolated strings, but I think the code is cleaner | ||
// if we maintain the distinction. | ||
_raw_string: $ => | ||
choice( | ||
seq( | ||
$._simple_string_start, | ||
seq( | ||
repeat( | ||
seq( | ||
$._raw_string_middle, | ||
choice($._dollar_escape, $.interpolation), | ||
), | ||
), | ||
$._single_line_string_end, | ||
), | ||
), | ||
seq( | ||
$._simple_multiline_string_start, | ||
repeat( | ||
seq( | ||
$._raw_string_multiline_middle, | ||
choice($._dollar_escape, $.interpolation), | ||
) | ||
), | ||
$._multiline_string_end, | ||
), | ||
), | ||
|
||
string: $ => choice($._simple_string, $._simple_multiline_string), | ||
escape_sequence: _ => token.immediate(seq( | ||
'\\', | ||
choice( | ||
/[tbnrf"'\\]/, | ||
// The Java spec allows any number of u's and U's at the start of a unicode escape. | ||
/[uU]+[0-9a-fA-F]{4}/, | ||
// Octals are not allowed in Scala 3, but are allowed in Scala 2. tree-sitter | ||
// does not have a mechanism for distinguishing between different versions of a | ||
// language, so I think it makes sense to allow them. Maybe in the future we | ||
// should move them to a `deprecated` syntax node? | ||
/[0-3]?[0-7]{1,2}/, | ||
), | ||
)), | ||
|
||
string: $ => choice( | ||
seq( | ||
$._simple_string_start, | ||
repeat(seq($._simple_string_middle, $.escape_sequence)), | ||
$._single_line_string_end, | ||
), | ||
seq( | ||
$._simple_multiline_string_start, | ||
/// Multiline strings ignore escape sequences | ||
$._multiline_string_end, | ||
), | ||
), | ||
|
||
_semicolon: $ => choice(";", $._automatic_semicolon), | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The docs suggest having an
error_sentinel
token type. We do not use it, but I found it helpful in debugging.