Skip to content

Commit fa25299

Browse files
committed
[GR-29310] Fix extra whitespace in squiggly heredoc with escaped newline + clarify lexer
PullRequest: truffleruby/2409
2 parents f89c33a + 86666c3 commit fa25299

File tree

5 files changed

+259
-126
lines changed

5 files changed

+259
-126
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Bug fixes:
1212
* Fix the label of the first location reported by `Thread#backtrace_locations` (#2229).
1313
* Fix `Thread.handle_interrupt` to defer non-pure interrupts until the end of the `handle_interrupt` block (#2219).
1414
* Clear and restore errinfo on entry and normal return from methods in C extensions (#2227).
15+
* Fix extra whitespace in squiggly heredoc with escaped newline (#2238, @wildmaples and @norswap).
1516

1617
Compatibility:
1718

spec/tags/language/heredoc_tags.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
11
fails:Heredoc string prints a warning if quoted HEREDOC identifier is ending not on same line
2-
fails:Heredoc string allows HEREDOC with <<~'identifier', no interpolation, with backslash

src/main/java/org/truffleruby/parser/lexer/HeredocTerm.java

Lines changed: 61 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -53,19 +53,21 @@
5353
* ',bar)\n' = lastLine
5454
* </pre>
5555
*/
56-
public class HeredocTerm extends StrTerm {
57-
// Marker delimiting heredoc boundary
56+
public final class HeredocTerm extends StrTerm {
57+
/** End marker delimiting heredoc boundary. */
5858
private final Rope nd_lit;
5959

60-
// Expand variables, Indentation of final marker
60+
/** Indicates whether string interpolation (expansion) should be performed, and the identation of the end marker. */
6161
private final int flags;
6262

63-
protected final int nth;
63+
/** End position of the end marker on the line where it is declared. */
64+
final int nth;
6465

65-
protected final int line;
66+
/** Line index of the line where the end marker is declared (1-based). */
67+
final int line;
6668

67-
// Portion of line right after beginning marker
68-
protected final Rope lastLine;
69+
/** Portion of the line where the end marker is declarer, from right after the marker until the end of the line. */
70+
final Rope lastLine;
6971

7072
public HeredocTerm(Rope marker, int func, int nth, int line, Rope lastLine) {
7173
this.nd_lit = marker;
@@ -85,44 +87,45 @@ protected int error(RubyLexer lexer, Rope eos) {
8587
return -1;
8688
}
8789

88-
protected int restore(RubyLexer lexer) {
90+
private int restore(RubyLexer lexer) {
8991
lexer.heredoc_restore(this);
90-
lexer.setStrTerm(new StringTerm(flags | STR_FUNC_TERM, 0, 0, line)); // weird way to terminate heredoc.
91-
92+
// this will cause the next call to RubyLexer#yylex() to emit the RubyParser.tSTRING_END token
93+
lexer.setStrTerm(new StringTerm(flags | STR_FUNC_TERM, 0, 0, line));
9294
return EOF;
9395
}
9496

9597
@Override
9698
public int parseString(RubyLexer lexer) {
9799
RopeBuilder str = null;
98-
Rope eos = nd_lit;
99100
boolean indent = (flags & STR_FUNC_INDENT) != 0;
100101
int c = lexer.nextc();
101102

102103
if (c == EOF) {
103-
return error(lexer, eos);
104+
return error(lexer, nd_lit);
104105
}
105106

106-
// Found end marker for this heredoc
107-
if (lexer.was_bol() && lexer.whole_match_p(nd_lit, indent)) {
108-
lexer.heredoc_restore(this);
107+
// Found end marker for this heredoc, at the start of a line
108+
if (lexer.was_bol() && lexer.whole_match_p(this.nd_lit, indent)) {
109+
lexer.heredoc_restore(this); // will also skip over the end marker
109110
lexer.setStrTerm(null);
110111
lexer.setState(EXPR_END);
111112
return RubyParser.tSTRING_END;
112113
}
113114

114115
if ((flags & STR_FUNC_EXPAND) == 0) {
115-
do {
116-
Rope lbuf = lexer.lex_lastline;
117-
int p = 0;
116+
// heredocs without string interpolation
117+
118+
do { // iterate on lines, while end marker not found
119+
final Rope lbuf = lexer.lexb;
118120
int pend = lexer.lex_pend;
119-
if (pend > p) {
121+
122+
// Remove trailing newline, it will be appended later in normalized form (single \n).
123+
if (pend > 0) {
120124
switch (lexer.p(pend - 1)) {
121125
case '\n':
122126
pend--;
123-
if (pend == p || lexer.p(pend - 1) == '\r') {
124-
pend++;
125-
break;
127+
if (pend > 0 && lexer.p(pend - 1) == '\r') {
128+
pend--;
126129
}
127130
break;
128131
case '\r':
@@ -131,77 +134,99 @@ public int parseString(RubyLexer lexer) {
131134
}
132135
}
133136

137+
// if we are dealing with a squiggly heredoc
134138
if (lexer.getHeredocIndent() > 0) {
135-
for (int i = 0; p + i < pend && lexer.update_heredoc_indent(lexer.p(p + i)); i++) {
139+
// update the indent for the current line
140+
for (int i = 0; i < pend && lexer.update_heredoc_indent(lexer.p(i)); i++) {
136141
}
142+
// reset heredoc_line_indent to 0 (was -1 after we matched the first non-whitespace character)
137143
lexer.setHeredocLineIndent(0);
138144
}
139145

140146
if (str != null) {
141-
str.append(lbuf.getBytes(), p, pend - p);
147+
str.append(lbuf.getBytes(), 0, pend);
142148
} else {
143-
final RopeBuilder builder = RopeBuilder.createRopeBuilder(lbuf.getBytes(), p, pend - p);
149+
// lazy initialization of string builder
150+
final RopeBuilder builder = RopeBuilder.createRopeBuilder(lbuf.getBytes(), 0, pend);
144151
builder.setEncoding(lbuf.getEncoding());
145152
str = builder;
146153
}
147154

148-
if (pend < lexer.lex_pend) {
149-
str.append('\n');
150-
}
155+
// append the newline that we removed earlier
156+
str.append('\n');
151157
lexer.lex_goto_eol();
152158

153159
if (lexer.getHeredocIndent() > 0) {
160+
// for squiggly (indented) heredocs, generate one string content token token per line
161+
// this will be dedented in the parser through lexer.heredoc_dedent
154162
lexer.setValue(lexer.createStr(str, 0));
155163
return RubyParser.tSTRING_CONTENT;
156164
}
157165
// MRI null checks str in this case but it is unconditionally non-null?
158166
if (lexer.nextc() == -1) {
159-
return error(lexer, eos);
167+
return error(lexer, nd_lit);
160168
}
161-
} while (!lexer.whole_match_p(eos, indent));
169+
} while (!lexer.whole_match_p(nd_lit, indent));
162170
} else {
171+
// heredoc with string interpolation
172+
163173
RopeBuilder tok = new RopeBuilder();
164174
tok.setEncoding(lexer.getEncoding());
175+
165176
if (c == '#') {
177+
// interpolated variable or block begin
178+
// This returns tSTRING_DVAR (if it finds $, @ or @@), tSTRING_DBEG (if it finds '{'), or 0 (none of
179+
// these things were found).
166180
int token = lexer.peekVariableName(RubyParser.tSTRING_DVAR, RubyParser.tSTRING_DBEG);
167-
168181
if (token != 0) {
182+
// Emit the token - note that the parser will unset RubyLexer#lex_strTerm while the variable or
183+
// block is being parse and restore it when it is done, allowing the rest of the heredoc to be
184+
// processed.
169185
return token;
170186
}
171-
172187
tok.append('#');
173188
}
174189

175190
// MRI has extra pointer which makes our code look a little bit more strange in comparison
176191
do {
177192
lexer.pushback(c);
178193

179-
Encoding enc[] = new Encoding[1];
194+
Encoding[] enc = new Encoding[1];
180195
enc[0] = lexer.getEncoding();
181196

197+
// Parse the next string segment into the buffer, as a regular string (with expansion).
198+
// The segment might terminate because of a newline, line continuation (\\) or because of a
199+
// an interpolation (#{...}, #@foo, #$foo, etc).
182200
if ((c = new StringTerm(flags, '\0', '\n', lexer.ruby_sourceline)
183201
.parseStringIntoBuffer(lexer, tok, enc)) == EOF) {
184202
if (lexer.eofp) {
185-
return error(lexer, eos);
203+
return error(lexer, nd_lit);
186204
}
187205
return restore(lexer);
188206
}
207+
189208
if (c != '\n') {
209+
// emit the string segment
190210
lexer.setValue(lexer.createStr(tok, 0));
191211
return RubyParser.tSTRING_CONTENT;
192212
}
213+
214+
// append the terminating newline
193215
tok.append(lexer.nextc());
194216

195217
if (lexer.getHeredocIndent() > 0) {
218+
// for squiggly (indented) heredocs, generate one string content token token per line
219+
// this will be dedented in the parser through lexer.heredoc_dedent
196220
lexer.lex_goto_eol();
197221
lexer.setValue(lexer.createStr(tok, 0));
198222
return RubyParser.tSTRING_CONTENT;
199223
}
200224

201225
if ((c = lexer.nextc()) == EOF) {
202-
return error(lexer, eos);
226+
return error(lexer, nd_lit);
203227
}
204-
} while (!lexer.whole_match_p(eos, indent));
228+
// NOTE: The end marker is not processed here, but in the next call to HeredocTerm#parseString
229+
} while (!lexer.whole_match_p(nd_lit, indent));
205230
str = tok;
206231
}
207232

0 commit comments

Comments
 (0)