Skip to content

Commit df25d01

Browse files
committed
[GR-32712] Get source and encoding from regex user object in RubyRegexp constructor
PullRequest: truffleruby/2839
2 parents e198f61 + e5de644 commit df25d01

File tree

6 files changed

+74
-111
lines changed

6 files changed

+74
-111
lines changed

src/main/java/org/truffleruby/core/regexp/ClassicRegexp.java

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import static org.truffleruby.core.string.StringUtils.EMPTY_STRING_ARRAY;
4242

4343
import java.nio.charset.StandardCharsets;
44+
import java.util.Arrays;
4445
import java.util.Iterator;
4546

4647
import org.graalvm.collections.Pair;
@@ -602,48 +603,55 @@ public static RopeBuilder preprocess(Rope str, RubyEncoding enc, RubyEncoding[]
602603
return to;
603604
}
604605

605-
private static void preprocessLight(RubyContext context, RopeWithEncoding str, RubyEncoding enc,
606-
RubyEncoding[] fixedEnc,
607-
RegexpSupport.ErrorMode mode) throws DeferredRaiseException {
606+
private static void preprocessLight(RopeWithEncoding str, RubyEncoding enc, RubyEncoding[] fixedEnc)
607+
throws DeferredRaiseException {
608608
if (enc.jcoding.isAsciiCompatible()) {
609609
fixedEnc[0] = null;
610610
} else {
611611
fixedEnc[0] = enc;
612612
}
613613

614-
boolean hasProperty = unescapeNonAscii(null, str.getRope(), enc, fixedEnc, mode);
614+
boolean hasProperty = unescapeNonAscii(null, str.getRope(), enc, fixedEnc, RegexpSupport.ErrorMode.PREPROCESS);
615615
if (hasProperty && fixedEnc[0] == null) {
616616
fixedEnc[0] = enc;
617617
}
618618
}
619619

620620
public static RopeWithEncoding preprocessDRegexp(RubyContext context, RopeWithEncoding[] strings,
621-
RegexpOptions options)
622-
throws DeferredRaiseException {
621+
RegexpOptions options) throws DeferredRaiseException {
623622
assert strings.length > 0;
624623

625-
RopeBuilder string = RopeOperations.toRopeBuilderCopy(strings[0].getRope());
624+
RopeBuilder builder = RopeOperations.toRopeBuilderCopy(strings[0].getRope());
626625

627626
RubyEncoding regexpEnc = processDRegexpElement(context, options, null, strings[0]);
628627

629628
for (int i = 1; i < strings.length; i++) {
630629
RopeWithEncoding str = strings[i];
631630
regexpEnc = processDRegexpElement(context, options, regexpEnc, str);
632-
string.append(str.getRope());
631+
builder.append(str.getRope());
632+
}
633+
634+
if (options.isEncodingNone()) {
635+
if (!all7Bit(builder.getBytes())) {
636+
regexpEnc = Encodings.BINARY;
637+
} else {
638+
regexpEnc = Encodings.US_ASCII;
639+
}
633640
}
634641

635642
if (regexpEnc != null) {
636-
string.setEncoding(regexpEnc.jcoding);
643+
builder.setEncoding(regexpEnc.jcoding);
637644
} else {
638645
regexpEnc = strings[0].getEncoding();
639646
}
640-
return new RopeWithEncoding(RopeOperations.ropeFromRopeBuilder(string), regexpEnc);
647+
648+
Rope rope = RopeOperations.ropeFromRopeBuilder(builder);
649+
return new RopeWithEncoding(rope, regexpEnc);
641650
}
642651

643652
@TruffleBoundary
644653
private static RubyEncoding processDRegexpElement(RubyContext context, RegexpOptions options,
645-
RubyEncoding regexpEnc,
646-
RopeWithEncoding str) throws DeferredRaiseException {
654+
RubyEncoding regexpEnc, RopeWithEncoding str) throws DeferredRaiseException {
647655
RubyEncoding strEnc = str.getEncoding();
648656

649657
if (options.isEncodingNone() && strEnc != Encodings.BINARY) {
@@ -661,7 +669,7 @@ private static RubyEncoding processDRegexpElement(RubyContext context, RegexpOpt
661669
// used. Since the preprocessing error-checking can be done without
662670
// creating a new rope builder, I added a "light" path.
663671
final RubyEncoding[] fixedEnc = new RubyEncoding[]{ null };
664-
ClassicRegexp.preprocessLight(context, str, strEnc, fixedEnc, RegexpSupport.ErrorMode.PREPROCESS);
672+
ClassicRegexp.preprocessLight(str, strEnc, fixedEnc);
665673

666674
if (fixedEnc[0] != null) {
667675
if (regexpEnc != null && regexpEnc != fixedEnc[0]) {
@@ -678,6 +686,39 @@ private static RubyEncoding processDRegexpElement(RubyContext context, RegexpOpt
678686
return regexpEnc;
679687
}
680688

689+
private static boolean all7Bit(byte[] bytes) {
690+
for (int n = 0; n < bytes.length; n++) {
691+
if (bytes[n] < 0) {
692+
return false;
693+
}
694+
695+
if (bytes[n] == '\\' && n + 1 < bytes.length && bytes[n + 1] == 'x') {
696+
final String num;
697+
final boolean isSecondHex = n + 3 < bytes.length && Character.digit(bytes[n + 3], 16) != -1;
698+
if (isSecondHex) {
699+
num = new String(Arrays.copyOfRange(bytes, n + 2, n + 4), StandardCharsets.UTF_8);
700+
} else {
701+
num = new String(Arrays.copyOfRange(bytes, n + 2, n + 3), StandardCharsets.UTF_8);
702+
}
703+
704+
int b = Integer.parseInt(num, 16);
705+
706+
if (b > 0x7F) {
707+
return false;
708+
}
709+
710+
if (isSecondHex) {
711+
n += 3;
712+
} else {
713+
n += 2;
714+
}
715+
716+
}
717+
}
718+
719+
return true;
720+
}
721+
681722
private static final int QUOTED_V = 11;
682723

683724
/** rb_reg_quote */

src/main/java/org/truffleruby/core/regexp/InterpolatedRegexpNode.java

Lines changed: 8 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,10 @@
99
*/
1010
package org.truffleruby.core.regexp;
1111

12-
import java.nio.charset.StandardCharsets;
13-
import java.util.Arrays;
14-
15-
import org.jcodings.specific.ASCIIEncoding;
16-
import org.jcodings.specific.USASCIIEncoding;
1712
import org.joni.Regex;
1813
import org.truffleruby.core.cast.ToSNode;
1914
import org.truffleruby.core.regexp.InterpolatedRegexpNodeFactory.RegexpBuilderNodeGen;
20-
import org.truffleruby.core.rope.Rope;
2115
import org.truffleruby.core.rope.RopeNodes;
22-
import org.truffleruby.core.rope.RopeOperations;
2316
import org.truffleruby.core.rope.RopeWithEncoding;
2417
import org.truffleruby.language.NotOptimizedWarningNode;
2518
import org.truffleruby.language.RubyContextNode;
@@ -109,72 +102,20 @@ protected boolean ropesWithEncodingsMatch(RopeWithEncoding[] a, RopeWithEncoding
109102
protected RubyRegexp createRegexp(RopeWithEncoding[] strings) {
110103
final RegexpOptions options = (RegexpOptions) this.options.clone();
111104
final RopeWithEncoding preprocessed;
112-
final Regex regexp1;
105+
final Regex regex;
113106
try {
114107
preprocessed = ClassicRegexp.preprocessDRegexp(getContext(), strings, options);
115-
regexp1 = TruffleRegexpNodes
116-
.compile(
117-
getLanguage(),
118-
null,
119-
preprocessed,
120-
options,
121-
this);
108+
regex = TruffleRegexpNodes.compile(
109+
getLanguage(),
110+
null,
111+
preprocessed,
112+
options,
113+
this);
122114
} catch (DeferredRaiseException dre) {
123115
throw dre.getException(getContext());
124116
}
125117

126-
// The RegexpNodes.compile operation may modify the encoding of the source rope. This modified copy is stored
127-
// in the Regex object as the "user object". Since ropes are immutable, we need to take this updated copy when
128-
// constructing the final regexp.
129-
final RopeWithEncoding ropeWithEncoding = (RopeWithEncoding) regexp1.getUserObject();
130-
131-
Rope source = ropeWithEncoding.getRope();
132-
if (options.isEncodingNone()) {
133-
if (!all7Bit(preprocessed.getRope().getBytes())) {
134-
source = RopeOperations.withEncoding(source, ASCIIEncoding.INSTANCE);
135-
} else {
136-
source = RopeOperations.withEncoding(source, USASCIIEncoding.INSTANCE);
137-
}
138-
}
139-
140-
return new RubyRegexp(
141-
regexp1,
142-
source,
143-
ropeWithEncoding.getEncoding(),
144-
options);
145-
}
146-
147-
private static boolean all7Bit(byte[] bytes) {
148-
for (int n = 0; n < bytes.length; n++) {
149-
if (bytes[n] < 0) {
150-
return false;
151-
}
152-
153-
if (bytes[n] == '\\' && n + 1 < bytes.length && bytes[n + 1] == 'x') {
154-
final String num;
155-
final boolean isSecondHex = n + 3 < bytes.length && Character.digit(bytes[n + 3], 16) != -1;
156-
if (isSecondHex) {
157-
num = new String(Arrays.copyOfRange(bytes, n + 2, n + 4), StandardCharsets.UTF_8);
158-
} else {
159-
num = new String(Arrays.copyOfRange(bytes, n + 2, n + 3), StandardCharsets.UTF_8);
160-
}
161-
162-
int b = Integer.parseInt(num, 16);
163-
164-
if (b > 0x7F) {
165-
return false;
166-
}
167-
168-
if (isSecondHex) {
169-
n += 3;
170-
} else {
171-
n += 2;
172-
}
173-
174-
}
175-
}
176-
177-
return true;
118+
return new RubyRegexp(regex, options);
178119
}
179120
}
180121
}

src/main/java/org/truffleruby/core/regexp/RegexpNodes.java

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,7 @@ public static RubyRegexp create(RubyLanguage language,
6565
new RopeWithEncoding(setSource, setSourceEncoding),
6666
regexpOptions,
6767
currentNode);
68-
69-
// The RegexpNodes.compile operation may modify the encoding of the source rope. This modified copy is stored
70-
// in the Regex object as the "user object". Since ropes are immutable, we need to take this updated copy when
71-
// constructing the final regexp.
72-
final RopeWithEncoding sourceWithEncoding = (RopeWithEncoding) regex.getUserObject();
73-
return new RubyRegexp(
74-
regex,
75-
sourceWithEncoding.getRope(),
76-
sourceWithEncoding.getEncoding(),
77-
regexpOptions);
68+
return new RubyRegexp(regex, regexpOptions);
7869
}
7970

8071
@CoreMethod(names = "hash")

src/main/java/org/truffleruby/core/regexp/RubyRegexp.java

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.truffleruby.core.kernel.KernelNodes;
2424
import org.truffleruby.core.klass.RubyClass;
2525
import org.truffleruby.core.rope.Rope;
26+
import org.truffleruby.core.rope.RopeWithEncoding;
2627
import org.truffleruby.language.ImmutableRubyObject;
2728
import org.truffleruby.language.dispatch.DispatchNode;
2829

@@ -36,15 +37,15 @@ public class RubyRegexp extends ImmutableRubyObject implements TruffleObject {
3637
public final EncodingCache cachedEncodings;
3738
public final TRegexCache tregexCache;
3839

39-
public RubyRegexp(
40-
Regex regex,
41-
Rope source,
42-
RubyEncoding encoding,
43-
RegexpOptions options) {
44-
assert (source == null && encoding == null) || source.encoding == encoding.jcoding;
40+
public RubyRegexp(Regex regex, RegexpOptions options) {
41+
// The RegexpNodes.compile operation may modify the encoding of the source rope. This modified copy is stored
42+
// in the Regex object as the "user object". Since ropes are immutable, we need to take this updated copy when
43+
// constructing the final regexp.
4544
this.regex = regex;
46-
this.source = source;
47-
this.encoding = encoding;
45+
final RopeWithEncoding ropeWithEncoding = (RopeWithEncoding) regex.getUserObject();
46+
this.source = ropeWithEncoding.getRope();
47+
this.encoding = ropeWithEncoding.getEncoding();
48+
assert source.encoding == encoding.jcoding;
4849
this.options = options;
4950
this.cachedEncodings = new EncodingCache();
5051
this.tregexCache = new TRegexCache();

src/main/java/org/truffleruby/core/regexp/TruffleRegexpNodes.java

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -254,13 +254,7 @@ public RubyRegexp createRegexp(Rope pattern, RubyEncoding encoding) throws Defer
254254
new RopeWithEncoding(pattern, encoding),
255255
regexpOptions,
256256
this);
257-
258-
final RopeWithEncoding ropeWithEncoding = (RopeWithEncoding) regex.getUserObject();
259-
return new RubyRegexp(
260-
regex,
261-
ropeWithEncoding.getRope(),
262-
ropeWithEncoding.getEncoding(),
263-
regexpOptions);
257+
return new RubyRegexp(regex, regexpOptions);
264258
}
265259
}
266260

src/main/java/org/truffleruby/parser/BodyTranslator.java

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2672,12 +2672,7 @@ public RubyNode visitRegexpNode(RegexpParseNode node) {
26722672
throw dre.getException(RubyLanguage.getCurrentContext());
26732673
}
26742674

2675-
// The RegexpNodes.compile operation may modify the encoding of the source rope. This modified copy is stored
2676-
// in the Regex object as the "user object". Since ropes are immutable, we need to take this updated copy when
2677-
// constructing the final regexp.
2678-
final RopeWithEncoding updatedRope = (RopeWithEncoding) regex.getUserObject();
2679-
final RubyRegexp regexp = new RubyRegexp(regex, updatedRope.getRope(), updatedRope.getEncoding(), options);
2680-
2675+
final RubyRegexp regexp = new RubyRegexp(regex, options);
26812676
final ObjectLiteralNode literalNode = new ObjectLiteralNode(regexp);
26822677
literalNode.unsafeSetSourceSection(node.getPosition());
26832678
return addNewlineIfNeeded(node, literalNode);

0 commit comments

Comments
 (0)