Skip to content

Commit e5de644

Browse files
bjfisheregon
authored andcommitted
Get source and encoding from regex user object in RubyRegexp constructor
1 parent aa0a082 commit e5de644

File tree

6 files changed

+74
-111
lines changed

6 files changed

+74
-111
lines changed

src/main/java/org/truffleruby/core/regexp/ClassicRegexp.java

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import static org.truffleruby.core.string.StringUtils.EMPTY_STRING_ARRAY;
4242

4343
import java.nio.charset.StandardCharsets;
44+
import java.util.Arrays;
4445
import java.util.Iterator;
4546

4647
import org.graalvm.collections.Pair;
@@ -603,48 +604,55 @@ public static RopeBuilder preprocess(Rope str, RubyEncoding enc, RubyEncoding[]
603604
return to;
604605
}
605606

606-
private static void preprocessLight(RubyContext context, RopeWithEncoding str, RubyEncoding enc,
607-
RubyEncoding[] fixedEnc,
608-
RegexpSupport.ErrorMode mode) throws DeferredRaiseException {
607+
private static void preprocessLight(RopeWithEncoding str, RubyEncoding enc, RubyEncoding[] fixedEnc)
608+
throws DeferredRaiseException {
609609
if (enc.jcoding.isAsciiCompatible()) {
610610
fixedEnc[0] = null;
611611
} else {
612612
fixedEnc[0] = enc;
613613
}
614614

615-
boolean hasProperty = unescapeNonAscii(null, str.getRope(), enc, fixedEnc, mode);
615+
boolean hasProperty = unescapeNonAscii(null, str.getRope(), enc, fixedEnc, RegexpSupport.ErrorMode.PREPROCESS);
616616
if (hasProperty && fixedEnc[0] == null) {
617617
fixedEnc[0] = enc;
618618
}
619619
}
620620

621621
public static RopeWithEncoding preprocessDRegexp(RubyContext context, RopeWithEncoding[] strings,
622-
RegexpOptions options)
623-
throws DeferredRaiseException {
622+
RegexpOptions options) throws DeferredRaiseException {
624623
assert strings.length > 0;
625624

626-
RopeBuilder string = RopeOperations.toRopeBuilderCopy(strings[0].getRope());
625+
RopeBuilder builder = RopeOperations.toRopeBuilderCopy(strings[0].getRope());
627626

628627
RubyEncoding regexpEnc = processDRegexpElement(context, options, null, strings[0]);
629628

630629
for (int i = 1; i < strings.length; i++) {
631630
RopeWithEncoding str = strings[i];
632631
regexpEnc = processDRegexpElement(context, options, regexpEnc, str);
633-
string.append(str.getRope());
632+
builder.append(str.getRope());
633+
}
634+
635+
if (options.isEncodingNone()) {
636+
if (!all7Bit(builder.getBytes())) {
637+
regexpEnc = Encodings.BINARY;
638+
} else {
639+
regexpEnc = Encodings.US_ASCII;
640+
}
634641
}
635642

636643
if (regexpEnc != null) {
637-
string.setEncoding(regexpEnc.jcoding);
644+
builder.setEncoding(regexpEnc.jcoding);
638645
} else {
639646
regexpEnc = strings[0].getEncoding();
640647
}
641-
return new RopeWithEncoding(RopeOperations.ropeFromRopeBuilder(string), regexpEnc);
648+
649+
Rope rope = RopeOperations.ropeFromRopeBuilder(builder);
650+
return new RopeWithEncoding(rope, regexpEnc);
642651
}
643652

644653
@TruffleBoundary
645654
private static RubyEncoding processDRegexpElement(RubyContext context, RegexpOptions options,
646-
RubyEncoding regexpEnc,
647-
RopeWithEncoding str) throws DeferredRaiseException {
655+
RubyEncoding regexpEnc, RopeWithEncoding str) throws DeferredRaiseException {
648656
RubyEncoding strEnc = str.getEncoding();
649657

650658
if (options.isEncodingNone() && strEnc != Encodings.BINARY) {
@@ -662,7 +670,7 @@ private static RubyEncoding processDRegexpElement(RubyContext context, RegexpOpt
662670
// used. Since the preprocessing error-checking can be done without
663671
// creating a new rope builder, I added a "light" path.
664672
final RubyEncoding[] fixedEnc = new RubyEncoding[]{ null };
665-
ClassicRegexp.preprocessLight(context, str, strEnc, fixedEnc, RegexpSupport.ErrorMode.PREPROCESS);
673+
ClassicRegexp.preprocessLight(str, strEnc, fixedEnc);
666674

667675
if (fixedEnc[0] != null) {
668676
if (regexpEnc != null && regexpEnc != fixedEnc[0]) {
@@ -679,6 +687,39 @@ private static RubyEncoding processDRegexpElement(RubyContext context, RegexpOpt
679687
return regexpEnc;
680688
}
681689

690+
private static boolean all7Bit(byte[] bytes) {
691+
for (int n = 0; n < bytes.length; n++) {
692+
if (bytes[n] < 0) {
693+
return false;
694+
}
695+
696+
if (bytes[n] == '\\' && n + 1 < bytes.length && bytes[n + 1] == 'x') {
697+
final String num;
698+
final boolean isSecondHex = n + 3 < bytes.length && Character.digit(bytes[n + 3], 16) != -1;
699+
if (isSecondHex) {
700+
num = new String(Arrays.copyOfRange(bytes, n + 2, n + 4), StandardCharsets.UTF_8);
701+
} else {
702+
num = new String(Arrays.copyOfRange(bytes, n + 2, n + 3), StandardCharsets.UTF_8);
703+
}
704+
705+
int b = Integer.parseInt(num, 16);
706+
707+
if (b > 0x7F) {
708+
return false;
709+
}
710+
711+
if (isSecondHex) {
712+
n += 3;
713+
} else {
714+
n += 2;
715+
}
716+
717+
}
718+
}
719+
720+
return true;
721+
}
722+
682723
private static final int QUOTED_V = 11;
683724

684725
/** rb_reg_quote */

src/main/java/org/truffleruby/core/regexp/InterpolatedRegexpNode.java

Lines changed: 8 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,10 @@
99
*/
1010
package org.truffleruby.core.regexp;
1111

12-
import java.nio.charset.StandardCharsets;
13-
import java.util.Arrays;
14-
15-
import org.jcodings.specific.ASCIIEncoding;
16-
import org.jcodings.specific.USASCIIEncoding;
1712
import org.joni.Regex;
1813
import org.truffleruby.core.cast.ToSNode;
1914
import org.truffleruby.core.regexp.InterpolatedRegexpNodeFactory.RegexpBuilderNodeGen;
20-
import org.truffleruby.core.rope.Rope;
2115
import org.truffleruby.core.rope.RopeNodes;
22-
import org.truffleruby.core.rope.RopeOperations;
2316
import org.truffleruby.core.rope.RopeWithEncoding;
2417
import org.truffleruby.language.NotOptimizedWarningNode;
2518
import org.truffleruby.language.RubyContextNode;
@@ -109,72 +102,20 @@ protected boolean ropesWithEncodingsMatch(RopeWithEncoding[] a, RopeWithEncoding
109102
protected RubyRegexp createRegexp(RopeWithEncoding[] strings) {
110103
final RegexpOptions options = (RegexpOptions) this.options.clone();
111104
final RopeWithEncoding preprocessed;
112-
final Regex regexp1;
105+
final Regex regex;
113106
try {
114107
preprocessed = ClassicRegexp.preprocessDRegexp(getContext(), strings, options);
115-
regexp1 = TruffleRegexpNodes
116-
.compile(
117-
getLanguage(),
118-
null,
119-
preprocessed,
120-
options,
121-
this);
108+
regex = TruffleRegexpNodes.compile(
109+
getLanguage(),
110+
null,
111+
preprocessed,
112+
options,
113+
this);
122114
} catch (DeferredRaiseException dre) {
123115
throw dre.getException(getContext());
124116
}
125117

126-
// The RegexpNodes.compile operation may modify the encoding of the source rope. This modified copy is stored
127-
// in the Regex object as the "user object". Since ropes are immutable, we need to take this updated copy when
128-
// constructing the final regexp.
129-
final RopeWithEncoding ropeWithEncoding = (RopeWithEncoding) regexp1.getUserObject();
130-
131-
Rope source = ropeWithEncoding.getRope();
132-
if (options.isEncodingNone()) {
133-
if (!all7Bit(preprocessed.getRope().getBytes())) {
134-
source = RopeOperations.withEncoding(source, ASCIIEncoding.INSTANCE);
135-
} else {
136-
source = RopeOperations.withEncoding(source, USASCIIEncoding.INSTANCE);
137-
}
138-
}
139-
140-
return new RubyRegexp(
141-
regexp1,
142-
source,
143-
ropeWithEncoding.getEncoding(),
144-
options);
145-
}
146-
147-
private static boolean all7Bit(byte[] bytes) {
148-
for (int n = 0; n < bytes.length; n++) {
149-
if (bytes[n] < 0) {
150-
return false;
151-
}
152-
153-
if (bytes[n] == '\\' && n + 1 < bytes.length && bytes[n + 1] == 'x') {
154-
final String num;
155-
final boolean isSecondHex = n + 3 < bytes.length && Character.digit(bytes[n + 3], 16) != -1;
156-
if (isSecondHex) {
157-
num = new String(Arrays.copyOfRange(bytes, n + 2, n + 4), StandardCharsets.UTF_8);
158-
} else {
159-
num = new String(Arrays.copyOfRange(bytes, n + 2, n + 3), StandardCharsets.UTF_8);
160-
}
161-
162-
int b = Integer.parseInt(num, 16);
163-
164-
if (b > 0x7F) {
165-
return false;
166-
}
167-
168-
if (isSecondHex) {
169-
n += 3;
170-
} else {
171-
n += 2;
172-
}
173-
174-
}
175-
}
176-
177-
return true;
118+
return new RubyRegexp(regex, options);
178119
}
179120
}
180121
}

src/main/java/org/truffleruby/core/regexp/RegexpNodes.java

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,7 @@ public static RubyRegexp create(RubyLanguage language,
6565
new RopeWithEncoding(setSource, setSourceEncoding),
6666
regexpOptions,
6767
currentNode);
68-
69-
// The RegexpNodes.compile operation may modify the encoding of the source rope. This modified copy is stored
70-
// in the Regex object as the "user object". Since ropes are immutable, we need to take this updated copy when
71-
// constructing the final regexp.
72-
final RopeWithEncoding sourceWithEncoding = (RopeWithEncoding) regex.getUserObject();
73-
return new RubyRegexp(
74-
regex,
75-
sourceWithEncoding.getRope(),
76-
sourceWithEncoding.getEncoding(),
77-
regexpOptions);
68+
return new RubyRegexp(regex, regexpOptions);
7869
}
7970

8071
@CoreMethod(names = "hash")

src/main/java/org/truffleruby/core/regexp/RubyRegexp.java

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.truffleruby.core.kernel.KernelNodes;
2424
import org.truffleruby.core.klass.RubyClass;
2525
import org.truffleruby.core.rope.Rope;
26+
import org.truffleruby.core.rope.RopeWithEncoding;
2627
import org.truffleruby.language.ImmutableRubyObject;
2728
import org.truffleruby.language.dispatch.DispatchNode;
2829

@@ -36,15 +37,15 @@ public class RubyRegexp extends ImmutableRubyObject implements TruffleObject {
3637
public final EncodingCache cachedEncodings;
3738
public final TRegexCache tregexCache;
3839

39-
public RubyRegexp(
40-
Regex regex,
41-
Rope source,
42-
RubyEncoding encoding,
43-
RegexpOptions options) {
44-
assert (source == null && encoding == null) || source.encoding == encoding.jcoding;
40+
public RubyRegexp(Regex regex, RegexpOptions options) {
41+
// The RegexpNodes.compile operation may modify the encoding of the source rope. This modified copy is stored
42+
// in the Regex object as the "user object". Since ropes are immutable, we need to take this updated copy when
43+
// constructing the final regexp.
4544
this.regex = regex;
46-
this.source = source;
47-
this.encoding = encoding;
45+
final RopeWithEncoding ropeWithEncoding = (RopeWithEncoding) regex.getUserObject();
46+
this.source = ropeWithEncoding.getRope();
47+
this.encoding = ropeWithEncoding.getEncoding();
48+
assert source.encoding == encoding.jcoding;
4849
this.options = options;
4950
this.cachedEncodings = new EncodingCache();
5051
this.tregexCache = new TRegexCache();

src/main/java/org/truffleruby/core/regexp/TruffleRegexpNodes.java

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -254,13 +254,7 @@ public RubyRegexp createRegexp(Rope pattern, RubyEncoding encoding) throws Defer
254254
new RopeWithEncoding(pattern, encoding),
255255
regexpOptions,
256256
this);
257-
258-
final RopeWithEncoding ropeWithEncoding = (RopeWithEncoding) regex.getUserObject();
259-
return new RubyRegexp(
260-
regex,
261-
ropeWithEncoding.getRope(),
262-
ropeWithEncoding.getEncoding(),
263-
regexpOptions);
257+
return new RubyRegexp(regex, regexpOptions);
264258
}
265259
}
266260

src/main/java/org/truffleruby/parser/BodyTranslator.java

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2672,12 +2672,7 @@ public RubyNode visitRegexpNode(RegexpParseNode node) {
26722672
throw dre.getException(RubyLanguage.getCurrentContext());
26732673
}
26742674

2675-
// The RegexpNodes.compile operation may modify the encoding of the source rope. This modified copy is stored
2676-
// in the Regex object as the "user object". Since ropes are immutable, we need to take this updated copy when
2677-
// constructing the final regexp.
2678-
final RopeWithEncoding updatedRope = (RopeWithEncoding) regex.getUserObject();
2679-
final RubyRegexp regexp = new RubyRegexp(regex, updatedRope.getRope(), updatedRope.getEncoding(), options);
2680-
2675+
final RubyRegexp regexp = new RubyRegexp(regex, options);
26812676
final ObjectLiteralNode literalNode = new ObjectLiteralNode(regexp);
26822677
literalNode.unsafeSetSourceSection(node.getPosition());
26832678
return addNewlineIfNeeded(node, literalNode);

0 commit comments

Comments
 (0)