Skip to content

Commit 2200231

Browse files
committed
Add ruby encoding field to RubyRegexp
1 parent e1c7078 commit 2200231

File tree

12 files changed

+260
-182
lines changed

12 files changed

+260
-182
lines changed

src/main/java/org/truffleruby/core/encoding/Encodings.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import org.jcodings.Encoding;
1717
import org.jcodings.EncodingDB;
1818
import org.jcodings.specific.ASCIIEncoding;
19+
import org.jcodings.specific.ISO8859_1Encoding;
1920
import org.jcodings.specific.USASCIIEncoding;
2021
import org.jcodings.specific.UTF16BEEncoding;
2122
import org.jcodings.specific.UTF16LEEncoding;
@@ -43,6 +44,7 @@ public class Encodings {
4344
public static final RubyEncoding UTF16BE = BUILT_IN_ENCODINGS[UTF16BEEncoding.INSTANCE.getIndex()];
4445
public static final RubyEncoding UTF32LE = BUILT_IN_ENCODINGS[UTF32LEEncoding.INSTANCE.getIndex()];
4546
public static final RubyEncoding UTF32BE = BUILT_IN_ENCODINGS[UTF32BEEncoding.INSTANCE.getIndex()];
47+
public static final RubyEncoding ISO_8859_1 = BUILT_IN_ENCODINGS[ISO8859_1Encoding.INSTANCE.getIndex()];
4648
public static final RubyEncoding UTF16_DUMMY = BUILT_IN_ENCODINGS[EncodingDB
4749
.getEncodings()
4850
.get(RopeOperations.encodeAsciiBytes("UTF-16"))

src/main/java/org/truffleruby/core/encoding/StandardEncodings.java

Lines changed: 0 additions & 31 deletions
This file was deleted.

src/main/java/org/truffleruby/core/regexp/ClassicRegexp.java

Lines changed: 81 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@
4747
import org.jcodings.Encoding;
4848
import org.jcodings.specific.ASCIIEncoding;
4949
import org.jcodings.specific.USASCIIEncoding;
50-
import org.jcodings.specific.UTF8Encoding;
5150
import org.joni.NameEntry;
5251
import org.joni.Option;
5352
import org.joni.Regex;
@@ -61,6 +60,7 @@
6160
import org.truffleruby.core.rope.Rope;
6261
import org.truffleruby.core.rope.RopeBuilder;
6362
import org.truffleruby.core.rope.RopeOperations;
63+
import org.truffleruby.core.rope.RopeWithEncoding;
6464
import org.truffleruby.core.string.StringSupport;
6565
import org.truffleruby.core.string.StringUtils;
6666
import org.truffleruby.language.backtrace.BacktraceFormatter;
@@ -88,14 +88,14 @@ public Encoding getEncoding() {
8888

8989
public static Regex makeRegexp(RubyContext context, RubyDeferredWarnings rubyDeferredWarnings,
9090
RopeBuilder processedSource, RegexpOptions options,
91-
Encoding enc, Rope source, Node currentNode) throws DeferredRaiseException {
91+
RubyEncoding enc, Rope source, Node currentNode) throws DeferredRaiseException {
9292
try {
9393
return new Regex(
9494
processedSource.getUnsafeBytes(),
9595
0,
9696
processedSource.getLength(),
9797
options.toJoniOptions(),
98-
enc,
98+
enc.jcoding,
9999
Syntax.RUBY,
100100
rubyDeferredWarnings == null
101101
? new RegexWarnCallback(context)
@@ -107,10 +107,17 @@ public static Regex makeRegexp(RubyContext context, RubyDeferredWarnings rubyDef
107107
}
108108
}
109109

110-
private static Regex getRegexpFromCache(RubyContext context, RopeBuilder bytes, Encoding encoding,
110+
private static Regex getRegexpFromCache(RubyContext context, RopeBuilder bytes, RubyEncoding encoding,
111111
RegexpOptions options, Rope source) throws DeferredRaiseException {
112112
if (context == null) {
113-
final Regex regex = makeRegexp(null, null, bytes, options, encoding, source, null);
113+
final Regex regex = makeRegexp(
114+
null,
115+
null,
116+
bytes,
117+
options,
118+
encoding,
119+
source,
120+
null);
114121
regex.setUserObject(bytes);
115122
return regex;
116123
}
@@ -133,37 +140,47 @@ private static Regex getRegexpFromCache(RubyContext context, RopeBuilder bytes,
133140
}
134141
}
135142

136-
public ClassicRegexp(RubyContext context, Rope str, RegexpOptions originalOptions) throws DeferredRaiseException {
143+
public ClassicRegexp(RubyContext context, Rope str, RubyEncoding enc, RegexpOptions originalOptions)
144+
throws DeferredRaiseException {
137145
this.context = context;
138146
this.options = (RegexpOptions) originalOptions.clone();
139147

140-
Encoding enc = str.getEncoding();
141-
if (enc.isDummy()) {
148+
if (enc.jcoding.isDummy()) {
142149
throw new UnsupportedOperationException("can't make regexp with dummy encoding");
143150
}
144151

145-
Encoding[] fixedEnc = new Encoding[]{ null };
152+
RubyEncoding[] fixedEnc = new RubyEncoding[]{ null };
146153
RopeBuilder unescaped = preprocess(str, enc, fixedEnc, RegexpSupport.ErrorMode.RAISE);
147-
enc = computeRegexpEncoding(options, enc, fixedEnc);
148-
149-
this.pattern = getRegexpFromCache(context, unescaped, enc, options, str);
154+
final RubyEncoding computedEnc = computeRegexpEncoding(options, enc, fixedEnc);
155+
this.pattern = getRegexpFromCache(
156+
context,
157+
unescaped,
158+
computedEnc,
159+
options,
160+
RopeOperations.withEncoding(str, computedEnc.jcoding));
150161
this.str = str;
151162
}
152163

153164
@TruffleBoundary
154165
@SuppressWarnings("fallthrough")
155-
private static boolean unescapeNonAscii(RopeBuilder to, Rope str, Encoding enc,
156-
Encoding[] encp, RegexpSupport.ErrorMode mode) throws DeferredRaiseException {
166+
private static boolean unescapeNonAscii(RopeBuilder to, Rope str, RubyEncoding enc,
167+
RubyEncoding[] encp, RegexpSupport.ErrorMode mode) throws DeferredRaiseException {
157168
boolean hasProperty = false;
158169
byte[] buf = null;
159170

160171
int p = 0;
161172
int end = str.byteLength();
162173
final byte[] bytes = str.getBytes();
163174

175+
164176
while (p < end) {
165177
final int cl = StringSupport
166-
.characterLength(enc, enc == str.getEncoding() ? str.getCodeRange() : CR_UNKNOWN, bytes, p, end);
178+
.characterLength(
179+
enc.jcoding,
180+
enc.jcoding == str.getEncoding() ? str.getCodeRange() : CR_UNKNOWN,
181+
bytes,
182+
p,
183+
end);
167184
if (cl <= 0) {
168185
raisePreprocessError(str, "invalid multibyte character", mode);
169186
}
@@ -208,7 +225,7 @@ private static boolean unescapeNonAscii(RopeBuilder to, Rope str, Encoding enc,
208225
case 'C': /* \C-X, \C-\M-X */
209226
case 'M': /* \M-X, \M-\C-X, \M-\cX */
210227
p -= 2;
211-
if (enc == USASCIIEncoding.INSTANCE) {
228+
if (enc == Encodings.US_ASCII) {
212229
if (buf == null) {
213230
buf = new byte[1];
214231
}
@@ -269,7 +286,7 @@ private static boolean unescapeNonAscii(RopeBuilder to, Rope str, Encoding enc,
269286
}
270287

271288
private static int unescapeUnicodeBmp(RopeBuilder to, byte[] bytes, int p, int end,
272-
Encoding[] encp, Rope str, RegexpSupport.ErrorMode mode) throws DeferredRaiseException {
289+
RubyEncoding[] encp, Rope str, RegexpSupport.ErrorMode mode) throws DeferredRaiseException {
273290
if (p + 4 > end) {
274291
raisePreprocessError(str, "invalid Unicode escape", mode);
275292
}
@@ -283,7 +300,7 @@ private static int unescapeUnicodeBmp(RopeBuilder to, byte[] bytes, int p, int e
283300
}
284301

285302
private static int unescapeUnicodeList(RopeBuilder to, byte[] bytes, int p, int end,
286-
Encoding[] encp, Rope str, RegexpSupport.ErrorMode mode) throws DeferredRaiseException {
303+
RubyEncoding[] encp, Rope str, RegexpSupport.ErrorMode mode) throws DeferredRaiseException {
287304
while (p < end && ASCIIEncoding.INSTANCE.isSpace(bytes[p] & 0xff)) {
288305
p++;
289306
}
@@ -314,7 +331,7 @@ private static int unescapeUnicodeList(RopeBuilder to, byte[] bytes, int p, int
314331
return p;
315332
}
316333

317-
private static void appendUtf8(RopeBuilder to, int code, Encoding[] enc, Rope str,
334+
private static void appendUtf8(RopeBuilder to, int code, RubyEncoding[] enc, Rope str,
318335
RegexpSupport.ErrorMode mode) throws DeferredRaiseException {
319336
checkUnicodeRange(code, str, mode);
320337

@@ -328,8 +345,8 @@ private static void appendUtf8(RopeBuilder to, int code, Encoding[] enc, Rope st
328345
to.setLength(to.getLength() + utf8Decode(to.getUnsafeBytes(), to.getLength(), code));
329346
}
330347
if (enc[0] == null) {
331-
enc[0] = UTF8Encoding.INSTANCE;
332-
} else if (!(enc[0].isUTF8())) {
348+
enc[0] = Encodings.UTF_8;
349+
} else if (!(enc[0].jcoding.isUTF8())) {
333350
raisePreprocessError(str, "UTF-8 character in non UTF-8 regexp", mode);
334351
}
335352
}
@@ -381,17 +398,19 @@ private static void checkUnicodeRange(int code, Rope str, RegexpSupport.ErrorMod
381398
}
382399

383400
private static int unescapeEscapedNonAscii(RopeBuilder to, byte[] bytes, int p, int end,
384-
Encoding enc, Encoding[] encp, Rope str, RegexpSupport.ErrorMode mode) throws DeferredRaiseException {
385-
byte[] chBuf = new byte[enc.maxLength()];
401+
RubyEncoding enc, RubyEncoding[] encp, Rope str, RegexpSupport.ErrorMode mode)
402+
throws DeferredRaiseException {
403+
byte[] chBuf = new byte[enc.jcoding.maxLength()];
386404
int chLen = 0;
387405

388406
p = readEscapedByte(chBuf, chLen++, bytes, p, end, str, mode);
389-
while (chLen < enc.maxLength() &&
390-
StringSupport.MBCLEN_NEEDMORE_P(StringSupport.characterLength(enc, CR_UNKNOWN, chBuf, 0, chLen))) {
407+
while (chLen < enc.jcoding.maxLength() &&
408+
StringSupport
409+
.MBCLEN_NEEDMORE_P(StringSupport.characterLength(enc.jcoding, CR_UNKNOWN, chBuf, 0, chLen))) {
391410
p = readEscapedByte(chBuf, chLen++, bytes, p, end, str, mode);
392411
}
393412

394-
int cl = StringSupport.characterLength(enc, CR_UNKNOWN, chBuf, 0, chLen);
413+
int cl = StringSupport.characterLength(enc.jcoding, CR_UNKNOWN, chBuf, 0, chLen);
395414
if (cl == -1) {
396415
raisePreprocessError(str, "invalid multibyte escape", mode); // MBCLEN_INVALID_P
397416
}
@@ -551,86 +570,94 @@ public static int readEscapedByte(byte[] to, int toP, byte[] bytes, int p, int e
551570
} // while
552571
}
553572

554-
public static void preprocessCheck(Rope bytes) throws DeferredRaiseException {
555-
preprocess(bytes, bytes.getEncoding(), new Encoding[]{ null }, RegexpSupport.ErrorMode.RAISE);
573+
public static void preprocessCheck(RopeWithEncoding ropeWithEncoding) throws DeferredRaiseException {
574+
preprocess(
575+
ropeWithEncoding.getRope(),
576+
ropeWithEncoding.getEncoding(),
577+
new RubyEncoding[]{ null },
578+
RegexpSupport.ErrorMode.RAISE);
556579
}
557580

558-
public static RopeBuilder preprocess(Rope str, Encoding enc, Encoding[] fixedEnc,
581+
public static RopeBuilder preprocess(Rope str, RubyEncoding enc, RubyEncoding[] fixedEnc,
559582
RegexpSupport.ErrorMode mode) throws DeferredRaiseException {
560583
RopeBuilder to = RopeBuilder.createRopeBuilder(str.byteLength());
561584

562-
if (enc.isAsciiCompatible()) {
585+
if (enc.jcoding.isAsciiCompatible()) {
563586
fixedEnc[0] = null;
564587
} else {
565588
fixedEnc[0] = enc;
566-
to.setEncoding(enc);
589+
to.setEncoding(enc.jcoding);
567590
}
568591

569592
boolean hasProperty = unescapeNonAscii(to, str, enc, fixedEnc, mode);
570593
if (hasProperty && fixedEnc[0] == null) {
571594
fixedEnc[0] = enc;
572595
}
573596
if (fixedEnc[0] != null) {
574-
to.setEncoding(fixedEnc[0]);
597+
to.setEncoding(fixedEnc[0].jcoding);
575598
}
576599
return to;
577600
}
578601

579-
private static void preprocessLight(RubyContext context, Rope str, Encoding enc, Encoding[] fixedEnc,
602+
private static void preprocessLight(RubyContext context, RopeWithEncoding str, RubyEncoding enc,
603+
RubyEncoding[] fixedEnc,
580604
RegexpSupport.ErrorMode mode) throws DeferredRaiseException {
581-
if (enc.isAsciiCompatible()) {
605+
if (enc.jcoding.isAsciiCompatible()) {
582606
fixedEnc[0] = null;
583607
} else {
584608
fixedEnc[0] = enc;
585609
}
586610

587-
boolean hasProperty = unescapeNonAscii(null, str, enc, fixedEnc, mode);
611+
boolean hasProperty = unescapeNonAscii(null, str.getRope(), enc, fixedEnc, mode);
588612
if (hasProperty && fixedEnc[0] == null) {
589613
fixedEnc[0] = enc;
590614
}
591615
}
592616

593-
public static RopeBuilder preprocessDRegexp(RubyContext context, Rope[] strings, RegexpOptions options)
617+
public static RopeWithEncoding preprocessDRegexp(RubyContext context, RopeWithEncoding[] strings,
618+
RegexpOptions options)
594619
throws DeferredRaiseException {
595620
assert strings.length > 0;
596621

597-
RopeBuilder string = RopeOperations.toRopeBuilderCopy(strings[0]);
622+
RopeBuilder string = RopeOperations.toRopeBuilderCopy(strings[0].getRope());
598623

599-
Encoding regexpEnc = processDRegexpElement(context, options, null, strings[0]);
624+
RubyEncoding regexpEnc = processDRegexpElement(context, options, null, strings[0]);
600625

601626
for (int i = 1; i < strings.length; i++) {
602-
Rope str = strings[i];
627+
RopeWithEncoding str = strings[i];
603628
regexpEnc = processDRegexpElement(context, options, regexpEnc, str);
604-
string.append(str);
629+
string.append(str.getRope());
605630
}
606631

607632
if (regexpEnc != null) {
608-
string.setEncoding(regexpEnc);
633+
string.setEncoding(regexpEnc.jcoding);
634+
} else {
635+
regexpEnc = strings[0].getEncoding();
609636
}
610-
611-
return string;
637+
return new RopeWithEncoding(RopeOperations.ropeFromRopeBuilder(string), regexpEnc);
612638
}
613639

614640
@TruffleBoundary
615-
private static Encoding processDRegexpElement(RubyContext context, RegexpOptions options, Encoding regexpEnc,
616-
Rope str) throws DeferredRaiseException {
617-
Encoding strEnc = str.getEncoding();
641+
private static RubyEncoding processDRegexpElement(RubyContext context, RegexpOptions options,
642+
RubyEncoding regexpEnc,
643+
RopeWithEncoding str) throws DeferredRaiseException {
644+
RubyEncoding strEnc = str.getEncoding();
618645

619-
if (options.isEncodingNone() && strEnc != ASCIIEncoding.INSTANCE) {
620-
if (str.getCodeRange() != CR_7BIT) {
646+
if (options.isEncodingNone() && strEnc != Encodings.BINARY) {
647+
if (str.getRope().getCodeRange() != CR_7BIT) {
621648
throw new RaiseException(
622649
context,
623650
context.getCoreExceptions().regexpError(
624651
"/.../n has a non escaped non ASCII character in non ASCII-8BIT script",
625652
null));
626653
}
627-
strEnc = ASCIIEncoding.INSTANCE;
654+
strEnc = Encodings.BINARY;
628655
}
629656

630657
// This used to call preprocess, but the resulting rope builder was not
631658
// used. Since the preprocessing error-checking can be done without
632659
// creating a new rope builder, I added a "light" path.
633-
final Encoding[] fixedEnc = new Encoding[]{ null };
660+
final RubyEncoding[] fixedEnc = new RubyEncoding[]{ null };
634661
ClassicRegexp.preprocessLight(context, str, strEnc, fixedEnc, RegexpSupport.ErrorMode.PREPROCESS);
635662

636663
if (fixedEnc[0] != null) {
@@ -788,21 +815,21 @@ public static Pair<Rope, RubyEncoding> quote19(Rope bs, RubyEncoding encoding) {
788815
}
789816

790817
/** WARNING: This mutates options, so the caller should make sure it's a copy */
791-
static Encoding computeRegexpEncoding(RegexpOptions options, Encoding enc, Encoding[] fixedEnc)
818+
static RubyEncoding computeRegexpEncoding(RegexpOptions options, RubyEncoding enc, RubyEncoding[] fixedEnc)
792819
throws DeferredRaiseException {
793820
if (fixedEnc[0] != null) {
794821
if ((fixedEnc[0] != enc && options.isFixed()) ||
795-
(fixedEnc[0] != ASCIIEncoding.INSTANCE && options.isEncodingNone())) {
822+
(fixedEnc[0] != Encodings.BINARY && options.isEncodingNone())) {
796823
throw new DeferredRaiseException(context -> context
797824
.getCoreExceptions()
798825
.regexpError("incompatible character encoding", null));
799826
}
800-
if (fixedEnc[0] != ASCIIEncoding.INSTANCE) {
827+
if (fixedEnc[0] != Encodings.BINARY) {
801828
options.setFixed(true);
802829
enc = fixedEnc[0];
803830
}
804831
} else if (!options.isFixed()) {
805-
enc = USASCIIEncoding.INSTANCE;
832+
enc = Encodings.US_ASCII;
806833
}
807834

808835
if (fixedEnc[0] != null) {

src/main/java/org/truffleruby/core/regexp/EncodingCache.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,19 @@
1414
import java.util.function.Function;
1515

1616
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
17-
import org.jcodings.Encoding;
1817
import org.joni.Regex;
1918
import org.truffleruby.collections.ConcurrentOperations;
19+
import org.truffleruby.core.encoding.RubyEncoding;
2020

2121
public class EncodingCache {
22-
private final Map<Encoding, Regex> encodings;
22+
private final Map<RubyEncoding, Regex> encodings;
2323

2424
@TruffleBoundary
2525
public EncodingCache() {
2626
this.encodings = new ConcurrentHashMap<>();
2727
}
2828

29-
public Regex getOrCreate(Encoding encoding, Function<Encoding, Regex> function) {
29+
public Regex getOrCreate(RubyEncoding encoding, Function<RubyEncoding, Regex> function) {
3030
return ConcurrentOperations.getOrCompute(encodings, encoding, function);
3131
}
3232
}

0 commit comments

Comments
 (0)