Skip to content

Commit d16c5b0

Browse files
committed
Allow for substring lookups in the frozen string table.
Previously, we always extracted the precise range of bytes we needed when looking up an entry in the frozen string table. If an entry already existed, we'd discard that extracted range in favor of what was already in the cache. This change defers making a copy of the string's bytes until we need to insert into the cache. For situations with many cache hits this approach can be much faster.
1 parent 66ca27b commit d16c5b0

File tree

4 files changed

+97
-16
lines changed

4 files changed

+97
-16
lines changed

src/main/java/org/truffleruby/core/encoding/TStringUtils.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,13 @@ public static TruffleString.Encoding jcodingToTEncoding(Encoding jcoding) {
4444
}
4545

4646
public static TruffleString fromByteArray(byte[] bytes, TruffleString.Encoding tencoding) {
47+
return fromByteArray(bytes, 0, bytes.length, tencoding);
48+
}
49+
50+
public static TruffleString fromByteArray(byte[] bytes, int offset, int length, TruffleString.Encoding tencoding) {
4751
CompilerAsserts.neverPartOfCompilation(
4852
"Use createString(TruffleString.FromByteArrayNode, byte[], RubyEncoding) instead");
49-
return TruffleString.fromByteArrayUncached(bytes, 0, bytes.length, tencoding, false);
53+
return TruffleString.fromByteArrayUncached(bytes, offset, length, tencoding, false);
5054
}
5155

5256
public static TruffleString fromByteArray(byte[] bytes, RubyEncoding rubyEncoding) {

src/main/java/org/truffleruby/core/string/FrozenStringLiterals.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,16 @@ public ImmutableRubyString getFrozenStringLiteral(TruffleString tstring, RubyEnc
4141
throw CompilerDirectives.shouldNotReachHere();
4242
}
4343

44-
return getFrozenStringLiteral(TStringUtils.getBytesOrCopy(tstring, encoding), encoding);
44+
// Ensure all ImmutableRubyString have a TruffleString from the TStringCache
45+
var cachedTString = tstringCache.getTString(tstring, encoding);
46+
var tstringWithEncoding = new TStringWithEncoding(cachedTString, encoding);
47+
48+
final ImmutableRubyString string = values.get(tstringWithEncoding);
49+
if (string != null) {
50+
return string;
51+
} else {
52+
return getFrozenStringLiteral(TStringUtils.getBytesOrCopy(tstring, encoding), encoding);
53+
}
4554
}
4655

4756
@TruffleBoundary

src/main/java/org/truffleruby/core/string/TBytesKey.java

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,34 @@
1212
import java.util.Arrays;
1313
import java.util.Objects;
1414

15+
import com.oracle.truffle.api.strings.InternalByteArray;
1516
import com.oracle.truffle.api.strings.TruffleString;
17+
import org.truffleruby.core.array.ArrayUtils;
1618
import org.truffleruby.core.encoding.RubyEncoding;
19+
import org.truffleruby.core.encoding.TStringUtils;
1720

1821
public final class TBytesKey {
1922

2023
private final byte[] bytes;
24+
private final int offset;
25+
private final int length;
2126
private RubyEncoding encoding;
2227
private final int bytesHashCode;
2328

24-
public TBytesKey(byte[] bytes, RubyEncoding encoding) {
29+
public TBytesKey(byte[] bytes, int offset, int length, int bytesHashCode, RubyEncoding encoding) {
2530
this.bytes = bytes;
31+
this.offset = offset;
32+
this.length = length;
33+
this.bytesHashCode = bytesHashCode;
2634
this.encoding = encoding;
27-
this.bytesHashCode = Arrays.hashCode(bytes);
35+
}
36+
37+
public TBytesKey(byte[] bytes, RubyEncoding encoding) {
38+
this(bytes, 0, bytes.length, Arrays.hashCode(bytes), encoding);
39+
}
40+
41+
public TBytesKey(InternalByteArray byteArray, RubyEncoding encoding) {
42+
this(byteArray.getArray(), byteArray.getOffset(), byteArray.getLength(), hashCode(byteArray), encoding);
2843
}
2944

3045
@Override
@@ -37,15 +52,15 @@ public boolean equals(Object o) {
3752
if (o instanceof TBytesKey) {
3853
final TBytesKey other = (TBytesKey) o;
3954
if (encoding == null) {
40-
if (Arrays.equals(bytes, other.bytes)) {
55+
if (equalBytes(this, other)) {
4156
// For getMatchedEncoding()
4257
this.encoding = Objects.requireNonNull(other.encoding);
4358
return true;
4459
} else {
4560
return false;
4661
}
4762
} else {
48-
return encoding == other.encoding && Arrays.equals(bytes, other.bytes);
63+
return encoding == other.encoding && equalBytes(this, other);
4964
}
5065
}
5166

@@ -62,4 +77,48 @@ public String toString() {
6277
return TruffleString.fromByteArrayUncached(bytes, encoding, false).toString();
6378
}
6479

80+
private static int hashCode(InternalByteArray byteArray) {
81+
return hashCode(byteArray.getArray(), byteArray.getOffset(), byteArray.getLength());
82+
}
83+
84+
// A variant of <code>Arrays.hashCode</code> that allows for selecting a range within the array.
85+
private static int hashCode(byte[] bytes, int offset, int length) {
86+
if (bytes == null) {
87+
return 0;
88+
}
89+
90+
int result = 1;
91+
for (int i = offset; i < offset + length; i++) {
92+
result = 31 * result + bytes[i];
93+
}
94+
95+
return result;
96+
}
97+
98+
private boolean equalBytes(TBytesKey a, TBytesKey b) {
99+
return Arrays.equals(a.bytes, a.offset, a.offset + a.length, b.bytes, b.offset, b.offset + b.length);
100+
}
101+
102+
private boolean isPerfectFit() {
103+
return offset == 0 && length == bytes.length;
104+
}
105+
106+
public TBytesKey makeCacheable() {
107+
if (isPerfectFit()) {
108+
// TODO (nirvdrum 2023-Jun-17): We can avoid cloning the key if we know the byte array came from an immutable string.
109+
return new TBytesKey(bytes.clone(), encoding);
110+
}
111+
112+
var simplified = ArrayUtils.extractRange(this.bytes, this.offset, this.offset + this.length);
113+
return new TBytesKey(simplified, encoding);
114+
}
115+
116+
public TBytesKey withNewEncoding(RubyEncoding encoding) {
117+
return new TBytesKey(bytes, offset, length, bytesHashCode, encoding);
118+
}
119+
120+
public TruffleString toTruffleString() {
121+
return TStringUtils.fromByteArray(bytes, offset, length, encoding.tencoding);
122+
}
123+
65124
}

src/main/java/org/truffleruby/core/string/TStringCache.java

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -69,20 +69,30 @@ private void register(TruffleString tstring, RubyEncoding encoding) {
6969
}
7070
}
7171

72-
public TruffleString getTString(TruffleString string, RubyEncoding encoding) {
73-
return getTString(TStringUtils.getBytesOrCopy(string, encoding), encoding);
72+
@TruffleBoundary
73+
public TruffleString getTString(TruffleString string, RubyEncoding rubyEncoding) {
74+
assert rubyEncoding != null;
75+
76+
var byteArray = string.getInternalByteArrayUncached(rubyEncoding.tencoding);
77+
final TBytesKey key = new TBytesKey(byteArray, rubyEncoding);
78+
79+
return getTString(key);
7480
}
7581

76-
@TruffleBoundary
7782
public TruffleString getTString(byte[] bytes, RubyEncoding rubyEncoding) {
7883
assert rubyEncoding != null;
7984

80-
final TBytesKey key = new TBytesKey(bytes, rubyEncoding);
85+
return getTString(new TBytesKey(bytes, rubyEncoding));
86+
}
87+
88+
@TruffleBoundary
89+
private TruffleString getTString(TBytesKey lookupKey) {
90+
final TruffleString tstring = bytesToTString.get(lookupKey);
91+
var rubyEncoding = lookupKey.getMatchedEncoding();
8192

82-
final TruffleString tstring = bytesToTString.get(key);
8393
if (tstring != null) {
8494
++tstringsReusedCount;
85-
tstringBytesSaved += tstring.byteLength(rubyEncoding.tencoding);
95+
tstringBytesSaved += tstring.byteLength(lookupKey.getMatchedEncoding().tencoding);
8696

8797
return tstring;
8898
}
@@ -92,7 +102,7 @@ public TruffleString getTString(byte[] bytes, RubyEncoding rubyEncoding) {
92102
// reference equality optimizations. So, do another search but with a marker encoding. The only guarantee
93103
// we can make about the resulting TruffleString is that it would have the same logical byte[], but that's good enough
94104
// for our purposes.
95-
TBytesKey keyNoEncoding = new TBytesKey(bytes, null);
105+
TBytesKey keyNoEncoding = lookupKey.withNewEncoding(null);
96106
final TruffleString tstringWithSameBytesButDifferentEncoding = bytesToTString.get(keyNoEncoding);
97107

98108
final TruffleString newTString;
@@ -104,12 +114,11 @@ public TruffleString getTString(byte[] bytes, RubyEncoding rubyEncoding) {
104114
++byteArrayReusedCount;
105115
tstringBytesSaved += newTString.byteLength(rubyEncoding.tencoding);
106116
} else {
107-
newTString = TStringUtils.fromByteArray(bytes, rubyEncoding);
117+
newTString = lookupKey.toTruffleString();
108118
}
109119

110120
// Use the new TruffleString bytes in the cache, so we do not keep bytes alive unnecessarily.
111-
final TBytesKey newKey = new TBytesKey(TStringUtils.getBytesOrCopy(newTString, rubyEncoding), rubyEncoding);
112-
return bytesToTString.addInCacheIfAbsent(newKey, newTString);
121+
return bytesToTString.addInCacheIfAbsent(lookupKey.makeCacheable(), newTString);
113122
}
114123

115124
public boolean contains(TruffleString string, RubyEncoding encoding) {

0 commit comments

Comments
 (0)