Skip to content

Commit 30fd5fb

Browse files
committed
Add a new 'regexp-instrument-match-detailed' option for collecting data about the strings used in a match.
1 parent 6e0d39f commit 30fd5fb

File tree

4 files changed

+217
-53
lines changed

4 files changed

+217
-53
lines changed

src/main/java/org/truffleruby/core/regexp/TruffleRegexpNodes.java

Lines changed: 200 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
import java.util.Set;
1818
import java.util.concurrent.ConcurrentHashMap;
1919
import java.util.concurrent.atomic.AtomicInteger;
20+
import java.util.concurrent.atomic.AtomicLong;
21+
import java.util.function.Function;
2022
import java.util.stream.Collectors;
2123

2224
import com.oracle.truffle.api.CompilerDirectives;
@@ -88,14 +90,21 @@ public class TruffleRegexpNodes {
8890

8991
@TruffleBoundary
9092
private static void instrumentMatch(ConcurrentHashMap<MatchInfo, AtomicInteger> metricsMap, RubyRegexp regexp,
91-
Object string, boolean fromStart) {
93+
Object string, boolean fromStart, boolean collectDetailedStats) {
9294
Rope source = regexp.source;
9395
RegexpOptions options = regexp.options;
9496
TruffleRegexpNodes.MatchInfo matchInfo = new TruffleRegexpNodes.MatchInfo(
9597
new RegexpCacheKey(source, regexp.encoding, options, Hashing.NO_SEED),
96-
fromStart,
97-
RubyStringLibrary.getUncached().getEncoding(string));
98+
fromStart);
9899
ConcurrentOperations.getOrCompute(metricsMap, matchInfo, x -> new AtomicInteger()).incrementAndGet();
100+
101+
if (collectDetailedStats) {
102+
final MatchInfoStats stats = ConcurrentOperations
103+
.getOrCompute(MATCHED_REGEXP_STATS, matchInfo, x -> new MatchInfoStats());
104+
stats.record(
105+
RubyStringLibrary.getUncached().getRope(string),
106+
RubyStringLibrary.getUncached().getEncoding(string));
107+
}
99108
}
100109

101110
// rb_reg_prepare_enc ... mostly. Some of the error checks are performed by callers of this method.
@@ -361,6 +370,36 @@ protected <T> RubyArray fillinInstrumentData(Map<T, AtomicInteger> map, ArrayBui
361370
}
362371
return createArray(arrayBuilderNode.finish(state, n), n);
363372
}
373+
374+
@TruffleBoundary
375+
protected static Set<RegexpCacheKey> allCompiledRegexps() {
376+
final Set<RegexpCacheKey> ret = new HashSet<>();
377+
378+
ret.addAll(COMPILED_REGEXPS_DYNAMIC.keySet());
379+
ret.addAll(COMPILED_REGEXPS_LITERAL.keySet());
380+
381+
return ret;
382+
}
383+
384+
@TruffleBoundary
385+
protected static Set<RegexpCacheKey> allMatchedRegexps() {
386+
final Set<RegexpCacheKey> ret = new HashSet<>();
387+
388+
ret.addAll(
389+
MATCHED_REGEXPS_JONI
390+
.keySet()
391+
.stream()
392+
.map(matchInfo -> matchInfo.regexpInfo)
393+
.collect(Collectors.toSet()));
394+
ret.addAll(
395+
MATCHED_REGEXPS_TREGEX
396+
.keySet()
397+
.stream()
398+
.map(matchInfo -> matchInfo.regexpInfo)
399+
.collect(Collectors.toSet()));
400+
401+
return ret;
402+
}
364403
}
365404

366405
@CoreMethod(names = "regexp_compilation_stats_array", onSingleton = true, required = 1)
@@ -390,28 +429,14 @@ protected Object buildStatsArray(boolean joniMatches,
390429
}
391430

392431
@CoreMethod(names = "unused_regexps_array", onSingleton = true, required = 0)
393-
public abstract static class UnusedRegexpsArray extends CoreMethodArrayArgumentsNode {
432+
public abstract static class UnusedRegexpsArray extends RegexpStatsNode {
394433

434+
@TruffleBoundary
395435
@Specialization
396436
protected Object buildUnusedRegexpsArray(
397437
@Cached ArrayBuilderNode arrayBuilderNode) {
398-
final Set<RegexpCacheKey> compiledRegexps = new HashSet<>();
399-
compiledRegexps.addAll(COMPILED_REGEXPS_DYNAMIC.keySet());
400-
compiledRegexps.addAll(COMPILED_REGEXPS_LITERAL.keySet());
401-
402-
final Set<RegexpCacheKey> matchedRegexps = new HashSet<>();
403-
matchedRegexps.addAll(
404-
MATCHED_REGEXPS_JONI
405-
.keySet()
406-
.stream()
407-
.map(matchInfo -> matchInfo.regexpInfo)
408-
.collect(Collectors.toSet()));
409-
matchedRegexps.addAll(
410-
MATCHED_REGEXPS_TREGEX
411-
.keySet()
412-
.stream()
413-
.map(matchInfo -> matchInfo.regexpInfo)
414-
.collect(Collectors.toSet()));
438+
final Set<RegexpCacheKey> compiledRegexps = allCompiledRegexps();
439+
final Set<RegexpCacheKey> matchedRegexps = allMatchedRegexps();
415440

416441
final Set<RegexpCacheKey> unusedRegexps = new HashSet<>(compiledRegexps);
417442
unusedRegexps.removeAll(matchedRegexps);
@@ -429,29 +454,14 @@ protected Object buildUnusedRegexpsArray(
429454
}
430455

431456
@CoreMethod(names = "compiled_regexp_hash_array", onSingleton = true, required = 0)
432-
public abstract static class CompiledRegexpHashArray extends CoreMethodArrayArgumentsNode {
457+
public abstract static class CompiledRegexpHashArray extends RegexpStatsNode {
433458

459+
@TruffleBoundary
434460
@Specialization
435461
protected Object buildInfoArray(
436462
@Cached ArrayBuilderNode arrayBuilderNode,
437-
@CachedLibrary(limit = "3") HashStoreLibrary hashStoreLibrary) {
438-
final Set<RegexpCacheKey> compiledRegexps = new HashSet<>();
439-
compiledRegexps.addAll(COMPILED_REGEXPS_DYNAMIC.keySet());
440-
compiledRegexps.addAll(COMPILED_REGEXPS_LITERAL.keySet());
441-
442-
final Set<RegexpCacheKey> matchedRegexps = new HashSet<>();
443-
matchedRegexps.addAll(
444-
MATCHED_REGEXPS_JONI
445-
.keySet()
446-
.stream()
447-
.map(matchInfo -> matchInfo.regexpInfo)
448-
.collect(Collectors.toSet()));
449-
matchedRegexps.addAll(
450-
MATCHED_REGEXPS_TREGEX
451-
.keySet()
452-
.stream()
453-
.map(matchInfo -> matchInfo.regexpInfo)
454-
.collect(Collectors.toSet()));
463+
@CachedLibrary(limit = "1") HashStoreLibrary hashStoreLibrary) {
464+
final Set<RegexpCacheKey> matchedRegexps = allMatchedRegexps();
455465

456466
final int arraySize = COMPILED_REGEXPS_LITERAL.size() + COMPILED_REGEXPS_DYNAMIC.size();
457467
final BuilderState state = arrayBuilderNode.start(arraySize);
@@ -513,7 +523,10 @@ protected static RubyHash buildRegexInfoHash(RubyContext context, RubyLanguage l
513523
hashStoreLibrary.set(hash.store, hash, language.getSymbol("isLiteral"), isRegexpLiteral.get(), true);
514524
}
515525

516-
hashStoreLibrary.set(hash.store, hash, language.getSymbol("isUsed"), isUsed, true);
526+
if (context.getOptions().REGEXP_INSTRUMENT_MATCH) {
527+
hashStoreLibrary.set(hash.store, hash, language.getSymbol("isUsed"), isUsed, true);
528+
}
529+
517530
hashStoreLibrary.set(hash.store, hash, language.getSymbol("encoding"), regexpInfo.getEncoding(), true);
518531
hashStoreLibrary.set(
519532
hash.store,
@@ -529,8 +542,9 @@ protected static RubyHash buildRegexInfoHash(RubyContext context, RubyLanguage l
529542
}
530543

531544
@CoreMethod(names = "matched_regexp_hash_array", onSingleton = true, required = 0)
532-
public abstract static class MatchedRegexpHashArray extends CoreMethodArrayArgumentsNode {
545+
public abstract static class MatchedRegexpHashArray extends RegexpStatsNode {
533546

547+
@TruffleBoundary
534548
@Specialization
535549
protected Object buildInfoArray(
536550
@Cached ArrayBuilderNode arrayBuilderNode,
@@ -561,12 +575,18 @@ private void processGroup(ConcurrentHashMap<MatchInfo, AtomicInteger> group,
561575
.appendValue(
562576
state,
563577
offset + n,
564-
buildHash(hashStoreLibrary, isTRegexMatch, entry.getKey(), entry.getValue()));
578+
buildHash(
579+
hashStoreLibrary,
580+
arrayBuilderNode,
581+
isTRegexMatch,
582+
entry.getKey(),
583+
entry.getValue()));
565584
n++;
566585
}
567586
}
568587

569-
private RubyHash buildHash(HashStoreLibrary hashStoreLibrary, boolean isTRegexMatch, MatchInfo matchInfo,
588+
private RubyHash buildHash(HashStoreLibrary hashStoreLibrary, ArrayBuilderNode arrayBuilderNode,
589+
boolean isTRegexMatch, MatchInfo matchInfo,
570590
AtomicInteger count) {
571591
final RubyHash regexpInfoHash = CompiledRegexpHashArray.buildRegexInfoHash(
572592
getContext(),
@@ -591,10 +611,99 @@ private RubyHash buildHash(HashStoreLibrary hashStoreLibrary, boolean isTRegexMa
591611
matchInfo.matchStart,
592612
true);
593613

614+
if (getContext().getOptions().REGEXP_INSTRUMENT_MATCH_DETAILED) {
615+
hashStoreLibrary.set(
616+
matchInfoHash.store,
617+
matchInfoHash,
618+
getLanguage().getSymbol("match_stats"),
619+
buildMatchInfoStatsHash(hashStoreLibrary, arrayBuilderNode, matchInfo),
620+
true);
621+
}
622+
594623
assert hashStoreLibrary.verify(matchInfoHash.store, matchInfoHash);
595624

596625
return matchInfoHash;
597626
}
627+
628+
private RubyHash buildMatchInfoStatsHash(HashStoreLibrary hashStoreLibrary, ArrayBuilderNode arrayBuilderNode,
629+
MatchInfo matchInfo) {
630+
final MatchInfoStats stats = MATCHED_REGEXP_STATS.get(matchInfo);
631+
final RubyHash ret = HashOperations.newEmptyHash(getContext(), getLanguage());
632+
633+
buildAndSetDistributionHash(
634+
hashStoreLibrary,
635+
ret,
636+
"byte_array_populated",
637+
stats.byteArrayPopulatedFrequencies,
638+
Optional.empty(),
639+
Optional.of(count -> count.get()));
640+
641+
buildAndSetDistributionHash(
642+
hashStoreLibrary,
643+
ret,
644+
"byte_lengths",
645+
stats.byteLengthFrequencies,
646+
Optional.empty(),
647+
Optional.of(count -> count.get()));
648+
649+
buildAndSetDistributionHash(
650+
hashStoreLibrary,
651+
ret,
652+
"character_lengths",
653+
stats.characterLengthFrequencies,
654+
Optional.empty(),
655+
Optional.of(count -> count.get()));
656+
657+
buildAndSetDistributionHash(
658+
hashStoreLibrary,
659+
ret,
660+
"code_ranges",
661+
stats.codeRangeFrequencies,
662+
Optional.of(codeRange -> getLanguage().getSymbol(codeRange.toString())),
663+
Optional.of(count -> count.get()));
664+
665+
buildAndSetDistributionHash(
666+
hashStoreLibrary,
667+
ret,
668+
"encodings",
669+
stats.encodingFrequencies,
670+
Optional.empty(),
671+
Optional.of(count -> count.get()));
672+
673+
buildAndSetDistributionHash(
674+
hashStoreLibrary,
675+
ret,
676+
"rope_types",
677+
stats.ropeClassFrequencies,
678+
Optional.of(
679+
className -> StringOperations.createUTF8String(
680+
getContext(),
681+
getLanguage(),
682+
StringOperations.encodeRope(className, UTF8Encoding.INSTANCE))),
683+
Optional.of(count -> count.get()));
684+
685+
return ret;
686+
}
687+
688+
private <K, V> void buildAndSetDistributionHash(HashStoreLibrary hashStoreLibrary, RubyHash hash,
689+
String keyName, ConcurrentHashMap<K, V> distribution, Optional<Function<K, Object>> keyMapper,
690+
Optional<Function<V, Object>> valueMapper) {
691+
final RubyHash distributionHash = HashOperations.toRubyHash(
692+
getContext(),
693+
getLanguage(),
694+
hashStoreLibrary,
695+
distribution,
696+
keyMapper,
697+
valueMapper,
698+
true);
699+
700+
hashStoreLibrary.set(
701+
hash.store,
702+
hash,
703+
getLanguage().getSymbol(keyName),
704+
distributionHash,
705+
true);
706+
}
598707
}
599708

600709
@Primitive(name = "regexp_initialized?")
@@ -693,7 +802,12 @@ protected Object matchInRegionTRegex(
693802
}
694803

695804
if (getContext().getOptions().REGEXP_INSTRUMENT_MATCH) {
696-
TruffleRegexpNodes.instrumentMatch(MATCHED_REGEXPS_TREGEX, regexp, string, atStart);
805+
TruffleRegexpNodes.instrumentMatch(
806+
MATCHED_REGEXPS_TREGEX,
807+
regexp,
808+
string,
809+
atStart,
810+
getContext().getOptions().REGEXP_INSTRUMENT_MATCH_DETAILED);
697811
}
698812

699813
int fromIndex = fromPos;
@@ -841,7 +955,12 @@ protected Object executeMatch(
841955
RubyRegexp regexp, Object string, Matcher matcher, int startPos, int range, boolean onlyMatchAtStart,
842956
@Cached ConditionProfile matchesProfile) {
843957
if (getContext().getOptions().REGEXP_INSTRUMENT_MATCH) {
844-
TruffleRegexpNodes.instrumentMatch(MATCHED_REGEXPS_JONI, regexp, string, onlyMatchAtStart);
958+
TruffleRegexpNodes.instrumentMatch(
959+
MATCHED_REGEXPS_JONI,
960+
regexp,
961+
string,
962+
onlyMatchAtStart,
963+
getContext().getOptions().REGEXP_INSTRUMENT_MATCH_DETAILED);
845964
}
846965

847966
int match = runMatch(matcher, startPos, range, onlyMatchAtStart);
@@ -896,13 +1015,11 @@ static final class MatchInfo {
8961015

8971016
private final RegexpCacheKey regexpInfo;
8981017
private final boolean matchStart;
899-
private final RubyEncoding matchEncoding;
9001018

901-
MatchInfo(RegexpCacheKey regexpInfo, boolean matchStart, RubyEncoding matchEncoding) {
1019+
MatchInfo(RegexpCacheKey regexpInfo, boolean matchStart) {
9021020
assert regexpInfo != null;
9031021
this.regexpInfo = regexpInfo;
9041022
this.matchStart = matchStart;
905-
this.matchEncoding = matchEncoding;
9061023
}
9071024

9081025
@Override
@@ -921,24 +1038,54 @@ public boolean equals(Object obj) {
9211038
}
9221039

9231040
MatchInfo other = (MatchInfo) obj;
924-
return matchStart == other.matchStart && matchEncoding == other.matchEncoding &&
1041+
return matchStart == other.matchStart &&
9251042
regexpInfo.equals(other.regexpInfo);
9261043
}
9271044

9281045
@Override
9291046
public String toString() {
9301047
return String.format(
931-
"Match (%s, fromStart = %s, encoding = %s)",
1048+
"Match (%s, fromStart = %s)",
9321049
regexpInfo,
933-
matchStart,
934-
RopeOperations.decodeOrEscapeBinaryRope(matchEncoding.name.rope));
1050+
matchStart);
1051+
}
1052+
}
1053+
1054+
static final class MatchInfoStats {
1055+
1056+
private final ConcurrentHashMap<Boolean, AtomicLong> byteArrayPopulatedFrequencies = new ConcurrentHashMap<>();
1057+
private final ConcurrentHashMap<Integer, AtomicLong> byteLengthFrequencies = new ConcurrentHashMap<>();
1058+
private final ConcurrentHashMap<Integer, AtomicLong> characterLengthFrequencies = new ConcurrentHashMap<>();
1059+
private final ConcurrentHashMap<CodeRange, AtomicLong> codeRangeFrequencies = new ConcurrentHashMap<>();
1060+
private final ConcurrentHashMap<RubyEncoding, AtomicLong> encodingFrequencies = new ConcurrentHashMap<>();
1061+
private final ConcurrentHashMap<String, AtomicLong> ropeClassFrequencies = new ConcurrentHashMap<>();
1062+
1063+
private void record(Rope rope, RubyEncoding encoding) {
1064+
ConcurrentOperations
1065+
.getOrCompute(byteArrayPopulatedFrequencies, rope.getRawBytes() != null, x -> new AtomicLong())
1066+
.incrementAndGet();
1067+
ConcurrentOperations
1068+
.getOrCompute(byteLengthFrequencies, rope.byteLength(), x -> new AtomicLong())
1069+
.incrementAndGet();
1070+
ConcurrentOperations
1071+
.getOrCompute(characterLengthFrequencies, rope.characterLength(), x -> new AtomicLong())
1072+
.incrementAndGet();
1073+
ConcurrentOperations
1074+
.getOrCompute(codeRangeFrequencies, rope.getCodeRange(), x -> new AtomicLong())
1075+
.incrementAndGet();
1076+
ConcurrentOperations.getOrCompute(encodingFrequencies, encoding, x -> new AtomicLong()).incrementAndGet();
1077+
ConcurrentOperations
1078+
.getOrCompute(ropeClassFrequencies, rope.getClass().getSimpleName(), x -> new AtomicLong())
1079+
.incrementAndGet();
9351080
}
1081+
9361082
}
9371083

9381084
private static ConcurrentHashMap<RegexpCacheKey, AtomicInteger> COMPILED_REGEXPS_DYNAMIC = new ConcurrentHashMap<>();
9391085
private static ConcurrentHashMap<RegexpCacheKey, AtomicInteger> COMPILED_REGEXPS_LITERAL = new ConcurrentHashMap<>();
9401086
private static ConcurrentHashMap<MatchInfo, AtomicInteger> MATCHED_REGEXPS_JONI = new ConcurrentHashMap<>();
9411087
private static ConcurrentHashMap<MatchInfo, AtomicInteger> MATCHED_REGEXPS_TREGEX = new ConcurrentHashMap<>();
1088+
private static ConcurrentHashMap<MatchInfo, MatchInfoStats> MATCHED_REGEXP_STATS = new ConcurrentHashMap<>();
9421089

9431090
/** WARNING: computeRegexpEncoding() mutates options, so the caller should make sure it's a copy */
9441091
@TruffleBoundary

0 commit comments

Comments
 (0)