Skip to content

Commit 5593022

Browse files
committed
[GR-19220] Shift TruffleBoundary around in Truffle::RegexOperations (#2374)
PullRequest: truffleruby/2732
2 parents 8258edb + f828a77 commit 5593022

File tree

5 files changed

+120
-72
lines changed

5 files changed

+120
-72
lines changed

src/main/java/org/truffleruby/core/regexp/TruffleRegexpNodes.java

Lines changed: 84 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import java.util.concurrent.ConcurrentHashMap;
1717
import java.util.concurrent.atomic.AtomicInteger;
1818

19-
import com.oracle.truffle.api.CompilerDirectives;
2019
import com.oracle.truffle.api.library.CachedLibrary;
2120
import com.oracle.truffle.api.profiles.BranchProfile;
2221
import com.oracle.truffle.api.source.Source;
@@ -41,6 +40,7 @@
4140
import org.truffleruby.core.array.ArrayBuilderNode;
4241
import org.truffleruby.core.array.ArrayBuilderNode.BuilderState;
4342
import org.truffleruby.core.array.RubyArray;
43+
import org.truffleruby.core.encoding.EncodingNodes;
4444
import org.truffleruby.core.encoding.RubyEncoding;
4545
import org.truffleruby.core.kernel.KernelNodes.SameOrEqualNode;
4646
import org.truffleruby.core.regexp.RegexpNodes.ToSNode;
@@ -75,34 +75,76 @@
7575
@CoreModule("Truffle::RegexpOperations")
7676
public class TruffleRegexpNodes {
7777

78-
@TruffleBoundary
79-
public static Matcher createMatcher(RubyContext context, RubyRegexp regexp, Rope stringRope, byte[] stringBytes,
80-
boolean encodingConversion, int start, Node currentNode) {
81-
final Encoding enc = checkEncoding(regexp, stringRope.getEncoding(), stringRope.getCodeRange());
82-
Regex regex = regexp.regex;
78+
// rb_reg_prepare_enc ... mostly. Some of the error checks are performed by callers of this method.
79+
public abstract static class CheckEncodingNode extends RubyContextNode {
80+
81+
@Child RopeNodes.CodeRangeNode codeRangeNode = RopeNodes.CodeRangeNode.create();
82+
@Child RubyStringLibrary stringLibrary = RubyStringLibrary.getFactory().createDispatched(2);
8383

84-
if (encodingConversion && regex.getEncoding() != enc) {
85-
EncodingCache encodingCache = regexp.cachedEncodings;
86-
regex = encodingCache.getOrCreate(enc, e -> makeRegexpForEncoding(context, regexp, e, currentNode));
84+
public static CheckEncodingNode create() {
85+
return TruffleRegexpNodesFactory.CheckEncodingNodeGen.create();
8786
}
8887

89-
return regex.matcher(stringBytes, start, stringBytes.length);
90-
}
88+
public final Encoding executeCheckEncoding(RubyRegexp regexp, Object string) {
89+
return executeInternal(regexp, stringLibrary.getRope(string));
90+
}
9191

92-
@TruffleBoundary
93-
public static Encoding checkEncoding(RubyRegexp regexp, Encoding strEnc, CodeRange codeRange) {
94-
final Encoding regexEnc = regexp.regex.getEncoding();
92+
public abstract Encoding executeInternal(RubyRegexp regexp, Rope rope);
93+
94+
@Specialization(guards = {
95+
"!isSameEncoding(regexp, rope)",
96+
"isUSASCII(regexp, rope)"
97+
})
98+
protected Encoding checkEncodingAsciiOnly(RubyRegexp regexp, Rope rope) {
99+
return USASCIIEncoding.INSTANCE;
100+
}
95101

96-
if (strEnc == regexEnc) {
97-
return regexEnc;
98-
} else if (regexEnc == USASCIIEncoding.INSTANCE && codeRange == CodeRange.CR_7BIT) {
99-
return regexEnc;
100-
} else if (strEnc.isAsciiCompatible() && regexp.options.isFixed()) {
101-
return regexEnc;
102+
@Specialization(guards = {
103+
"isSameEncoding(regexp, rope)"
104+
})
105+
protected Encoding checkEncodingSameEncoding(RubyRegexp regexp, Rope rope) {
106+
return regexp.regex.getEncoding();
102107
}
103-
return strEnc;
108+
109+
@Specialization(guards = {
110+
"!isSameEncoding(regexp, rope)",
111+
"!isUSASCII(regexp, rope)",
112+
"isFixedEncoding(regexp, rope)",
113+
})
114+
protected Encoding checkEncodingFixedEncoding(RubyRegexp regexp, Rope rope) {
115+
return regexp.regex.getEncoding();
116+
}
117+
118+
@Specialization(guards = {
119+
"!isSameEncoding(regexp, rope)",
120+
"!isUSASCII(regexp, rope)",
121+
"!isFixedEncoding(regexp, rope)"
122+
})
123+
protected Encoding fallback(RubyRegexp regexp, Rope rope) {
124+
return rope.encoding;
125+
}
126+
127+
protected boolean isSameEncoding(RubyRegexp regexp, Rope rope) {
128+
return regexp.regex.getEncoding() == rope.encoding;
129+
}
130+
131+
protected boolean isUSASCII(RubyRegexp regexp, Rope rope) {
132+
return regexp.regex.getEncoding() == USASCIIEncoding.INSTANCE &&
133+
codeRangeNode.execute(rope) == CodeRange.CR_7BIT;
134+
}
135+
136+
protected boolean isFixedEncoding(RubyRegexp regexp, Rope rope) {
137+
return regexp.options.isFixed() && rope.encoding.isAsciiCompatible();
138+
}
139+
140+
}
141+
142+
@TruffleBoundary
143+
private static Matcher getMatcher(Regex regex, byte[] stringBytes, int start) {
144+
return regex.matcher(stringBytes, start, stringBytes.length);
104145
}
105146

147+
@TruffleBoundary
106148
private static Regex makeRegexpForEncoding(RubyContext context, RubyRegexp regexp, Encoding enc, Node currentNode) {
107149
final Encoding[] fixedEnc = new Encoding[]{ null };
108150
final Rope sourceRope = regexp.source;
@@ -197,26 +239,16 @@ public RubyRegexp createRegexp(Rope pattern) throws DeferredRaiseException {
197239
}
198240
}
199241

200-
@CoreMethod(names = "select_encoding", onSingleton = true, required = 3)
242+
@CoreMethod(names = "select_encoding", onSingleton = true, required = 2)
201243
public abstract static class SelectEncodingNode extends CoreMethodArrayArgumentsNode {
202244

203-
@Child RopeNodes.CodeRangeNode codeRangeNode;
204-
205245
@Specialization(guards = "libString.isRubyString(str)")
206-
protected RubyEncoding selectEncoding(RubyRegexp re, Object str, boolean encodingConversion,
246+
protected RubyEncoding selectEncoding(RubyRegexp re, Object str,
247+
@Cached EncodingNodes.GetRubyEncodingNode getRubyEncodingNode,
248+
@Cached CheckEncodingNode checkEncodingNode,
207249
@CachedLibrary(limit = "2") RubyStringLibrary libString) {
208-
Encoding encoding;
209-
if (encodingConversion) {
210-
Rope stringRope = libString.getRope(str);
211-
if (codeRangeNode == null) {
212-
CompilerDirectives.transferToInterpreterAndInvalidate();
213-
codeRangeNode = insert(RopeNodes.CodeRangeNode.create());
214-
}
215-
encoding = checkEncoding(re, stringRope.getEncoding(), codeRangeNode.execute(stringRope));
216-
} else {
217-
encoding = re.regex.getEncoding();
218-
}
219-
return getContext().getEncodingManager().getRubyEncoding(encoding);
250+
final Encoding encoding = checkEncodingNode.executeCheckEncoding(re, str);
251+
return getRubyEncodingNode.executeGetRubyEncoding(encoding);
220252
}
221253
}
222254

@@ -359,7 +391,7 @@ protected boolean initialized(RubyRegexp regexp) {
359391
}
360392
}
361393

362-
@Primitive(name = "regexp_match_in_region", lowerFixnum = { 2, 3, 6 })
394+
@Primitive(name = "regexp_match_in_region", lowerFixnum = { 2, 3, 5 })
363395
public abstract static class MatchInRegionNode extends PrimitiveArrayArgumentsNode {
364396

365397
/** Matches a regular expression against a string over the specified range of characters.
@@ -375,32 +407,27 @@ public abstract static class MatchInRegionNode extends PrimitiveArrayArgumentsNo
375407
* @param atStart Whether to only match at the beginning of the string, if false then the regexp can have any
376408
* amount of prematch.
377409
*
378-
* @param encodingConversion Whether to attempt encoding conversion of the regexp to match the string
379-
*
380410
* @param startPos The position within the string which the matcher should consider the start. Setting this to
381411
* the from position allows scanners to match starting partway through a string while still setting
382412
* atStart and thus forcing the match to be at the specific starting position. */
383413
@Specialization(guards = "libString.isRubyString(string)")
384414
protected Object matchInRegion(
385-
RubyRegexp regexp,
386-
Object string,
387-
int fromPos,
388-
int toPos,
389-
boolean atStart,
390-
boolean encodingConversion,
391-
int startPos,
415+
RubyRegexp regexp, Object string, int fromPos, int toPos, boolean atStart, int startPos,
416+
@Cached ConditionProfile encodingMismatchProfile,
392417
@Cached RopeNodes.BytesNode bytesNode,
393-
@Cached TruffleRegexpNodes.MatchNode matchNode,
418+
@Cached MatchNode matchNode,
419+
@Cached CheckEncodingNode checkEncodingNode,
394420
@CachedLibrary(limit = "2") RubyStringLibrary libString) {
395-
Rope rope = libString.getRope(string);
396-
Matcher matcher = createMatcher(
397-
getContext(),
398-
regexp,
399-
rope,
400-
bytesNode.execute(rope),
401-
encodingConversion,
402-
startPos,
403-
this);
421+
final Rope rope = libString.getRope(string);
422+
final Encoding enc = checkEncodingNode.executeCheckEncoding(regexp, string);
423+
Regex regex = regexp.regex;
424+
425+
if (encodingMismatchProfile.profile(regex.getEncoding() != enc)) {
426+
final EncodingCache encodingCache = regexp.cachedEncodings;
427+
regex = encodingCache.getOrCreate(enc, e -> makeRegexpForEncoding(getContext(), regexp, e, this));
428+
}
429+
430+
final Matcher matcher = getMatcher(regex, bytesNode.execute(rope), startPos);
404431
return matchNode.execute(regexp, string, matcher, fromPos, toPos, atStart);
405432
}
406433
}

src/main/java/org/truffleruby/options/Options.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,8 @@ public class Options {
129129
public final boolean WARN_EXPERIMENTAL;
130130
/** --use-truffle-regex=false */
131131
public final boolean USE_TRUFFLE_REGEX;
132+
/** --warn-truffle-regex-fallback=false */
133+
public final boolean WARN_TRUFFLE_REGEX_FALLBACK;
132134
/** --argv-globals=false */
133135
public final boolean ARGV_GLOBALS;
134136
/** --chomp-loop=false */
@@ -246,6 +248,7 @@ public Options(Env env, OptionValues options, LanguageOptions languageOptions) {
246248
WARN_DEPRECATED = options.get(OptionsCatalog.WARN_DEPRECATED_KEY);
247249
WARN_EXPERIMENTAL = options.get(OptionsCatalog.WARN_EXPERIMENTAL_KEY);
248250
USE_TRUFFLE_REGEX = options.get(OptionsCatalog.USE_TRUFFLE_REGEX_KEY);
251+
WARN_TRUFFLE_REGEX_FALLBACK = options.get(OptionsCatalog.WARN_TRUFFLE_REGEX_FALLBACK_KEY);
249252
ARGV_GLOBALS = options.get(OptionsCatalog.ARGV_GLOBALS_KEY);
250253
CHOMP_LOOP = options.get(OptionsCatalog.CHOMP_LOOP_KEY);
251254
GETS_LOOP = options.get(OptionsCatalog.GETS_LOOP_KEY);
@@ -387,6 +390,8 @@ public Object fromDescriptor(OptionDescriptor descriptor) {
387390
return WARN_EXPERIMENTAL;
388391
case "ruby.use-truffle-regex":
389392
return USE_TRUFFLE_REGEX;
393+
case "ruby.warn-truffle-regex-fallback":
394+
return WARN_TRUFFLE_REGEX_FALLBACK;
390395
case "ruby.argv-globals":
391396
return ARGV_GLOBALS;
392397
case "ruby.chomp-loop":

src/main/ruby/truffleruby/core/truffle/regexp_operations.rb

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,15 @@ def self.search_region(re, str, start_index, end_index, forward)
3131
from = end_index
3232
to = start_index
3333
end
34-
match_in_region(re, str, from, to, false, true, 0)
34+
match_in_region(re, str, from, to, false, 0)
3535
end
3636

3737
# This path is used by some string and scanner methods and allows
3838
# for at_start to be specified on the matcher. FIXME it might be
3939
# possible to refactor search region to offer the ability to
4040
# specify at start, we should investigate this at some point.
4141
def self.match_onwards(re, str, from, at_start)
42-
md = match_in_region(re, str, from, str.bytesize, at_start, true, from)
42+
md = match_in_region(re, str, from, str.bytesize, at_start, from)
4343
Primitive.matchdata_fixup_positions(md, from) if md
4444
md
4545
end
@@ -64,6 +64,7 @@ def self.match_from(re, str, pos)
6464
Truffle::Boot.delay do
6565
COMPARE_ENGINES = Truffle::Boot.get_option('compare-regex-engines')
6666
USE_TRUFFLE_REGEX = Truffle::Boot.get_option('use-truffle-regex')
67+
WARN_TRUFFLE_REGEX_FALLBACK = Truffle::Boot.get_option('warn-truffle-regex-fallback')
6768

6869
if Truffle::Boot.get_option('regexp-instrument-creation') or Truffle::Boot.get_option('regexp-instrument-match')
6970
at_exit do
@@ -72,50 +73,53 @@ def self.match_from(re, str, pos)
7273
end
7374
end
7475

75-
def self.match_in_region(re, str, from, to, at_start, encoding_conversion, start)
76+
def self.match_in_region(re, str, from, to, at_start, start)
7677
if COMPARE_ENGINES
77-
match_in_region_compare_engines(re, str, from, to, at_start, encoding_conversion, start)
78+
match_in_region_compare_engines(re, str, from, to, at_start, start)
7879
elsif USE_TRUFFLE_REGEX
79-
match_in_region_tregex(re, str, from, to, at_start, encoding_conversion, start)
80+
match_in_region_tregex(re, str, from, to, at_start, start)
8081
else
81-
Primitive.regexp_match_in_region(re, str, from, to, at_start, encoding_conversion, start)
82+
Primitive.regexp_match_in_region(re, str, from, to, at_start, start)
8283
end
8384
end
8485

85-
def self.match_in_region_compare_engines(re, str, from, to, at_start, encoding_conversion, start)
86+
def self.match_in_region_compare_engines(re, str, from, to, at_start, start)
8687
begin
87-
md1 = match_in_region_tregex(re, str, from, to, at_start, encoding_conversion, start)
88+
md1 = match_in_region_tregex(re, str, from, to, at_start, start)
8889
rescue => e
8990
md1 = e
9091
end
9192
begin
92-
md2 = Primitive.regexp_match_in_region(re, str, from, to, at_start, encoding_conversion, start)
93+
md2 = Primitive.regexp_match_in_region(re, str, from, to, at_start, start)
9394
rescue => e
9495
md2 = e
9596
end
9697
if self.results_match?(md1, md2)
9798
return self.return_match_data(md1)
9899
else
99-
$stderr.puts "match_in_region(/#{re}/, \"#{str}\"@#{str.encoding}, #{from}, #{to}, #{at_start}, #{encoding_conversion}, #{start}) gate"
100-
self.print_match_data(md1)
100+
$stderr.puts "match_in_region(#{re.inspect}, #{str.inspect}@#{str.encoding}, #{from}, #{to}, #{at_start}, #{start}) gave"
101+
print_match_data(md1)
101102
$stderr.puts 'but we expected'
102-
self.print_match_data(md2)
103+
print_match_data(md2)
103104
return self.return_match_data(md2)
104105
end
105106
end
106107

107-
def self.match_in_region_tregex(re, str, from, to, at_start, encoding_conversion, start)
108+
def self.match_in_region_tregex(re, str, from, to, at_start, start)
108109
bail_out = to < from || to != str.bytesize || start != 0 || from < 0
109110
if !bail_out
110-
compiled_regex = tregex_compile(re, at_start, select_encoding(re, str, encoding_conversion))
111+
compiled_regex = tregex_compile(re, at_start, select_encoding(re, str))
111112
bail_out = compiled_regex.nil?
112113
end
113114
if !bail_out
114115
str_bytes = StringOperations.raw_bytes(str)
115116
regex_result = compiled_regex.execBytes(str_bytes, from)
116117
end
117118
if bail_out
118-
return Primitive.regexp_match_in_region(re, str, from, to, at_start, encoding_conversion, start)
119+
if WARN_TRUFFLE_REGEX_FALLBACK
120+
warn "match_in_region_tregex(#{re.inspect}, #{str.inspect}@#{str.encoding}, #{from}, #{to}, #{at_start}, #{encoding_conversion}, #{start}) can't be run as a Truffle regexp and fell back to Joni", uplevel: 1
121+
end
122+
return Primitive.regexp_match_in_region(re, str, from, to, at_start, start)
119123
elsif regex_result.isMatch
120124
starts = []
121125
ends = []

src/options.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ EXPERT:
144144

145145
# Controlling the regular expression engines
146146
USE_TRUFFLE_REGEX: [use-truffle-regex, boolean, false, 'Use the Truffle regular expression engine for Regexp objects']
147+
WARN_TRUFFLE_REGEX_FALLBACK: [warn-truffle-regex-fallback, boolean, false, 'Warn when Truffle Regex could not be used for a Regexp and instead Joni is used']
147148

148149
INTERNAL: # Options for debugging the TruffleRuby implementation
149150
EXPERIMENTAL:

src/shared/java/org/truffleruby/shared/options/OptionsCatalog.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ public class OptionsCatalog {
8383
public static final OptionKey<Boolean> WARN_DEPRECATED_KEY = new OptionKey<>(false);
8484
public static final OptionKey<Boolean> WARN_EXPERIMENTAL_KEY = new OptionKey<>(true);
8585
public static final OptionKey<Boolean> USE_TRUFFLE_REGEX_KEY = new OptionKey<>(false);
86+
public static final OptionKey<Boolean> WARN_TRUFFLE_REGEX_FALLBACK_KEY = new OptionKey<>(false);
8687
public static final OptionKey<Boolean> ARGV_GLOBALS_KEY = new OptionKey<>(false);
8788
public static final OptionKey<Boolean> CHOMP_LOOP_KEY = new OptionKey<>(false);
8889
public static final OptionKey<Boolean> GETS_LOOP_KEY = new OptionKey<>(false);
@@ -596,6 +597,13 @@ public class OptionsCatalog {
596597
.stability(OptionStability.EXPERIMENTAL)
597598
.build();
598599

600+
public static final OptionDescriptor WARN_TRUFFLE_REGEX_FALLBACK = OptionDescriptor
601+
.newBuilder(WARN_TRUFFLE_REGEX_FALLBACK_KEY, "ruby.warn-truffle-regex-fallback")
602+
.help("Warn when Truffle Regex could not be used for a Regexp and instead Joni is used")
603+
.category(OptionCategory.EXPERT)
604+
.stability(OptionStability.EXPERIMENTAL)
605+
.build();
606+
599607
public static final OptionDescriptor ARGV_GLOBALS = OptionDescriptor
600608
.newBuilder(ARGV_GLOBALS_KEY, "ruby.argv-globals")
601609
.help("Parse options in script argv into global variables (configured by the -s Ruby option)")
@@ -1221,6 +1229,8 @@ public static OptionDescriptor fromName(String name) {
12211229
return WARN_EXPERIMENTAL;
12221230
case "ruby.use-truffle-regex":
12231231
return USE_TRUFFLE_REGEX;
1232+
case "ruby.warn-truffle-regex-fallback":
1233+
return WARN_TRUFFLE_REGEX_FALLBACK;
12241234
case "ruby.argv-globals":
12251235
return ARGV_GLOBALS;
12261236
case "ruby.chomp-loop":
@@ -1433,6 +1443,7 @@ public static OptionDescriptor[] allDescriptors() {
14331443
WARN_DEPRECATED,
14341444
WARN_EXPERIMENTAL,
14351445
USE_TRUFFLE_REGEX,
1446+
WARN_TRUFFLE_REGEX_FALLBACK,
14361447
ARGV_GLOBALS,
14371448
CHOMP_LOOP,
14381449
GETS_LOOP,

0 commit comments

Comments
 (0)