Skip to content

Commit 49ae17c

Browse files
Fix an issue where Regexp.union was improperly negotiating the result encoding.
As part of this fix we were able to remove the non-standard `Regexp.convert`. Co-authored-by: Simon LeVasseur <simon.levasseur@shopify.com>
1 parent 63ff781 commit 49ae17c

File tree

3 files changed

+49
-31
lines changed

3 files changed

+49
-31
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ Bug fixes:
77

88
* Fix `rb_enc_left_char_head()` so it is not always `ArgumentError` (#3267, @eregon).
99
* Fix `IO.copy_stream` with a `Tempfile` destination (#3280, @eregon).
10+
* Fix `Regexp.union` negotiating the wrong result encoding (#3287, @nirvdrum, @simonlevasseur).
1011

1112
Compatibility:
1213

spec/ruby/core/regexp/union_spec.rb

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,24 @@
4343
Regexp.union("\u00A9".encode("ISO-8859-1"), "a".encode("UTF-8")).encoding.should == Encoding::ISO_8859_1
4444
end
4545

46+
it "returns ASCII-8BIT if the regexp encodings are ASCII-8BIT and at least one has non-ASCII characters" do
47+
us_ascii_implicit, us_ascii_explicit, binary = /abc/, /[\x00-\x7f]/n, /[\x80-\xBF]/n
48+
49+
Regexp.union(us_ascii_implicit, us_ascii_explicit, binary).encoding.should == Encoding::BINARY
50+
Regexp.union(us_ascii_implicit, binary, us_ascii_explicit).encoding.should == Encoding::BINARY
51+
Regexp.union(us_ascii_explicit, us_ascii_implicit, binary).encoding.should == Encoding::BINARY
52+
Regexp.union(us_ascii_explicit, binary, us_ascii_implicit).encoding.should == Encoding::BINARY
53+
Regexp.union(binary, us_ascii_implicit, us_ascii_explicit).encoding.should == Encoding::BINARY
54+
Regexp.union(binary, us_ascii_explicit, us_ascii_implicit).encoding.should == Encoding::BINARY
55+
end
56+
57+
it "return US-ASCII if all patterns are ASCII-only" do
58+
Regexp.union(/abc/e, /def/e).encoding.should == Encoding::US_ASCII
59+
Regexp.union(/abc/n, /def/n).encoding.should == Encoding::US_ASCII
60+
Regexp.union(/abc/s, /def/s).encoding.should == Encoding::US_ASCII
61+
Regexp.union(/abc/u, /def/u).encoding.should == Encoding::US_ASCII
62+
end
63+
4664
it "returns a Regexp with UTF-8 if one part is UTF-8" do
4765
Regexp.union(/probl[éeè]me/i, /help/i).encoding.should == Encoding::UTF_8
4866
end

src/main/ruby/truffleruby/core/regexp.rb

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -55,22 +55,23 @@ def self.try_convert(obj)
5555
Truffle::Type.try_convert obj, Regexp, :to_regexp
5656
end
5757

58-
def self.convert(pattern)
59-
return pattern if Primitive.is_a?(pattern, Regexp)
60-
if Primitive.is_a?(pattern, Array)
61-
union(*pattern)
62-
else
63-
Regexp.quote(pattern.to_s)
64-
end
65-
end
58+
def self.negotiate_union_encoding(*patterns)
59+
res = nil
60+
61+
patterns.each do |pattern|
62+
converted = Primitive.is_a?(pattern, Regexp) ? pattern : Regexp.quote(pattern)
6663

67-
def self.compatible?(*patterns)
68-
encodings = patterns.map { |r| convert(r).encoding }
69-
last_enc = encodings.pop
70-
encodings.each do |encoding|
71-
raise ArgumentError, "incompatible encodings: #{encoding} and #{last_enc}" unless Primitive.encoding_compatible?(last_enc, encoding)
72-
last_enc = encoding
64+
enc = converted.encoding
65+
66+
if Primitive.nil?(res)
67+
res = enc
68+
else
69+
res = Primitive.encoding_compatible?(enc, res)
70+
raise ArgumentError, "incompatible encodings: #{enc} and #{res}" unless res
71+
end
7372
end
73+
74+
res
7475
end
7576

7677
def self.last_match(index = nil)
@@ -96,37 +97,35 @@ def self.last_match(index = nil)
9697
def self.union(*patterns)
9798
case patterns.size
9899
when 0
99-
return %r/(?!)/
100+
%r/(?!)/
100101
when 1
101102
pattern = patterns.first
102103
case pattern
103104
when Array
104-
return union(*pattern)
105+
union(*pattern)
105106
else
106107
converted = Truffle::Type.rb_check_convert_type(pattern, Regexp, :to_regexp)
107108
if Primitive.nil? converted
108-
return Regexp.new(Regexp.quote(pattern))
109+
Regexp.new(Regexp.quote(pattern))
109110
else
110-
return converted
111+
converted
111112
end
112113
end
113114
else
114-
compatible?(*patterns)
115-
enc = convert(patterns.first).encoding
116-
end
115+
patterns = patterns.map do |pat|
116+
if Primitive.is_a?(pat, Regexp)
117+
pat
118+
else
119+
StringValue(pat)
120+
end
121+
end
117122

118-
sep = '|'.encode(enc)
119-
str = ''.encode(enc)
123+
enc = negotiate_union_encoding(*patterns)
124+
sep = '|'.encode(enc)
125+
str = ''.encode(enc)
120126

121-
patterns = patterns.map do |pat|
122-
if Primitive.is_a?(pat, Regexp)
123-
pat
124-
else
125-
StringValue(pat)
126-
end
127+
Truffle::RegexpOperations.union(str, sep, *patterns)
127128
end
128-
129-
Truffle::RegexpOperations.union(str, sep, *patterns)
130129
end
131130
Truffle::Graal.always_split(method(:union))
132131

0 commit comments

Comments
 (0)