Skip to content

Commit 152a876

Browse files
committed
[GR-19220] Fix an issue where Regexp.union was improperly negotiating the result encoding (#3296)
PullRequest: truffleruby/4034 (cherry picked from commit 6642143)
1 parent d74667c commit 152a876

File tree

3 files changed

+72
-45
lines changed

3 files changed

+72
-45
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ Bug fixes:
44

55
* Fix `rb_enc_left_char_head()` so it is not always `ArgumentError` (#3267, @eregon).
66
* Fix `IO.copy_stream` with a `Tempfile` destination (#3280, @eregon).
7+
* Fix `Regexp.union` negotiating the wrong result encoding (#3287, @nirvdrum, @simonlevasseur).
78

89
# 23.1.0
910

spec/ruby/core/regexp/union_spec.rb

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,27 @@
4343
Regexp.union("\u00A9".encode("ISO-8859-1"), "a".encode("UTF-8")).encoding.should == Encoding::ISO_8859_1
4444
end
4545

46+
it "returns ASCII-8BIT if the regexp encodings are ASCII-8BIT and at least one has non-ASCII characters" do
47+
us_ascii_implicit, us_ascii_explicit, binary = /abc/, /[\x00-\x7f]/n, /[\x80-\xBF]/n
48+
us_ascii_implicit.encoding.should == Encoding::US_ASCII
49+
us_ascii_explicit.encoding.should == Encoding::US_ASCII
50+
binary.encoding.should == Encoding::BINARY
51+
52+
Regexp.union(us_ascii_implicit, us_ascii_explicit, binary).encoding.should == Encoding::BINARY
53+
Regexp.union(us_ascii_implicit, binary, us_ascii_explicit).encoding.should == Encoding::BINARY
54+
Regexp.union(us_ascii_explicit, us_ascii_implicit, binary).encoding.should == Encoding::BINARY
55+
Regexp.union(us_ascii_explicit, binary, us_ascii_implicit).encoding.should == Encoding::BINARY
56+
Regexp.union(binary, us_ascii_implicit, us_ascii_explicit).encoding.should == Encoding::BINARY
57+
Regexp.union(binary, us_ascii_explicit, us_ascii_implicit).encoding.should == Encoding::BINARY
58+
end
59+
60+
it "return US-ASCII if all patterns are ASCII-only" do
61+
Regexp.union(/abc/e, /def/e).encoding.should == Encoding::US_ASCII
62+
Regexp.union(/abc/n, /def/n).encoding.should == Encoding::US_ASCII
63+
Regexp.union(/abc/s, /def/s).encoding.should == Encoding::US_ASCII
64+
Regexp.union(/abc/u, /def/u).encoding.should == Encoding::US_ASCII
65+
end
66+
4667
it "returns a Regexp with UTF-8 if one part is UTF-8" do
4768
Regexp.union(/probl[éeè]me/i, /help/i).encoding.should == Encoding::UTF_8
4869
end
@@ -54,83 +75,83 @@
5475
it "raises ArgumentError if the arguments include conflicting ASCII-incompatible Strings" do
5576
-> {
5677
Regexp.union("a".encode("UTF-16LE"), "b".encode("UTF-16BE"))
57-
}.should raise_error(ArgumentError)
78+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and UTF-16BE')
5879
end
5980

6081
it "raises ArgumentError if the arguments include conflicting ASCII-incompatible Regexps" do
6182
-> {
6283
Regexp.union(Regexp.new("a".encode("UTF-16LE")),
6384
Regexp.new("b".encode("UTF-16BE")))
64-
}.should raise_error(ArgumentError)
85+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and UTF-16BE')
6586
end
6687

6788
it "raises ArgumentError if the arguments include conflicting fixed encoding Regexps" do
6889
-> {
6990
Regexp.union(Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING),
7091
Regexp.new("b".encode("US-ASCII"), Regexp::FIXEDENCODING))
71-
}.should raise_error(ArgumentError)
92+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-8 and US-ASCII')
7293
end
7394

7495
it "raises ArgumentError if the arguments include a fixed encoding Regexp and a String containing non-ASCII-compatible characters in a different encoding" do
7596
-> {
7697
Regexp.union(Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING),
7798
"\u00A9".encode("ISO-8859-1"))
78-
}.should raise_error(ArgumentError)
99+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-8 and ISO-8859-1')
79100
end
80101

81102
it "raises ArgumentError if the arguments include a String containing non-ASCII-compatible characters and a fixed encoding Regexp in a different encoding" do
82103
-> {
83104
Regexp.union("\u00A9".encode("ISO-8859-1"),
84105
Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING))
85-
}.should raise_error(ArgumentError)
106+
}.should raise_error(ArgumentError, 'incompatible encodings: ISO-8859-1 and UTF-8')
86107
end
87108

88109
it "raises ArgumentError if the arguments include an ASCII-incompatible String and an ASCII-only String" do
89110
-> {
90111
Regexp.union("a".encode("UTF-16LE"), "b".encode("UTF-8"))
91-
}.should raise_error(ArgumentError)
112+
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
92113
end
93114

94115
it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and an ASCII-only String" do
95116
-> {
96117
Regexp.union(Regexp.new("a".encode("UTF-16LE")), "b".encode("UTF-8"))
97-
}.should raise_error(ArgumentError)
118+
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
98119
end
99120

100121
it "raises ArgumentError if the arguments include an ASCII-incompatible String and an ASCII-only Regexp" do
101122
-> {
102123
Regexp.union("a".encode("UTF-16LE"), Regexp.new("b".encode("UTF-8")))
103-
}.should raise_error(ArgumentError)
124+
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
104125
end
105126

106127
it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and an ASCII-only Regexp" do
107128
-> {
108129
Regexp.union(Regexp.new("a".encode("UTF-16LE")), Regexp.new("b".encode("UTF-8")))
109-
}.should raise_error(ArgumentError)
130+
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
110131
end
111132

112133
it "raises ArgumentError if the arguments include an ASCII-incompatible String and a String containing non-ASCII-compatible characters in a different encoding" do
113134
-> {
114135
Regexp.union("a".encode("UTF-16LE"), "\u00A9".encode("ISO-8859-1"))
115-
}.should raise_error(ArgumentError)
136+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
116137
end
117138

118139
it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and a String containing non-ASCII-compatible characters in a different encoding" do
119140
-> {
120141
Regexp.union(Regexp.new("a".encode("UTF-16LE")), "\u00A9".encode("ISO-8859-1"))
121-
}.should raise_error(ArgumentError)
142+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
122143
end
123144

124145
it "raises ArgumentError if the arguments include an ASCII-incompatible String and a Regexp containing non-ASCII-compatible characters in a different encoding" do
125146
-> {
126147
Regexp.union("a".encode("UTF-16LE"), Regexp.new("\u00A9".encode("ISO-8859-1")))
127-
}.should raise_error(ArgumentError)
148+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
128149
end
129150

130151
it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and a Regexp containing non-ASCII-compatible characters in a different encoding" do
131152
-> {
132153
Regexp.union(Regexp.new("a".encode("UTF-16LE")), Regexp.new("\u00A9".encode("ISO-8859-1")))
133-
}.should raise_error(ArgumentError)
154+
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
134155
end
135156

136157
it "uses to_str to convert arguments (if not Regexp)" do
@@ -154,6 +175,8 @@
154175
not_supported_on :opal do
155176
Regexp.union([/dogs/, /cats/i]).should == /(?-mix:dogs)|(?i-mx:cats)/
156177
end
157-
->{Regexp.union(["skiing", "sledding"], [/dogs/, /cats/i])}.should raise_error(TypeError)
178+
-> {
179+
Regexp.union(["skiing", "sledding"], [/dogs/, /cats/i])
180+
}.should raise_error(TypeError, 'no implicit conversion of Array into String')
158181
end
159182
end

src/main/ruby/truffleruby/core/regexp.rb

Lines changed: 34 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -55,22 +55,27 @@ def self.try_convert(obj)
5555
Truffle::Type.try_convert obj, Regexp, :to_regexp
5656
end
5757

58-
def self.convert(pattern)
59-
return pattern if Primitive.is_a?(pattern, Regexp)
60-
if Primitive.is_a?(pattern, Array)
61-
union(*pattern)
62-
else
63-
Regexp.quote(pattern.to_s)
64-
end
65-
end
58+
def self.negotiate_union_encoding(*patterns)
59+
compatible_enc = nil
60+
61+
patterns.each do |pattern|
62+
converted = Primitive.is_a?(pattern, Regexp) ? pattern : Regexp.quote(pattern)
63+
64+
enc = converted.encoding
65+
66+
if Primitive.nil?(compatible_enc)
67+
compatible_enc = enc
68+
else
69+
if test = Primitive.encoding_compatible?(enc, compatible_enc)
70+
compatible_enc = test
71+
else
72+
raise ArgumentError, "incompatible encodings: #{compatible_enc} and #{enc}"
73+
end
6674

67-
def self.compatible?(*patterns)
68-
encodings = patterns.map { |r| convert(r).encoding }
69-
last_enc = encodings.pop
70-
encodings.each do |encoding|
71-
raise ArgumentError, "incompatible encodings: #{encoding} and #{last_enc}" unless Primitive.encoding_compatible?(last_enc, encoding)
72-
last_enc = encoding
75+
end
7376
end
77+
78+
compatible_enc
7479
end
7580

7681
def self.last_match(index = nil)
@@ -96,37 +101,35 @@ def self.last_match(index = nil)
96101
def self.union(*patterns)
97102
case patterns.size
98103
when 0
99-
return %r/(?!)/
104+
%r/(?!)/
100105
when 1
101106
pattern = patterns.first
102107
case pattern
103108
when Array
104-
return union(*pattern)
109+
union(*pattern)
105110
else
106111
converted = Truffle::Type.rb_check_convert_type(pattern, Regexp, :to_regexp)
107112
if Primitive.nil? converted
108-
return Regexp.new(Regexp.quote(pattern))
113+
Regexp.new(Regexp.quote(pattern))
109114
else
110-
return converted
115+
converted
111116
end
112117
end
113118
else
114-
compatible?(*patterns)
115-
enc = convert(patterns.first).encoding
116-
end
119+
patterns = patterns.map do |pat|
120+
if Primitive.is_a?(pat, Regexp)
121+
pat
122+
else
123+
StringValue(pat)
124+
end
125+
end
117126

118-
sep = '|'.encode(enc)
119-
str = ''.encode(enc)
127+
enc = negotiate_union_encoding(*patterns)
128+
sep = '|'.encode(enc)
129+
str = ''.encode(enc)
120130

121-
patterns = patterns.map do |pat|
122-
if Primitive.is_a?(pat, Regexp)
123-
pat
124-
else
125-
StringValue(pat)
126-
end
131+
Truffle::RegexpOperations.union(str, sep, *patterns)
127132
end
128-
129-
Truffle::RegexpOperations.union(str, sep, *patterns)
130133
end
131134
Truffle::Graal.always_split(method(:union))
132135

0 commit comments

Comments
 (0)