Skip to content

Commit 8f0cc2e

Browse files
committed
[GR-19220] Fix IO#gets/#each_line/etc methods with multi-byte separator
PullRequest: truffleruby/3739
2 parents fbbc027 + a1040f0 commit 8f0cc2e

File tree

5 files changed

+116
-42
lines changed

5 files changed

+116
-42
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ Bug fixes:
4545
* Fix processing of proc rest arguments located at the beginning if there are no actual arguments (#2921, @andrykonchin).
4646
* Fix `Monitor#exit` to raise `ThreadError` when monitor not owned by the current thread (#2922, @andrykonchin).
4747
* Fix `MatchData#[]` to support negative `length` argument (#2929, @andrykonchin).
48+
* Fix `IO` line reading calls when using a multi-byte delimiter (`IO#{each,gets,readline,readlines,etc.}) (#2961, @vinistock, @nirvdrum).
4849

4950
Compatibility:
5051

spec/ruby/core/io/gets_spec.rb

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,33 @@
113113
$..should == @count += 1
114114
end
115115
end
116+
117+
describe "that consists of multiple bytes" do
118+
it "should match the separator even if the buffer is filled over successive reads" do
119+
IO.pipe do |read, write|
120+
121+
# Write part of the string with the separator split between two write calls. We want
122+
# the read to intertwine such that when the read starts the full data isn't yet
123+
# available in the buffer.
124+
write.write("Aquí está la línea tres\r\n")
125+
126+
t = Thread.new do
127+
# Continue reading until the separator is encountered or the pipe is closed.
128+
read.gets("\r\n\r\n")
129+
end
130+
131+
# Write the other half of the separator, which should cause the `gets` call to now
132+
# match. Explicitly close the pipe for good measure so a bug in `gets` doesn't block forever.
133+
Thread.pass until t.stop?
134+
135+
write.write("\r\nelse\r\n\r\n")
136+
write.close
137+
138+
t.value.bytes.should == "Aquí está la línea tres\r\n\r\n".bytes
139+
read.read(8).bytes.should == "else\r\n\r\n".bytes
140+
end
141+
end
142+
end
116143
end
117144

118145
describe "when passed chomp" do

src/main/java/org/truffleruby/core/string/StringNodes.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@
9898
import org.truffleruby.builtins.CoreMethodArrayArgumentsNode;
9999
import org.truffleruby.builtins.CoreMethodNode;
100100
import org.truffleruby.annotations.CoreModule;
101-
import org.truffleruby.builtins.NonStandard;
102101
import org.truffleruby.annotations.Primitive;
103102
import org.truffleruby.builtins.PrimitiveArrayArgumentsNode;
104103
import org.truffleruby.builtins.PrimitiveNode;
@@ -4328,9 +4327,8 @@ protected Object stringSubstringGeneric(Object string, int codePointOffset, int
43284327

43294328
}
43304329

4331-
@NonStandard
4332-
@CoreMethod(names = "from_bytearray", onSingleton = true, required = 4, lowerFixnum = { 2, 3 })
4333-
public abstract static class StringFromByteArrayPrimitiveNode extends CoreMethodArrayArgumentsNode {
4330+
@Primitive(name = "string_from_bytearray", lowerFixnum = { 1, 2 })
4331+
public abstract static class StringFromByteArrayPrimitiveNode extends PrimitiveArrayArgumentsNode {
43344332

43354333
@Specialization
43364334
protected RubyString stringFromByteArray(

src/main/java/org/truffleruby/core/support/ByteArrayNodes.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
import com.oracle.truffle.api.strings.TruffleString;
1616
import org.truffleruby.annotations.CoreMethod;
1717
import org.truffleruby.annotations.CoreModule;
18+
import org.truffleruby.annotations.Primitive;
1819
import org.truffleruby.builtins.CoreMethodArrayArgumentsNode;
20+
import org.truffleruby.builtins.PrimitiveArrayArgumentsNode;
1921
import org.truffleruby.core.encoding.TStringUtils;
2022
import org.truffleruby.core.klass.RubyClass;
2123
import org.truffleruby.core.string.RubyString;
@@ -151,8 +153,8 @@ protected Object fillFromPointer(
151153

152154
}
153155

154-
@CoreMethod(names = "locate", required = 3, lowerFixnum = { 2, 3 })
155-
public abstract static class LocateNode extends CoreMethodArrayArgumentsNode {
156+
@Primitive(name = "bytearray_locate", lowerFixnum = { 2, 3 })
157+
public abstract static class LocateNode extends PrimitiveArrayArgumentsNode {
156158

157159
@Specialization(
158160
guards = "isSingleBytePattern(patternTString, patternEncoding)", limit = "1")

src/main/ruby/truffleruby/core/io.rb

Lines changed: 82 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ def discard(skip)
202202
# Returns the number of bytes to fetch from the buffer up-to-
203203
# and-including +pattern+. Returns +nil+ if pattern is not found.
204204
def find(pattern, discard = nil)
205-
if count = @storage.locate(pattern, @start, @used)
205+
if count = Primitive.bytearray_locate(@storage, pattern, @start, @used)
206206
count - @start
207207
end
208208
end
@@ -225,7 +225,7 @@ def shift(count = nil, encoding = Encoding::BINARY)
225225
total = size
226226
total = count if count and count < total
227227

228-
str = String.from_bytearray @storage, @start, total, encoding
228+
str = Primitive.string_from_bytearray(@storage, @start, total, encoding)
229229
@start += total
230230

231231
str
@@ -1135,6 +1135,8 @@ def each(&block)
11351135
# method A, D
11361136
def read_to_separator
11371137
str = +''
1138+
last_scan_end = 0
1139+
separator_byte_size = @separator.bytesize
11381140

11391141
until @buffer.exhausted?
11401142
available = @buffer.fill_from @io, @skip
@@ -1143,26 +1145,80 @@ def read_to_separator
11431145
if count = @buffer.find(@separator)
11441146
s = @buffer.shift(count)
11451147

1146-
unless str.empty?
1147-
s.prepend(str)
1148-
str.clear
1148+
# We need to be careful matching against multi-byte separators since the
1149+
# `str` value is being built up progressively.
1150+
#
1151+
# If the separator is only a single byte wide and we found it in the buffer
1152+
# then `count` must be the correct position because there's no way for a single
1153+
# byte read to be split up; it doesn't matter what `str` contains.
1154+
#
1155+
# If the separator is multiple bytes, then we look at `str`. If it's empty,
1156+
# then the buffer trivially must contain the separator. If not, however, we
1157+
# must scan the entire `str` after appending the buffer to see if the separator
1158+
# pattern appears earlier than the position detected by `count`.
1159+
if str.empty? || separator_byte_size == 1
1160+
unless str.empty?
1161+
s.prepend(str)
1162+
str.clear
1163+
end
1164+
1165+
yield prepare_read_string(s)
1166+
1167+
next
1168+
else
1169+
str << s
11491170
end
1171+
else
1172+
str << @buffer.shift
1173+
end
11501174

1151-
s = IO.read_encode(@io, s)
1175+
if separator_byte_size > 1 && last_scan_end < str.bytesize
1176+
# Since the separator could be split over multiple passes of this loop,
1177+
# it's possible for the separator to never appear completely in `@buffer`,
1178+
# but may appear in `str` after successive passes. If we found an unambiguous
1179+
# match in the buffer, we wouldn't be in this branch. Since we are, we need
1180+
# to check if the separator appears in the total read string.
1181+
#
1182+
# Rather than scan the entirety of `str` every time, we track how far we've
1183+
# previously scanned. Since the separator bytes can span reads, we need to
1184+
# step back `@separator.bytesize - 1` bytes to ensure we don't skip over the
1185+
# separator bytes. It's `@separator.bytesize - 1` because if the entire
1186+
# separator was already in `str` we would have found it on a previous pass of
1187+
# the loop. Since we also need to subtract one to account for zero-based offsets,
1188+
# we can include that in our offset and just substract the separator byte size.
1189+
# On the very first scan we don't need to account for any partial scans of the
1190+
# separator.
1191+
1192+
search_offset = last_scan_end - separator_byte_size
1193+
search_offset = 0 if search_offset < 0
1194+
1195+
found_byte_index = Primitive.string_byte_index(str, @separator.b, Encoding::BINARY, search_offset)
1196+
1197+
if found_byte_index
1198+
offset = found_byte_index + @separator.bytesize
1199+
1200+
# If we've read more bytes than we need to satisfy the current request, we
1201+
# need to put the remainder back into the buffer so that subsequent reads
1202+
# will have the correct bytes.
1203+
if offset < str.bytesize
1204+
@buffer.put_back(str.byteslice(offset, str.bytesize - offset))
1205+
end
1206+
1207+
res = prepare_read_string(str.byteslice(0, offset))
11521208

1153-
s.chomp!(@separator) if @chomp
1154-
$. = @io.__send__(:increment_lineno)
1155-
@buffer.discard @skip if @skip
1209+
str.clear
1210+
last_scan_end = 0
11561211

1157-
yield s
1158-
else
1159-
str << @buffer.shift
1212+
yield res
1213+
else
1214+
last_scan_end = str.bytesize
1215+
end
11601216
end
11611217
end
11621218

11631219
str << @buffer.shift
11641220
str.chomp!(@separator) if @chomp
1165-
yield_string(str) { |y| yield y }
1221+
yield prepare_read_string(str) unless str.empty?
11661222
end
11671223

11681224
# method B, E
@@ -1180,27 +1236,16 @@ def read_to_separator_with_limit
11801236
bytes = Primitive.min(count, wanted)
11811237
str << @buffer.shift(bytes)
11821238

1183-
str = IO.read_encode(@io, str)
1184-
1185-
str.chomp!(@separator) if @chomp
1186-
$. = @io.__send__(:increment_lineno)
1187-
@buffer.discard @skip if @skip
1188-
1189-
yield str
1239+
yield prepare_read_string(str)
11901240

11911241
str = +''
11921242
wanted = limit
11931243
else
11941244
if wanted < available
11951245
str << @buffer.shift(wanted)
1196-
11971246
str = @buffer.read_to_char_boundary(@io, str)
11981247

1199-
str.chomp!(@separator) if @chomp
1200-
$. = @io.__send__(:increment_lineno)
1201-
@buffer.discard @skip if @skip
1202-
1203-
yield str
1248+
yield prepare_read_string(str)
12041249

12051250
str = +''
12061251
wanted = limit
@@ -1211,8 +1256,7 @@ def read_to_separator_with_limit
12111256
end
12121257
end
12131258

1214-
str.chomp!(@separator) if @chomp
1215-
yield_string(str) { |s| yield s }
1259+
yield prepare_read_string(str) unless str.empty?
12161260
end
12171261

12181262
# Method G
@@ -1228,7 +1272,7 @@ def read_all
12281272
end
12291273

12301274
str.chomp!(DEFAULT_RECORD_SEPARATOR) if @chomp
1231-
yield_string(str) { |s| yield s }
1275+
yield prepare_read_string(str) unless str.empty?
12321276
end
12331277

12341278
# Method H
@@ -1254,15 +1298,17 @@ def read_to_limit
12541298
end
12551299
end
12561300

1257-
yield_string(str) { |s| yield s }
1301+
yield prepare_read_string(str) unless str.empty?
12581302
end
12591303

1260-
def yield_string(str)
1261-
unless str.empty?
1262-
str = IO.read_encode(@io, str)
1263-
$. = @io.__send__(:increment_lineno)
1264-
yield str
1265-
end
1304+
def prepare_read_string(str)
1305+
s = IO.read_encode(@io, str)
1306+
1307+
s.chomp!(@separator) if @chomp
1308+
$. = @io.__send__(:increment_lineno)
1309+
@buffer.discard @skip if @skip
1310+
1311+
s
12661312
end
12671313
end
12681314

0 commit comments

Comments
 (0)