[GR-19220] Fix IO#gets/#each_line/etc methods with multi-byte separator

andrykonchin · andrykonchin · commit 8f0cc2edc7b5 · 2023-04-03T22:08:36.000Z
PullRequest: truffleruby/3739
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -45,6 +45,7 @@ Bug fixes:
 * Fix processing of proc rest arguments located at the beginning if there are no actual arguments (#2921, @andrykonchin).
 * Fix `Monitor#exit` to raise `ThreadError` when monitor not owned by the current thread (#2922, @andrykonchin).
 * Fix `MatchData#[]` to support negative `length` argument (#2929, @andrykonchin).
+* Fix `IO` line reading calls when using a multi-byte delimiter (`IO#{each,gets,readline,readlines,etc.}) (#2961, @vinistock, @nirvdrum).
 
 Compatibility:
 
diff --git a/spec/ruby/core/io/gets_spec.rb b/spec/ruby/core/io/gets_spec.rb
@@ -113,6 +113,33 @@
         $..should == @count += 1
       end
     end
+
+    describe "that consists of multiple bytes" do
+      it "should match the separator even if the buffer is filled over successive reads" do
+        IO.pipe do |read, write|
+
+          # Write part of the string with the separator split between two write calls. We want
+          # the read to intertwine such that when the read starts the full data isn't yet
+          # available in the buffer.
+          write.write("Aquí está la línea tres\r\n")
+
+          t = Thread.new do
+            # Continue reading until the separator is encountered or the pipe is closed.
+            read.gets("\r\n\r\n")
+          end
+
+          # Write the other half of the separator, which should cause the `gets` call to now
+          # match. Explicitly close the pipe for good measure so a bug in `gets` doesn't block forever.
+          Thread.pass until t.stop?
+
+          write.write("\r\nelse\r\n\r\n")
+          write.close
+
+          t.value.bytes.should == "Aquí está la línea tres\r\n\r\n".bytes
+          read.read(8).bytes.should == "else\r\n\r\n".bytes
+        end
+      end
+    end
   end
 
   describe "when passed chomp" do
diff --git a/src/main/java/org/truffleruby/core/string/StringNodes.java b/src/main/java/org/truffleruby/core/string/StringNodes.java
@@ -98,7 +98,6 @@
 import org.truffleruby.builtins.CoreMethodArrayArgumentsNode;
 import org.truffleruby.builtins.CoreMethodNode;
 import org.truffleruby.annotations.CoreModule;
-import org.truffleruby.builtins.NonStandard;
 import org.truffleruby.annotations.Primitive;
 import org.truffleruby.builtins.PrimitiveArrayArgumentsNode;
 import org.truffleruby.builtins.PrimitiveNode;
@@ -4328,9 +4327,8 @@ protected Object stringSubstringGeneric(Object string, int codePointOffset, int
 
     }
 
-    @NonStandard
-    @CoreMethod(names = "from_bytearray", onSingleton = true, required = 4, lowerFixnum = { 2, 3 })
-    public abstract static class StringFromByteArrayPrimitiveNode extends CoreMethodArrayArgumentsNode {
+    @Primitive(name = "string_from_bytearray", lowerFixnum = { 1, 2 })
+    public abstract static class StringFromByteArrayPrimitiveNode extends PrimitiveArrayArgumentsNode {
 
         @Specialization
         protected RubyString stringFromByteArray(
diff --git a/src/main/java/org/truffleruby/core/support/ByteArrayNodes.java b/src/main/java/org/truffleruby/core/support/ByteArrayNodes.java
@@ -15,7 +15,9 @@
 import com.oracle.truffle.api.strings.TruffleString;
 import org.truffleruby.annotations.CoreMethod;
 import org.truffleruby.annotations.CoreModule;
+import org.truffleruby.annotations.Primitive;
 import org.truffleruby.builtins.CoreMethodArrayArgumentsNode;
+import org.truffleruby.builtins.PrimitiveArrayArgumentsNode;
 import org.truffleruby.core.encoding.TStringUtils;
 import org.truffleruby.core.klass.RubyClass;
 import org.truffleruby.core.string.RubyString;
@@ -151,8 +153,8 @@ protected Object fillFromPointer(
 
     }
 
-    @CoreMethod(names = "locate", required = 3, lowerFixnum = { 2, 3 })
-    public abstract static class LocateNode extends CoreMethodArrayArgumentsNode {
+    @Primitive(name = "bytearray_locate", lowerFixnum = { 2, 3 })
+    public abstract static class LocateNode extends PrimitiveArrayArgumentsNode {
 
         @Specialization(
                 guards = "isSingleBytePattern(patternTString, patternEncoding)", limit = "1")
diff --git a/src/main/ruby/truffleruby/core/io.rb b/src/main/ruby/truffleruby/core/io.rb
@@ -202,7 +202,7 @@ def discard(skip)
     # Returns the number of bytes to fetch from the buffer up-to-
     # and-including +pattern+. Returns +nil+ if pattern is not found.
     def find(pattern, discard = nil)
-      if count = @storage.locate(pattern, @start, @used)
+      if count = Primitive.bytearray_locate(@storage, pattern, @start, @used)
         count - @start
       end
     end
@@ -225,7 +225,7 @@ def shift(count = nil, encoding = Encoding::BINARY)
         total = size
         total = count if count and count < total
 
-        str = String.from_bytearray @storage, @start, total, encoding
+        str = Primitive.string_from_bytearray(@storage, @start, total, encoding)
         @start += total
 
         str
@@ -1135,6 +1135,8 @@ def each(&block)
     # method A, D
     def read_to_separator
       str = +''
+      last_scan_end = 0
+      separator_byte_size = @separator.bytesize
 
       until @buffer.exhausted?
         available = @buffer.fill_from @io, @skip
@@ -1143,26 +1145,80 @@ def read_to_separator
         if count = @buffer.find(@separator)
           s = @buffer.shift(count)
 
-          unless str.empty?
-            s.prepend(str)
-            str.clear
+          # We need to be careful matching against multi-byte separators since the
+          # `str` value is being built up progressively.
+          #
+          # If the separator is only a single byte wide and we found it in the buffer
+          # then `count` must be the correct position because there's no way for a single
+          # byte read to be split up; it doesn't matter what `str` contains.
+          #
+          # If the separator is multiple bytes, then we look at `str`. If it's empty,
+          # then the buffer trivially must contain the separator. If not, however, we
+          # must scan the entire `str` after appending the buffer to see if the separator
+          # pattern appears earlier than the position detected by `count`.
+          if str.empty? || separator_byte_size == 1
+            unless str.empty?
+              s.prepend(str)
+              str.clear
+            end
+
+            yield prepare_read_string(s)
+
+            next
+          else
+            str << s
           end
+        else
+          str << @buffer.shift
+        end
 
-          s = IO.read_encode(@io, s)
+        if separator_byte_size > 1 && last_scan_end < str.bytesize
+          # Since the separator could be split over multiple passes of this loop,
+          # it's possible for the separator to never appear completely in `@buffer`,
+          # but may appear in `str` after successive passes. If we found an unambiguous
+          # match in the buffer, we wouldn't be in this branch. Since we are, we need
+          # to check if the separator appears in the total read string.
+          #
+          # Rather than scan the entirety of `str` every time, we track how far we've
+          # previously scanned. Since the separator bytes can span reads, we need to
+          # step back `@separator.bytesize - 1` bytes to ensure we don't skip over the
+          # separator bytes. It's `@separator.bytesize - 1` because if the entire
+          # separator was already in `str` we would have found it on a previous pass of
+          # the loop. Since we also need to subtract one to account for zero-based offsets,
+          # we can include that in our offset and just substract the separator byte size.
+          # On the very first scan we don't need to account for any partial scans of the
+          # separator.
+
+          search_offset = last_scan_end - separator_byte_size
+          search_offset = 0 if search_offset < 0
+
+          found_byte_index = Primitive.string_byte_index(str, @separator.b, Encoding::BINARY, search_offset)
+
+          if found_byte_index
+            offset = found_byte_index + @separator.bytesize
+
+            # If we've read more bytes than we need to satisfy the current request, we
+            # need to put the remainder back into the buffer so that subsequent reads
+            # will have the correct bytes.
+            if offset < str.bytesize
+              @buffer.put_back(str.byteslice(offset, str.bytesize - offset))
+            end
+
+            res = prepare_read_string(str.byteslice(0, offset))
 
-          s.chomp!(@separator) if @chomp
-          $. = @io.__send__(:increment_lineno)
-          @buffer.discard @skip if @skip
+            str.clear
+            last_scan_end = 0
 
-          yield s
-        else
-          str << @buffer.shift
+            yield res
+          else
+            last_scan_end = str.bytesize
+          end
         end
       end
 
       str << @buffer.shift
       str.chomp!(@separator) if @chomp
-      yield_string(str) { |y| yield y }
+      yield prepare_read_string(str) unless str.empty?
     end
 
     # method B, E
@@ -1180,27 +1236,16 @@ def read_to_separator_with_limit
           bytes = Primitive.min(count, wanted)
           str << @buffer.shift(bytes)
 
-          str = IO.read_encode(@io, str)
-
-          str.chomp!(@separator) if @chomp
-          $. = @io.__send__(:increment_lineno)
-          @buffer.discard @skip if @skip
-
-          yield str
+          yield prepare_read_string(str)
 
           str = +''
           wanted = limit
         else
           if wanted < available
             str << @buffer.shift(wanted)
-
             str = @buffer.read_to_char_boundary(@io, str)
 
-            str.chomp!(@separator) if @chomp
-            $. = @io.__send__(:increment_lineno)
-            @buffer.discard @skip if @skip
-
-            yield str
+            yield prepare_read_string(str)
 
             str = +''
             wanted = limit
@@ -1211,8 +1256,7 @@ def read_to_separator_with_limit
         end
       end
 
-      str.chomp!(@separator) if @chomp
-      yield_string(str) { |s| yield s }
+      yield prepare_read_string(str) unless str.empty?
     end
 
     # Method G
@@ -1228,7 +1272,7 @@ def read_all
       end
 
       str.chomp!(DEFAULT_RECORD_SEPARATOR) if @chomp
-      yield_string(str) { |s| yield s }
+      yield prepare_read_string(str) unless str.empty?
     end
 
     # Method H
@@ -1254,15 +1298,17 @@ def read_to_limit
         end
       end
 
-      yield_string(str) { |s| yield s }
+      yield prepare_read_string(str) unless str.empty?
     end
 
-    def yield_string(str)
-      unless str.empty?
-        str = IO.read_encode(@io, str)
-        $. = @io.__send__(:increment_lineno)
-        yield str
-      end
+    def prepare_read_string(str)
+      s = IO.read_encode(@io, str)
+
+      s.chomp!(@separator) if @chomp
+      $. = @io.__send__(:increment_lineno)
+      @buffer.discard @skip if @skip
+
+      s
     end
   end