Skip to content

Commit 86de331

Browse files
committed
Fix a bug that wrong parse result with :skip_lines
fix GH-296 This is caused by a bug of nested InputsScanner#keep_start/keep_back. It may back duplicated data when an internal scanner accepts multiple keep_starts. :skip_lines may cause this situation when a line includes "\n" and the row separator is "\r\n". Reported by Ryo Tsukamoto. Thanks!!!
1 parent 3ae9194 commit 86de331

File tree

3 files changed

+56
-15
lines changed

3 files changed

+56
-15
lines changed

lib/csv/parser.rb

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,15 @@ def keep_back
220220
end
221221
# trace(__method__, :repos, start, buffer)
222222
@scanner.pos = start
223+
last_scanner, last_start, last_buffer = @keeps.last
224+
# Drop the last buffer when the last buffer is the same data
225+
# in the last keep. If we keep it, we have duplicated data
226+
# by the next keep_back.
227+
if last_scanner == @scanner and
228+
last_buffer and
229+
last_buffer == last_scanner.string.byteslice(last_start, start)
230+
@keeps.last[2] = nil
231+
end
223232
end
224233
read_chunk if @scanner.eos?
225234
end

test/csv/parse/test_inputs_scanner.rb

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,20 @@
33
class TestCSVParseInputsScanner < Test::Unit::TestCase
44
include CSVHelper
55

6+
def test_scan_keep_nested_back
7+
input = CSV::Parser::UnoptimizedStringIO.new("abcdef")
8+
scanner = CSV::Parser::InputsScanner.new([input],
9+
Encoding::UTF_8,
10+
nil)
11+
scanner.keep_start
12+
assert_equal("abc", scanner.scan_all(/[a-c]+/))
13+
scanner.keep_start
14+
assert_equal("def", scanner.scan_all(/[d-f]+/))
15+
scanner.keep_back
16+
scanner.keep_back
17+
assert_equal("abcdef", scanner.scan_all(/[a-f]+/))
18+
end
19+
620
def test_scan_keep_over_chunks_nested_back
721
input = CSV::Parser::UnoptimizedStringIO.new("abcdefghijklmnl")
822
scanner = CSV::Parser::InputsScanner.new([input],

test/csv/parse/test_skip_lines.rb

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,17 @@ def test_default
1111
assert_nil(csv.skip_lines)
1212
end
1313

14+
def parse(data, **options)
15+
# We use Tempfile here to use CSV::Parser::InputsScanner.
16+
Tempfile.open(["csv-", ".csv"]) do |file|
17+
file.print(data)
18+
file.close
19+
CSV.open(file, **options) do |csv|
20+
csv.read
21+
end
22+
end
23+
end
24+
1425
def test_regexp
1526
csv = <<-CSV
1627
1
@@ -22,7 +33,7 @@ def test_regexp
2233
["1"],
2334
["4"],
2435
],
25-
CSV.parse(csv, :skip_lines => /\A\s*#/))
36+
parse(csv, :skip_lines => /\A\s*#/))
2637
end
2738

2839
def test_regexp_quoted
@@ -37,7 +48,7 @@ def test_regexp_quoted
3748
["#3"],
3849
["4"],
3950
],
40-
CSV.parse(csv, :skip_lines => /\A\s*#/))
51+
parse(csv, :skip_lines => /\A\s*#/))
4152
end
4253

4354
def test_string
@@ -51,7 +62,7 @@ def test_string
5162
["1"],
5263
["4"],
5364
],
54-
CSV.parse(csv, :skip_lines => "."))
65+
parse(csv, :skip_lines => "."))
5566
end
5667

5768
class RegexStub
@@ -88,7 +99,7 @@ def test_matchable
8899
["1"],
89100
["3"],
90101
],
91-
CSV.parse(csv, :skip_lines => Matchable.new(/\A#/)))
102+
parse(csv, :skip_lines => Matchable.new(/\A#/)))
92103
end
93104

94105
def test_multibyte_data
@@ -98,29 +109,36 @@ def test_multibyte_data
98109
value = "\u3042\u3044\u3046"
99110
with_chunk_size("5") do
100111
assert_equal([[value], [value]],
101-
CSV.parse("#{value}\n#{value}\n",
102-
:skip_lines => /\A#/))
112+
parse("#{value}\n#{value}\n",
113+
:skip_lines => /\A#/))
103114
end
104115
end
105116

106117
def test_empty_line_and_liberal_parsing
107118
assert_equal([["a", "b"]],
108-
CSV.parse("a,b\n",
109-
:liberal_parsing => true,
110-
:skip_lines => /^$/))
119+
parse("a,b\n",
120+
:liberal_parsing => true,
121+
:skip_lines => /^$/))
111122
end
112123

113124
def test_crlf
114125
assert_equal([["a", "b"]],
115-
CSV.parse("a,b\r\n,\r\n",
116-
:skip_lines => /^,+$/))
126+
parse("a,b\r\n,\r\n",
127+
:skip_lines => /^,+$/))
117128
end
118129

119130
def test_crlf_strip_no_last_crlf
120131
assert_equal([["a"], ["b"]],
121-
CSV.parse("a\r\nb",
122-
row_sep: "\r\n",
123-
skip_lines: /^ *$/,
124-
strip: true))
132+
parse("a\r\nb",
133+
row_sep: "\r\n",
134+
skip_lines: /^ *$/,
135+
strip: true))
136+
end
137+
138+
def test_crlf_quoted_lf
139+
assert_equal([["\n", ""]],
140+
parse("\"\n\",\"\"\r\n",
141+
row_sep: "\r\n",
142+
skip_lines: /not matched/))
125143
end
126144
end

0 commit comments

Comments
 (0)