From c71fd12f82ad3e5e87b59c6adede0cdf113c622c Mon Sep 17 00:00:00 2001 From: John W Higgins Date: Tue, 1 Jul 2025 19:45:59 -0700 Subject: [PATCH 1/4] Increase speed of Table#to_csv when encoding is provided If we set the encoding when we call Table#to_csv we do not need to go row by row to determine the encoding to use. This allows the use of CSV#generate_lines as a faster exporter. --- lib/csv/table.rb | 11 ++++++++--- test/csv/test_table.rb | 9 +++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/lib/csv/table.rb b/lib/csv/table.rb index fb19f545..66a2d596 100644 --- a/lib/csv/table.rb +++ b/lib/csv/table.rb @@ -1006,10 +1006,15 @@ def to_csv(write_headers: true, limit: nil, **options) limit ||= @table.size limit = @table.size + 1 + limit if limit < 0 limit = 0 if limit < 0 - @table.first(limit).each do |row| - array.push(row.fields.to_csv(**options)) unless row.header_row? - end + if options[:encoding] + rows = @table.first(limit).select { |row| !row.header_row? } + array.push(CSV.generate_lines(rows, **options)) + else + @table.first(limit).each do |row| + array.push(row.fields.to_csv(**options)) unless row.header_row? + end + end array.join("") end alias_method :to_s, :to_csv diff --git a/test/csv/test_table.rb b/test/csv/test_table.rb index e8ab7404..d63b32a3 100644 --- a/test/csv/test_table.rb +++ b/test/csv/test_table.rb @@ -373,6 +373,15 @@ def test_to_csv_limit_negative_over CSV end + def test_to_csv_encoding + rows = [ CSV::Row.new(%w{A}, ["\x00\xac".force_encoding("ASCII-8BIT")]), + CSV::Row.new(%w{A}, ["\x00\xac"]) ] + table = CSV::Table.new(rows) + + assert_equal('UTF-8', table.to_csv(encoding: 'UTF-8').encoding.to_s) + assert_raises(Encoding::CompatibilityError) {table.to_csv} + end + def test_append # verify that we can chain the call assert_equal(@table, @table << [10, 11, 12]) From b2b6e8094f1a575c7236a2647c0112a8e5b388eb Mon Sep 17 00:00:00 2001 From: John W Higgins Date: Wed, 2 Jul 2025 20:07:45 -0700 Subject: [PATCH 2/4] Add benchmark for Table#to_csv improvement --- benchmark/table.yaml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 benchmark/table.yaml diff --git a/benchmark/table.yaml b/benchmark/table.yaml new file mode 100644 index 00000000..9a67c8cd --- /dev/null +++ b/benchmark/table.yaml @@ -0,0 +1,27 @@ +loop_count: 100 +contexts: + - gems: + csv: 3.3.0 + - name: "master" + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require "csv" +prelude: |- + n_columns = Integer(ENV.fetch("N_COLUMNS", "5"), 10) + n_rows = Integer(ENV.fetch("N_ROWS", "100"), 10) + fields = ["AAAAA"] * n_columns + headers = n_columns.times.collect do |i| + "header#{i}" + end + row = CSV::Row.new(headers, fields) + rows = [row] * n_rows + table = CSV::Table.new(rows) + rows = [row] * n_rows * 10 + large_table = CSV::Table.new(rows) +benchmark: + "to_csv: no encoding": |- + table.to_csv + "to_csv: encoding": |- + table.to_csv(encoding: 'UTF-8') + "to_csv: encoding - 10 x rows": |- + large_table.to_csv(encoding: 'UTF-8') From cf0cec0c553b8fa5568ea5c7b7c3ee4537a8e4f1 Mon Sep 17 00:00:00 2001 From: John W Higgins Date: Fri, 4 Jul 2025 09:06:46 -0700 Subject: [PATCH 3/4] Move CSV row encoding calculation to its own method --- lib/csv.rb | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/lib/csv.rb b/lib/csv.rb index aef96ac9..ddc0b9ed 100644 --- a/lib/csv.rb +++ b/lib/csv.rb @@ -1506,17 +1506,7 @@ def generate_line(row, **options) if options[:encoding] str.force_encoding(options[:encoding]) else - fallback_encoding = nil - output_encoding = nil - row.each do |field| - next unless field.is_a?(String) - fallback_encoding ||= field.encoding - next if field.ascii_only? - output_encoding = field.encoding - break - end - output_encoding ||= fallback_encoding - if output_encoding + if output_encoding = row_encoding(row) str.force_encoding(output_encoding) end end @@ -1960,6 +1950,20 @@ def table(path, **options) private_constant :ON_WINDOWS private + + def row_encoding(row) + fallback_encoding = nil + output_encoding = nil + row.each do |field| + next unless field.is_a?(String) + fallback_encoding ||= field.encoding + next if field.ascii_only? + output_encoding = field.encoding + break + end + output_encoding || fallback_encoding + end + def may_enable_bom_detection_automatically(filename_or_io, mode, options, From 1df04bda7e4dba65838c10a87f4f6a98bce6cc2e Mon Sep 17 00:00:00 2001 From: John W Higgins Date: Fri, 4 Jul 2025 15:40:38 -0700 Subject: [PATCH 4/4] Update test based on @kou suggestion --- test/csv/test_table.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/csv/test_table.rb b/test/csv/test_table.rb index d63b32a3..7744a34b 100644 --- a/test/csv/test_table.rb +++ b/test/csv/test_table.rb @@ -378,7 +378,7 @@ def test_to_csv_encoding CSV::Row.new(%w{A}, ["\x00\xac"]) ] table = CSV::Table.new(rows) - assert_equal('UTF-8', table.to_csv(encoding: 'UTF-8').encoding.to_s) + assert_equal(Encoding::UTF_8, table.to_csv(encoding: 'UTF-8').encoding) assert_raises(Encoding::CompatibilityError) {table.to_csv} end