Skip to content

Commit 26ce300

Browse files
authored
Version 1.14.3 (#303)
* improved C-performance, and small Ruby parser performance tweak * 1.14.3.pre2 w. improved performance * performance improvements * Version 1.14.3
1 parent 9e9e7da commit 26ce300

File tree

6 files changed

+186
-46
lines changed

6 files changed

+186
-46
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11

22
# SmarterCSV 1.x Change Log
33

4+
## 1.14.3 (2025-05-04)
5+
* Improved C-extension parsing logic:
6+
- Added fast path for unquoted fields to avoid unnecessary quote checks.
7+
- Aded inline whitespace stripping inside the C parser
8+
* Performance
9+
- Significantly reduced per-line overhead in non-quoted, wide CSVs (e.g. fixed-width data exports).
10+
- Benchmarks show ~10–40% speedup over v1.14.2 depending on structure and quoting.
11+
412
## 1.14.2 (2025-04-10)
513
* bugfix: SmarterCSV::Writer fixing corner case with `quote_headers: true` ([issue 301](https://github.com/tilo/smarter_csv/issues/301))
614
* new option: `header_converter` allows to programatically modify the headers

ext/smarter_csv/extconf.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
RbConfig::MAKEFILE_CONFIG["CFLAGS"] = fixed_CFLAGS
1010
end
1111

12-
CONFIG["optflags"] = "-O3"
12+
# CONFIG["optflags"] = "-O3 -march=native -flto"
13+
CONFIG["optflags"] = "-O3 -march=native -flto -fomit-frame-pointer -DNDEBUG"
14+
CONFIG["debugflags"] = ""
1315

1416
create_makefile('smarter_csv/smarter_csv')

ext/smarter_csv/smarter_csv.c

Lines changed: 159 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "ruby/encoding.h"
33
#include <stdio.h>
44
#include <stdbool.h>
5+
#include <string.h>
56

67
#ifndef bool
78
#define bool int
@@ -12,8 +13,25 @@
1213
VALUE SmarterCSV = Qnil;
1314
VALUE eMalformedCSVError = Qnil;
1415
VALUE Parser = Qnil;
16+
VALUE Qempty_string = Qnil; // shared frozen empty string
17+
18+
static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *encoding) {
19+
char *buf = ALLOC_N(char, len);
20+
long j = 0;
21+
for (long i = 0; i < len; i++) {
22+
if (str[i] == quote_char && i + 1 < len && str[i + 1] == quote_char) {
23+
buf[j++] = quote_char;
24+
i++; // skip second quote
25+
} else {
26+
buf[j++] = str[i];
27+
}
28+
}
29+
VALUE out = rb_enc_str_new(buf, j, encoding);
30+
xfree(buf);
31+
return out;
32+
}
1533

16-
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size) {
34+
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val, VALUE strip_ws_val) {
1735
if (RB_TYPE_P(line, T_NIL) == 1) {
1836
return rb_ary_new();
1937
}
@@ -22,85 +40,191 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
2240
rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
2341
}
2442

25-
rb_encoding *encoding = rb_enc_get(line); /* get the encoding from the input line */
26-
char *startP = RSTRING_PTR(line); /* may not be null terminated */
43+
rb_encoding *encoding = rb_enc_get(line);
44+
char *startP = RSTRING_PTR(line);
2745
long line_len = RSTRING_LEN(line);
28-
char *endP = startP + line_len; /* points behind the string */
46+
char *endP = startP + line_len;
2947
char *p = startP;
3048

3149
char *col_sepP = RSTRING_PTR(col_sep);
3250
long col_sep_len = RSTRING_LEN(col_sep);
3351

3452
char *quoteP = RSTRING_PTR(quote_char);
35-
long quote_count = 0;
36-
37-
bool col_sep_found = true;
53+
char quote_char_val = quoteP[0];
54+
size_t quote_len = strlen(quoteP);
3855

3956
VALUE elements = rb_ary_new();
4057
VALUE field;
41-
long i;
4258

43-
/* Variables for escaped quote handling */
59+
long element_count = 0;
60+
int max_fields = -1;
61+
if (max_size != Qnil) {
62+
max_fields = NUM2INT(max_size);
63+
if (max_fields < 0) {
64+
return rb_ary_new();
65+
}
66+
}
67+
68+
bool has_quotes = RTEST(has_quotes_val);
69+
bool strip_ws = RTEST(strip_ws_val);
70+
71+
// === FAST PATH: No quotes and single-character separator ===
72+
if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
73+
char sep = *col_sepP;
74+
char *sep_pos = NULL;
75+
76+
while ((sep_pos = memchr(p, sep, endP - p))) {
77+
if ((max_fields >= 0) && (element_count >= max_fields)) {
78+
break;
79+
}
80+
81+
long field_len = sep_pos - startP;
82+
char *raw_field = startP;
83+
char *trim_start = raw_field;
84+
char *trim_end = raw_field + field_len - 1;
85+
86+
if (strip_ws) {
87+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
88+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
89+
}
90+
91+
long trimmed_len = trim_end - trim_start + 1;
92+
93+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
94+
rb_ary_push(elements, field);
95+
element_count++;
96+
97+
p = sep_pos + 1;
98+
startP = p;
99+
}
100+
101+
if ((max_fields < 0) || (element_count < max_fields)) {
102+
long field_len = endP - startP;
103+
char *raw_field = startP;
104+
char *trim_start = raw_field;
105+
char *trim_end = raw_field + field_len - 1;
106+
107+
if (strip_ws) {
108+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
109+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
110+
}
111+
112+
long trimmed_len = trim_end - trim_start + 1;
113+
114+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
115+
rb_ary_push(elements, field);
116+
}
117+
118+
return elements;
119+
}
120+
121+
// === SLOW PATH: Quoted fields or multi-char separator ===
122+
long i;
44123
long backslash_count = 0;
45124
bool in_quotes = false;
125+
bool col_sep_found = true;
46126

47127
while (p < endP) {
48-
/* does the remaining string start with col_sep ? */
49128
col_sep_found = true;
50-
for(i=0; (i < col_sep_len) && (p+i < endP); i++) {
51-
col_sep_found = col_sep_found && (*(p+i) == *(col_sepP+i));
129+
for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
130+
if (*(p + i) != *(col_sepP + i)) {
131+
col_sep_found = false;
132+
break;
133+
}
52134
}
53-
/* if col_sep was found and we're not inside quotes */
135+
54136
if (col_sep_found && !in_quotes) {
55-
/* if max_size != nil && elements.size >= header_size */
56-
if ((max_size != Qnil) && RARRAY_LEN(elements) >= NUM2INT(max_size)) {
137+
if ((max_fields >= 0) && (element_count >= max_fields)) {
57138
break;
58-
} else {
59-
/* push that field with original encoding onto the results */
60-
field = rb_enc_str_new(startP, p - startP, encoding);
61-
rb_ary_push(elements, field);
139+
}
140+
141+
long field_len = p - startP;
142+
char *raw_field = startP;
62143

63-
p += col_sep_len;
64-
startP = p;
65-
backslash_count = 0; // Reset backslash count at the start of a new field
144+
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
145+
if (quoted) {
146+
raw_field++;
147+
field_len -= 2;
148+
}
149+
150+
char *trim_start = raw_field;
151+
char *trim_end = raw_field + field_len - 1;
152+
153+
if (strip_ws) {
154+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
155+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
156+
}
157+
158+
long trimmed_len = trim_end - trim_start + 1;
159+
160+
if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
161+
field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
162+
} else {
163+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
66164
}
165+
166+
rb_ary_push(elements, field);
167+
element_count++;
168+
169+
p += col_sep_len;
170+
startP = p;
171+
backslash_count = 0;
67172
} else {
68173
if (*p == '\\') {
69174
backslash_count++;
70175
} else {
71-
if (*p == *quoteP) {
176+
if (*p == quote_char_val) {
72177
if (backslash_count % 2 == 0) {
73-
/* Even number of backslashes means quote is not escaped */
74178
in_quotes = !in_quotes;
75179
}
76-
/* Else, quote is escaped; do nothing */
77180
}
78-
backslash_count = 0; // Reset after any character other than backslash
181+
backslash_count = 0;
79182
}
80183
p++;
81184
}
82-
} /* while */
185+
}
83186

84-
/* Check for unclosed quotes at the end of the line */
85187
if (in_quotes) {
86188
rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
87189
}
88190

89-
/* check if the last part of the line needs to be processed */
90-
if ((max_size == Qnil) || RARRAY_LEN(elements) < NUM2INT(max_size)) {
91-
/* copy the remaining line as a field with original encoding onto the results */
92-
field = rb_enc_str_new(startP, endP - startP, encoding);
191+
if ((max_fields < 0) || (element_count < max_fields)) {
192+
long field_len = endP - startP;
193+
char *raw_field = startP;
194+
195+
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
196+
if (quoted) {
197+
raw_field++;
198+
field_len -= 2;
199+
}
200+
201+
char *trim_start = raw_field;
202+
char *trim_end = raw_field + field_len - 1;
203+
204+
if (strip_ws) {
205+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
206+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
207+
}
208+
209+
long trimmed_len = trim_end - trim_start + 1;
210+
211+
if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
212+
field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
213+
} else {
214+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
215+
}
216+
93217
rb_ary_push(elements, field);
94218
}
95219

96220
return elements;
97221
}
98222

99223
void Init_smarter_csv(void) {
100-
// these modules and the error class are already defined in Ruby code, make them accessible:
101224
SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
102225
Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
103226
eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
104-
105-
rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 4);
227+
Qempty_string = rb_str_new_literal("");
228+
rb_gc_register_address(&Qempty_string);
229+
rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 6);
106230
}

lib/smarter_csv/parser.rb

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
module SmarterCSV
44
module Parser
5+
EMPTY_STRING = ''.freeze
6+
57
protected
68

79
###
@@ -11,17 +13,16 @@ module Parser
1113
###
1214
def parse(line, options, header_size = nil)
1315
# puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
16+
has_quotes = line.include?(options[:quote_char])
1417

1518
if options[:acceleration] && has_acceleration
1619
# :nocov:
17-
has_quotes = line =~ /#{options[:quote_char]}/
18-
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
19-
elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
20+
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace])
2021
[elements, elements.size]
2122
# :nocov:
2223
else
2324
# puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
24-
parse_csv_line_ruby(line, options, header_size)
25+
parse_csv_line_ruby(line, options, header_size, has_quotes)
2526
end
2627
end
2728

@@ -46,7 +47,7 @@ def parse(line, options, header_size = nil)
4647
#
4748
# Our convention is that empty fields are returned as empty strings, not as nil.
4849

49-
def parse_csv_line_ruby(line, options, header_size = nil)
50+
def parse_csv_line_ruby(line, options, header_size = nil, has_quotes = false)
5051
return [[], 0] if line.nil?
5152

5253
line_size = line.size
@@ -98,21 +99,27 @@ def parse_csv_line_ruby(line, options, header_size = nil)
9899
elements << cleanup_quotes(line[start..-1], quote)
99100
end
100101

102+
elements.map!(&:strip) if options[:strip_whitespace]
101103
[elements, elements.size]
102104
end
103105

104106
def cleanup_quotes(field, quote)
105-
return field if field.nil?
107+
return nil if field.nil?
108+
return EMPTY_STRING if field.empty?
106109

107110
# Remove surrounding quotes if present
108111
if field.start_with?(quote) && field.end_with?(quote)
109112
field = field[1..-2]
110113
end
111114

112115
# Replace double quotes with a single quote
113-
field.gsub!((quote * 2).to_s, quote)
116+
field.gsub!(doubled_quote(quote), quote)
114117

115118
field
116119
end
120+
121+
def doubled_quote(quote)
122+
@doubled_quote ||= (quote * 2).to_s.freeze
123+
end
117124
end
118125
end

lib/smarter_csv/reader.rb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ def process(&block) # rubocop:disable Lint/UnusedMethodArgument
128128
line.chomp!(options[:row_sep])
129129

130130
# --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
131+
# we are now stripping whitespace inside the parse() methods
131132
dataA, data_size = parse(line, options) # we parse the extra columns
132133

133134
if options[:strict]
@@ -141,8 +142,6 @@ def process(&block) # rubocop:disable Lint/UnusedMethodArgument
141142
end
142143
end
143144

144-
dataA.map!{|x| x.strip} if options[:strip_whitespace]
145-
146145
# if all values are blank, then ignore this line
147146
next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
148147

lib/smarter_csv/version.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# frozen_string_literal: true
22

33
module SmarterCSV
4-
VERSION = "1.14.2"
4+
VERSION = "1.14.3"
55
end

0 commit comments

Comments
 (0)