2
2
#include "ruby/encoding.h"
3
3
#include <stdio.h>
4
4
#include <stdbool.h>
5
+ #include <string.h>
5
6
6
7
#ifndef bool
7
8
#define bool int
12
13
VALUE SmarterCSV = Qnil ;
13
14
VALUE eMalformedCSVError = Qnil ;
14
15
VALUE Parser = Qnil ;
16
+ VALUE Qempty_string = Qnil ; // shared frozen empty string
17
+
18
+ static VALUE unescape_quotes (char * str , long len , char quote_char , rb_encoding * encoding ) {
19
+ char * buf = ALLOC_N (char , len );
20
+ long j = 0 ;
21
+ for (long i = 0 ; i < len ; i ++ ) {
22
+ if (str [i ] == quote_char && i + 1 < len && str [i + 1 ] == quote_char ) {
23
+ buf [j ++ ] = quote_char ;
24
+ i ++ ; // skip second quote
25
+ } else {
26
+ buf [j ++ ] = str [i ];
27
+ }
28
+ }
29
+ VALUE out = rb_enc_str_new (buf , j , encoding );
30
+ xfree (buf );
31
+ return out ;
32
+ }
15
33
16
- static VALUE rb_parse_csv_line (VALUE self , VALUE line , VALUE col_sep , VALUE quote_char , VALUE max_size ) {
34
+ static VALUE rb_parse_csv_line (VALUE self , VALUE line , VALUE col_sep , VALUE quote_char , VALUE max_size , VALUE has_quotes_val , VALUE strip_ws_val ) {
17
35
if (RB_TYPE_P (line , T_NIL ) == 1 ) {
18
36
return rb_ary_new ();
19
37
}
@@ -22,85 +40,191 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
22
40
rb_raise (rb_eTypeError , "ERROR in SmarterCSV.parse_line: line has to be a string or nil" );
23
41
}
24
42
25
- rb_encoding * encoding = rb_enc_get (line ); /* get the encoding from the input line */
26
- char * startP = RSTRING_PTR (line ); /* may not be null terminated */
43
+ rb_encoding * encoding = rb_enc_get (line );
44
+ char * startP = RSTRING_PTR (line );
27
45
long line_len = RSTRING_LEN (line );
28
- char * endP = startP + line_len ; /* points behind the string */
46
+ char * endP = startP + line_len ;
29
47
char * p = startP ;
30
48
31
49
char * col_sepP = RSTRING_PTR (col_sep );
32
50
long col_sep_len = RSTRING_LEN (col_sep );
33
51
34
52
char * quoteP = RSTRING_PTR (quote_char );
35
- long quote_count = 0 ;
36
-
37
- bool col_sep_found = true;
53
+ char quote_char_val = quoteP [0 ];
54
+ size_t quote_len = strlen (quoteP );
38
55
39
56
VALUE elements = rb_ary_new ();
40
57
VALUE field ;
41
- long i ;
42
58
43
- /* Variables for escaped quote handling */
59
+ long element_count = 0 ;
60
+ int max_fields = -1 ;
61
+ if (max_size != Qnil ) {
62
+ max_fields = NUM2INT (max_size );
63
+ if (max_fields < 0 ) {
64
+ return rb_ary_new ();
65
+ }
66
+ }
67
+
68
+ bool has_quotes = RTEST (has_quotes_val );
69
+ bool strip_ws = RTEST (strip_ws_val );
70
+
71
+ // === FAST PATH: No quotes and single-character separator ===
72
+ if (__builtin_expect (!has_quotes && col_sep_len == 1 , 1 )) {
73
+ char sep = * col_sepP ;
74
+ char * sep_pos = NULL ;
75
+
76
+ while ((sep_pos = memchr (p , sep , endP - p ))) {
77
+ if ((max_fields >= 0 ) && (element_count >= max_fields )) {
78
+ break ;
79
+ }
80
+
81
+ long field_len = sep_pos - startP ;
82
+ char * raw_field = startP ;
83
+ char * trim_start = raw_field ;
84
+ char * trim_end = raw_field + field_len - 1 ;
85
+
86
+ if (strip_ws ) {
87
+ while (trim_start <= trim_end && (* trim_start == ' ' || * trim_start == '\t' )) trim_start ++ ;
88
+ while (trim_end >= trim_start && (* trim_end == ' ' || * trim_end == '\t' )) trim_end -- ;
89
+ }
90
+
91
+ long trimmed_len = trim_end - trim_start + 1 ;
92
+
93
+ field = rb_enc_str_new (trim_start , trimmed_len , encoding );
94
+ rb_ary_push (elements , field );
95
+ element_count ++ ;
96
+
97
+ p = sep_pos + 1 ;
98
+ startP = p ;
99
+ }
100
+
101
+ if ((max_fields < 0 ) || (element_count < max_fields )) {
102
+ long field_len = endP - startP ;
103
+ char * raw_field = startP ;
104
+ char * trim_start = raw_field ;
105
+ char * trim_end = raw_field + field_len - 1 ;
106
+
107
+ if (strip_ws ) {
108
+ while (trim_start <= trim_end && (* trim_start == ' ' || * trim_start == '\t' )) trim_start ++ ;
109
+ while (trim_end >= trim_start && (* trim_end == ' ' || * trim_end == '\t' )) trim_end -- ;
110
+ }
111
+
112
+ long trimmed_len = trim_end - trim_start + 1 ;
113
+
114
+ field = rb_enc_str_new (trim_start , trimmed_len , encoding );
115
+ rb_ary_push (elements , field );
116
+ }
117
+
118
+ return elements ;
119
+ }
120
+
121
+ // === SLOW PATH: Quoted fields or multi-char separator ===
122
+ long i ;
44
123
long backslash_count = 0 ;
45
124
bool in_quotes = false;
125
+ bool col_sep_found = true;
46
126
47
127
while (p < endP ) {
48
- /* does the remaining string start with col_sep ? */
49
128
col_sep_found = true;
50
- for (i = 0 ; (i < col_sep_len ) && (p + i < endP ); i ++ ) {
51
- col_sep_found = col_sep_found && (* (p + i ) == * (col_sepP + i ));
129
+ for (i = 0 ; (i < col_sep_len ) && (p + i < endP ); i ++ ) {
130
+ if (* (p + i ) != * (col_sepP + i )) {
131
+ col_sep_found = false;
132
+ break ;
133
+ }
52
134
}
53
- /* if col_sep was found and we're not inside quotes */
135
+
54
136
if (col_sep_found && !in_quotes ) {
55
- /* if max_size != nil && elements.size >= header_size */
56
- if ((max_size != Qnil ) && RARRAY_LEN (elements ) >= NUM2INT (max_size )) {
137
+ if ((max_fields >= 0 ) && (element_count >= max_fields )) {
57
138
break ;
58
- } else {
59
- /* push that field with original encoding onto the results */
60
- field = rb_enc_str_new ( startP , p - startP , encoding ) ;
61
- rb_ary_push ( elements , field ) ;
139
+ }
140
+
141
+ long field_len = p - startP ;
142
+ char * raw_field = startP ;
62
143
63
- p += col_sep_len ;
64
- startP = p ;
65
- backslash_count = 0 ; // Reset backslash count at the start of a new field
144
+ bool quoted = (field_len >= 2 && raw_field [0 ] == quote_char_val && raw_field [field_len - 1 ] == quote_char_val );
145
+ if (quoted ) {
146
+ raw_field ++ ;
147
+ field_len -= 2 ;
148
+ }
149
+
150
+ char * trim_start = raw_field ;
151
+ char * trim_end = raw_field + field_len - 1 ;
152
+
153
+ if (strip_ws ) {
154
+ while (trim_start <= trim_end && (* trim_start == ' ' || * trim_start == '\t' )) trim_start ++ ;
155
+ while (trim_end >= trim_start && (* trim_end == ' ' || * trim_end == '\t' )) trim_end -- ;
156
+ }
157
+
158
+ long trimmed_len = trim_end - trim_start + 1 ;
159
+
160
+ if (quoted || memchr (trim_start , quote_char_val , trimmed_len )) {
161
+ field = unescape_quotes (trim_start , trimmed_len , quote_char_val , encoding );
162
+ } else {
163
+ field = rb_enc_str_new (trim_start , trimmed_len , encoding );
66
164
}
165
+
166
+ rb_ary_push (elements , field );
167
+ element_count ++ ;
168
+
169
+ p += col_sep_len ;
170
+ startP = p ;
171
+ backslash_count = 0 ;
67
172
} else {
68
173
if (* p == '\\' ) {
69
174
backslash_count ++ ;
70
175
} else {
71
- if (* p == * quoteP ) {
176
+ if (* p == quote_char_val ) {
72
177
if (backslash_count % 2 == 0 ) {
73
- /* Even number of backslashes means quote is not escaped */
74
178
in_quotes = !in_quotes ;
75
179
}
76
- /* Else, quote is escaped; do nothing */
77
180
}
78
- backslash_count = 0 ; // Reset after any character other than backslash
181
+ backslash_count = 0 ;
79
182
}
80
183
p ++ ;
81
184
}
82
- } /* while */
185
+ }
83
186
84
- /* Check for unclosed quotes at the end of the line */
85
187
if (in_quotes ) {
86
188
rb_raise (eMalformedCSVError , "Unclosed quoted field detected in line: %s" , StringValueCStr (line ));
87
189
}
88
190
89
- /* check if the last part of the line needs to be processed */
90
- if ((max_size == Qnil ) || RARRAY_LEN (elements ) < NUM2INT (max_size )) {
91
- /* copy the remaining line as a field with original encoding onto the results */
92
- field = rb_enc_str_new (startP , endP - startP , encoding );
191
+ if ((max_fields < 0 ) || (element_count < max_fields )) {
192
+ long field_len = endP - startP ;
193
+ char * raw_field = startP ;
194
+
195
+ bool quoted = (field_len >= 2 && raw_field [0 ] == quote_char_val && raw_field [field_len - 1 ] == quote_char_val );
196
+ if (quoted ) {
197
+ raw_field ++ ;
198
+ field_len -= 2 ;
199
+ }
200
+
201
+ char * trim_start = raw_field ;
202
+ char * trim_end = raw_field + field_len - 1 ;
203
+
204
+ if (strip_ws ) {
205
+ while (trim_start <= trim_end && (* trim_start == ' ' || * trim_start == '\t' )) trim_start ++ ;
206
+ while (trim_end >= trim_start && (* trim_end == ' ' || * trim_end == '\t' )) trim_end -- ;
207
+ }
208
+
209
+ long trimmed_len = trim_end - trim_start + 1 ;
210
+
211
+ if (quoted || memchr (trim_start , quote_char_val , trimmed_len )) {
212
+ field = unescape_quotes (trim_start , trimmed_len , quote_char_val , encoding );
213
+ } else {
214
+ field = rb_enc_str_new (trim_start , trimmed_len , encoding );
215
+ }
216
+
93
217
rb_ary_push (elements , field );
94
218
}
95
219
96
220
return elements ;
97
221
}
98
222
99
223
void Init_smarter_csv (void ) {
100
- // these modules and the error class are already defined in Ruby code, make them accessible:
101
224
SmarterCSV = rb_const_get (rb_cObject , rb_intern ("SmarterCSV" ));
102
225
Parser = rb_const_get (SmarterCSV , rb_intern ("Parser" ));
103
226
eMalformedCSVError = rb_const_get (SmarterCSV , rb_intern ("MalformedCSV" ));
104
-
105
- rb_define_module_function (Parser , "parse_csv_line_c" , rb_parse_csv_line , 4 );
227
+ Qempty_string = rb_str_new_literal ("" );
228
+ rb_gc_register_address (& Qempty_string );
229
+ rb_define_module_function (Parser , "parse_csv_line_c" , rb_parse_csv_line , 6 );
106
230
}
0 commit comments