3
3
#include < ydb/library/arrow_inference/arrow_inference.h>
4
4
#include < ydb/public/lib/ydb_cli/common/interactive.h>
5
5
#include < ydb/public/lib/ydb_cli/common/pretty_table.h>
6
+ #include < ydb/public/lib/ydb_cli/common/print_utils.h>
7
+ #include < ydb/public/lib/ydb_cli/common/csv_parser.h>
6
8
#include < ydb/public/sdk/cpp/include/ydb-cpp-sdk/client/query/client.h>
7
9
8
10
#include < arrow/csv/options.h>
11
13
#include < arrow/table.h>
12
14
#include < util/string/builder.h>
13
15
#include < library/cpp/string_utils/csv/csv.h>
16
+ #include < util/stream/file.h>
17
+ #include < regex>
14
18
15
19
namespace NYdb ::NConsoleClient {
16
20
@@ -22,8 +26,10 @@ TCommandToolsInfer::TCommandToolsInfer()
22
26
23
27
TCommandToolsInferCsv::TCommandToolsInferCsv ()
24
28
: TYdbCommand(" csv" , {}, " Generate CREATE TABLE SQL query from CSV file"
25
- " \n\n By default, the command attempts to use the first row of the CSV as column names if possible."
26
- " Use the \" --columns\" , \" --gen-names\" or \" --header\" options to set the column names source explicitly." )
29
+ " \n\n By default, if no options are specified, "
30
+ " the command uses the first row of the file as column names if possible"
31
+ " (i.e., if the values meet the requirements for column names and do not match data types in the other rows)."
32
+ " Otherwise, column names will be generated automatically." )
27
33
{}
28
34
29
35
void TCommandToolsInferCsv::Config (TConfig& config) {
@@ -33,19 +39,23 @@ void TCommandToolsInferCsv::Config(TConfig& config) {
33
39
" One or more file paths to infer from. Or CSV data can be passed to stdin instead" );
34
40
config.Opts ->AddLongOption (' p' , " path" , " Database path to table that should be created" )
35
41
.RequiredArgument (" STRING" ).DefaultValue (" table" ).StoreResult (&Path);
36
- config.Opts ->AddLongOption (" columns" ,
42
+ auto & columnsOption = config.Opts ->AddLongOption (" columns" ,
37
43
" Explicitly specifies table column names, as a comma-separated list." )
38
44
.RequiredArgument (" NAMES" ).StoreResult (&ColumnNames);
39
- config.Opts ->AddLongOption (" gen-columns" ,
45
+ auto & genColumnsOption = config.Opts ->AddLongOption (" gen-columns" ,
40
46
" Explicitly indicates that table column names should be generated automatically." )
41
47
.NoArgument ().StoreTrue (&GenerateColumnNames);
42
- config.Opts ->AddLongOption (" header" , " Explicitly indicates that the first row in the CSV contains column names." )
48
+ auto & headerOption = config.Opts ->AddLongOption (" header" , " Explicitly indicates that the first row in the CSV contains column names." )
43
49
.NoArgument ().StoreTrue (&HeaderHasColumnNames);
44
50
config.Opts ->AddLongOption (" rows-to-analyze" , " Number of rows to analyze. "
45
51
" 0 means unlimited. Reading will be stopped soon after this number of rows is read." )
46
52
.DefaultValue (500000 ).StoreResult (&RowsToAnalyze);
47
53
config.Opts ->AddLongOption (" execute" , " Execute CREATE TABLE request right after generation." )
48
54
.NoArgument ().StoreTrue (&Execute);
55
+
56
+ config.Opts ->MutuallyExclusiveOpt (columnsOption, genColumnsOption);
57
+ config.Opts ->MutuallyExclusiveOpt (columnsOption, headerOption);
58
+ config.Opts ->MutuallyExclusiveOpt (genColumnsOption, headerOption);
49
59
}
50
60
51
61
void TCommandToolsInferCsv::Parse (TConfig& config) {
@@ -67,11 +77,6 @@ void TCommandToolsInferCsv::Parse(TConfig& config) {
67
77
ReadingFromStdin = true ;
68
78
}
69
79
}
70
-
71
- if (HeaderHasColumnNames && !ColumnNames.empty ()) {
72
- throw TMisuseException () << " Options --header and --columns are mutually exclusive."
73
- " Use --header if first row in the file containscolumn names. Use --columns to list column names manually." ;
74
- }
75
80
}
76
81
77
82
namespace {
@@ -93,6 +98,16 @@ namespace {
93
98
builder << str;
94
99
}
95
100
}
101
+
102
+ bool IsValidColumnName (const std::string& name) {
103
+ if (name.empty ()) {
104
+ return false ;
105
+ }
106
+
107
+ // Column name must start with a letter or underscore and contain only letters, numbers and underscores
108
+ static const std::regex namePattern (" ^[a-zA-Z_][a-zA-Z0-9_]*$" );
109
+ return std::regex_match (name, namePattern);
110
+ }
96
111
}
97
112
98
113
int TCommandToolsInferCsv::Run (TConfig& config) {
@@ -112,34 +127,174 @@ int TCommandToolsInferCsv::Run(TConfig& config) {
112
127
113
128
auto formatConfig = std::make_shared<NArrowInference::TCsvConfig>();
114
129
formatConfig->RowsToAnalyze = RowsToAnalyze;
130
+ formatConfig->Format = NArrowInference::EFileFormat::CsvWithNames;
131
+ formatConfig->ShouldMakeOptional = true ;
132
+
133
+ // Configure CSV parsing options
134
+ formatConfig->ParseOpts .delimiter = ' ,' ; // Use comma as default delimiter
135
+ formatConfig->ParseOpts .quote_char = ' "' ; // Use double quotes as default quote character
136
+ formatConfig->ParseOpts .escape_char = ' \\ ' ; // Use backslash as default escape character
137
+
138
+ // Read the first line of the file if needed
139
+ std::vector<std::string> firstRowValues;
140
+ bool generateColumnNames = GenerateColumnNames;
115
141
if (!ColumnNames.empty ()) {
116
- NCsvFormat::CsvSplitter splitter (ColumnNames);
117
- auto tmp = static_cast <TVector<TString>>(splitter);
118
- std::vector<std::string> columnNames;
142
+ // If --columns option is specified, use explicitly provided names
143
+ auto tmp = static_cast <TVector<TString>>(NCsvFormat::CsvSplitter (ColumnNames));
119
144
for (const auto & columnName : tmp) {
120
- columnNames .push_back (columnName.data ());
145
+ firstRowValues .push_back (columnName.data ());
121
146
}
122
- formatConfig->ReadOpts .column_names = columnNames;
123
- } else if (!HeaderHasColumnNames) {
147
+ formatConfig->ReadOpts .column_names = firstRowValues;
148
+ formatConfig->ReadOpts .autogenerate_column_names = false ;
149
+ } else if (HeaderHasColumnNames) {
150
+ // If --header option is specified, use first row as column names
151
+ formatConfig->ReadOpts .column_names = {};
152
+ formatConfig->ReadOpts .autogenerate_column_names = false ;
153
+ } else if (GenerateColumnNames) {
154
+ // If --gen-columns option is specified, generate names automatically
155
+ formatConfig->ReadOpts .column_names = {};
124
156
formatConfig->ReadOpts .autogenerate_column_names = true ;
125
- }
157
+ } else {
158
+ // If no option is specified:
159
+ // 1. Read the first line of the file
160
+ TFile file;
161
+ if (ReadingFromStdin) {
162
+ if (config.IsVerbose ()) {
163
+ Cerr << " Reading first linefrom stdin" << Endl;
164
+ }
165
+ file = TFile (GetStdinFileno ());
166
+ } else {
167
+ if (config.IsVerbose ()) {
168
+ Cerr << " Reading first line from file " << FilePaths[0 ] << Endl;
169
+ }
170
+ file = TFile (FilePaths[0 ], RdOnly);
171
+ }
172
+ auto input = MakeHolder<TFileInput>(file);
173
+ NCsvFormat::TLinesSplitter csvSplitter (*input);
174
+ TString firstLine = csvSplitter.ConsumeLine ();
126
175
127
- formatConfig->Format = NArrowInference::EFileFormat::CsvWithNames;
176
+ // Check if the line contains newlines inside quotes
177
+ if (firstLine.find (' \n ' ) != TString::npos || firstLine.find (' \r ' ) != TString::npos) {
178
+ // If there are newlines, it's definitely not column names
179
+ formatConfig->ReadOpts .column_names = {};
180
+ formatConfig->ReadOpts .autogenerate_column_names = true ;
181
+ } else {
182
+ // Split the line by delimiter
183
+ auto tmp = static_cast <TVector<TString>>(NCsvFormat::CsvSplitter (firstLine));
184
+ for (const auto & value : tmp) {
185
+ firstRowValues.push_back (value.data ());
186
+ }
128
187
188
+ // 2. Tell the library to generate names automatically
189
+ // We will decide later if we would use generated names or first row as column names
190
+ formatConfig->ReadOpts .column_names = {};
191
+ formatConfig->ReadOpts .autogenerate_column_names = true ;
192
+ formatConfig->ReadOpts .skip_rows = 1 ;
193
+ }
194
+ }
195
+
196
+ // Start file analysis
129
197
auto result = NYdb::NArrowInference::InferTypes (inputs, formatConfig);
130
-
131
198
if (std::holds_alternative<TString>(result)) {
132
199
throw TMisuseException () << " Failed to infer schema: " << std::get<TString>(result);
133
200
}
134
201
135
202
auto & arrowFields = std::get<NYdb::NArrowInference::ArrowFields>(result);
203
+ bool useFirstRowAsColumnNames = false ;
204
+
205
+ // If no option is specified, check if the first row can be used as data
206
+ if (firstRowValues.size () > 0 && ColumnNames.empty () && !HeaderHasColumnNames && !GenerateColumnNames) {
207
+ bool canUseFirstRowAsColumnNames = true ;
208
+ bool canUseFirstRowAsData = false ; // By default, assume we can't use it as data
209
+ if (firstRowValues.size () != arrowFields.size ()) {
210
+ canUseFirstRowAsColumnNames = false ;
211
+ if (config.IsVerbose ()) {
212
+ Cerr << " First row size (" << firstRowValues.size () << " ) doesn't match inferred fields count ("
213
+ << arrowFields.size () << " ), can't use first row as header or data" << Endl;
214
+ }
215
+ } else {
216
+ // First check if all values in the first row can be column names
217
+ for (const auto & value : firstRowValues) {
218
+ if (!IsValidColumnName (value)) {
219
+ canUseFirstRowAsColumnNames = false ;
220
+ generateColumnNames = true ;
221
+ if (config.IsVerbose ()) {
222
+ Cerr << " Value '" << value << " ' is not a valid column name, can't use first row as header."
223
+ " Column names will be generated automatically" << Endl;
224
+ }
225
+ break ;
226
+ }
227
+ }
228
+
229
+ // Only if all values can be column names, check if they can be converted to types
230
+ if (canUseFirstRowAsColumnNames) {
231
+ if (config.IsVerbose ()) {
232
+ Cerr << " All values in first row are valid column names, checking if they can be used as data..." << Endl;
233
+ }
234
+
235
+ canUseFirstRowAsData = true ; // Assume we can use as data until proven otherwise
236
+ for (size_t i = 0 ; i < arrowFields.size (); ++i) {
237
+ auto field = arrowFields[i];
238
+ auto value = firstRowValues[i];
239
+
240
+ // Try to convert value to column type
241
+ Ydb::Type inferredType;
242
+ bool inferResult = NYdb::NArrowInference::ArrowToYdbType (inferredType, *field->type (), formatConfig);
243
+ if (!inferResult) {
244
+ canUseFirstRowAsData = false ;
245
+ if (config.IsVerbose ()) {
246
+ Cerr << " Failed to infer type for column " << i << " , assuming string type" << Endl;
247
+ }
248
+ break ;
249
+ }
250
+ if (!NYdb::NConsoleClient::IsConvertibleToYdbValue (TString (value), inferredType)) {
251
+ canUseFirstRowAsData = false ;
252
+ if (config.IsVerbose ()) {
253
+ Cerr << " Value '" << value << " ' in column " << i << " cannot be converted to inferred type "
254
+ << TType (inferredType) << Endl;
255
+ }
256
+ break ;
257
+ }
258
+ }
259
+
260
+ if (canUseFirstRowAsData) {
261
+ generateColumnNames = true ;
262
+ if (config.IsVerbose ()) {
263
+ Cerr << " All values in the first row can be used as data, "
264
+ " so considering it as data and generating column names" << Endl;
265
+ }
266
+ } else {
267
+ useFirstRowAsColumnNames = true ;
268
+ if (config.IsVerbose ()) {
269
+ Cerr << " First row will be used as column names since values cannot be used as data" << Endl;
270
+ }
271
+ }
272
+ }
273
+ }
274
+ }
275
+
276
+ // Generate SQL query for table creation
136
277
TStringBuilder query;
137
278
query << " CREATE TABLE " ;
138
279
PrintStringQuotedIfNeeded (query, GetRelativePath (Path, config));
139
280
query << " (" << Endl;
281
+ int columnIndex = -1 ;
282
+ std::string firstColumnName;
140
283
for (const auto & field : arrowFields) {
141
- if (field->name ().empty ()) {
142
- continue ;
284
+ ++columnIndex;
285
+ std::string columnName;
286
+ if (useFirstRowAsColumnNames) {
287
+ columnName = firstRowValues[columnIndex];
288
+ } else if (generateColumnNames) {
289
+ columnName = " column" + ToString (columnIndex);
290
+ } else {
291
+ columnName = field->name ();
292
+ if (columnName.empty ()) {
293
+ continue ;
294
+ }
295
+ }
296
+ if (columnIndex == 0 ) {
297
+ firstColumnName = columnName;
143
298
}
144
299
Ydb::Type inferredType;
145
300
bool inferResult = NYdb::NArrowInference::ArrowToYdbType (inferredType, *field->type (), formatConfig);
@@ -161,16 +316,18 @@ int TCommandToolsInferCsv::Run(TConfig& config) {
161
316
" Inferred type kind: " << parser.GetKind ();
162
317
}
163
318
} else if (config.IsVerbose ()) {
164
- Cerr << " Failed to infer type for column " << field-> name () << Endl;
319
+ Cerr << " Failed to infer type for column " << columnName << " with index " << columnIndex << Endl;
165
320
}
166
321
query << " " ;
167
- PrintStringQuotedIfNeeded (query, field->name ());
168
- query << " " << resultType << ' ,' << Endl;
169
- if (!field->nullable ()) {
322
+ PrintStringQuotedIfNeeded (query, columnName);
323
+ query << " " << resultType;
324
+ // Only setting NOT NULL for the first column because we consider it a PRIMARY KEY
325
+ if (!columnIndex) {
170
326
query << " NOT NULL" ;
171
327
}
328
+ query << ' ,' << Endl;
172
329
}
173
- query << " PRIMARY KEY (" << arrowFields[ 0 ]-> name () << " ) -- First column is chosen. Probably need to change this." << Endl;
330
+ query << " PRIMARY KEY (` " << firstColumnName << " ` ) -- First column is chosen. Probably need to change this." << Endl;
174
331
query <<
175
332
R"( )
176
333
WITH (
0 commit comments