Skip to content

Commit f2860c5

Browse files
authored
Add tools infer csv default behaviour (#20248)
1 parent f0e6047 commit f2860c5

File tree

6 files changed

+208
-35
lines changed

6 files changed

+208
-35
lines changed

ydb/public/lib/ydb_cli/commands/ydb_tools_infer.cpp

Lines changed: 183 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#include <ydb/library/arrow_inference/arrow_inference.h>
44
#include <ydb/public/lib/ydb_cli/common/interactive.h>
55
#include <ydb/public/lib/ydb_cli/common/pretty_table.h>
6+
#include <ydb/public/lib/ydb_cli/common/print_utils.h>
7+
#include <ydb/public/lib/ydb_cli/common/csv_parser.h>
68
#include <ydb/public/sdk/cpp/include/ydb-cpp-sdk/client/query/client.h>
79

810
#include <arrow/csv/options.h>
@@ -11,6 +13,8 @@
1113
#include <arrow/table.h>
1214
#include <util/string/builder.h>
1315
#include <library/cpp/string_utils/csv/csv.h>
16+
#include <util/stream/file.h>
17+
#include <regex>
1418

1519
namespace NYdb::NConsoleClient {
1620

@@ -22,8 +26,10 @@ TCommandToolsInfer::TCommandToolsInfer()
2226

2327
TCommandToolsInferCsv::TCommandToolsInferCsv()
2428
: TYdbCommand("csv", {}, "Generate CREATE TABLE SQL query from CSV file"
25-
"\n\nBy default, the command attempts to use the first row of the CSV as column names if possible."
26-
" Use the \"--columns\", \"--gen-names\" or \"--header\" options to set the column names source explicitly.")
29+
"\n\nBy default, if no options are specified, "
30+
"the command uses the first row of the file as column names if possible"
31+
" (i.e., if the values meet the requirements for column names and do not match data types in the other rows)."
32+
" Otherwise, column names will be generated automatically.")
2733
{}
2834

2935
void TCommandToolsInferCsv::Config(TConfig& config) {
@@ -33,19 +39,23 @@ void TCommandToolsInferCsv::Config(TConfig& config) {
3339
"One or more file paths to infer from. Or CSV data can be passed to stdin instead");
3440
config.Opts->AddLongOption('p', "path", "Database path to table that should be created")
3541
.RequiredArgument("STRING").DefaultValue("table").StoreResult(&Path);
36-
config.Opts->AddLongOption("columns",
42+
auto& columnsOption = config.Opts->AddLongOption("columns",
3743
"Explicitly specifies table column names, as a comma-separated list.")
3844
.RequiredArgument("NAMES").StoreResult(&ColumnNames);
39-
config.Opts->AddLongOption("gen-columns",
45+
auto& genColumnsOption = config.Opts->AddLongOption("gen-columns",
4046
"Explicitly indicates that table column names should be generated automatically.")
4147
.NoArgument().StoreTrue(&GenerateColumnNames);
42-
config.Opts->AddLongOption("header", "Explicitly indicates that the first row in the CSV contains column names.")
48+
auto& headerOption = config.Opts->AddLongOption("header", "Explicitly indicates that the first row in the CSV contains column names.")
4349
.NoArgument().StoreTrue(&HeaderHasColumnNames);
4450
config.Opts->AddLongOption("rows-to-analyze", "Number of rows to analyze. "
4551
"0 means unlimited. Reading will be stopped soon after this number of rows is read.")
4652
.DefaultValue(500000).StoreResult(&RowsToAnalyze);
4753
config.Opts->AddLongOption("execute", "Execute CREATE TABLE request right after generation.")
4854
.NoArgument().StoreTrue(&Execute);
55+
56+
config.Opts->MutuallyExclusiveOpt(columnsOption, genColumnsOption);
57+
config.Opts->MutuallyExclusiveOpt(columnsOption, headerOption);
58+
config.Opts->MutuallyExclusiveOpt(genColumnsOption, headerOption);
4959
}
5060

5161
void TCommandToolsInferCsv::Parse(TConfig& config) {
@@ -67,11 +77,6 @@ void TCommandToolsInferCsv::Parse(TConfig& config) {
6777
ReadingFromStdin = true;
6878
}
6979
}
70-
71-
if (HeaderHasColumnNames && !ColumnNames.empty()) {
72-
throw TMisuseException() << "Options --header and --columns are mutually exclusive."
73-
" Use --header if first row in the file containscolumn names. Use --columns to list column names manually.";
74-
}
7580
}
7681

7782
namespace {
@@ -93,6 +98,16 @@ namespace {
9398
builder << str;
9499
}
95100
}
101+
102+
bool IsValidColumnName(const std::string& name) {
103+
if (name.empty()) {
104+
return false;
105+
}
106+
107+
// Column name must start with a letter or underscore and contain only letters, numbers and underscores
108+
static const std::regex namePattern("^[a-zA-Z_][a-zA-Z0-9_]*$");
109+
return std::regex_match(name, namePattern);
110+
}
96111
}
97112

98113
int TCommandToolsInferCsv::Run(TConfig& config) {
@@ -112,34 +127,174 @@ int TCommandToolsInferCsv::Run(TConfig& config) {
112127

113128
auto formatConfig = std::make_shared<NArrowInference::TCsvConfig>();
114129
formatConfig->RowsToAnalyze = RowsToAnalyze;
130+
formatConfig->Format = NArrowInference::EFileFormat::CsvWithNames;
131+
formatConfig->ShouldMakeOptional = true;
132+
133+
// Configure CSV parsing options
134+
formatConfig->ParseOpts.delimiter = ','; // Use comma as default delimiter
135+
formatConfig->ParseOpts.quote_char = '"'; // Use double quotes as default quote character
136+
formatConfig->ParseOpts.escape_char = '\\'; // Use backslash as default escape character
137+
138+
// Read the first line of the file if needed
139+
std::vector<std::string> firstRowValues;
140+
bool generateColumnNames = GenerateColumnNames;
115141
if (!ColumnNames.empty()) {
116-
NCsvFormat::CsvSplitter splitter(ColumnNames);
117-
auto tmp = static_cast<TVector<TString>>(splitter);
118-
std::vector<std::string> columnNames;
142+
// If --columns option is specified, use explicitly provided names
143+
auto tmp = static_cast<TVector<TString>>(NCsvFormat::CsvSplitter(ColumnNames));
119144
for (const auto& columnName : tmp) {
120-
columnNames.push_back(columnName.data());
145+
firstRowValues.push_back(columnName.data());
121146
}
122-
formatConfig->ReadOpts.column_names = columnNames;
123-
} else if (!HeaderHasColumnNames) {
147+
formatConfig->ReadOpts.column_names = firstRowValues;
148+
formatConfig->ReadOpts.autogenerate_column_names = false;
149+
} else if (HeaderHasColumnNames) {
150+
// If --header option is specified, use first row as column names
151+
formatConfig->ReadOpts.column_names = {};
152+
formatConfig->ReadOpts.autogenerate_column_names = false;
153+
} else if (GenerateColumnNames) {
154+
// If --gen-columns option is specified, generate names automatically
155+
formatConfig->ReadOpts.column_names = {};
124156
formatConfig->ReadOpts.autogenerate_column_names = true;
125-
}
157+
} else {
158+
// If no option is specified:
159+
// 1. Read the first line of the file
160+
TFile file;
161+
if (ReadingFromStdin) {
162+
if (config.IsVerbose()) {
163+
Cerr << "Reading first linefrom stdin" << Endl;
164+
}
165+
file = TFile(GetStdinFileno());
166+
} else {
167+
if (config.IsVerbose()) {
168+
Cerr << "Reading first line from file " << FilePaths[0] << Endl;
169+
}
170+
file = TFile(FilePaths[0], RdOnly);
171+
}
172+
auto input = MakeHolder<TFileInput>(file);
173+
NCsvFormat::TLinesSplitter csvSplitter(*input);
174+
TString firstLine = csvSplitter.ConsumeLine();
126175

127-
formatConfig->Format = NArrowInference::EFileFormat::CsvWithNames;
176+
// Check if the line contains newlines inside quotes
177+
if (firstLine.find('\n') != TString::npos || firstLine.find('\r') != TString::npos) {
178+
// If there are newlines, it's definitely not column names
179+
formatConfig->ReadOpts.column_names = {};
180+
formatConfig->ReadOpts.autogenerate_column_names = true;
181+
} else {
182+
// Split the line by delimiter
183+
auto tmp = static_cast<TVector<TString>>(NCsvFormat::CsvSplitter(firstLine));
184+
for (const auto& value : tmp) {
185+
firstRowValues.push_back(value.data());
186+
}
128187

188+
// 2. Tell the library to generate names automatically
189+
// We will decide later if we would use generated names or first row as column names
190+
formatConfig->ReadOpts.column_names = {};
191+
formatConfig->ReadOpts.autogenerate_column_names = true;
192+
formatConfig->ReadOpts.skip_rows = 1;
193+
}
194+
}
195+
196+
// Start file analysis
129197
auto result = NYdb::NArrowInference::InferTypes(inputs, formatConfig);
130-
131198
if (std::holds_alternative<TString>(result)) {
132199
throw TMisuseException() << "Failed to infer schema: " << std::get<TString>(result);
133200
}
134201

135202
auto& arrowFields = std::get<NYdb::NArrowInference::ArrowFields>(result);
203+
bool useFirstRowAsColumnNames = false;
204+
205+
// If no option is specified, check if the first row can be used as data
206+
if (firstRowValues.size() > 0 && ColumnNames.empty() && !HeaderHasColumnNames && !GenerateColumnNames) {
207+
bool canUseFirstRowAsColumnNames = true;
208+
bool canUseFirstRowAsData = false; // By default, assume we can't use it as data
209+
if (firstRowValues.size() != arrowFields.size()) {
210+
canUseFirstRowAsColumnNames = false;
211+
if (config.IsVerbose()) {
212+
Cerr << "First row size (" << firstRowValues.size() << ") doesn't match inferred fields count ("
213+
<< arrowFields.size() << "), can't use first row as header or data" << Endl;
214+
}
215+
} else {
216+
// First check if all values in the first row can be column names
217+
for (const auto& value : firstRowValues) {
218+
if (!IsValidColumnName(value)) {
219+
canUseFirstRowAsColumnNames = false;
220+
generateColumnNames = true;
221+
if (config.IsVerbose()) {
222+
Cerr << "Value '" << value << "' is not a valid column name, can't use first row as header."
223+
" Column names will be generated automatically" << Endl;
224+
}
225+
break;
226+
}
227+
}
228+
229+
// Only if all values can be column names, check if they can be converted to types
230+
if (canUseFirstRowAsColumnNames) {
231+
if (config.IsVerbose()) {
232+
Cerr << "All values in first row are valid column names, checking if they can be used as data..." << Endl;
233+
}
234+
235+
canUseFirstRowAsData = true; // Assume we can use as data until proven otherwise
236+
for (size_t i = 0; i < arrowFields.size(); ++i) {
237+
auto field = arrowFields[i];
238+
auto value = firstRowValues[i];
239+
240+
// Try to convert value to column type
241+
Ydb::Type inferredType;
242+
bool inferResult = NYdb::NArrowInference::ArrowToYdbType(inferredType, *field->type(), formatConfig);
243+
if (!inferResult) {
244+
canUseFirstRowAsData = false;
245+
if (config.IsVerbose()) {
246+
Cerr << "Failed to infer type for column " << i << ", assuming string type" << Endl;
247+
}
248+
break;
249+
}
250+
if (!NYdb::NConsoleClient::IsConvertibleToYdbValue(TString(value), inferredType)) {
251+
canUseFirstRowAsData = false;
252+
if (config.IsVerbose()) {
253+
Cerr << "Value '" << value << "' in column " << i << " cannot be converted to inferred type "
254+
<< TType(inferredType) << Endl;
255+
}
256+
break;
257+
}
258+
}
259+
260+
if (canUseFirstRowAsData) {
261+
generateColumnNames = true;
262+
if (config.IsVerbose()) {
263+
Cerr << "All values in the first row can be used as data, "
264+
"so considering it as data and generating column names" << Endl;
265+
}
266+
} else {
267+
useFirstRowAsColumnNames = true;
268+
if (config.IsVerbose()) {
269+
Cerr << "First row will be used as column names since values cannot be used as data" << Endl;
270+
}
271+
}
272+
}
273+
}
274+
}
275+
276+
// Generate SQL query for table creation
136277
TStringBuilder query;
137278
query << "CREATE TABLE ";
138279
PrintStringQuotedIfNeeded(query, GetRelativePath(Path, config));
139280
query << " (" << Endl;
281+
int columnIndex = -1;
282+
std::string firstColumnName;
140283
for (const auto& field : arrowFields) {
141-
if (field->name().empty()) {
142-
continue;
284+
++columnIndex;
285+
std::string columnName;
286+
if (useFirstRowAsColumnNames) {
287+
columnName = firstRowValues[columnIndex];
288+
} else if (generateColumnNames) {
289+
columnName = "column" + ToString(columnIndex);
290+
} else {
291+
columnName = field->name();
292+
if (columnName.empty()) {
293+
continue;
294+
}
295+
}
296+
if (columnIndex == 0) {
297+
firstColumnName = columnName;
143298
}
144299
Ydb::Type inferredType;
145300
bool inferResult = NYdb::NArrowInference::ArrowToYdbType(inferredType, *field->type(), formatConfig);
@@ -161,16 +316,18 @@ int TCommandToolsInferCsv::Run(TConfig& config) {
161316
" Inferred type kind: " << parser.GetKind();
162317
}
163318
} else if (config.IsVerbose()) {
164-
Cerr << "Failed to infer type for column " << field->name() << Endl;
319+
Cerr << "Failed to infer type for column " << columnName << " with index " << columnIndex << Endl;
165320
}
166321
query << " ";
167-
PrintStringQuotedIfNeeded(query, field->name());
168-
query << " " << resultType << ',' << Endl;
169-
if (!field->nullable()) {
322+
PrintStringQuotedIfNeeded(query, columnName);
323+
query << " " << resultType;
324+
// Only setting NOT NULL for the first column because we consider it a PRIMARY KEY
325+
if (!columnIndex) {
170326
query << " NOT NULL";
171327
}
328+
query << ',' << Endl;
172329
}
173-
query << " PRIMARY KEY (" << arrowFields[0]->name() << ") -- First column is chosen. Probably need to change this." << Endl;
330+
query << " PRIMARY KEY (`" << firstColumnName << "`) -- First column is chosen. Probably need to change this." << Endl;
174331
query <<
175332
R"()
176333
WITH (

ydb/public/lib/ydb_cli/common/csv_parser.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -659,6 +659,16 @@ const TString& TCsvParser::GetHeaderRow() const {
659659
return HeaderRow;
660660
}
661661

662+
bool IsConvertibleToYdbValue(const TString& value, const Ydb::Type& type) {
663+
try {
664+
TTypeParser parser(type);
665+
FieldToValue(parser, value, std::nullopt, {}, "columnName");
666+
return true;
667+
} catch (...) {
668+
return false;
669+
}
670+
}
671+
662672
}
663673
}
664674

ydb/public/lib/ydb_cli/common/csv_parser.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include <ydb/public/sdk/cpp/include/ydb-cpp-sdk/client/params/params.h>
44
#include <google/protobuf/arena.h>
5+
#include <ydb/public/api/protos/ydb_value.pb.h>
56

67
#include <library/cpp/string_utils/csv/csv.h>
78

@@ -82,5 +83,8 @@ class TCsvParser {
8283
) const;
8384
};
8485

86+
// Checks if a string value can be converted to a YDB type
87+
bool IsConvertibleToYdbValue(const TString& value, const Ydb::Type& type);
88+
8589
}
8690
}

ydb/public/lib/ydb_cli/common/print_utils.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,5 +138,13 @@ int PrintProtoJsonBase64(const google::protobuf::Message& msg) {
138138
return EXIT_SUCCESS;
139139
}
140140

141+
FHANDLE GetStdinFileno() {
142+
#if defined(_win32_)
143+
return GetStdHandle(STD_INPUT_HANDLE);
144+
#elif defined(_unix_)
145+
return STDIN_FILENO;
146+
#endif
147+
}
148+
141149
}
142150
}

ydb/public/lib/ydb_cli/common/print_utils.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include <ydb/public/sdk/cpp/include/ydb-cpp-sdk/client/scheme/scheme.h>
44
#include <library/cpp/colorizer/colors.h>
5+
#include <util/system/file.h>
56

67
namespace NYdb {
78
namespace NConsoleClient {
@@ -12,8 +13,8 @@ TString FormatDuration(TDuration duration);
1213
TString PrettySize(ui64 size);
1314
TString PrettyNumber(ui64 number);
1415
TString EntryTypeToString(NScheme::ESchemeEntryType entry);
15-
1616
int PrintProtoJsonBase64(const google::protobuf::Message& msg);
17+
FHANDLE GetStdinFileno();
1718

1819
}
1920
}

ydb/public/lib/ydb_cli/import/import.cpp

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <ydb/public/lib/ydb_cli/common/recursive_list.h>
2121
#include <ydb/public/lib/ydb_cli/common/interactive.h>
2222
#include <ydb/public/lib/ydb_cli/common/progress_bar.h>
23+
#include <ydb/public/lib/ydb_cli/common/print_utils.h>
2324
#include <ydb/public/lib/ydb_cli/commands/ydb_common.h>
2425
#include <ydb/public/lib/ydb_cli/dump/util/util.h>
2526
#include <ydb/public/lib/ydb_cli/import/cli_arrow_helpers.h>
@@ -163,14 +164,6 @@ void InitCsvParser(TCsvParser& parser,
163164
}
164165
}
165166

166-
FHANDLE GetStdinFileno() {
167-
#if defined(_win32_)
168-
return GetStdHandle(STD_INPUT_HANDLE);
169-
#elif defined(_unix_)
170-
return STDIN_FILENO;
171-
#endif
172-
}
173-
174167
class TMaxInflightGetter {
175168
public:
176169
TMaxInflightGetter(ui64 totalMaxInFlight, std::atomic<ui64>& currentFileCount)

0 commit comments

Comments
 (0)