3
3
#include < arrow/table.h>
4
4
#include < arrow/csv/options.h>
5
5
#include < arrow/csv/reader.h>
6
+ #include < arrow/json/options.h>
7
+ #include < arrow/json/reader.h>
6
8
#include < parquet/arrow/reader.h>
7
9
8
10
#include < ydb/core/external_sources/object_storage/events.h>
@@ -182,6 +184,10 @@ struct CsvConfig : public FormatConfig {
182
184
arrow::csv::ConvertOptions ConvOpts = arrow::csv::ConvertOptions::Defaults();
183
185
};
184
186
187
+ struct JsonConfig : public FormatConfig {
188
+ arrow::json::ParseOptions ParseOpts = arrow::json::ParseOptions::Defaults();
189
+ };
190
+
185
191
using TsvConfig = CsvConfig;
186
192
187
193
namespace {
@@ -190,23 +196,30 @@ using ArrowField = std::shared_ptr<arrow::Field>;
190
196
using ArrowFields = std::vector<ArrowField>;
191
197
192
198
std::variant<ArrowFields, TString> InferCsvTypes (std::shared_ptr<arrow::io::RandomAccessFile> file, const CsvConfig& config) {
199
+ int64_t fileSize;
200
+ if (auto sizeStatus = file->GetSize ().Value (&fileSize); !sizeStatus.ok ()) {
201
+ return TStringBuilder{} << " coudn't get file size: " << sizeStatus.ToString ();
202
+ }
203
+
193
204
std::shared_ptr<arrow::csv::TableReader> reader;
194
- auto fileSize = static_cast <int32_t >(file->GetSize ().ValueOr (1 << 20 ));
195
- fileSize = std::min (fileSize, 1 << 20 );
196
205
auto readerStatus = arrow::csv::TableReader::Make (
197
- arrow::io::default_io_context (), std::move (file), arrow::csv::ReadOptions{.use_threads = false , .block_size = fileSize}, config.ParseOpts , config.ConvOpts
206
+ arrow::io::default_io_context (),
207
+ std::move (file),
208
+ arrow::csv::ReadOptions{.use_threads = false , .block_size = static_cast <int32_t >(fileSize)},
209
+ config.ParseOpts ,
210
+ config.ConvOpts
198
211
)
199
212
.Value (&reader);
200
213
201
214
if (!readerStatus.ok ()) {
202
- return TString{TStringBuilder{} << " couldn't parse csv/tsv file, check format and compression params: " << readerStatus.ToString ()};
215
+ return TString{TStringBuilder{} << " couldn't open csv/tsv file, check format and compression params: " << readerStatus.ToString ()};
203
216
}
204
217
205
218
std::shared_ptr<arrow::Table> table;
206
219
auto tableRes = reader->Read ().Value (&table);
207
220
208
221
if (!tableRes.ok ()) {
209
- return TStringBuilder{} << " couldn't parse csv/tsv file, check format and compression params: " << readerStatus .ToString ();
222
+ return TStringBuilder{} << " couldn't parse csv/tsv file, check format and compression params: " << tableRes .ToString ();
210
223
}
211
224
212
225
return table->fields ();
@@ -217,24 +230,52 @@ std::variant<ArrowFields, TString> InferParquetTypes(std::shared_ptr<arrow::io::
217
230
builder.properties (parquet::ArrowReaderProperties (false ));
218
231
auto openStatus = builder.Open (std::move (file));
219
232
if (!openStatus.ok ()) {
220
- return TStringBuilder{} << " couldn't parse parquet file, check format params: " << openStatus.ToString ();
233
+ return TStringBuilder{} << " couldn't open parquet file, check format params: " << openStatus.ToString ();
221
234
}
222
235
223
236
std::unique_ptr<parquet::arrow::FileReader> reader;
224
237
auto readerStatus = builder.Build (&reader);
225
238
if (!readerStatus.ok ()) {
226
- return TStringBuilder{} << " couldn't parse parquet file, check format params: " << openStatus .ToString ();
239
+ return TStringBuilder{} << " couldn't read parquet file, check format params: " << readerStatus .ToString ();
227
240
}
228
241
229
242
std::shared_ptr<arrow::Schema> schema;
230
243
auto schemaRes = reader->GetSchema (&schema);
231
244
if (!schemaRes.ok ()) {
232
- return TStringBuilder{} << " couldn't parse parquet file, check format params: " << openStatus .ToString ();
245
+ return TStringBuilder{} << " couldn't parse parquet file, check format params: " << schemaRes .ToString ();
233
246
}
234
247
235
248
return schema->fields ();
236
249
}
237
250
251
+ std::variant<ArrowFields, TString> InferJsonTypes (std::shared_ptr<arrow::io::RandomAccessFile> file, const JsonConfig& config) {
252
+ int64_t fileSize;
253
+ if (auto sizeStatus = file->GetSize ().Value (&fileSize); !sizeStatus.ok ()) {
254
+ return TStringBuilder{} << " coudn't get file size: " << sizeStatus.ToString ();
255
+ }
256
+
257
+ std::shared_ptr<arrow::json::TableReader> reader;
258
+ auto readerStatus = arrow::json::TableReader::Make (
259
+ arrow::default_memory_pool (),
260
+ std::move (file),
261
+ arrow::json::ReadOptions{.use_threads = false , .block_size = static_cast <int32_t >(fileSize)},
262
+ config.ParseOpts
263
+ ).Value (&reader);
264
+
265
+ if (!readerStatus.ok ()) {
266
+ return TString{TStringBuilder{} << " couldn't open json file, check format and compression params: " << readerStatus.ToString ()};
267
+ }
268
+
269
+ std::shared_ptr<arrow::Table> table;
270
+ auto tableRes = reader->Read ().Value (&table);
271
+
272
+ if (!tableRes.ok ()) {
273
+ return TString{TStringBuilder{} << " couldn't parse json file, check format and compression params: " << tableRes.ToString ()};
274
+ }
275
+
276
+ return table->fields ();
277
+ }
278
+
238
279
std::variant<ArrowFields, TString> InferType (EFileFormat format, std::shared_ptr<arrow::io::RandomAccessFile> file, const FormatConfig& config) {
239
280
switch (format) {
240
281
case EFileFormat::CsvWithNames:
@@ -243,6 +284,9 @@ std::variant<ArrowFields, TString> InferType(EFileFormat format, std::shared_ptr
243
284
return InferCsvTypes (std::move (file), static_cast <const TsvConfig&>(config));
244
285
case EFileFormat::Parquet:
245
286
return InferParquetTypes (std::move (file));
287
+ case EFileFormat::JsonEachRow:
288
+ case EFileFormat::JsonList:
289
+ return InferJsonTypes (std::move (file), static_cast <const JsonConfig&>(config));
246
290
case EFileFormat::Undefined:
247
291
default :
248
292
return std::variant<ArrowFields, TString>{std::in_place_type_t <TString>{}, TStringBuilder{} << " unexpected format: " << ConvertFileFormat (format)};
@@ -259,12 +303,19 @@ std::unique_ptr<TsvConfig> MakeTsvConfig(const THashMap<TString, TString>& param
259
303
return config;
260
304
}
261
305
306
+ std::unique_ptr<JsonConfig> MakeJsonConfig (const THashMap<TString, TString>&) {
307
+ return std::make_unique<JsonConfig>();
308
+ }
309
+
262
310
std::unique_ptr<FormatConfig> MakeFormatConfig (EFileFormat format, const THashMap<TString, TString>& params) {
263
311
switch (format) {
264
312
case EFileFormat::CsvWithNames:
265
313
return MakeCsvConfig (params);
266
314
case EFileFormat::TsvWithNames:
267
315
return MakeTsvConfig (params);
316
+ case EFileFormat::JsonEachRow:
317
+ case EFileFormat::JsonList:
318
+ return MakeJsonConfig (params);
268
319
case EFileFormat::Undefined:
269
320
default :
270
321
return nullptr ;
0 commit comments