4
4
5
5
#include < yql/essentials/minikql/dom/json.h>
6
6
#include < yql/essentials/minikql/invoke_builtins/mkql_builtins.h>
7
+ #include < yql/essentials/minikql/mkql_function_registry.h>
7
8
#include < yql/essentials/minikql/mkql_node_cast.h>
8
9
#include < yql/essentials/minikql/mkql_program_builder.h>
9
10
#include < yql/essentials/minikql/mkql_string_util.h>
11
+ #include < yql/essentials/minikql/mkql_type_ops.h>
10
12
#include < yql/essentials/providers/common/schema/mkql/yql_mkql_schema.h>
11
13
12
14
#include < library/cpp/containers/absl_flat_hash/flat_hash_map.h>
@@ -17,6 +19,8 @@ namespace {
17
19
18
20
TString LogPrefix = " JsonParser: " ;
19
21
22
+ constexpr ui64 DEFAULT_STATIC_BUFFER_SIZE = 1000000 ;
23
+
20
24
struct TJsonParserBuffer {
21
25
size_t NumberValues = 0 ;
22
26
bool Finished = false ;
@@ -80,31 +84,31 @@ class TColumnParser {
80
84
const TString TypeYson;
81
85
const NKikimr::NMiniKQL::TType* TypeMkql;
82
86
const bool IsOptional = false ;
83
- size_t NumberValues = 0 ;
87
+ TVector< size_t > ParsedRows ;
84
88
85
89
public:
86
- TColumnParser (const TString& name, const TString& typeYson, NKikimr::NMiniKQL::TProgramBuilder& programBuilder)
90
+ TColumnParser (const TString& name, const TString& typeYson, ui64 maxNumberRows, NKikimr::NMiniKQL::TProgramBuilder& programBuilder)
87
91
: Name(name)
88
92
, TypeYson(typeYson)
89
93
, TypeMkql(NYql::NCommon::ParseTypeFromYson(TStringBuf(typeYson), programBuilder, Cerr))
90
94
, IsOptional(TypeMkql->IsOptional ())
91
- , NumberValues(0 )
92
95
{
96
+ ParsedRows.reserve (maxNumberRows);
93
97
try {
94
98
Parser = CreateParser (TypeMkql);
95
99
} catch (...) {
96
100
throw yexception () << " Failed to create parser for column '" << Name << " ' with type " << TypeYson << " , description: " << CurrentExceptionMessage ();
97
101
}
98
102
}
99
103
100
- void ParseJsonValue (simdjson::builtin::ondemand::value jsonValue, NYql::NUdf::TUnboxedValue& resultValue) {
104
+ void ParseJsonValue (ui64 rowId, simdjson::builtin::ondemand::value jsonValue, NYql::NUdf::TUnboxedValue& resultValue) {
101
105
Parser (jsonValue, resultValue);
102
- NumberValues++ ;
106
+ ParsedRows. emplace_back (rowId) ;
103
107
}
104
108
105
109
void ValidateNumberValues (size_t expectedNumberValues, ui64 firstOffset) const {
106
- if (Y_UNLIKELY (!IsOptional && NumberValues < expectedNumberValues)) {
107
- throw yexception () << " Failed to parse json messages, found " << expectedNumberValues - NumberValues << " missing values from offset " << firstOffset << " in non optional column '" << Name << " ' with type " << TypeYson;
110
+ if (Y_UNLIKELY (!IsOptional && ParsedRows. size () < expectedNumberValues)) {
111
+ throw yexception () << " Failed to parse json messages, found " << expectedNumberValues - ParsedRows. size () << " missing values from offset " << firstOffset << " in non optional column '" << Name << " ' with type " << TypeYson;
108
112
}
109
113
}
110
114
@@ -273,11 +277,13 @@ namespace NFq {
273
277
274
278
class TJsonParser ::TImpl {
275
279
public:
276
- TImpl (const TVector<TString>& columns, const TVector<TString>& types, ui64 batchSize, TDuration batchCreationTimeout)
280
+ TImpl (const TVector<TString>& columns, const TVector<TString>& types, TCallback parseCallback, ui64 batchSize, TDuration batchCreationTimeout, ui64 staticBufferSize )
277
281
: Alloc(__LOCATION__, NKikimr::TAlignedPagePoolCounters(), true , false )
278
282
, TypeEnv(std::make_unique<NKikimr::NMiniKQL::TTypeEnvironment>(Alloc))
279
283
, BatchSize(batchSize)
284
+ , MaxNumberRows(((staticBufferSize ? staticBufferSize : DEFAULT_STATIC_BUFFER_SIZE) - 1 ) / columns.size() + 1 )
280
285
, BatchCreationTimeout(batchCreationTimeout)
286
+ , ParseCallback(parseCallback)
281
287
, ParsedValues(columns.size())
282
288
{
283
289
Y_ENSURE (columns.size () == types.size (), " Number of columns and types should by equal" );
@@ -288,7 +294,7 @@ class TJsonParser::TImpl {
288
294
289
295
Columns.reserve (columns.size ());
290
296
for (size_t i = 0 ; i < columns.size (); i++) {
291
- Columns.emplace_back (columns[i], types[i], programBuilder);
297
+ Columns.emplace_back (columns[i], types[i], MaxNumberRows, programBuilder);
292
298
}
293
299
}
294
300
@@ -297,7 +303,11 @@ class TJsonParser::TImpl {
297
303
ColumnsIndex.emplace (std::string_view (Columns[i].Name ), i);
298
304
}
299
305
300
- Buffer.Reserve (BatchSize, 1 );
306
+ for (size_t i = 0 ; i < columns.size (); i++) {
307
+ ParsedValues[i].resize (MaxNumberRows);
308
+ }
309
+
310
+ Buffer.Reserve (BatchSize, MaxNumberRows);
301
311
302
312
LOG_ROW_DISPATCHER_INFO (" Simdjson active implementation " << simdjson::get_active_implementation ()->name ());
303
313
Parser.threaded = false ;
@@ -330,21 +340,20 @@ class TJsonParser::TImpl {
330
340
Buffer.AddMessages (messages);
331
341
}
332
342
333
- const TVector<NKikimr::NMiniKQL::TUnboxedValueVector>& Parse () {
343
+ void Parse () {
334
344
Y_ENSURE (Buffer.IsReady (), " Nothing to parse" );
335
345
336
346
const auto [values, size] = Buffer.Finish ();
337
347
LOG_ROW_DISPATCHER_TRACE (" Parse values:\n " << values);
338
348
339
349
with_lock (Alloc) {
340
- ClearColumns (Buffer.NumberValues );
341
-
342
350
const ui64 firstOffset = Buffer.Offsets .front ();
343
351
size_t rowId = 0 ;
352
+ size_t parsedRows = 0 ;
344
353
simdjson::ondemand::document_stream documents = Parser.iterate_many (values, size, simdjson::ondemand::DEFAULT_BATCH_SIZE);
345
354
for (auto document : documents) {
346
- if (Y_UNLIKELY (rowId >= Buffer.NumberValues )) {
347
- throw yexception () << " Failed to parse json messages, expected " << Buffer.NumberValues << " json rows from offset " << firstOffset << " but got " << rowId + 1 ;
355
+ if (Y_UNLIKELY (parsedRows >= Buffer.NumberValues )) {
356
+ throw yexception () << " Failed to parse json messages, expected " << Buffer.NumberValues << " json rows from offset " << firstOffset << " but got " << parsedRows + 1 ;
348
357
}
349
358
for (auto item : document.get_object ()) {
350
359
const auto it = ColumnsIndex.find (item.escaped_key ().value ());
@@ -355,23 +364,28 @@ class TJsonParser::TImpl {
355
364
const size_t columnId = it->second ;
356
365
auto & columnParser = Columns[columnId];
357
366
try {
358
- columnParser.ParseJsonValue (item.value (), ParsedValues[columnId][rowId]);
367
+ columnParser.ParseJsonValue (rowId, item.value (), ParsedValues[columnId][rowId]);
359
368
} catch (...) {
360
369
throw yexception () << " Failed to parse json string at offset " << Buffer.Offsets [rowId] << " , got parsing error for column '" << columnParser.Name << " ' with type " << columnParser.TypeYson << " , description: " << CurrentExceptionMessage ();
361
370
}
362
371
}
372
+
363
373
rowId++;
374
+ parsedRows++;
375
+
376
+ if (rowId == MaxNumberRows) {
377
+ ClearColumns (parsedRows, MaxNumberRows);
378
+ rowId = 0 ;
379
+ }
364
380
}
365
381
366
- if (rowId != Buffer.NumberValues ) {
382
+ if (parsedRows != Buffer.NumberValues ) {
367
383
throw yexception () << " Failed to parse json messages, expected " << Buffer.NumberValues << " json rows from offset " << firstOffset << " but got " << rowId;
368
384
}
369
- for ( const auto & columnDesc : Columns ) {
370
- columnDesc. ValidateNumberValues (rowId, firstOffset );
385
+ if (rowId ) {
386
+ ClearColumns (parsedRows, rowId );
371
387
}
372
388
}
373
-
374
- return ParsedValues;
375
389
}
376
390
377
391
TString GetDescription () const {
@@ -385,26 +399,32 @@ class TJsonParser::TImpl {
385
399
386
400
~TImpl () {
387
401
with_lock (Alloc) {
388
- ClearColumns (0 );
389
402
ParsedValues.clear ();
390
403
Columns.clear ();
391
404
TypeEnv.reset ();
392
405
}
393
406
}
394
407
395
408
private:
396
- void ClearColumns (size_t newSize ) {
397
- const auto clearValue = [&allocState = Alloc. Ref ()](NYql::NUdf::TUnboxedValue& value){
398
- value. UnlockRef ( 1 );
399
- value. Clear ( );
400
- };
409
+ void ClearColumns (size_t parsedRows, size_t savedRows ) {
410
+ const ui64 firstOffset = Buffer. Offsets . front ();
411
+ for ( const auto & column : Columns) {
412
+ column. ValidateNumberValues (savedRows, firstOffset );
413
+ }
401
414
402
- for (size_t i = 0 ; i < Columns.size (); ++i) {
403
- Columns[i].NumberValues = 0 ;
415
+ {
416
+ auto unguard = Unguard (Alloc);
417
+ ParseCallback (parsedRows - savedRows, savedRows, ParsedValues);
418
+ }
404
419
420
+ for (size_t i = 0 ; i < Columns.size (); ++i) {
405
421
auto & parsedColumn = ParsedValues[i];
406
- std::for_each (parsedColumn.begin (), parsedColumn.end (), clearValue);
407
- parsedColumn.resize (newSize);
422
+ for (size_t rowId : Columns[i].ParsedRows ) {
423
+ auto & parsedRow = parsedColumn[rowId];
424
+ parsedRow.UnlockRef (1 );
425
+ parsedRow.Clear ();
426
+ }
427
+ Columns[i].ParsedRows .clear ();
408
428
}
409
429
}
410
430
@@ -413,18 +433,20 @@ class TJsonParser::TImpl {
413
433
std::unique_ptr<NKikimr::NMiniKQL::TTypeEnvironment> TypeEnv;
414
434
415
435
const ui64 BatchSize;
436
+ const ui64 MaxNumberRows;
416
437
const TDuration BatchCreationTimeout;
438
+ const TCallback ParseCallback;
417
439
TVector<TColumnParser> Columns;
418
440
absl::flat_hash_map<std::string_view, size_t > ColumnsIndex;
419
441
420
442
TJsonParserBuffer Buffer;
421
443
simdjson::ondemand::parser Parser;
422
444
423
- TVector<NKikimr::NMiniKQL::TUnboxedValueVector > ParsedValues;
445
+ TVector<TVector<NYql::NUdf::TUnboxedValue> > ParsedValues;
424
446
};
425
447
426
- TJsonParser::TJsonParser (const TVector<TString>& columns, const TVector<TString>& types, ui64 batchSize, TDuration batchCreationTimeout)
427
- : Impl(std::make_unique<TJsonParser::TImpl>(columns, types, batchSize, batchCreationTimeout))
448
+ TJsonParser::TJsonParser (const TVector<TString>& columns, const TVector<TString>& types, TCallback parseCallback, ui64 batchSize, TDuration batchCreationTimeout, ui64 staticBufferSize )
449
+ : Impl(std::make_unique<TJsonParser::TImpl>(columns, types, parseCallback, batchSize, batchCreationTimeout, staticBufferSize ))
428
450
{}
429
451
430
452
TJsonParser::~TJsonParser () {
@@ -450,16 +472,16 @@ const TVector<ui64>& TJsonParser::GetOffsets() const {
450
472
return Impl->GetOffsets ();
451
473
}
452
474
453
- const TVector<NKikimr::NMiniKQL::TUnboxedValueVector>& TJsonParser::Parse () {
454
- return Impl->Parse ();
475
+ void TJsonParser::Parse () {
476
+ Impl->Parse ();
455
477
}
456
478
457
479
TString TJsonParser::GetDescription () const {
458
480
return Impl->GetDescription ();
459
481
}
460
482
461
- std::unique_ptr<TJsonParser> NewJsonParser (const TVector<TString>& columns, const TVector<TString>& types, ui64 batchSize, TDuration batchCreationTimeout) {
462
- return std::unique_ptr<TJsonParser>(new TJsonParser (columns, types, batchSize, batchCreationTimeout));
483
+ std::unique_ptr<TJsonParser> NewJsonParser (const TVector<TString>& columns, const TVector<TString>& types, TJsonParser::TCallback parseCallback, ui64 batchSize, TDuration batchCreationTimeout, ui64 staticBufferSize ) {
484
+ return std::unique_ptr<TJsonParser>(new TJsonParser (columns, types, parseCallback, batchSize, batchCreationTimeout, staticBufferSize ));
463
485
}
464
486
465
487
} // namespace NFq
0 commit comments