@@ -19,7 +19,9 @@ namespace {
19
19
20
20
TString LogPrefix = " JsonParser: " ;
21
21
22
+ constexpr ui64 DEFAULT_BATCH_SIZE = 1_MB;
22
23
constexpr ui64 DEFAULT_STATIC_BUFFER_SIZE = 1000000 ;
24
+ constexpr TDuration DEFAULT_BATCH_CREATION_TIMEOUT = TDuration::Seconds(1 );
23
25
24
26
struct TJsonParserBuffer {
25
27
size_t NumberValues = 0 ;
@@ -40,20 +42,11 @@ struct TJsonParserBuffer {
40
42
Offsets.reserve (numberValues);
41
43
}
42
44
43
- void AddMessages (const TVector< NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent::TMessage>& messages ) {
45
+ void AddMessage (const NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent::TMessage& message ) {
44
46
Y_ENSURE (!Finished, " Cannot add messages into finished buffer" );
45
-
46
- size_t messagesSize = 0 ;
47
- for (const auto & message : messages) {
48
- messagesSize += message.GetData ().size ();
49
- }
50
-
51
- NumberValues += messages.size ();
52
- Reserve (Values.size () + messagesSize, NumberValues);
53
- for (const auto & message : messages) {
54
- Values << message.GetData ();
55
- Offsets.emplace_back (message.GetOffset ());
56
- }
47
+ NumberValues++;
48
+ Values << message.GetData ();
49
+ Offsets.emplace_back (message.GetOffset ());
57
50
}
58
51
59
52
std::pair<const char *, size_t > Finish () {
@@ -102,8 +95,8 @@ class TColumnParser {
102
95
}
103
96
104
97
void ParseJsonValue (ui64 rowId, simdjson::builtin::ondemand::value jsonValue, NYql::NUdf::TUnboxedValue& resultValue) {
105
- Parser (jsonValue, resultValue);
106
98
ParsedRows.emplace_back (rowId);
99
+ Parser (jsonValue, resultValue);
107
100
}
108
101
109
102
void ValidateNumberValues (size_t expectedNumberValues, ui64 firstOffset) const {
@@ -280,9 +273,9 @@ class TJsonParser::TImpl {
280
273
TImpl (const TVector<TString>& columns, const TVector<TString>& types, TCallback parseCallback, ui64 batchSize, TDuration batchCreationTimeout, ui64 staticBufferSize)
281
274
: Alloc(__LOCATION__, NKikimr::TAlignedPagePoolCounters(), true , false )
282
275
, TypeEnv(std::make_unique<NKikimr::NMiniKQL::TTypeEnvironment>(Alloc))
283
- , BatchSize(batchSize)
276
+ , BatchSize(batchSize ? batchSize : DEFAULT_BATCH_SIZE )
284
277
, MaxNumberRows(((staticBufferSize ? staticBufferSize : DEFAULT_STATIC_BUFFER_SIZE) - 1 ) / columns.size() + 1 )
285
- , BatchCreationTimeout(batchCreationTimeout)
278
+ , BatchCreationTimeout(batchCreationTimeout ? batchCreationTimeout : DEFAULT_BATCH_CREATION_TIMEOUT )
286
279
, ParseCallback(parseCallback)
287
280
, ParsedValues(columns.size())
288
281
{
@@ -330,14 +323,13 @@ class TJsonParser::TImpl {
330
323
}
331
324
332
325
void AddMessages (const TVector<NYdb::NTopic::TReadSessionEvent::TDataReceivedEvent::TMessage>& messages) {
333
- if (messages. empty ()) {
334
- return ;
335
- }
336
-
337
- if (Buffer. Finished ) {
338
- Buffer. Clear ();
326
+ Y_ENSURE (!Buffer. Finished , " Cannot add messages into finished buffer " );
327
+ for ( const auto & message : messages) {
328
+ Buffer. AddMessage (message);
329
+ if ( IsReady ()) {
330
+ Parse ();
331
+ }
339
332
}
340
- Buffer.AddMessages (messages);
341
333
}
342
334
343
335
void Parse () {
@@ -347,13 +339,18 @@ class TJsonParser::TImpl {
347
339
LOG_ROW_DISPATCHER_TRACE (" Parse values:\n " << values);
348
340
349
341
with_lock (Alloc) {
350
- const ui64 firstOffset = Buffer.Offsets .front ();
342
+ Y_DEFER {
343
+ // Clear all UV in case of exception
344
+ ClearColumns ();
345
+ Buffer.Clear ();
346
+ };
347
+
351
348
size_t rowId = 0 ;
352
349
size_t parsedRows = 0 ;
353
350
simdjson::ondemand::document_stream documents = Parser.iterate_many (values, size, simdjson::ondemand::DEFAULT_BATCH_SIZE);
354
351
for (auto document : documents) {
355
352
if (Y_UNLIKELY (parsedRows >= Buffer.NumberValues )) {
356
- throw yexception () << " Failed to parse json messages, expected " << Buffer.NumberValues << " json rows from offset " << firstOffset << " but got " << parsedRows + 1 ;
353
+ throw yexception () << " Failed to parse json messages, expected " << Buffer.NumberValues << " json rows from offset " << Buffer. Offsets . front () << " but got " << parsedRows + 1 ;
357
354
}
358
355
for (auto item : document.get_object ()) {
359
356
const auto it = ColumnsIndex.find (item.escaped_key ().value ());
@@ -372,18 +369,17 @@ class TJsonParser::TImpl {
372
369
373
370
rowId++;
374
371
parsedRows++;
375
-
376
372
if (rowId == MaxNumberRows) {
377
- ClearColumns (parsedRows, MaxNumberRows);
373
+ FlushColumns (parsedRows, MaxNumberRows);
378
374
rowId = 0 ;
379
375
}
380
376
}
381
377
382
- if (parsedRows != Buffer.NumberValues ) {
383
- throw yexception () << " Failed to parse json messages, expected " << Buffer.NumberValues << " json rows from offset " << firstOffset << " but got " << rowId;
378
+ if (Y_UNLIKELY ( parsedRows != Buffer.NumberValues ) ) {
379
+ throw yexception () << " Failed to parse json messages, expected " << Buffer.NumberValues << " json rows from offset " << Buffer. Offsets . front () << " but got " << rowId;
384
380
}
385
381
if (rowId) {
386
- ClearColumns (parsedRows, rowId);
382
+ FlushColumns (parsedRows, rowId);
387
383
}
388
384
}
389
385
}
@@ -406,7 +402,7 @@ class TJsonParser::TImpl {
406
402
}
407
403
408
404
private:
409
- void ClearColumns (size_t parsedRows, size_t savedRows) {
405
+ void FlushColumns (size_t parsedRows, size_t savedRows) {
410
406
const ui64 firstOffset = Buffer.Offsets .front ();
411
407
for (const auto & column : Columns) {
412
408
column.ValidateNumberValues (savedRows, firstOffset);
@@ -417,6 +413,10 @@ class TJsonParser::TImpl {
417
413
ParseCallback (parsedRows - savedRows, savedRows, ParsedValues);
418
414
}
419
415
416
+ ClearColumns ();
417
+ }
418
+
419
+ void ClearColumns () {
420
420
for (size_t i = 0 ; i < Columns.size (); ++i) {
421
421
auto & parsedColumn = ParsedValues[i];
422
422
for (size_t rowId : Columns[i].ParsedRows ) {
0 commit comments