@@ -99,22 +99,22 @@ TConclusion<std::shared_ptr<TDataContainer>> AdaptColumnsImpl(
99
99
}
100
100
101
101
template <class TDataContainer , class TStringContainer >
102
- std::shared_ptr<TDataContainer> ExtractImpl (const TColumnOperator::EExtractProblemsPolicy & policy,
102
+ std::shared_ptr<TDataContainer> ExtractImpl (const TColumnOperator::EAbsentFieldPolicy & policy,
103
103
const std::shared_ptr<TDataContainer>& incoming, const std::vector<TStringContainer>& columnNames) {
104
104
AFL_VERIFY (incoming);
105
105
AFL_VERIFY (columnNames.size ());
106
106
auto result = ExtractColumnsValidateImpl (incoming, columnNames);
107
107
switch (policy) {
108
- case TColumnOperator::EExtractProblemsPolicy ::Verify:
108
+ case TColumnOperator::EAbsentFieldPolicy ::Verify:
109
109
AFL_VERIFY ((ui32)result->num_columns () == columnNames.size ())(" schema" , incoming->schema ()->ToString ())(
110
110
" required" , TColumnNameAccessor<TStringContainer>::DebugString (columnNames));
111
111
break ;
112
- case TColumnOperator::EExtractProblemsPolicy::Null :
112
+ case TColumnOperator::EAbsentFieldPolicy::Error :
113
113
if ((ui32)result->num_columns () != columnNames.size ()) {
114
114
return nullptr ;
115
115
}
116
116
break ;
117
- case TColumnOperator::EExtractProblemsPolicy ::Skip:
117
+ case TColumnOperator::EAbsentFieldPolicy ::Skip:
118
118
break ;
119
119
}
120
120
return result;
@@ -211,8 +211,8 @@ NKikimr::TConclusion<std::shared_ptr<arrow::Table>> TColumnOperator::Reorder(
211
211
}
212
212
namespace {
213
213
template <class TDataContainer , class TSchemaImpl >
214
- TConclusion<TSchemaSubset> BuildSequentialSubsetImpl (
215
- const std::shared_ptr<TDataContainer >& srcBatch , const std::shared_ptr<TSchemaImpl>& dstSchema ) {
214
+ TConclusion<TSchemaSubset> BuildSequentialSubsetImpl (const std::shared_ptr<TDataContainer>& srcBatch,
215
+ const std::shared_ptr<TSchemaImpl >& dstSchema , const TColumnOperator::ECheckFieldTypesPolicy checkFieldTypesPolicy ) {
216
216
AFL_VERIFY (srcBatch);
217
217
AFL_VERIFY (dstSchema);
218
218
if (dstSchema->num_fields () < srcBatch->schema ()->num_fields ()) {
@@ -228,10 +228,20 @@ TConclusion<TSchemaSubset> BuildSequentialSubsetImpl(
228
228
++itDst;
229
229
} else {
230
230
fieldIdx.emplace (itDst - dstSchema->fields ().begin ());
231
- if (!(*itDst)->Equals (*itSrc)) {
232
- AFL_ERROR (NKikimrServices::ARROW_HELPER)(" event" , " cannot_use_incoming_batch" )(" reason" , " invalid_column_type" )(
233
- " column_type" , (*itDst)->ToString (true ))(" incoming_type" , (*itSrc)->ToString (true ));
234
- return TConclusionStatus::Fail (" incompatible column types" );
231
+ if (checkFieldTypesPolicy != TColumnOperator::ECheckFieldTypesPolicy::Ignore && (*itDst)->Equals (*itSrc)) {
232
+ switch (checkFieldTypesPolicy) {
233
+ case TColumnOperator::ECheckFieldTypesPolicy::Error: {
234
+ AFL_ERROR (NKikimrServices::ARROW_HELPER)(" event" , " cannot_use_incoming_batch" )(" reason" , " invalid_column_type" )(
235
+ " column_type" , (*itDst)->ToString (true ))(" incoming_type" , (*itSrc)->ToString (true ));
236
+ return TConclusionStatus::Fail (" incompatible column types" );
237
+ }
238
+ case TColumnOperator::ECheckFieldTypesPolicy::Verify: {
239
+ AFL_VERIFY (false )(" event" , " cannot_use_incoming_batch" )(" reason" , " invalid_column_type" )(
240
+ " column_type" , (*itDst)->ToString (true ))(" incoming_type" , (*itSrc)->ToString (true ));
241
+ }
242
+ case TColumnOperator::ECheckFieldTypesPolicy::Ignore:
243
+ AFL_VERIFY (false );
244
+ }
235
245
}
236
246
237
247
++itDst;
@@ -249,7 +259,82 @@ TConclusion<TSchemaSubset> BuildSequentialSubsetImpl(
249
259
250
260
TConclusion<TSchemaSubset> TColumnOperator::BuildSequentialSubset (
251
261
const std::shared_ptr<arrow::RecordBatch>& incoming, const std::shared_ptr<NArrow::TSchemaLite>& dstSchema) {
252
- return BuildSequentialSubsetImpl (incoming, dstSchema);
262
+ return BuildSequentialSubsetImpl (incoming, dstSchema, DifferentColumnTypesPolicy);
263
+ }
264
+ namespace {
265
+ template <class TDataContainer >
266
+ TConclusion<std::shared_ptr<TDataContainer>> AdaptIncomingToDestinationExtImpl (const std::shared_ptr<TDataContainer>& incoming,
267
+ const std::shared_ptr<TSchemaLite>& dstSchema, const std::function<TConclusionStatus(const ui32, const i32 )>& checker,
268
+ const std::function<i32(const std::string&)>& nameResolver,
269
+ const TColumnOperator::ECheckFieldTypesPolicy differentColumnTypesPolicy,
270
+ const TColumnOperator::EAbsentFieldPolicy absentColumnPolicy) {
271
+ struct TFieldData {
272
+ ui32 Index;
273
+ std::shared_ptr<typename NAdapter::TDataBuilderPolicy<TDataContainer>::TColumn> Column;
274
+ bool operator <(const TFieldData& item) const {
275
+ return Index < item.Index ;
276
+ }
277
+ };
278
+ AFL_VERIFY (incoming);
279
+ AFL_VERIFY (dstSchema);
280
+ std::vector<TFieldData> resultColumns;
281
+ resultColumns.reserve (incoming->num_columns ());
282
+ ui32 idx = 0 ;
283
+ for (auto & srcField : incoming->schema ()->fields ()) {
284
+ const int dstIndex = nameResolver (srcField->name ());
285
+ if (dstIndex > -1 ) {
286
+ const auto & dstField = dstSchema->GetFieldByIndexVerified (dstIndex);
287
+ switch (differentColumnTypesPolicy) {
288
+ case TColumnOperator::ECheckFieldTypesPolicy::Verify:
289
+ AFL_VERIFY (dstField->Equals (srcField))(" event" , " cannot_use_incoming_batch" )(" reason" , " invalid_column_type" )(
290
+ " dst_column" , dstField->ToString (true ))(" src_column" , srcField->ToString (true ));
291
+ break ;
292
+ case TColumnOperator::ECheckFieldTypesPolicy::Error:
293
+ if (!dstField->Equals (srcField)) {
294
+ AFL_ERROR (NKikimrServices::ARROW_HELPER)(" event" , " cannot_use_incoming_batch" )(" reason" , " invalid_column_type" )(
295
+ " dst_column" , dstField->ToString (true ))(" src_column" , srcField->ToString (true ));
296
+ return TConclusionStatus::Fail (" incompatible column types for '" + dstField->name () + " '" );
297
+ }
298
+ break ;
299
+ case TColumnOperator::ECheckFieldTypesPolicy::Ignore:
300
+ break ;
301
+ }
302
+ auto resultCheck = checker (idx, dstIndex);
303
+ if (resultCheck.IsFail ()) {
304
+ return resultCheck;
305
+ }
306
+ resultColumns.emplace_back (TFieldData{ .Index = (ui32)dstIndex, .Column = incoming->column (idx) });
307
+ } else if (absentColumnPolicy == TColumnOperator::EAbsentFieldPolicy::Skip) {
308
+ } else if (absentColumnPolicy == TColumnOperator::EAbsentFieldPolicy::Verify) {
309
+ AFL_VERIFY (false )(" event" , " cannot_use_incoming_batch" )(" reason" , " absent_field" )(" dst_column" , srcField->ToString (true ));
310
+ } else if (absentColumnPolicy == TColumnOperator::EAbsentFieldPolicy::Error) {
311
+ AFL_ERROR (NKikimrServices::ARROW_HELPER)(" event" , " cannot_use_incoming_batch" )(" reason" , " absent_field" )(
312
+ " dst_column" , srcField->ToString (true ));
313
+ return TConclusionStatus::Fail (" not found column '" + srcField->name () + " '" );
314
+ } else {
315
+ AFL_VERIFY (false );
316
+ }
317
+ ++idx;
318
+ }
319
+ if (resultColumns.empty ()) {
320
+ return TConclusionStatus::Fail (" not found any column" );
321
+ }
322
+ std::sort (resultColumns.begin (), resultColumns.end ());
323
+ std::vector<std::shared_ptr<arrow::Field>> fields;
324
+ std::vector<std::shared_ptr<typename NAdapter::TDataBuilderPolicy<TDataContainer>::TColumn>> columns;
325
+ columns.reserve (resultColumns.size ());
326
+ fields.reserve (resultColumns.size ());
327
+ for (auto && i : resultColumns) {
328
+ fields.emplace_back (dstSchema->field (i.Index ));
329
+ columns.emplace_back (i.Column );
330
+ }
331
+ return NAdapter::TDataBuilderPolicy<TDataContainer>::Build (std::make_shared<arrow::Schema>(fields), std::move (columns), incoming->num_rows ());
332
+ }
333
+ } // namespace
334
+ TConclusion<std::shared_ptr<arrow::RecordBatch>> TColumnOperator::AdaptIncomingToDestinationExt (
335
+ const std::shared_ptr<arrow::RecordBatch>& incoming, const std::shared_ptr<TSchemaLite>& dstSchema,
336
+ const std::function<TConclusionStatus(const ui32, const i32 )>& checker, const std::function<i32(const std::string&)>& nameResolver) const {
337
+ return AdaptIncomingToDestinationExtImpl (incoming, dstSchema, checker, nameResolver, DifferentColumnTypesPolicy, AbsentColumnPolicy);
253
338
}
254
339
255
340
} // namespace NKikimr::NArrow
0 commit comments