1
1
#include " process_columns.h"
2
+
2
3
#include " common/adapter.h"
4
+ #include " modifier/schema.h"
3
5
#include " modifier/subset.h"
4
6
5
7
#include < util/string/join.h>
@@ -8,8 +10,8 @@ namespace NKikimr::NArrow {
8
10
9
11
namespace {
10
12
template <class TDataContainer , class TStringImpl >
11
- std::shared_ptr<TDataContainer> ExtractColumnsValidateImpl (const std::shared_ptr<TDataContainer>& srcBatch,
12
- const std::vector<TStringImpl>& columnNames) {
13
+ std::shared_ptr<TDataContainer> ExtractColumnsValidateImpl (
14
+ const std::shared_ptr<TDataContainer>& srcBatch, const std:: vector<TStringImpl>& columnNames) {
13
15
std::vector<std::shared_ptr<arrow::Field>> fields;
14
16
fields.reserve (columnNames.size ());
15
17
std::vector<std::shared_ptr<typename NAdapter::TDataBuilderPolicy<TDataContainer>::TColumn>> columns;
@@ -27,9 +29,9 @@ std::shared_ptr<TDataContainer> ExtractColumnsValidateImpl(const std::shared_ptr
27
29
return NAdapter::TDataBuilderPolicy<TDataContainer>::Build (std::move (fields), std::move (columns), srcBatch->num_rows ());
28
30
}
29
31
30
- template <class TDataContainer >
31
- TConclusion<std::shared_ptr<TDataContainer>> AdaptColumnsImpl (const std::shared_ptr<TDataContainer>& srcBatch,
32
- const std::shared_ptr<arrow::Schema >& dstSchema, TSchemaSubset* subset) {
32
+ template <class TDataContainer , class TSchemaImpl >
33
+ TConclusion<std::shared_ptr<TDataContainer>> AdaptColumnsImpl (
34
+ const std::shared_ptr<TDataContainer>& srcBatch, const std::shared_ptr<TSchemaImpl >& dstSchema, TSchemaSubset* subset) {
33
35
AFL_VERIFY (srcBatch);
34
36
AFL_VERIFY (dstSchema);
35
37
std::vector<std::shared_ptr<typename NAdapter::TDataBuilderPolicy<TDataContainer>::TColumn>> columns;
@@ -48,16 +50,16 @@ TConclusion<std::shared_ptr<TDataContainer>> AdaptColumnsImpl(const std::shared_
48
50
fields.emplace_back (field);
49
51
auto srcField = srcBatch->schema ()->field (index);
50
52
if (field->Equals (srcField)) {
51
- AFL_VERIFY (columns.back ()->type ()->Equals (field->type ()))(" event" , " cannot_use_incoming_batch" )(" reason" , " invalid_column_type" )(" column " , field-> name ())
52
- (" column_type" , field->type ()->ToString ())(" incoming_type" , columns.back ()->type ()->ToString ());
53
+ AFL_VERIFY (columns.back ()->type ()->Equals (field->type ()))(" event" , " cannot_use_incoming_batch" )(" reason" , " invalid_column_type" )(
54
+ " column " , field-> name ()) (" column_type" , field->type ()->ToString ())(" incoming_type" , columns.back ()->type ()->ToString ());
53
55
} else {
54
- AFL_ERROR (NKikimrServices::ARROW_HELPER)(" event" , " cannot_use_incoming_batch" )(" reason" , " invalid_column_type" )(" column " , field-> name ())
55
- (" column_type" , field->ToString (true ))(" incoming_type" , srcField->ToString (true ));
56
+ AFL_ERROR (NKikimrServices::ARROW_HELPER)(" event" , " cannot_use_incoming_batch" )(" reason" , " invalid_column_type" )(
57
+ " column " , field-> name ()) (" column_type" , field->ToString (true ))(" incoming_type" , srcField->ToString (true ));
56
58
return TConclusionStatus::Fail (" incompatible column types" );
57
59
}
58
60
} else if (!subset) {
59
- AFL_ERROR (NKikimrServices::ARROW_HELPER)(" event" , " not_found_column" )(" column" , field->name ())
60
- ( " column_type" , field->type ()->ToString ())(" columns" , JoinSeq (" ," , srcBatch->schema ()->field_names ()));
61
+ AFL_ERROR (NKikimrServices::ARROW_HELPER)(" event" , " not_found_column" )(" column" , field->name ())(
62
+ " column_type" , field->type ()->ToString ())(" columns" , JoinSeq (" ," , srcBatch->schema ()->field_names ()));
61
63
return TConclusionStatus::Fail (" not found column '" + field->name () + " '" );
62
64
}
63
65
++idx;
@@ -76,7 +78,8 @@ std::shared_ptr<TDataContainer> ExtractImpl(const TColumnOperator::EExtractProbl
76
78
auto result = ExtractColumnsValidateImpl (incoming, columnNames);
77
79
switch (policy) {
78
80
case TColumnOperator::EExtractProblemsPolicy::Verify:
79
- AFL_VERIFY ((ui32)result->num_columns () == columnNames.size ())(" schema" , incoming->schema ()->ToString ())(" required" , JoinSeq (" ," , columnNames));
81
+ AFL_VERIFY ((ui32)result->num_columns () == columnNames.size ())(" schema" , incoming->schema ()->ToString ())(
82
+ " required" , JoinSeq (" ," , columnNames));
80
83
break ;
81
84
case TColumnOperator::EExtractProblemsPolicy::Null:
82
85
if ((ui32)result->num_columns () != columnNames.size ()) {
@@ -90,7 +93,8 @@ std::shared_ptr<TDataContainer> ExtractImpl(const TColumnOperator::EExtractProbl
90
93
}
91
94
92
95
template <class TDataContainer , class TStringType >
93
- TConclusion<std::shared_ptr<TDataContainer>> ReorderImpl (const std::shared_ptr<TDataContainer>& incoming, const std::vector<TStringType>& columnNames) {
96
+ TConclusion<std::shared_ptr<TDataContainer>> ReorderImpl (
97
+ const std::shared_ptr<TDataContainer>& incoming, const std::vector<TStringType>& columnNames) {
94
98
AFL_VERIFY (!!incoming);
95
99
AFL_VERIFY (columnNames.size ());
96
100
if ((ui32)incoming->num_columns () < columnNames.size ()) {
@@ -107,46 +111,65 @@ TConclusion<std::shared_ptr<TDataContainer>> ReorderImpl(const std::shared_ptr<T
107
111
return result;
108
112
}
109
113
110
- }
114
+ } // namespace
111
115
112
- std::shared_ptr<arrow::RecordBatch> TColumnOperator::Extract (const std::shared_ptr<arrow::RecordBatch>& incoming, const std::vector<std::string>& columnNames) {
116
+ std::shared_ptr<arrow::RecordBatch> TColumnOperator::Extract (
117
+ const std::shared_ptr<arrow::RecordBatch>& incoming, const std::vector<std::string>& columnNames) {
113
118
return ExtractImpl (AbsentColumnPolicy, incoming, columnNames);
114
119
}
115
120
116
- std::shared_ptr<arrow::Table> TColumnOperator::Extract (const std::shared_ptr<arrow::Table>& incoming, const std::vector<std::string>& columnNames) {
121
+ std::shared_ptr<arrow::Table> TColumnOperator::Extract (
122
+ const std::shared_ptr<arrow::Table>& incoming, const std::vector<std::string>& columnNames) {
117
123
return ExtractImpl (AbsentColumnPolicy, incoming, columnNames);
118
124
}
119
125
120
- std::shared_ptr<arrow::RecordBatch> TColumnOperator::Extract (const std::shared_ptr<arrow::RecordBatch>& incoming, const std::vector<TString>& columnNames) {
126
+ std::shared_ptr<arrow::RecordBatch> TColumnOperator::Extract (
127
+ const std::shared_ptr<arrow::RecordBatch>& incoming, const std::vector<TString>& columnNames) {
121
128
return ExtractImpl (AbsentColumnPolicy, incoming, columnNames);
122
129
}
123
130
124
131
std::shared_ptr<arrow::Table> TColumnOperator::Extract (const std::shared_ptr<arrow::Table>& incoming, const std::vector<TString>& columnNames) {
125
132
return ExtractImpl (AbsentColumnPolicy, incoming, columnNames);
126
133
}
127
134
128
- NKikimr::TConclusion<std::shared_ptr<arrow::RecordBatch>> TColumnOperator::Adapt (const std::shared_ptr<arrow::RecordBatch>& incoming, const std::shared_ptr<arrow::Schema>& dstSchema, TSchemaSubset* subset) {
135
+ NKikimr::TConclusion<std::shared_ptr<arrow::RecordBatch>> TColumnOperator::Adapt (
136
+ const std::shared_ptr<arrow::RecordBatch>& incoming, const std::shared_ptr<arrow::Schema>& dstSchema, TSchemaSubset* subset) {
137
+ return AdaptColumnsImpl (incoming, dstSchema, subset);
138
+ }
139
+
140
+ NKikimr::TConclusion<std::shared_ptr<arrow::Table>> TColumnOperator::Adapt (
141
+ const std::shared_ptr<arrow::Table>& incoming, const std::shared_ptr<arrow::Schema>& dstSchema, TSchemaSubset* subset) {
142
+ return AdaptColumnsImpl (incoming, dstSchema, subset);
143
+ }
144
+
145
+ NKikimr::TConclusion<std::shared_ptr<arrow::RecordBatch>> TColumnOperator::Adapt (
146
+ const std::shared_ptr<arrow::RecordBatch>& incoming, const std::shared_ptr<NArrow::TSchemaLite>& dstSchema, TSchemaSubset* subset) {
129
147
return AdaptColumnsImpl (incoming, dstSchema, subset);
130
148
}
131
149
132
- NKikimr::TConclusion<std::shared_ptr<arrow::Table>> TColumnOperator::Adapt (const std::shared_ptr<arrow::Table>& incoming, const std::shared_ptr<arrow::Schema>& dstSchema, TSchemaSubset* subset) {
150
+ NKikimr::TConclusion<std::shared_ptr<arrow::Table>> TColumnOperator::Adapt (
151
+ const std::shared_ptr<arrow::Table>& incoming, const std::shared_ptr<NArrow::TSchemaLite>& dstSchema, TSchemaSubset* subset) {
133
152
return AdaptColumnsImpl (incoming, dstSchema, subset);
134
153
}
135
154
136
- NKikimr::TConclusion<std::shared_ptr<arrow::RecordBatch>> TColumnOperator::Reorder (const std::shared_ptr<arrow::RecordBatch>& incoming, const std::vector<std::string>& columnNames) {
155
+ NKikimr::TConclusion<std::shared_ptr<arrow::RecordBatch>> TColumnOperator::Reorder (
156
+ const std::shared_ptr<arrow::RecordBatch>& incoming, const std::vector<std::string>& columnNames) {
137
157
return ReorderImpl (incoming, columnNames);
138
158
}
139
159
140
- NKikimr::TConclusion<std::shared_ptr<arrow::Table>> TColumnOperator::Reorder (const std::shared_ptr<arrow::Table>& incoming, const std::vector<std::string>& columnNames) {
160
+ NKikimr::TConclusion<std::shared_ptr<arrow::Table>> TColumnOperator::Reorder (
161
+ const std::shared_ptr<arrow::Table>& incoming, const std::vector<std::string>& columnNames) {
141
162
return ReorderImpl (incoming, columnNames);
142
163
}
143
164
144
- NKikimr::TConclusion<std::shared_ptr<arrow::RecordBatch>> TColumnOperator::Reorder (const std::shared_ptr<arrow::RecordBatch>& incoming, const std::vector<TString>& columnNames) {
165
+ NKikimr::TConclusion<std::shared_ptr<arrow::RecordBatch>> TColumnOperator::Reorder (
166
+ const std::shared_ptr<arrow::RecordBatch>& incoming, const std::vector<TString>& columnNames) {
145
167
return ReorderImpl (incoming, columnNames);
146
168
}
147
169
148
- NKikimr::TConclusion<std::shared_ptr<arrow::Table>> TColumnOperator::Reorder (const std::shared_ptr<arrow::Table>& incoming, const std::vector<TString>& columnNames) {
170
+ NKikimr::TConclusion<std::shared_ptr<arrow::Table>> TColumnOperator::Reorder (
171
+ const std::shared_ptr<arrow::Table>& incoming, const std::vector<TString>& columnNames) {
149
172
return ReorderImpl (incoming, columnNames);
150
173
}
151
174
152
- }
175
+ } // namespace NKikimr::NArrow
0 commit comments