@@ -59,17 +59,14 @@ TSparsedArray::TSparsedArray(const IChunkedArray& defaultArray, const std::share
59
59
pos = current->GetAddress ().GetGlobalFinishPosition ();
60
60
AFL_VERIFY (pos <= GetRecordsCount ());
61
61
}
62
- std::vector<std::shared_ptr<arrow::Field>> fields = { std::make_shared<arrow::Field>(" index" , arrow::uint32 ()),
63
- std::make_shared<arrow::Field>(" value" , GetDataType ()) };
64
- auto schema = std::make_shared<arrow::Schema>(fields);
65
62
std::vector<std::shared_ptr<arrow::Array>> columns = { NArrow::TStatusValidator::GetValid (builderIndex->Finish ()),
66
63
NArrow::TStatusValidator::GetValid (builderValue->Finish ()) };
67
- records = arrow::RecordBatch::Make (schema , sparsedRecordsCount, columns);
64
+ records = arrow::RecordBatch::Make (BuildSchema ( GetDataType ()) , sparsedRecordsCount, columns);
68
65
AFL_VERIFY_DEBUG (records->ValidateFull ().ok ());
69
66
return true ;
70
67
}));
71
68
AFL_VERIFY (records);
72
- Records.emplace_back (TSparsedArrayChunk ( 0 , GetRecordsCount (), records, DefaultValue) );
69
+ Records.emplace_back (0 , GetRecordsCount (), records, DefaultValue);
73
70
}
74
71
75
72
std::vector<NKikimr::NArrow::NAccessor::TChunkedArraySerialized> TSparsedArray::DoSplitBySizes (
@@ -136,27 +133,44 @@ ui32 TSparsedArray::GetLastIndex(const std::shared_ptr<arrow::RecordBatch>& batc
136
133
return ui32Column->Value (ui32Column->length () - 1 );
137
134
}
138
135
136
+ namespace {
137
+ static thread_local THashMap<TString, std::shared_ptr<arrow::RecordBatch>> SimpleBatchesCache;
138
+ }
139
+
140
+ NKikimr::NArrow::NAccessor::TSparsedArrayChunk TSparsedArray::MakeDefaultChunk (
141
+ const std::shared_ptr<arrow::Scalar>& defaultValue, const std::shared_ptr<arrow::DataType>& type, const ui32 recordsCount) {
142
+ auto it = SimpleBatchesCache.find (type->ToString ());
143
+ if (it == SimpleBatchesCache.end ()) {
144
+ it = SimpleBatchesCache.emplace (type->ToString (), NArrow::MakeEmptyBatch (BuildSchema (type))).first ;
145
+ AFL_VERIFY (it->second ->ValidateFull ().ok ());
146
+ }
147
+ return TSparsedArrayChunk (0 , recordsCount, it->second , defaultValue);
148
+ }
149
+
139
150
IChunkedArray::TLocalDataAddress TSparsedArrayChunk::GetChunk (
140
151
const std::optional<IChunkedArray::TCommonChunkAddress>& /* chunkCurrent*/ , const ui64 position, const ui32 chunkIdx) const {
141
- auto it = RemapExternalToInternal.upper_bound (position);
152
+ const auto predCompare = [](const ui32 position, const TInternalChunkInfo& item) {
153
+ return position < item.GetStartExt ();
154
+ };
155
+ auto it = std::upper_bound (RemapExternalToInternal.begin (), RemapExternalToInternal.end (), position, predCompare);
142
156
AFL_VERIFY (it != RemapExternalToInternal.begin ());
143
157
--it;
144
- if (it->second . GetIsDefault ()) {
158
+ if (it->GetIsDefault ()) {
145
159
return IChunkedArray::TLocalDataAddress (
146
- NArrow::TThreadSimpleArraysCache::Get (ColValue->type (), DefaultValue, it->second . GetSize ()), StartPosition + it->first , chunkIdx);
160
+ NArrow::TThreadSimpleArraysCache::Get (ColValue->type (), DefaultValue, it->GetSize ()), StartPosition + it->GetStartExt () , chunkIdx);
147
161
} else {
148
162
return IChunkedArray::TLocalDataAddress (
149
- ColValue->Slice (it->second . GetStart (), it->second . GetSize ()), StartPosition + it->first , chunkIdx);
163
+ ColValue->Slice (it->GetStartInt (), it->GetSize ()), StartPosition + it->GetStartExt () , chunkIdx);
150
164
}
151
165
}
152
166
153
167
std::vector<std::shared_ptr<arrow::Array>> TSparsedArrayChunk::GetChunkedArray () const {
154
168
std::vector<std::shared_ptr<arrow::Array>> chunks;
155
169
for (auto && i : RemapExternalToInternal) {
156
- if (i.second . GetIsDefault ()) {
157
- chunks.emplace_back (NArrow::TThreadSimpleArraysCache::Get (ColValue->type (), DefaultValue, i.second . GetSize ()));
170
+ if (i.GetIsDefault ()) {
171
+ chunks.emplace_back (NArrow::TThreadSimpleArraysCache::Get (ColValue->type (), DefaultValue, i.GetSize ()));
158
172
} else {
159
- chunks.emplace_back (ColValue->Slice (i.second . GetStart (), i. second .GetSize ()));
173
+ chunks.emplace_back (ColValue->Slice (i.GetStartInt (), i.GetSize ()));
160
174
}
161
175
}
162
176
return chunks;
@@ -189,23 +203,26 @@ TSparsedArrayChunk::TSparsedArrayChunk(const ui32 posStart, const ui32 recordsCo
189
203
for (ui32 idx = 0 ; idx < UI32ColIndex->length (); ++idx) {
190
204
if (nextIndex != UI32ColIndex->Value (idx)) {
191
205
if (idx - startIndexInt) {
192
- AFL_VERIFY ( RemapExternalToInternal.emplace (startIndexExt, TInternalChunkInfo ( startIndexInt, idx - startIndexInt, false )). second );
206
+ RemapExternalToInternal.emplace_back (startIndexExt, startIndexInt, idx - startIndexInt, false );
193
207
}
194
- AFL_VERIFY ( RemapExternalToInternal.emplace (nextIndex, TInternalChunkInfo ( 0 , UI32ColIndex->Value (idx) - nextIndex, true )). second );
208
+ RemapExternalToInternal.emplace_back (nextIndex, 0 , UI32ColIndex->Value (idx) - nextIndex, true );
195
209
startIndexExt = UI32ColIndex->Value (idx);
196
210
startIndexInt = idx;
197
211
}
198
212
nextIndex = UI32ColIndex->Value (idx) + 1 ;
199
213
}
200
214
if (UI32ColIndex->length () > startIndexInt) {
201
- AFL_VERIFY ( RemapExternalToInternal.emplace (startIndexExt, TInternalChunkInfo ( startIndexInt, UI32ColIndex->length () - startIndexInt, false )). second );
215
+ RemapExternalToInternal.emplace_back (startIndexExt, startIndexInt, UI32ColIndex->length () - startIndexInt, false );
202
216
}
203
217
if (nextIndex != RecordsCount) {
204
- AFL_VERIFY ( RemapExternalToInternal.emplace (nextIndex, TInternalChunkInfo ( 0 , RecordsCount - nextIndex, true )). second );
218
+ RemapExternalToInternal.emplace_back (nextIndex, 0 , RecordsCount - nextIndex, true );
205
219
}
206
220
ui32 count = 0 ;
207
221
for (auto && i : RemapExternalToInternal) {
208
- count += i.second .GetSize ();
222
+ count += i.GetSize ();
223
+ }
224
+ for (ui32 i = 0 ; i + 1 < RemapExternalToInternal.size (); ++i) {
225
+ AFL_VERIFY (RemapExternalToInternal[i + 1 ].GetStartExt () == RemapExternalToInternal[i].GetStartExt () + RemapExternalToInternal[i].GetSize ());
209
226
}
210
227
AFL_VERIFY (count == RecordsCount)(" count" , count)(" records_count" , RecordsCount);
211
228
AFL_VERIFY (ColValue);
@@ -256,7 +273,7 @@ void TSparsedArray::TBuilder::AddChunk(const ui32 recordsCount, const std::share
256
273
auto * arr = static_cast <const arrow::UInt32Array*>(data->column (0 ).get ());
257
274
AFL_VERIFY (arr->Value (arr->length () - 1 ) < recordsCount)(" val" , arr->Value (arr->length () - 1 ))(" count" , recordsCount);
258
275
}
259
- Chunks.emplace_back (TSparsedArrayChunk ( RecordsCount, recordsCount, data, DefaultValue) );
276
+ Chunks.emplace_back (RecordsCount, recordsCount, data, DefaultValue);
260
277
RecordsCount += recordsCount;
261
278
}
262
279
0 commit comments