|
1 | 1 | #include "dq_arrow_helpers.h"
|
2 | 2 |
|
3 | 3 | #include <cstddef>
|
4 |
| -#include <yql/essentials/public/udf/udf_value.h> |
5 |
| -#include <yql/essentials/minikql/defs.h> |
| 4 | +#include <yql/essentials/minikql/computation/mkql_block_trimmer.h> |
6 | 5 | #include <yql/essentials/minikql/computation/mkql_computation_node_holders.h>
|
| 6 | +#include <yql/essentials/minikql/defs.h> |
7 | 7 | #include <yql/essentials/minikql/mkql_node.h>
|
| 8 | +#include <yql/essentials/public/udf/arrow/defs.h> |
| 9 | +#include <yql/essentials/public/udf/arrow/memory_pool.h> |
| 10 | +#include <yql/essentials/public/udf/arrow/util.h> |
| 11 | +#include <yql/essentials/public/udf/udf_value.h> |
8 | 12 |
|
| 13 | +#include <ydb/library/formats/arrow/size_calcer.h> |
9 | 14 | #include <ydb/library/yverify_stream/yverify_stream.h>
|
10 | 15 | #include <ydb/public/lib/scheme_types/scheme_type_id.h>
|
11 | 16 |
|
@@ -942,6 +947,214 @@ std::shared_ptr<arrow::Array> DeserializeArray(const std::string& blob, std::sha
|
942 | 947 | return (*batch)->column(0);
|
943 | 948 | }
|
944 | 949 |
|
| 950 | +// Block splitter |
| 951 | + |
| 952 | +namespace { |
| 953 | + |
| 954 | +class TBlockSplitter : public IBlockSplitter { |
| 955 | + class TItem { |
| 956 | + public: |
| 957 | + TItem(TBlockSplitter& self, const NUdf::TUnboxedValuePod* values) |
| 958 | + : Self(self) |
| 959 | + { |
| 960 | + Data.reserve(Self.Width); |
| 961 | + ArraysIdx.reserve(Self.Width); |
| 962 | + for (ui64 i = 0; i < Self.Width; ++i) { |
| 963 | + auto datum = TBlockSplitter::ExtractDatum(values[i]); |
| 964 | + if (datum.is_scalar()) { |
| 965 | + ScalarsSize += Self.GetDatumMemorySize(i, datum); |
| 966 | + } else { |
| 967 | + ArraysIdx.emplace_back(i); |
| 968 | + } |
| 969 | + Data.emplace_back(std::move(datum)); |
| 970 | + } |
| 971 | + |
| 972 | + NumberRows = Data.back().scalar_as<arrow::UInt64Scalar>().value; |
| 973 | + UpdateArraysSize(); |
| 974 | + } |
| 975 | + |
| 976 | + TItem(TBlockSplitter& self, std::vector<arrow::Datum>&& data, const std::vector<ui64>& arraysIdx, ui64 numberRows, ui64 scalarsSize) |
| 977 | + : Self(self) |
| 978 | + , Data(std::move(data)) |
| 979 | + , ArraysIdx(arraysIdx) |
| 980 | + , NumberRows(numberRows) |
| 981 | + , ScalarsSize(scalarsSize) |
| 982 | + { |
| 983 | + UpdateArraysSize(); |
| 984 | + } |
| 985 | + |
| 986 | + ui64 GetNumberRows() const { |
| 987 | + return NumberRows; |
| 988 | + } |
| 989 | + |
| 990 | + ui64 GetSize() const { |
| 991 | + return ScalarsSize + ArraysSize; |
| 992 | + } |
| 993 | + |
| 994 | + std::vector<arrow::Datum> ExtractData() { |
| 995 | + std::vector<arrow::Datum> result(std::move(Data)); |
| 996 | + for (ui64 i : ArraysIdx) { |
| 997 | + result[i] = Self.GetColumnTrimmer(i).Trim(result[i].array()); |
| 998 | + } |
| 999 | + result.back() = arrow::Datum(std::make_shared<arrow::UInt64Scalar>(NumberRows)); |
| 1000 | + return result; |
| 1001 | + } |
| 1002 | + |
| 1003 | + TItem PopBack(ui64 length) { |
| 1004 | + MKQL_ENSURE(length <= NumberRows, "Can not pop more than number of rows"); |
| 1005 | + std::vector<arrow::Datum> backData = Data; |
| 1006 | + for (ui64 i : ArraysIdx) { |
| 1007 | + auto array = Data[i].array(); |
| 1008 | + Data[i] = NUdf::Chop(array, NumberRows - length); |
| 1009 | + backData[i] = array; |
| 1010 | + } |
| 1011 | + |
| 1012 | + NumberRows -= length; |
| 1013 | + UpdateArraysSize(); |
| 1014 | + |
| 1015 | + return TItem(Self, std::move(backData), ArraysIdx, length, ScalarsSize); |
| 1016 | + } |
| 1017 | + |
| 1018 | + private: |
| 1019 | + void UpdateArraysSize() { |
| 1020 | + ArraysSize = 0; |
| 1021 | + for (ui64 i : ArraysIdx) { |
| 1022 | + ArraysSize += NKikimr::NArrow::GetArrayDataSize(Data[i].make_array()); |
| 1023 | + } |
| 1024 | + } |
| 1025 | + |
| 1026 | + private: |
| 1027 | + TBlockSplitter& Self; |
| 1028 | + std::vector<arrow::Datum> Data; |
| 1029 | + std::vector<ui64> ArraysIdx; |
| 1030 | + ui64 NumberRows = 0; |
| 1031 | + ui64 ScalarsSize = 0; |
| 1032 | + ui64 ArraysSize = 0; |
| 1033 | + }; |
| 1034 | + |
| 1035 | +public: |
| 1036 | + TBlockSplitter(const TVector<const TBlockType*>& items, ui64 chunkSizeLimit, arrow::MemoryPool* pool) |
| 1037 | + : Items(items) |
| 1038 | + , Width(items.size()) |
| 1039 | + , ChunkSizeLimit(chunkSizeLimit) |
| 1040 | + , ArrowPool(pool ? *pool : *NYql::NUdf::GetYqlMemoryPool()) |
| 1041 | + , ScalarSizes(Width) |
| 1042 | + , BlockTrimmers(Width) |
| 1043 | + {} |
| 1044 | + |
| 1045 | + bool ShouldSplitItem(const NUdf::TUnboxedValuePod* values, ui32 count) override { |
| 1046 | + MKQL_ENSURE(count == Width, "Invalid width"); |
| 1047 | + |
| 1048 | + ui64 itemSize = 0; |
| 1049 | + for (size_t i = 0; i < Width; ++i) { |
| 1050 | + itemSize += GetDatumMemorySize(i, ExtractDatum(values[i])); |
| 1051 | + } |
| 1052 | + return itemSize > ChunkSizeLimit; |
| 1053 | + } |
| 1054 | + |
| 1055 | + std::vector<std::vector<arrow::Datum>> SplitItem(const NUdf::TUnboxedValuePod* values, ui32 count) override { |
| 1056 | + MKQL_ENSURE(count == Width, "Invalid width"); |
| 1057 | + |
| 1058 | + SplitStack.clear(); |
| 1059 | + SplitStack.emplace_back(*this, values); |
| 1060 | + std::vector<std::vector<arrow::Datum>> result; |
| 1061 | + |
| 1062 | + const auto estimatedSize = SplitStack.back().GetSize() / std::max(ChunkSizeLimit, ui64(1)); |
| 1063 | + result.reserve(estimatedSize); |
| 1064 | + SplitStack.reserve(estimatedSize); |
| 1065 | + while (!SplitStack.empty()) { |
| 1066 | + auto item = std::move(SplitStack.back()); |
| 1067 | + SplitStack.pop_back(); |
| 1068 | + |
| 1069 | + while (item.GetSize() > ChunkSizeLimit) { |
| 1070 | + if (item.GetNumberRows() <= 1) { |
| 1071 | + throw yexception() << "Row size in block is " << item.GetSize() << ", that is larger than allowed limit " << ChunkSizeLimit; |
| 1072 | + } |
| 1073 | + SplitStack.emplace_back(item.PopBack(item.GetNumberRows() / 2)); |
| 1074 | + } |
| 1075 | + result.emplace_back(item.ExtractData()); |
| 1076 | + } |
| 1077 | + return result; |
| 1078 | + } |
| 1079 | + |
| 1080 | +private: |
| 1081 | + static arrow::Datum ExtractDatum(const NUdf::TUnboxedValuePod& value) { |
| 1082 | + arrow::Datum datum = TArrowBlock::From(value).GetDatum(); |
| 1083 | + MKQL_ENSURE(datum.is_array() || datum.is_scalar(), "Expecting array or scalar"); |
| 1084 | + return datum; |
| 1085 | + } |
| 1086 | + |
| 1087 | + ui64 GetDatumMemorySize(ui64 index, const arrow::Datum& datum) { |
| 1088 | + MKQL_ENSURE(index < Width, "Invalid index"); |
| 1089 | + if (datum.is_array()) { |
| 1090 | + return NKikimr::NArrow::GetArrayMemorySize(datum.array()); |
| 1091 | + } |
| 1092 | + |
| 1093 | + if (!ScalarSizes[index]) { |
| 1094 | + const auto& array = ARROW_RESULT(arrow::MakeArrayFromScalar(*datum.scalar(), 1)); |
| 1095 | + ScalarSizes[index] = NKikimr::NArrow::GetArrayMemorySize(array->data()); |
| 1096 | + } |
| 1097 | + return *ScalarSizes[index]; |
| 1098 | + } |
| 1099 | + |
| 1100 | + IBlockTrimmer& GetColumnTrimmer(ui64 index) { |
| 1101 | + MKQL_ENSURE(index < Width, "Invalid index"); |
| 1102 | + if (!BlockTrimmers[index]) { |
| 1103 | + BlockTrimmers[index] = MakeBlockTrimmer(TTypeInfoHelper(), Items[index]->GetItemType(), &ArrowPool); |
| 1104 | + } |
| 1105 | + return *BlockTrimmers[index]; |
| 1106 | + } |
| 1107 | + |
| 1108 | +private: |
| 1109 | + const TVector<const TBlockType*> Items; |
| 1110 | + const ui64 Width; |
| 1111 | + const ui64 ChunkSizeLimit; |
| 1112 | + arrow::MemoryPool& ArrowPool; |
| 1113 | + |
| 1114 | + std::vector<std::optional<ui64>> ScalarSizes; |
| 1115 | + std::vector<IBlockTrimmer::TPtr> BlockTrimmers; |
| 1116 | + std::vector<TItem> SplitStack; |
| 1117 | +}; |
| 1118 | + |
| 1119 | +} // namespace |
| 1120 | + |
| 1121 | +IBlockSplitter::TPtr CreateBlockSplitter(const TType* type, ui64 chunkSizeLimit, arrow::MemoryPool* pool) { |
| 1122 | + if (!type->IsMulti()) { |
| 1123 | + return nullptr; |
| 1124 | + } |
| 1125 | + |
| 1126 | + const TMultiType* multiType = static_cast<const TMultiType*>(type); |
| 1127 | + const ui32 width = multiType->GetElementsCount(); |
| 1128 | + if (!width) { |
| 1129 | + return nullptr; |
| 1130 | + } |
| 1131 | + |
| 1132 | + TVector<const TBlockType*> items; |
| 1133 | + items.reserve(width); |
| 1134 | + for (ui32 i = 0; i < width; i++) { |
| 1135 | + const auto type = multiType->GetElementType(i); |
| 1136 | + if (!type->IsBlock()) { |
| 1137 | + return nullptr; |
| 1138 | + } |
| 1139 | + |
| 1140 | + const TBlockType* blockType = static_cast<const TBlockType*>(type); |
| 1141 | + if (i == width - 1) { |
| 1142 | + if (blockType->GetShape() != TBlockType::EShape::Scalar) { |
| 1143 | + return nullptr; |
| 1144 | + } |
| 1145 | + if (!blockType->GetItemType()->IsData()) { |
| 1146 | + return nullptr; |
| 1147 | + } |
| 1148 | + if (static_cast<const TDataType*>(blockType->GetItemType())->GetDataSlot() != NUdf::EDataSlot::Uint64) { |
| 1149 | + return nullptr; |
| 1150 | + } |
| 1151 | + } |
| 1152 | + |
| 1153 | + items.push_back(blockType); |
| 1154 | + } |
| 1155 | + |
| 1156 | + return MakeIntrusive<TBlockSplitter>(items, chunkSizeLimit, pool); |
| 1157 | +} |
| 1158 | + |
945 | 1159 | } // namespace NArrow
|
946 | 1160 | } // namespace NYql
|
947 |
| - |
|
0 commit comments