Fix various issues related to Decimal and Arrow

robot-piglet · robot-piglet · commit 6559ca8141e0 · 2024-11-14T16:10:31.000+03:00
* Changelog entry Type: fix Component: proxy Some fixes in decimal: * Support parsing of nested `decimal128`/`decimal256` in Arrow. * Fix bug in `decimal256` unversioned value representation -- before this change, Arrow parser for `decimal256(n, p)` was always emitting strings of length 256 bits, event for n \<= 38, which is incorrect in the representation of YT `decimal(n, p)` type. Now it produces strings of variadic length (32, 64, 128 or 256 bits) depending on n, similar to `decimal128(n, p)`. --- Pull Request resolved: <ytsaurus/ytsaurus#942> commit_hash:32e66c7eb4d996caf0893f97d269fb1930bc5f7a
diff --git a/yt/yt/library/decimal/decimal.cpp b/yt/yt/library/decimal/decimal.cpp
@@ -891,10 +891,28 @@ TStringBuf TDecimal::WriteBinary256(int precision, TValue256 value, char* buffer
     CheckDecimalIntBits<TValue256>(precision);
     YT_VERIFY(bufferLength >= resultLength);
 
-    DecimalIntegerToBinaryUnchecked(std::move(value), buffer);
+    DecimalIntegerToBinaryUnchecked(value, buffer);
     return TStringBuf{buffer, sizeof(TValue256)};
 }
 
+TStringBuf TDecimal::WriteBinary256Variadic(int precision, TValue256 value, char* buffer, size_t bufferLength)
+{
+    const size_t resultLength = GetValueBinarySize(precision);
+    switch (resultLength) {
+        case 4:
+            return WriteBinary32(precision, *reinterpret_cast<i32*>(value.Parts.data()), buffer, bufferLength);
+        case 8:
+            return WriteBinary64(precision, *reinterpret_cast<i64*>(value.Parts.data()), buffer, bufferLength);
+        case 16:
+            return WriteBinary128(precision, *reinterpret_cast<TValue128*>(value.Parts.data()), buffer, bufferLength);
+        case 32:
+            return WriteBinary256(precision, value, buffer, bufferLength);
+        default:
+            THROW_ERROR_EXCEPTION("Invalid precision %v", precision);
+    }
+}
+
+
 template <typename T>
 Y_FORCE_INLINE void CheckBufferLength(int precision, size_t bufferLength)
 {
diff --git a/yt/yt/library/decimal/decimal.h b/yt/yt/library/decimal/decimal.h
@@ -24,6 +24,7 @@ class TDecimal
     };
     static_assert(sizeof(TValue128) == 2 * sizeof(ui64));
 
+    //! Lower-endian representation of 256-bit decimal value.
     struct TValue256
     {
         std::array<ui32, 8> Parts;
@@ -64,6 +65,8 @@ class TDecimal
 
     // Writes either 32-bit, 64-bit or 128-bit binary value depending on precision, provided a TValue128.
     static TStringBuf WriteBinary128Variadic(int precision, TValue128 value, char* buffer, size_t bufferLength);
+    // Writes either 32-bit, 64-bit, 128-bit or 256-bit binary value depending on precision, provided a TValue256.
+    static TStringBuf WriteBinary256Variadic(int precision, TValue256 value, char* buffer, size_t bufferLength);
 
     static i32 ParseBinary32(int precision, TStringBuf buffer);
     static i64 ParseBinary64(int precision, TStringBuf buffer);
diff --git a/yt/yt/library/formats/arrow_parser.cpp b/yt/yt/library/formats/arrow_parser.cpp
@@ -40,6 +40,28 @@ void ThrowOnError(const arrow::Status& status)
     }
 }
 
+template <class TUnderlyingValueType>
+TStringBuf SerializeDecimalBinary(const TStringBuf& value, int precision, char* buffer, size_t bufferLength)
+{
+    // NB: Arrow wire representation of Decimal128 is little-endian and (obviously) 128 bit,
+    // while YT in-memory representation of Decimal is big-endian, variadic-length of either 32 bit, 64 bit or 128 bit,
+    // and MSB-flipped to ensure lexical sorting order.
+    // Representation of Decimal256 is similar, but the upper limit for a length is 256 bit.
+    TUnderlyingValueType decimalValue;
+    YT_VERIFY(value.size() == sizeof(decimalValue));
+    std::memcpy(&decimalValue, value.data(), value.size());
+
+    TStringBuf decimalBinary;
+    if constexpr (std::is_same_v<TUnderlyingValueType, TDecimal::TValue128>) {
+        decimalBinary = TDecimal::WriteBinary128Variadic(precision, decimalValue, buffer, bufferLength);
+    } else if constexpr (std::is_same_v<TUnderlyingValueType, TDecimal::TValue256>) {
+        decimalBinary = TDecimal::WriteBinary256Variadic(precision, decimalValue, buffer, bufferLength);
+    } else {
+        static_assert(std::is_same_v<TUnderlyingValueType, TDecimal::TValue256>, "Unexpected decimal type");
+    }
+    return decimalBinary;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 class TArraySimpleVisitor
@@ -291,28 +313,12 @@ class TArraySimpleVisitor
     }
 
     template <class TUnderlyingValueType>
-    TUnversionedValue MakeDecimalBinaryValue(const TStringBuf& value, i64 columnId, int precision)
+    TUnversionedValue MakeDecimalBinaryValue(const TStringBuf& arrowValue, i64 columnId, int precision)
     {
-        // NB: Arrow wire representation of Decimal128 is little-endian and (obviously) 128 bit,
-        // while YT in-memory representation of Decimal is big-endian, variadic-length of either 32 bit, 64 bit or 128 bit,
-        // and MSB-flipped to ensure lexical sorting order.
-        // Representation of Decimal256 is similar, but only 256 bits.
-        TUnderlyingValueType decimalValue;
-        YT_VERIFY(value.size() == sizeof(decimalValue));
-        std::memcpy(&decimalValue, value.data(), value.size());
-
-        const auto maxByteCount = sizeof(decimalValue);
+        const auto maxByteCount = sizeof(TUnderlyingValueType);
         char* buffer = BufferForStringLikeValues_->Preallocate(maxByteCount);
-        TStringBuf decimalBinary;
-        if constexpr (std::is_same_v<TUnderlyingValueType, TDecimal::TValue128>) {
-            decimalBinary = TDecimal::WriteBinary128Variadic(precision, decimalValue, buffer, maxByteCount);
-        } else if constexpr (std::is_same_v<TUnderlyingValueType, TDecimal::TValue256>) {
-            decimalBinary = TDecimal::WriteBinary256(precision, decimalValue, buffer, maxByteCount);
-        } else {
-            static_assert(std::is_same_v<TUnderlyingValueType, TDecimal::TValue256>, "Unexpected decimal type");
-        }
+        auto decimalBinary = SerializeDecimalBinary<TUnderlyingValueType>(arrowValue, precision, buffer, maxByteCount);
         BufferForStringLikeValues_->Advance(decimalBinary.size());
-
         return MakeUnversionedStringValue(decimalBinary, columnId);
     }
 };
@@ -456,6 +462,20 @@ class TArrayCompositeVisitor
         return ParseStruct();
     }
 
+    arrow::Status Visit(const arrow::Decimal128Type& type) override
+    {
+        return ParseStringLikeArray<arrow::Decimal128Array>([&] (const TStringBuf& value) {
+            WriteDecimalBinary<TDecimal::TValue128>(value, type.precision());
+        });
+    }
+
+    arrow::Status Visit(const arrow::Decimal256Type& type) override
+    {
+        return ParseStringLikeArray<arrow::Decimal256Array>([&] (const TStringBuf& value) {
+            WriteDecimalBinary<TDecimal::TValue256>(value, type.precision());
+        });
+    }
+
 private:
     const int RowIndex_;
 
@@ -505,13 +525,21 @@ class TArrayCompositeVisitor
 
     template <typename ArrayType>
     arrow::Status ParseStringLikeArray()
+    {
+        return ParseStringLikeArray<ArrayType>([&] (const TStringBuf& value) {
+            Writer_->WriteBinaryString(value);
+        });
+    }
+
+    template <typename ArrayType>
+    arrow::Status ParseStringLikeArray(auto writeStringValue)
     {
         auto array = std::static_pointer_cast<ArrayType>(Array_);
         if (array->IsNull(RowIndex_)) {
             Writer_->WriteEntity();
         } else {
             auto element = array->GetView(RowIndex_);
-            Writer_->WriteBinaryString(TStringBuf(element.data(), element.size()));
+            writeStringValue(TStringBuf(element.data(), element.size()));
         }
         return arrow::Status::OK();
     }
@@ -610,6 +638,15 @@ class TArrayCompositeVisitor
         }
         return arrow::Status::OK();
     }
+
+    template <class TUnderlyingType>
+    void WriteDecimalBinary(TStringBuf arrowValue, int precision)
+    {
+        const auto maxByteCount = sizeof(TUnderlyingType);
+        char buffer[maxByteCount];
+        auto decimalBinary = SerializeDecimalBinary<TUnderlyingType>(arrowValue, precision, buffer, maxByteCount);
+        Writer_->WriteBinaryString(decimalBinary);
+    }
 };
 
 ////////////////////////////////////////////////////////////////////////////////