@@ -669,6 +669,28 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or
669
669
}
670
670
}
671
671
672
+ TColumnConverter ArrowComputeConvertor (const std::string& columnName, const std::shared_ptr<arrow::DataType>& sourceType, const std::shared_ptr<arrow::DataType>& targetType) {
673
+ YQL_ENSURE (arrow::compute::CanCast (*sourceType, *targetType), " Can not cast column " << columnName << " , from source type " << sourceType->ToString () << " to target type " << targetType->ToString ());
674
+ return [targetType](const std::shared_ptr<arrow::Array>& value) {
675
+ auto res = arrow::compute::Cast (*value, targetType);
676
+ THROW_ARROW_NOT_OK (res.status ());
677
+ return std::move (res).ValueOrDie ();
678
+ };
679
+ }
680
+
681
+ TColumnConverter YqlBlockTzDateToArrow (const std::string& columnName, const std::shared_ptr<arrow::DataType>& sourceType) {
682
+ YQL_ENSURE (sourceType->id () == arrow::Type::STRUCT, " Yql Tz block shoud have struct type" );
683
+ YQL_ENSURE (sourceType->num_fields () == 2 , " Yql Tz block shoud have two fields" );
684
+ return [columnName, sourceType](const std::shared_ptr<arrow::Array>& value) {
685
+ YQL_ENSURE (value->type ()->Equals (sourceType), " Unexpected block type: " << value->type ()->ToString () << " , expected type: " << sourceType->ToString () << " in column: " << columnName);
686
+ const auto structValue = std::static_pointer_cast<arrow::StructArray>(value);
687
+ const auto dateField = structValue->field (0 )->data ()->Copy ();
688
+ dateField->null_count = structValue->null_count ();
689
+ dateField->buffers [0 ] = structValue->null_bitmap ();
690
+ return arrow::MakeArray (dateField);
691
+ };
692
+ }
693
+
672
694
}
673
695
674
696
namespace NYql ::NDq {
@@ -698,11 +720,50 @@ TColumnConverter BuildColumnConverter(const std::string& columnName, const std::
698
720
<< targetType->ToString () << " , got: " << originalType->ToString ());
699
721
}
700
722
701
- return [targetType](const std::shared_ptr<arrow::Array>& value) {
702
- auto res = arrow::compute::Cast (*value, targetType);
703
- THROW_ARROW_NOT_OK (res.status ());
704
- return std::move (res).ValueOrDie ();
705
- };
723
+ return ArrowComputeConvertor (columnName, originalType, targetType);
724
+ }
725
+
726
+ TColumnConverter BuildOutputColumnConverter (const std::string& columnName, NKikimr::NMiniKQL::TType* columnType) {
727
+ std::shared_ptr<arrow::DataType> yqlArrowType, s3OutputType;
728
+ YQL_ENSURE (ConvertArrowType (columnType, yqlArrowType), " Got unsupported yql block type: " << *columnType << " in column " << columnName);
729
+ YQL_ENSURE (S3ConvertArrowOutputType (columnType, s3OutputType), " Got unsupported s3 output block type: " << *columnType << " in column " << columnName);
730
+
731
+ if (columnType->IsOptional ()) {
732
+ columnType = AS_TYPE (TOptionalType, columnType)->GetItemType ();
733
+ }
734
+ YQL_ENSURE (columnType->IsData (), " Allowed only data types for S3 output, but got: " << *columnType << " in column " << columnName);
735
+ const auto slot = AS_TYPE (TDataType, columnType)->GetDataSlot ();
736
+ YQL_ENSURE (slot, " Got invalid data type " << *columnType << " in column " << columnName);
737
+
738
+ switch (*slot) {
739
+ case NUdf::EDataSlot::Bool:
740
+ case NUdf::EDataSlot::Int8:
741
+ case NUdf::EDataSlot::Uint8:
742
+ case NUdf::EDataSlot::Int16:
743
+ case NUdf::EDataSlot::Uint16:
744
+ case NUdf::EDataSlot::Int32:
745
+ case NUdf::EDataSlot::Uint32:
746
+ case NUdf::EDataSlot::Int64:
747
+ case NUdf::EDataSlot::Uint64:
748
+ case NUdf::EDataSlot::Float:
749
+ case NUdf::EDataSlot::Double:
750
+ case NUdf::EDataSlot::String:
751
+ case NUdf::EDataSlot::Date:
752
+ case NUdf::EDataSlot::Datetime:
753
+ case NUdf::EDataSlot::Timestamp:
754
+ return {};
755
+ case NUdf::EDataSlot::Utf8:
756
+ case NUdf::EDataSlot::Json:
757
+ return ArrowComputeConvertor (columnName, yqlArrowType, s3OutputType);
758
+ case NUdf::EDataSlot::TzDate:
759
+ case NUdf::EDataSlot::TzDatetime:
760
+ case NUdf::EDataSlot::TzTimestamp:
761
+ return YqlBlockTzDateToArrow (columnName, yqlArrowType);
762
+ default :
763
+ YQL_ENSURE (false , " Got unsupported s3 output block type: " << *columnType << " in column " << columnName);
764
+ }
765
+
766
+ return {};
706
767
}
707
768
708
769
void BuildColumnConverters (std::shared_ptr<arrow::Schema> outputSchema, std::shared_ptr<arrow::Schema> dataSchema,
@@ -753,4 +814,82 @@ std::shared_ptr<arrow::RecordBatch> ConvertArrowColumns(std::shared_ptr<arrow::R
753
814
return arrow::RecordBatch::Make (batch->schema (), batch->num_rows (), columns);
754
815
}
755
816
817
+ // Type conversion same as in ClickHouseClient.SerializeFormat udf
818
+ bool S3ConvertArrowOutputType (NUdf::EDataSlot slot, std::shared_ptr<arrow::DataType>& type) {
819
+ switch (slot) {
820
+ case NUdf::EDataSlot::Int8:
821
+ type = arrow::int8 ();
822
+ return true ;
823
+ case NUdf::EDataSlot::Bool:
824
+ case NUdf::EDataSlot::Uint8:
825
+ type = arrow::uint8 ();
826
+ return true ;
827
+ case NUdf::EDataSlot::Int16:
828
+ type = arrow::int16 ();
829
+ return true ;
830
+ case NUdf::EDataSlot::Date:
831
+ case NUdf::EDataSlot::TzDate:
832
+ case NUdf::EDataSlot::Uint16:
833
+ type = arrow::uint16 ();
834
+ return true ;
835
+ case NUdf::EDataSlot::Int32:
836
+ type = arrow::int32 ();
837
+ return true ;
838
+ case NUdf::EDataSlot::Datetime:
839
+ case NUdf::EDataSlot::TzDatetime:
840
+ case NUdf::EDataSlot::Uint32:
841
+ type = arrow::uint32 ();
842
+ return true ;
843
+ case NUdf::EDataSlot::Int64:
844
+ type = arrow::int64 ();
845
+ return true ;
846
+ case NUdf::EDataSlot::Uint64:
847
+ type = arrow::uint64 ();
848
+ return true ;
849
+ case NUdf::EDataSlot::Float:
850
+ type = arrow::float32 ();
851
+ return true ;
852
+ case NUdf::EDataSlot::Double:
853
+ type = arrow::float64 ();
854
+ return true ;
855
+ case NUdf::EDataSlot::String:
856
+ case NUdf::EDataSlot::Utf8:
857
+ case NUdf::EDataSlot::Json:
858
+ type = arrow::binary ();
859
+ return true ;
860
+ case NUdf::EDataSlot::Timestamp:
861
+ case NUdf::EDataSlot::TzTimestamp:
862
+ type = arrow::timestamp (arrow::TimeUnit::MICRO, " UTC" );
863
+ return true ;
864
+ default :
865
+ break ;
866
+ }
867
+ return false ;
868
+ }
869
+
870
+ bool S3ConvertArrowOutputType (TType* itemType, std::shared_ptr<arrow::DataType>& type) {
871
+ if (itemType->IsOptional ()) {
872
+ itemType = AS_TYPE (TOptionalType, itemType)->GetItemType ();
873
+ }
874
+ if (!itemType->IsData ()) {
875
+ return false ;
876
+ }
877
+
878
+ const auto slot = AS_TYPE (TDataType, itemType)->GetDataSlot ();
879
+ if (!slot) {
880
+ return false ;
881
+ }
882
+
883
+ return S3ConvertArrowOutputType (*slot, type);
884
+ }
885
+
886
+ void BuildOutputColumnConverters (const NKikimr::NMiniKQL::TStructType* outputStructType, std::vector<TColumnConverter>& columnConverters) {
887
+ columnConverters.reserve (outputStructType->GetMembersCount ());
888
+ for (ui32 i = 0 ; i < outputStructType->GetMembersCount (); ++i) {
889
+ auto * const type = outputStructType->GetMemberType (i);
890
+ const std::string name (outputStructType->GetMemberName (i));
891
+ columnConverters.emplace_back (BuildOutputColumnConverter (name, type));
892
+ }
893
+ }
894
+
756
895
} // namespace NYql::NDq
0 commit comments