22
22
)
23
23
from xarray .core import indexing
24
24
from xarray .core .common import contains_cftime_datetimes , is_np_datetime_like
25
+ from xarray .core .duck_array_ops import asarray
25
26
from xarray .core .formatting import first_n_items , format_timestamp , last_item
27
+ from xarray .core .parallelcompat import T_ChunkedArray , get_chunked_array_type
26
28
from xarray .core .pdcompat import nanosecond_precision_timestamp
27
- from xarray .core .pycompat import is_duck_dask_array
29
+ from xarray .core .pycompat import is_chunked_array , is_duck_dask_array
28
30
from xarray .core .utils import emit_user_level_warning
29
31
from xarray .core .variable import Variable
30
32
34
36
cftime = None
35
37
36
38
if TYPE_CHECKING :
37
- from xarray .core .types import CFCalendar
39
+ from xarray .core .types import CFCalendar , T_DuckArray
38
40
39
41
T_Name = Union [Hashable , None ]
40
42
@@ -667,12 +669,48 @@ def _division(deltas, delta, floor):
667
669
return num
668
670
669
671
672
+ def _cast_to_dtype_if_safe (num : np .ndarray , dtype : np .dtype ) -> np .ndarray :
673
+ with warnings .catch_warnings ():
674
+ warnings .filterwarnings ("ignore" , message = "overflow" )
675
+ cast_num = np .asarray (num , dtype = dtype )
676
+
677
+ if np .issubdtype (dtype , np .integer ):
678
+ if not (num == cast_num ).all ():
679
+ if np .issubdtype (num .dtype , np .floating ):
680
+ raise ValueError (
681
+ f"Not possible to cast all encoded times from "
682
+ f"{ num .dtype !r} to { dtype !r} without losing precision. "
683
+ f"Consider modifying the units such that integer values "
684
+ f"can be used, or removing the units and dtype encoding, "
685
+ f"at which point xarray will make an appropriate choice."
686
+ )
687
+ else :
688
+ raise OverflowError (
689
+ f"Not possible to cast encoded times from "
690
+ f"{ num .dtype !r} to { dtype !r} without overflow. Consider "
691
+ f"removing the dtype encoding, at which point xarray will "
692
+ f"make an appropriate choice, or explicitly switching to "
693
+ "a larger integer dtype."
694
+ )
695
+ else :
696
+ if np .isinf (cast_num ).any ():
697
+ raise OverflowError (
698
+ f"Not possible to cast encoded times from { num .dtype !r} to "
699
+ f"{ dtype !r} without overflow. Consider removing the dtype "
700
+ f"encoding, at which point xarray will make an appropriate "
701
+ f"choice, or explicitly switching to a larger floating point "
702
+ f"dtype."
703
+ )
704
+
705
+ return cast_num
706
+
707
+
670
708
def encode_cf_datetime (
671
- dates ,
709
+ dates : T_DuckArray , # type: ignore
672
710
units : str | None = None ,
673
711
calendar : str | None = None ,
674
712
dtype : np .dtype | None = None ,
675
- ) -> tuple [np . ndarray , str , str ]:
713
+ ) -> tuple [T_DuckArray , str , str ]:
676
714
"""Given an array of datetime objects, returns the tuple `(num, units,
677
715
calendar)` suitable for a CF compliant time variable.
678
716
@@ -682,7 +720,21 @@ def encode_cf_datetime(
682
720
--------
683
721
cftime.date2num
684
722
"""
685
- dates = np .asarray (dates )
723
+ dates = asarray (dates )
724
+ if is_chunked_array (dates ):
725
+ return _lazily_encode_cf_datetime (dates , units , calendar , dtype )
726
+ else :
727
+ return _eagerly_encode_cf_datetime (dates , units , calendar , dtype )
728
+
729
+
730
+ def _eagerly_encode_cf_datetime (
731
+ dates : T_DuckArray , # type: ignore
732
+ units : str | None = None ,
733
+ calendar : str | None = None ,
734
+ dtype : np .dtype | None = None ,
735
+ allow_units_modification : bool = True ,
736
+ ) -> tuple [T_DuckArray , str , str ]:
737
+ dates = asarray (dates )
686
738
687
739
data_units = infer_datetime_units (dates )
688
740
@@ -731,7 +783,7 @@ def encode_cf_datetime(
731
783
f"Set encoding['dtype'] to integer dtype to serialize to int64. "
732
784
f"Set encoding['dtype'] to floating point dtype to silence this warning."
733
785
)
734
- elif np .issubdtype (dtype , np .integer ):
786
+ elif np .issubdtype (dtype , np .integer ) and allow_units_modification :
735
787
new_units = f"{ needed_units } since { format_timestamp (ref_date )} "
736
788
emit_user_level_warning (
737
789
f"Times can't be serialized faithfully to int64 with requested units { units !r} . "
@@ -752,12 +804,80 @@ def encode_cf_datetime(
752
804
# we already covered for this in pandas-based flow
753
805
num = cast_to_int_if_safe (num )
754
806
755
- return (num , units , calendar )
807
+ if dtype is not None :
808
+ num = _cast_to_dtype_if_safe (num , dtype )
809
+
810
+ return num , units , calendar
811
+
812
+
813
+ def _encode_cf_datetime_within_map_blocks (
814
+ dates : T_DuckArray , # type: ignore
815
+ units : str ,
816
+ calendar : str ,
817
+ dtype : np .dtype ,
818
+ ) -> T_DuckArray :
819
+ num , * _ = _eagerly_encode_cf_datetime (
820
+ dates , units , calendar , dtype , allow_units_modification = False
821
+ )
822
+ return num
823
+
824
+
825
+ def _lazily_encode_cf_datetime (
826
+ dates : T_ChunkedArray ,
827
+ units : str | None = None ,
828
+ calendar : str | None = None ,
829
+ dtype : np .dtype | None = None ,
830
+ ) -> tuple [T_ChunkedArray , str , str ]:
831
+ if calendar is None :
832
+ # This will only trigger minor compute if dates is an object dtype array.
833
+ calendar = infer_calendar_name (dates )
834
+
835
+ if units is None and dtype is None :
836
+ if dates .dtype == "O" :
837
+ units = "microseconds since 1970-01-01"
838
+ dtype = np .dtype ("int64" )
839
+ else :
840
+ units = "nanoseconds since 1970-01-01"
841
+ dtype = np .dtype ("int64" )
842
+
843
+ if units is None or dtype is None :
844
+ raise ValueError (
845
+ f"When encoding chunked arrays of datetime values, both the units "
846
+ f"and dtype must be prescribed or both must be unprescribed. "
847
+ f"Prescribing only one or the other is not currently supported. "
848
+ f"Got a units encoding of { units } and a dtype encoding of { dtype } ."
849
+ )
850
+
851
+ chunkmanager = get_chunked_array_type (dates )
852
+ num = chunkmanager .map_blocks (
853
+ _encode_cf_datetime_within_map_blocks ,
854
+ dates ,
855
+ units ,
856
+ calendar ,
857
+ dtype ,
858
+ dtype = dtype ,
859
+ )
860
+ return num , units , calendar
756
861
757
862
758
863
def encode_cf_timedelta (
759
- timedeltas , units : str | None = None , dtype : np .dtype | None = None
760
- ) -> tuple [np .ndarray , str ]:
864
+ timedeltas : T_DuckArray , # type: ignore
865
+ units : str | None = None ,
866
+ dtype : np .dtype | None = None ,
867
+ ) -> tuple [T_DuckArray , str ]:
868
+ timedeltas = asarray (timedeltas )
869
+ if is_chunked_array (timedeltas ):
870
+ return _lazily_encode_cf_timedelta (timedeltas , units , dtype )
871
+ else :
872
+ return _eagerly_encode_cf_timedelta (timedeltas , units , dtype )
873
+
874
+
875
+ def _eagerly_encode_cf_timedelta (
876
+ timedeltas : T_DuckArray , # type: ignore
877
+ units : str | None = None ,
878
+ dtype : np .dtype | None = None ,
879
+ allow_units_modification : bool = True ,
880
+ ) -> tuple [T_DuckArray , str ]:
761
881
data_units = infer_timedelta_units (timedeltas )
762
882
763
883
if units is None :
@@ -784,7 +904,7 @@ def encode_cf_timedelta(
784
904
f"Set encoding['dtype'] to integer dtype to serialize to int64. "
785
905
f"Set encoding['dtype'] to floating point dtype to silence this warning."
786
906
)
787
- elif np .issubdtype (dtype , np .integer ):
907
+ elif np .issubdtype (dtype , np .integer ) and allow_units_modification :
788
908
emit_user_level_warning (
789
909
f"Timedeltas can't be serialized faithfully with requested units { units !r} . "
790
910
f"Serializing with units { needed_units !r} instead. "
@@ -797,7 +917,49 @@ def encode_cf_timedelta(
797
917
798
918
num = _division (time_deltas , time_delta , floor_division )
799
919
num = num .values .reshape (timedeltas .shape )
800
- return (num , units )
920
+
921
+ if dtype is not None :
922
+ num = _cast_to_dtype_if_safe (num , dtype )
923
+
924
+ return num , units
925
+
926
+
927
+ def _encode_cf_timedelta_within_map_blocks (
928
+ timedeltas : T_DuckArray , # type:ignore
929
+ units : str ,
930
+ dtype : np .dtype ,
931
+ ) -> T_DuckArray :
932
+ num , _ = _eagerly_encode_cf_timedelta (
933
+ timedeltas , units , dtype , allow_units_modification = False
934
+ )
935
+ return num
936
+
937
+
938
+ def _lazily_encode_cf_timedelta (
939
+ timedeltas : T_ChunkedArray , units : str | None = None , dtype : np .dtype | None = None
940
+ ) -> tuple [T_ChunkedArray , str ]:
941
+ if units is None and dtype is None :
942
+ units = "nanoseconds"
943
+ dtype = np .dtype ("int64" )
944
+
945
+ if units is None or dtype is None :
946
+ raise ValueError (
947
+ f"When encoding chunked arrays of timedelta values, both the "
948
+ f"units and dtype must be prescribed or both must be "
949
+ f"unprescribed. Prescribing only one or the other is not "
950
+ f"currently supported. Got a units encoding of { units } and a "
951
+ f"dtype encoding of { dtype } ."
952
+ )
953
+
954
+ chunkmanager = get_chunked_array_type (timedeltas )
955
+ num = chunkmanager .map_blocks (
956
+ _encode_cf_timedelta_within_map_blocks ,
957
+ timedeltas ,
958
+ units ,
959
+ dtype ,
960
+ dtype = dtype ,
961
+ )
962
+ return num , units
801
963
802
964
803
965
class CFDatetimeCoder (VariableCoder ):
0 commit comments