Implement literal np.timedelta64 coding (#10101)

spencerkclark · pre-commit-ci[bot] · dcherian · web-flow · commit f20eb7a63876 · 2025-05-23T13:04:06.000+02:00
* Proof of concept literal timedelta64 coding * Ensure test_roundtrip_timedelta_data test uses old encoding pathway * Remove no longer relevant test * Include units attribute * Move coder to times.py * Add what's new entry * Restore test and reduce diff * Fix typing * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix doctests * Restore original order of encoders * Add return types to tests * Move everything to CFTimedeltaCoder; reuse code where possible * Fix mypy * Use Kai's offset and scale_factor logic for all encoding * Fix bad merge * Forbid mixing other encoding with literal timedelta64 encoding * Expose fine-grained control over decoding pathways * Rename test * Use consistent dtype spelling * Continue supporting non-timedelta dtype-only encoding * Fix example attribute in docstring * Update what's new * Fix typo * Complete test * Fix docstring * Support _FillValue or missing_value encoding * Tweak errors and warnings; relax decoding dtype error * Add xfail test for fine-resolution branch of non-pandas resolution code * Fix typing * Revert "Fix typing" This reverts commit 0929ec4. * Use simpler typing fix for now * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com> Co-authored-by: Kai Mühlbauer <kai.muehlbauer@uni-bonn.de>
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -71,6 +71,15 @@ Alban Farchi, Andrecho, Benoit Bovy, Deepak Cherian, Dimitri Papadopoulos Orfano
 
 New Features
 ~~~~~~~~~~~~
+- By default xarray now encodes :py:class:`numpy.timedelta64` values by
+  converting to :py:class:`numpy.int64` values and storing ``"dtype"`` and
+  ``"units"`` attributes consistent with the dtype of the in-memory
+  :py:class:`numpy.timedelta64` values, e.g. ``"timedelta64[s]"`` and
+  ``"seconds"`` for second-resolution timedeltas. These values will always be
+  decoded to timedeltas without a warning moving forward. Timedeltas encoded
+  via the previous approach can still be roundtripped exactly, but in the
+  future will not be decoded by default (:issue:`1621`, :issue:`10099`,
+  :pull:`10101`). By `Spencer Clark <https://github.com/spencerkclark>`_.
 
 - Added `scipy-stubs <https://github.com/scipy/scipy-stubs>`_ to the ``xarray[types]`` dependencies.
   By `Joren Hammudoglu <https://github.com/jorenham>`_.
diff --git a/xarray/coding/times.py b/xarray/coding/times.py
@@ -92,6 +92,12 @@
 )
 
 
+_INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS = [
+    "add_offset",
+    "scale_factor",
+]
+
+
 def _is_standard_calendar(calendar: str) -> bool:
     return calendar.lower() in _STANDARD_CALENDARS
 
@@ -1403,62 +1409,169 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
             return variable
 
 
+def has_timedelta64_encoding_dtype(attrs_or_encoding: dict) -> bool:
+    dtype = attrs_or_encoding.get("dtype", None)
+    return isinstance(dtype, str) and dtype.startswith("timedelta64")
+
+
 class CFTimedeltaCoder(VariableCoder):
     """Coder for CF Timedelta coding.
 
     Parameters
     ----------
     time_unit : PDDatetimeUnitOptions
-          Target resolution when decoding timedeltas. Defaults to "ns".
+        Target resolution when decoding timedeltas via units. Defaults to "ns".
+        When decoding via dtype, the resolution is specified in the dtype
+        attribute, so this parameter is ignored.
+    decode_via_units : bool
+        Whether to decode timedeltas based on the presence of a timedelta-like
+        units attribute, e.g. "seconds". Defaults to True, but in the future
+        will default to False.
+    decode_via_dtype : bool
+        Whether to decode timedeltas based on the presence of a np.timedelta64
+        dtype attribute, e.g. "timedelta64[s]". Defaults to True.
     """
 
     def __init__(
         self,
         time_unit: PDDatetimeUnitOptions = "ns",
+        decode_via_units: bool = True,
+        decode_via_dtype: bool = True,
     ) -> None:
         self.time_unit = time_unit
+        self.decode_via_units = decode_via_units
+        self.decode_via_dtype = decode_via_dtype
         self._emit_decode_timedelta_future_warning = False
 
     def encode(self, variable: Variable, name: T_Name = None) -> Variable:
         if np.issubdtype(variable.data.dtype, np.timedelta64):
             dims, data, attrs, encoding = unpack_for_encoding(variable)
+            has_timedelta_dtype = has_timedelta64_encoding_dtype(encoding)
+            if ("units" in encoding or "dtype" in encoding) and not has_timedelta_dtype:
+                dtype = encoding.get("dtype", None)
+                units = encoding.pop("units", None)
 
-            dtype = encoding.get("dtype", None)
-
-            # in the case of packed data we need to encode into
-            # float first, the correct dtype will be established
-            # via CFScaleOffsetCoder/CFMaskCoder
-            if "add_offset" in encoding or "scale_factor" in encoding:
-                dtype = data.dtype if data.dtype.kind == "f" else "float64"
+                # in the case of packed data we need to encode into
+                # float first, the correct dtype will be established
+                # via CFScaleOffsetCoder/CFMaskCoder
+                if "add_offset" in encoding or "scale_factor" in encoding:
+                    dtype = data.dtype if data.dtype.kind == "f" else "float64"
 
-            data, units = encode_cf_timedelta(data, encoding.pop("units", None), dtype)
+            else:
+                resolution, _ = np.datetime_data(variable.dtype)
+                dtype = np.int64
+                attrs_dtype = f"timedelta64[{resolution}]"
+                units = _numpy_dtype_to_netcdf_timeunit(variable.dtype)
+                safe_setitem(attrs, "dtype", attrs_dtype, name=name)
+                # Remove dtype encoding if it exists to prevent it from
+                # interfering downstream in NonStringCoder.
+                encoding.pop("dtype", None)
+
+                if any(
+                    k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
+                ):
+                    raise ValueError(
+                        f"Specifying 'add_offset' or 'scale_factor' is not "
+                        f"supported when encoding the timedelta64 values of "
+                        f"variable {name!r} with xarray's new default "
+                        f"timedelta64 encoding approach. To encode {name!r} "
+                        f"with xarray's previous timedelta64 encoding "
+                        f"approach, which supports the 'add_offset' and "
+                        f"'scale_factor' parameters, additionally set "
+                        f"encoding['units'] to a unit of time, e.g. "
+                        f"'seconds'. To proceed with encoding of {name!r} "
+                        f"via xarray's new approach, remove any encoding "
+                        f"entries for 'add_offset' or 'scale_factor'."
+                    )
+                if "_FillValue" not in encoding and "missing_value" not in encoding:
+                    encoding["_FillValue"] = np.iinfo(np.int64).min
 
+            data, units = encode_cf_timedelta(data, units, dtype)
             safe_setitem(attrs, "units", units, name=name)
-
             return Variable(dims, data, attrs, encoding, fastpath=True)
         else:
             return variable
 
     def decode(self, variable: Variable, name: T_Name = None) -> Variable:
         units = variable.attrs.get("units", None)
-        if isinstance(units, str) and units in TIME_UNITS:
-            if self._emit_decode_timedelta_future_warning:
-                emit_user_level_warning(
-                    "In a future version of xarray decode_timedelta will "
-                    "default to False rather than None. To silence this "
-                    "warning, set decode_timedelta to True, False, or a "
-                    "'CFTimedeltaCoder' instance.",
-                    FutureWarning,
-                )
+        has_timedelta_units = isinstance(units, str) and units in TIME_UNITS
+        has_timedelta_dtype = has_timedelta64_encoding_dtype(variable.attrs)
+        is_dtype_decodable = has_timedelta_units and has_timedelta_dtype
+        is_units_decodable = has_timedelta_units
+        if (is_dtype_decodable and self.decode_via_dtype) or (
+            is_units_decodable and self.decode_via_units
+        ):
             dims, data, attrs, encoding = unpack_for_decoding(variable)
-
             units = pop_to(attrs, encoding, "units")
-            dtype = np.dtype(f"timedelta64[{self.time_unit}]")
-            transform = partial(
-                decode_cf_timedelta, units=units, time_unit=self.time_unit
-            )
+            if is_dtype_decodable and self.decode_via_dtype:
+                if any(
+                    k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
+                ):
+                    raise ValueError(
+                        f"Decoding timedelta64 values via dtype is not "
+                        f"supported when 'add_offset', or 'scale_factor' are "
+                        f"present in encoding. Check the encoding parameters "
+                        f"of variable {name!r}."
+                    )
+                dtype = pop_to(attrs, encoding, "dtype", name=name)
+                dtype = np.dtype(dtype)
+                resolution, _ = np.datetime_data(dtype)
+                resolution = cast(NPDatetimeUnitOptions, resolution)
+                if np.timedelta64(1, resolution) > np.timedelta64(1, "s"):
+                    time_unit = cast(PDDatetimeUnitOptions, "s")
+                    dtype = np.dtype("timedelta64[s]")
+                    message = (
+                        f"Following pandas, xarray only supports decoding to "
+                        f"timedelta64 values with a resolution of 's', 'ms', "
+                        f"'us', or 'ns'. Encoded values for variable {name!r} "
+                        f"have a resolution of {resolution!r}. Attempting to "
+                        f"decode to a resolution of 's'. Note, depending on "
+                        f"the encoded values, this may lead to an "
+                        f"OverflowError. Additionally, data will not be "
+                        f"identically round tripped; xarray will choose an "
+                        f"encoding dtype of 'timedelta64[s]' when re-encoding."
+                    )
+                    emit_user_level_warning(message)
+                elif np.timedelta64(1, resolution) < np.timedelta64(1, "ns"):
+                    time_unit = cast(PDDatetimeUnitOptions, "ns")
+                    dtype = np.dtype("timedelta64[ns]")
+                    message = (
+                        f"Following pandas, xarray only supports decoding to "
+                        f"timedelta64 values with a resolution of 's', 'ms', "
+                        f"'us', or 'ns'. Encoded values for variable {name!r} "
+                        f"have a resolution of {resolution!r}. Attempting to "
+                        f"decode to a resolution of 'ns'. Note, depending on "
+                        f"the encoded values, this may lead to loss of "
+                        f"precision. Additionally, data will not be "
+                        f"identically round tripped; xarray will choose an "
+                        f"encoding dtype of 'timedelta64[ns]' "
+                        f"when re-encoding."
+                    )
+                    emit_user_level_warning(message)
+                else:
+                    time_unit = cast(PDDatetimeUnitOptions, resolution)
+            elif self.decode_via_units:
+                if self._emit_decode_timedelta_future_warning:
+                    emit_user_level_warning(
+                        "In a future version, xarray will not decode "
+                        "timedelta values based on the presence of a "
+                        "timedelta-like units attribute by default. Instead "
+                        "it will rely on the presence of a timedelta64 dtype "
+                        "attribute, which is now xarray's default way of "
+                        "encoding timedelta64 values. To continue decoding "
+                        "timedeltas based on the presence of a timedelta-like "
+                        "units attribute, users will need to explicitly "
+                        "opt-in by passing True or "
+                        "CFTimedeltaCoder(decode_via_units=True) to "
+                        "decode_timedelta. To silence this warning, set "
+                        "decode_timedelta to True, False, or a "
+                        "'CFTimedeltaCoder' instance.",
+                        FutureWarning,
+                    )
+                dtype = np.dtype(f"timedelta64[{self.time_unit}]")
+                time_unit = self.time_unit
+            transform = partial(decode_cf_timedelta, units=units, time_unit=time_unit)
             data = lazy_elemwise_func(data, transform, dtype=dtype)
-
             return Variable(dims, data, attrs, encoding, fastpath=True)
         else:
             return variable
diff --git a/xarray/conventions.py b/xarray/conventions.py
@@ -204,8 +204,10 @@ def decode_cf_variable(
             var = coder.decode(var, name=name)
 
     if decode_timedelta:
-        if not isinstance(decode_timedelta, CFTimedeltaCoder):
-            decode_timedelta = CFTimedeltaCoder()
+        if isinstance(decode_timedelta, bool):
+            decode_timedelta = CFTimedeltaCoder(
+                decode_via_units=decode_timedelta, decode_via_dtype=decode_timedelta
+            )
         decode_timedelta._emit_decode_timedelta_future_warning = (
             decode_timedelta_was_none
         )
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -635,7 +635,10 @@ def test_roundtrip_timedelta_data(self) -> None:
         #  though we cannot test that until we fix the timedelta decoding
         #  to support large ranges
         time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit("s")  # type: ignore[arg-type, unused-ignore]
+        encoding = {"units": "seconds"}
         expected = Dataset({"td": ("td", time_deltas), "td0": time_deltas[0]})
+        expected["td"].encoding = encoding
+        expected["td0"].encoding = encoding
         with self.roundtrip(
             expected, open_kwargs={"decode_timedelta": CFTimedeltaCoder(time_unit="ns")}
         ) as actual:
diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py