Skip to content

Commit 516ec07

Browse files
spencerkclarkpre-commit-ci[bot]dcheriankmuehlbauer
authored
Fix critical np.timedelta64 encoding bugs (#10469)
This PR fixes the critical `np.timedelta64` encoding bugs introduced in #10101. We now always encode `np.timedelta64` values with a `dtype` attribute corresponding to the in-memory `dtype`, and use the same encoding path that we did previously, which by default selects the coarsest units that support integer serialization. This enables storing `"timedelta64[ns]"` values in netCDF3 format, which was not supported by the "literal" encoding approach implemented in #10101. For consistency with the previous `units`-based decoding approach, this update also now enables controlling the decoded resolution of `dtype`-decoded values via the `time_unit` parameter of the `CFTimedeltaCoder` class. By default the `time_unit` parameter is now `None`. If the `time_unit` is `None` the `dtype` attribute determines the `dtype` the data is decoded to; if the `time_unit` is not `None` it takes precedence. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com> Co-authored-by: Kai Mühlbauer <kai.muehlbauer@uni-bonn.de>
1 parent e243e2b commit 516ec07

File tree

4 files changed

+171
-136
lines changed

4 files changed

+171
-136
lines changed

doc/whats-new.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,15 @@ Bug fixes
204204
(:pull:`10352`). By `Spencer Clark <https://github.com/spencerkclark>`_.
205205
- Avoid unsafe casts from float to unsigned int in CFMaskCoder (:issue:`9815`, :pull:`9964`).
206206
By ` Elliott Sales de Andrade <https://github.com/QuLogic>`_.
207+
- Fix attribute overwriting bug when decoding encoded
208+
:py:class:`numpy.timedelta64` values from disk with a dtype attribute
209+
(:issue:`10468`, :pull:`10469`). By `Spencer Clark
210+
<https://github.com/spencerkclark>`_.
211+
- Fix default ``"_FillValue"`` dtype coercion bug when encoding
212+
:py:class:`numpy.timedelta64` values to an on-disk format that only supports
213+
32-bit integers (:issue:`10466`, :pull:`10469`). By `Spencer Clark
214+
<https://github.com/spencerkclark>`_.
215+
207216

208217
Performance
209218
~~~~~~~~~~~

xarray/coding/times.py

Lines changed: 67 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1410,6 +1410,43 @@ def has_timedelta64_encoding_dtype(attrs_or_encoding: dict) -> bool:
14101410
return isinstance(dtype, str) and dtype.startswith("timedelta64")
14111411

14121412

1413+
def resolve_time_unit_from_attrs_dtype(
1414+
attrs_dtype: str, name: T_Name
1415+
) -> PDDatetimeUnitOptions:
1416+
dtype = np.dtype(attrs_dtype)
1417+
resolution, _ = np.datetime_data(dtype)
1418+
resolution = cast(NPDatetimeUnitOptions, resolution)
1419+
if np.timedelta64(1, resolution) > np.timedelta64(1, "s"):
1420+
time_unit = cast(PDDatetimeUnitOptions, "s")
1421+
message = (
1422+
f"Following pandas, xarray only supports decoding to timedelta64 "
1423+
f"values with a resolution of 's', 'ms', 'us', or 'ns'. Encoded "
1424+
f"values for variable {name!r} have a resolution of "
1425+
f"{resolution!r}. Attempting to decode to a resolution of 's'. "
1426+
f"Note, depending on the encoded values, this may lead to an "
1427+
f"OverflowError. Additionally, data will not be identically round "
1428+
f"tripped; xarray will choose an encoding dtype of "
1429+
f"'timedelta64[s]' when re-encoding."
1430+
)
1431+
emit_user_level_warning(message)
1432+
elif np.timedelta64(1, resolution) < np.timedelta64(1, "ns"):
1433+
time_unit = cast(PDDatetimeUnitOptions, "ns")
1434+
message = (
1435+
f"Following pandas, xarray only supports decoding to timedelta64 "
1436+
f"values with a resolution of 's', 'ms', 'us', or 'ns'. Encoded "
1437+
f"values for variable {name!r} have a resolution of "
1438+
f"{resolution!r}. Attempting to decode to a resolution of 'ns'. "
1439+
f"Note, depending on the encoded values, this may lead to loss of "
1440+
f"precision. Additionally, data will not be identically round "
1441+
f"tripped; xarray will choose an encoding dtype of "
1442+
f"'timedelta64[ns]' when re-encoding."
1443+
)
1444+
emit_user_level_warning(message)
1445+
else:
1446+
time_unit = cast(PDDatetimeUnitOptions, resolution)
1447+
return time_unit
1448+
1449+
14131450
class CFTimedeltaCoder(VariableCoder):
14141451
"""Coder for CF Timedelta coding.
14151452
@@ -1430,7 +1467,7 @@ class CFTimedeltaCoder(VariableCoder):
14301467

14311468
def __init__(
14321469
self,
1433-
time_unit: PDDatetimeUnitOptions = "ns",
1470+
time_unit: PDDatetimeUnitOptions | None = None,
14341471
decode_via_units: bool = True,
14351472
decode_via_dtype: bool = True,
14361473
) -> None:
@@ -1442,45 +1479,18 @@ def __init__(
14421479
def encode(self, variable: Variable, name: T_Name = None) -> Variable:
14431480
if np.issubdtype(variable.data.dtype, np.timedelta64):
14441481
dims, data, attrs, encoding = unpack_for_encoding(variable)
1445-
has_timedelta_dtype = has_timedelta64_encoding_dtype(encoding)
1446-
if ("units" in encoding or "dtype" in encoding) and not has_timedelta_dtype:
1447-
dtype = encoding.get("dtype", None)
1448-
units = encoding.pop("units", None)
1482+
dtype = encoding.get("dtype", None)
1483+
units = encoding.pop("units", None)
14491484

1450-
# in the case of packed data we need to encode into
1451-
# float first, the correct dtype will be established
1452-
# via CFScaleOffsetCoder/CFMaskCoder
1453-
if "add_offset" in encoding or "scale_factor" in encoding:
1454-
dtype = data.dtype if data.dtype.kind == "f" else "float64"
1485+
# in the case of packed data we need to encode into
1486+
# float first, the correct dtype will be established
1487+
# via CFScaleOffsetCoder/CFMaskCoder
1488+
if "add_offset" in encoding or "scale_factor" in encoding:
1489+
dtype = data.dtype if data.dtype.kind == "f" else "float64"
14551490

1456-
else:
1457-
resolution, _ = np.datetime_data(variable.dtype)
1458-
dtype = np.int64
1459-
attrs_dtype = f"timedelta64[{resolution}]"
1460-
units = _numpy_dtype_to_netcdf_timeunit(variable.dtype)
1461-
safe_setitem(attrs, "dtype", attrs_dtype, name=name)
1462-
# Remove dtype encoding if it exists to prevent it from
1463-
# interfering downstream in NonStringCoder.
1464-
encoding.pop("dtype", None)
1465-
1466-
if any(
1467-
k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
1468-
):
1469-
raise ValueError(
1470-
f"Specifying 'add_offset' or 'scale_factor' is not "
1471-
f"supported when encoding the timedelta64 values of "
1472-
f"variable {name!r} with xarray's new default "
1473-
f"timedelta64 encoding approach. To encode {name!r} "
1474-
f"with xarray's previous timedelta64 encoding "
1475-
f"approach, which supports the 'add_offset' and "
1476-
f"'scale_factor' parameters, additionally set "
1477-
f"encoding['units'] to a unit of time, e.g. "
1478-
f"'seconds'. To proceed with encoding of {name!r} "
1479-
f"via xarray's new approach, remove any encoding "
1480-
f"entries for 'add_offset' or 'scale_factor'."
1481-
)
1482-
if "_FillValue" not in encoding and "missing_value" not in encoding:
1483-
encoding["_FillValue"] = np.iinfo(np.int64).min
1491+
resolution, _ = np.datetime_data(variable.dtype)
1492+
attrs_dtype = f"timedelta64[{resolution}]"
1493+
safe_setitem(attrs, "dtype", attrs_dtype, name=name)
14841494

14851495
data, units = encode_cf_timedelta(data, units, dtype)
14861496
safe_setitem(attrs, "units", units, name=name)
@@ -1499,54 +1509,13 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
14991509
):
15001510
dims, data, attrs, encoding = unpack_for_decoding(variable)
15011511
units = pop_to(attrs, encoding, "units")
1502-
if is_dtype_decodable and self.decode_via_dtype:
1503-
if any(
1504-
k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
1505-
):
1506-
raise ValueError(
1507-
f"Decoding timedelta64 values via dtype is not "
1508-
f"supported when 'add_offset', or 'scale_factor' are "
1509-
f"present in encoding. Check the encoding parameters "
1510-
f"of variable {name!r}."
1511-
)
1512-
dtype = pop_to(attrs, encoding, "dtype", name=name)
1513-
dtype = np.dtype(dtype)
1514-
resolution, _ = np.datetime_data(dtype)
1515-
resolution = cast(NPDatetimeUnitOptions, resolution)
1516-
if np.timedelta64(1, resolution) > np.timedelta64(1, "s"):
1517-
time_unit = cast(PDDatetimeUnitOptions, "s")
1518-
dtype = np.dtype("timedelta64[s]")
1519-
message = (
1520-
f"Following pandas, xarray only supports decoding to "
1521-
f"timedelta64 values with a resolution of 's', 'ms', "
1522-
f"'us', or 'ns'. Encoded values for variable {name!r} "
1523-
f"have a resolution of {resolution!r}. Attempting to "
1524-
f"decode to a resolution of 's'. Note, depending on "
1525-
f"the encoded values, this may lead to an "
1526-
f"OverflowError. Additionally, data will not be "
1527-
f"identically round tripped; xarray will choose an "
1528-
f"encoding dtype of 'timedelta64[s]' when re-encoding."
1529-
)
1530-
emit_user_level_warning(message)
1531-
elif np.timedelta64(1, resolution) < np.timedelta64(1, "ns"):
1532-
time_unit = cast(PDDatetimeUnitOptions, "ns")
1533-
dtype = np.dtype("timedelta64[ns]")
1534-
message = (
1535-
f"Following pandas, xarray only supports decoding to "
1536-
f"timedelta64 values with a resolution of 's', 'ms', "
1537-
f"'us', or 'ns'. Encoded values for variable {name!r} "
1538-
f"have a resolution of {resolution!r}. Attempting to "
1539-
f"decode to a resolution of 'ns'. Note, depending on "
1540-
f"the encoded values, this may lead to loss of "
1541-
f"precision. Additionally, data will not be "
1542-
f"identically round tripped; xarray will choose an "
1543-
f"encoding dtype of 'timedelta64[ns]' "
1544-
f"when re-encoding."
1545-
)
1546-
emit_user_level_warning(message)
1512+
if is_dtype_decodable:
1513+
attrs_dtype = attrs.pop("dtype")
1514+
if self.time_unit is None:
1515+
time_unit = resolve_time_unit_from_attrs_dtype(attrs_dtype, name)
15471516
else:
1548-
time_unit = cast(PDDatetimeUnitOptions, resolution)
1549-
elif self.decode_via_units:
1517+
time_unit = self.time_unit
1518+
else:
15501519
if self._emit_decode_timedelta_future_warning:
15511520
emit_user_level_warning(
15521521
"In a future version, xarray will not decode "
@@ -1564,8 +1533,19 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
15641533
"'CFTimedeltaCoder' instance.",
15651534
FutureWarning,
15661535
)
1567-
dtype = np.dtype(f"timedelta64[{self.time_unit}]")
1568-
time_unit = self.time_unit
1536+
if self.time_unit is None:
1537+
time_unit = cast(PDDatetimeUnitOptions, "ns")
1538+
else:
1539+
time_unit = self.time_unit
1540+
1541+
# Handle edge case that decode_via_dtype=False and
1542+
# decode_via_units=True, and timedeltas were encoded with a
1543+
# dtype attribute. We need to remove the dtype attribute
1544+
# to prevent an error during round tripping.
1545+
if has_timedelta_dtype:
1546+
attrs.pop("dtype")
1547+
1548+
dtype = np.dtype(f"timedelta64[{time_unit}]")
15691549
transform = partial(decode_cf_timedelta, units=units, time_unit=time_unit)
15701550
data = lazy_elemwise_func(data, transform, dtype=dtype)
15711551
return Variable(dims, data, attrs, encoding, fastpath=True)

xarray/tests/test_backends.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
from xarray.conventions import encode_dataset_coordinates
5757
from xarray.core import indexing
5858
from xarray.core.options import set_options
59+
from xarray.core.types import PDDatetimeUnitOptions
5960
from xarray.core.utils import module_available
6061
from xarray.namedarray.pycompat import array_type
6162
from xarray.tests import (
@@ -642,6 +643,16 @@ def test_roundtrip_timedelta_data(self) -> None:
642643
) as actual:
643644
assert_identical(expected, actual)
644645

646+
def test_roundtrip_timedelta_data_via_dtype(
647+
self, time_unit: PDDatetimeUnitOptions
648+
) -> None:
649+
time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit(time_unit) # type: ignore[arg-type, unused-ignore]
650+
expected = Dataset(
651+
{"td": ("td", time_deltas), "td0": time_deltas[0].to_numpy()}
652+
)
653+
with self.roundtrip(expected) as actual:
654+
assert_identical(expected, actual)
655+
645656
def test_roundtrip_float64_data(self) -> None:
646657
expected = Dataset({"x": ("y", np.array([1.0, 2.0, np.pi], dtype="float64"))})
647658
with self.roundtrip(expected) as actual:

0 commit comments

Comments
 (0)