Skip to content

Commit f20eb7a

Browse files
spencerkclarkpre-commit-ci[bot]dcheriankmuehlbauer
authored
Implement literal np.timedelta64 coding (#10101)
* Proof of concept literal timedelta64 coding * Ensure test_roundtrip_timedelta_data test uses old encoding pathway * Remove no longer relevant test * Include units attribute * Move coder to times.py * Add what's new entry * Restore test and reduce diff * Fix typing * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix doctests * Restore original order of encoders * Add return types to tests * Move everything to CFTimedeltaCoder; reuse code where possible * Fix mypy * Use Kai's offset and scale_factor logic for all encoding * Fix bad merge * Forbid mixing other encoding with literal timedelta64 encoding * Expose fine-grained control over decoding pathways * Rename test * Use consistent dtype spelling * Continue supporting non-timedelta dtype-only encoding * Fix example attribute in docstring * Update what's new * Fix typo * Complete test * Fix docstring * Support _FillValue or missing_value encoding * Tweak errors and warnings; relax decoding dtype error * Add xfail test for fine-resolution branch of non-pandas resolution code * Fix typing * Revert "Fix typing" This reverts commit 0929ec4. * Use simpler typing fix for now * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com> Co-authored-by: Kai Mühlbauer <kai.muehlbauer@uni-bonn.de>
1 parent c4b3f1c commit f20eb7a

File tree

5 files changed

+335
-29
lines changed

5 files changed

+335
-29
lines changed

doc/whats-new.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,15 @@ Alban Farchi, Andrecho, Benoit Bovy, Deepak Cherian, Dimitri Papadopoulos Orfano
7171

7272
New Features
7373
~~~~~~~~~~~~
74+
- By default xarray now encodes :py:class:`numpy.timedelta64` values by
75+
converting to :py:class:`numpy.int64` values and storing ``"dtype"`` and
76+
``"units"`` attributes consistent with the dtype of the in-memory
77+
:py:class:`numpy.timedelta64` values, e.g. ``"timedelta64[s]"`` and
78+
``"seconds"`` for second-resolution timedeltas. These values will always be
79+
decoded to timedeltas without a warning moving forward. Timedeltas encoded
80+
via the previous approach can still be roundtripped exactly, but in the
81+
future will not be decoded by default (:issue:`1621`, :issue:`10099`,
82+
:pull:`10101`). By `Spencer Clark <https://github.com/spencerkclark>`_.
7483

7584
- Added `scipy-stubs <https://github.com/scipy/scipy-stubs>`_ to the ``xarray[types]`` dependencies.
7685
By `Joren Hammudoglu <https://github.com/jorenham>`_.

xarray/coding/times.py

Lines changed: 138 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,12 @@
9292
)
9393

9494

95+
_INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS = [
96+
"add_offset",
97+
"scale_factor",
98+
]
99+
100+
95101
def _is_standard_calendar(calendar: str) -> bool:
96102
return calendar.lower() in _STANDARD_CALENDARS
97103

@@ -1403,62 +1409,169 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
14031409
return variable
14041410

14051411

1412+
def has_timedelta64_encoding_dtype(attrs_or_encoding: dict) -> bool:
1413+
dtype = attrs_or_encoding.get("dtype", None)
1414+
return isinstance(dtype, str) and dtype.startswith("timedelta64")
1415+
1416+
14061417
class CFTimedeltaCoder(VariableCoder):
14071418
"""Coder for CF Timedelta coding.
14081419
14091420
Parameters
14101421
----------
14111422
time_unit : PDDatetimeUnitOptions
1412-
Target resolution when decoding timedeltas. Defaults to "ns".
1423+
Target resolution when decoding timedeltas via units. Defaults to "ns".
1424+
When decoding via dtype, the resolution is specified in the dtype
1425+
attribute, so this parameter is ignored.
1426+
decode_via_units : bool
1427+
Whether to decode timedeltas based on the presence of a timedelta-like
1428+
units attribute, e.g. "seconds". Defaults to True, but in the future
1429+
will default to False.
1430+
decode_via_dtype : bool
1431+
Whether to decode timedeltas based on the presence of a np.timedelta64
1432+
dtype attribute, e.g. "timedelta64[s]". Defaults to True.
14131433
"""
14141434

14151435
def __init__(
14161436
self,
14171437
time_unit: PDDatetimeUnitOptions = "ns",
1438+
decode_via_units: bool = True,
1439+
decode_via_dtype: bool = True,
14181440
) -> None:
14191441
self.time_unit = time_unit
1442+
self.decode_via_units = decode_via_units
1443+
self.decode_via_dtype = decode_via_dtype
14201444
self._emit_decode_timedelta_future_warning = False
14211445

14221446
def encode(self, variable: Variable, name: T_Name = None) -> Variable:
14231447
if np.issubdtype(variable.data.dtype, np.timedelta64):
14241448
dims, data, attrs, encoding = unpack_for_encoding(variable)
1449+
has_timedelta_dtype = has_timedelta64_encoding_dtype(encoding)
1450+
if ("units" in encoding or "dtype" in encoding) and not has_timedelta_dtype:
1451+
dtype = encoding.get("dtype", None)
1452+
units = encoding.pop("units", None)
14251453

1426-
dtype = encoding.get("dtype", None)
1427-
1428-
# in the case of packed data we need to encode into
1429-
# float first, the correct dtype will be established
1430-
# via CFScaleOffsetCoder/CFMaskCoder
1431-
if "add_offset" in encoding or "scale_factor" in encoding:
1432-
dtype = data.dtype if data.dtype.kind == "f" else "float64"
1454+
# in the case of packed data we need to encode into
1455+
# float first, the correct dtype will be established
1456+
# via CFScaleOffsetCoder/CFMaskCoder
1457+
if "add_offset" in encoding or "scale_factor" in encoding:
1458+
dtype = data.dtype if data.dtype.kind == "f" else "float64"
14331459

1434-
data, units = encode_cf_timedelta(data, encoding.pop("units", None), dtype)
1460+
else:
1461+
resolution, _ = np.datetime_data(variable.dtype)
1462+
dtype = np.int64
1463+
attrs_dtype = f"timedelta64[{resolution}]"
1464+
units = _numpy_dtype_to_netcdf_timeunit(variable.dtype)
1465+
safe_setitem(attrs, "dtype", attrs_dtype, name=name)
1466+
# Remove dtype encoding if it exists to prevent it from
1467+
# interfering downstream in NonStringCoder.
1468+
encoding.pop("dtype", None)
1469+
1470+
if any(
1471+
k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
1472+
):
1473+
raise ValueError(
1474+
f"Specifying 'add_offset' or 'scale_factor' is not "
1475+
f"supported when encoding the timedelta64 values of "
1476+
f"variable {name!r} with xarray's new default "
1477+
f"timedelta64 encoding approach. To encode {name!r} "
1478+
f"with xarray's previous timedelta64 encoding "
1479+
f"approach, which supports the 'add_offset' and "
1480+
f"'scale_factor' parameters, additionally set "
1481+
f"encoding['units'] to a unit of time, e.g. "
1482+
f"'seconds'. To proceed with encoding of {name!r} "
1483+
f"via xarray's new approach, remove any encoding "
1484+
f"entries for 'add_offset' or 'scale_factor'."
1485+
)
1486+
if "_FillValue" not in encoding and "missing_value" not in encoding:
1487+
encoding["_FillValue"] = np.iinfo(np.int64).min
14351488

1489+
data, units = encode_cf_timedelta(data, units, dtype)
14361490
safe_setitem(attrs, "units", units, name=name)
1437-
14381491
return Variable(dims, data, attrs, encoding, fastpath=True)
14391492
else:
14401493
return variable
14411494

14421495
def decode(self, variable: Variable, name: T_Name = None) -> Variable:
14431496
units = variable.attrs.get("units", None)
1444-
if isinstance(units, str) and units in TIME_UNITS:
1445-
if self._emit_decode_timedelta_future_warning:
1446-
emit_user_level_warning(
1447-
"In a future version of xarray decode_timedelta will "
1448-
"default to False rather than None. To silence this "
1449-
"warning, set decode_timedelta to True, False, or a "
1450-
"'CFTimedeltaCoder' instance.",
1451-
FutureWarning,
1452-
)
1497+
has_timedelta_units = isinstance(units, str) and units in TIME_UNITS
1498+
has_timedelta_dtype = has_timedelta64_encoding_dtype(variable.attrs)
1499+
is_dtype_decodable = has_timedelta_units and has_timedelta_dtype
1500+
is_units_decodable = has_timedelta_units
1501+
if (is_dtype_decodable and self.decode_via_dtype) or (
1502+
is_units_decodable and self.decode_via_units
1503+
):
14531504
dims, data, attrs, encoding = unpack_for_decoding(variable)
1454-
14551505
units = pop_to(attrs, encoding, "units")
1456-
dtype = np.dtype(f"timedelta64[{self.time_unit}]")
1457-
transform = partial(
1458-
decode_cf_timedelta, units=units, time_unit=self.time_unit
1459-
)
1506+
if is_dtype_decodable and self.decode_via_dtype:
1507+
if any(
1508+
k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
1509+
):
1510+
raise ValueError(
1511+
f"Decoding timedelta64 values via dtype is not "
1512+
f"supported when 'add_offset', or 'scale_factor' are "
1513+
f"present in encoding. Check the encoding parameters "
1514+
f"of variable {name!r}."
1515+
)
1516+
dtype = pop_to(attrs, encoding, "dtype", name=name)
1517+
dtype = np.dtype(dtype)
1518+
resolution, _ = np.datetime_data(dtype)
1519+
resolution = cast(NPDatetimeUnitOptions, resolution)
1520+
if np.timedelta64(1, resolution) > np.timedelta64(1, "s"):
1521+
time_unit = cast(PDDatetimeUnitOptions, "s")
1522+
dtype = np.dtype("timedelta64[s]")
1523+
message = (
1524+
f"Following pandas, xarray only supports decoding to "
1525+
f"timedelta64 values with a resolution of 's', 'ms', "
1526+
f"'us', or 'ns'. Encoded values for variable {name!r} "
1527+
f"have a resolution of {resolution!r}. Attempting to "
1528+
f"decode to a resolution of 's'. Note, depending on "
1529+
f"the encoded values, this may lead to an "
1530+
f"OverflowError. Additionally, data will not be "
1531+
f"identically round tripped; xarray will choose an "
1532+
f"encoding dtype of 'timedelta64[s]' when re-encoding."
1533+
)
1534+
emit_user_level_warning(message)
1535+
elif np.timedelta64(1, resolution) < np.timedelta64(1, "ns"):
1536+
time_unit = cast(PDDatetimeUnitOptions, "ns")
1537+
dtype = np.dtype("timedelta64[ns]")
1538+
message = (
1539+
f"Following pandas, xarray only supports decoding to "
1540+
f"timedelta64 values with a resolution of 's', 'ms', "
1541+
f"'us', or 'ns'. Encoded values for variable {name!r} "
1542+
f"have a resolution of {resolution!r}. Attempting to "
1543+
f"decode to a resolution of 'ns'. Note, depending on "
1544+
f"the encoded values, this may lead to loss of "
1545+
f"precision. Additionally, data will not be "
1546+
f"identically round tripped; xarray will choose an "
1547+
f"encoding dtype of 'timedelta64[ns]' "
1548+
f"when re-encoding."
1549+
)
1550+
emit_user_level_warning(message)
1551+
else:
1552+
time_unit = cast(PDDatetimeUnitOptions, resolution)
1553+
elif self.decode_via_units:
1554+
if self._emit_decode_timedelta_future_warning:
1555+
emit_user_level_warning(
1556+
"In a future version, xarray will not decode "
1557+
"timedelta values based on the presence of a "
1558+
"timedelta-like units attribute by default. Instead "
1559+
"it will rely on the presence of a timedelta64 dtype "
1560+
"attribute, which is now xarray's default way of "
1561+
"encoding timedelta64 values. To continue decoding "
1562+
"timedeltas based on the presence of a timedelta-like "
1563+
"units attribute, users will need to explicitly "
1564+
"opt-in by passing True or "
1565+
"CFTimedeltaCoder(decode_via_units=True) to "
1566+
"decode_timedelta. To silence this warning, set "
1567+
"decode_timedelta to True, False, or a "
1568+
"'CFTimedeltaCoder' instance.",
1569+
FutureWarning,
1570+
)
1571+
dtype = np.dtype(f"timedelta64[{self.time_unit}]")
1572+
time_unit = self.time_unit
1573+
transform = partial(decode_cf_timedelta, units=units, time_unit=time_unit)
14601574
data = lazy_elemwise_func(data, transform, dtype=dtype)
1461-
14621575
return Variable(dims, data, attrs, encoding, fastpath=True)
14631576
else:
14641577
return variable

xarray/conventions.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,8 +204,10 @@ def decode_cf_variable(
204204
var = coder.decode(var, name=name)
205205

206206
if decode_timedelta:
207-
if not isinstance(decode_timedelta, CFTimedeltaCoder):
208-
decode_timedelta = CFTimedeltaCoder()
207+
if isinstance(decode_timedelta, bool):
208+
decode_timedelta = CFTimedeltaCoder(
209+
decode_via_units=decode_timedelta, decode_via_dtype=decode_timedelta
210+
)
209211
decode_timedelta._emit_decode_timedelta_future_warning = (
210212
decode_timedelta_was_none
211213
)

xarray/tests/test_backends.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,7 +635,10 @@ def test_roundtrip_timedelta_data(self) -> None:
635635
# though we cannot test that until we fix the timedelta decoding
636636
# to support large ranges
637637
time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit("s") # type: ignore[arg-type, unused-ignore]
638+
encoding = {"units": "seconds"}
638639
expected = Dataset({"td": ("td", time_deltas), "td0": time_deltas[0]})
640+
expected["td"].encoding = encoding
641+
expected["td0"].encoding = encoding
639642
with self.roundtrip(
640643
expected, open_kwargs={"decode_timedelta": CFTimedeltaCoder(time_unit="ns")}
641644
) as actual:

0 commit comments

Comments
 (0)