Skip to content

Commit 4ab0679

Browse files
djhoesekmuehlbauerdcherian
authored
Combine UnsignedIntegerCoder and CFMaskCoder (pydata#9274)
* Fix small typo in docstring * Combine CF Unsigned and Mask handling * Replace UnsignedIntegerCode tests with CFMaskCoder usage * Fix dtype type annotation * Fix when unsigned serialization warning is expected in tests * Small refactor of CFMaskCoder decoding * Add CF encoder tests for _Unsigned=false cases * Remove UnsignedIntegerCoder from api docs --------- Co-authored-by: Kai Mühlbauer <kai.muehlbauer@uni-bonn.de> Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>
1 parent 40291ad commit 4ab0679

File tree

5 files changed

+226
-120
lines changed

5 files changed

+226
-120
lines changed

doc/api-hidden.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -684,7 +684,6 @@
684684

685685
conventions.decode_cf_variables
686686

687-
coding.variables.UnsignedIntegerCoder
688687
coding.variables.CFMaskCoder
689688
coding.variables.CFScaleOffsetCoder
690689

xarray/coding/variables.py

Lines changed: 131 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ def _is_time_like(units):
261261

262262

263263
def _check_fill_values(attrs, name, dtype):
264-
""" "Check _FillValue and missing_value if available.
264+
"""Check _FillValue and missing_value if available.
265265
266266
Return dictionary with raw fill values and set with encoded fill values.
267267
@@ -298,18 +298,87 @@ def _check_fill_values(attrs, name, dtype):
298298
return raw_fill_dict, encoded_fill_values
299299

300300

301+
def _convert_unsigned_fill_value(
302+
name: T_Name,
303+
data: Any,
304+
unsigned: str,
305+
raw_fill_value: Any,
306+
encoded_fill_values: set,
307+
) -> Any:
308+
if data.dtype.kind == "i":
309+
if unsigned == "true":
310+
unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
311+
transform = partial(np.asarray, dtype=unsigned_dtype)
312+
if raw_fill_value is not None:
313+
new_fill = np.array(raw_fill_value, dtype=data.dtype)
314+
encoded_fill_values.remove(raw_fill_value)
315+
# use view here to prevent OverflowError
316+
encoded_fill_values.add(new_fill.view(unsigned_dtype).item())
317+
data = lazy_elemwise_func(data, transform, unsigned_dtype)
318+
elif data.dtype.kind == "u":
319+
if unsigned == "false":
320+
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
321+
transform = partial(np.asarray, dtype=signed_dtype)
322+
data = lazy_elemwise_func(data, transform, signed_dtype)
323+
if raw_fill_value is not None:
324+
new_fill = signed_dtype.type(raw_fill_value)
325+
encoded_fill_values.remove(raw_fill_value)
326+
encoded_fill_values.add(new_fill)
327+
else:
328+
warnings.warn(
329+
f"variable {name!r} has _Unsigned attribute but is not "
330+
"of integer type. Ignoring attribute.",
331+
SerializationWarning,
332+
stacklevel=3,
333+
)
334+
return data
335+
336+
337+
def _encode_unsigned_fill_value(
338+
name: T_Name,
339+
fill_value: Any,
340+
encoded_dtype: np.dtype,
341+
) -> Any:
342+
try:
343+
if hasattr(fill_value, "item"):
344+
# if numpy type, convert to python native integer to determine overflow
345+
# otherwise numpy unsigned ints will silently cast to the signed counterpart
346+
fill_value = fill_value.item()
347+
# passes if provided fill value fits in encoded on-disk type
348+
new_fill = encoded_dtype.type(fill_value)
349+
except OverflowError:
350+
encoded_kind_str = "signed" if encoded_dtype.kind == "i" else "unsigned"
351+
warnings.warn(
352+
f"variable {name!r} will be stored as {encoded_kind_str} integers "
353+
f"but _FillValue attribute can't be represented as a "
354+
f"{encoded_kind_str} integer.",
355+
SerializationWarning,
356+
stacklevel=3,
357+
)
358+
# user probably provided the fill as the in-memory dtype,
359+
# convert to on-disk type to match CF standard
360+
orig_kind = "u" if encoded_dtype.kind == "i" else "i"
361+
orig_dtype = np.dtype(f"{orig_kind}{encoded_dtype.itemsize}")
362+
# use view here to prevent OverflowError
363+
new_fill = np.array(fill_value, dtype=orig_dtype).view(encoded_dtype).item()
364+
return new_fill
365+
366+
301367
class CFMaskCoder(VariableCoder):
302368
"""Mask or unmask fill values according to CF conventions."""
303369

304370
def encode(self, variable: Variable, name: T_Name = None):
305371
dims, data, attrs, encoding = unpack_for_encoding(variable)
306372

307373
dtype = np.dtype(encoding.get("dtype", data.dtype))
374+
# from netCDF best practices
375+
# https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
376+
# "_Unsigned = "true" to indicate that
377+
# integer data should be treated as unsigned"
378+
has_unsigned = encoding.get("_Unsigned") is not None
308379
fv = encoding.get("_FillValue")
309380
mv = encoding.get("missing_value")
310-
# to properly handle _FillValue/missing_value below [a], [b]
311-
# we need to check if unsigned data is written as signed data
312-
unsigned = encoding.get("_Unsigned") is not None
381+
fill_value = None
313382

314383
fv_exists = fv is not None
315384
mv_exists = mv is not None
@@ -324,23 +393,28 @@ def encode(self, variable: Variable, name: T_Name = None):
324393

325394
if fv_exists:
326395
# Ensure _FillValue is cast to same dtype as data's
327-
# [a] need to skip this if _Unsigned is available
328-
if not unsigned:
329-
encoding["_FillValue"] = dtype.type(fv)
396+
encoding["_FillValue"] = (
397+
_encode_unsigned_fill_value(name, fv, dtype)
398+
if has_unsigned
399+
else dtype.type(fv)
400+
)
330401
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
331402

332403
if mv_exists:
333404
# try to use _FillValue, if it exists to align both values
334405
# or use missing_value and ensure it's cast to same dtype as data's
335-
# [b] need to provide mv verbatim if _Unsigned is available
336406
encoding["missing_value"] = attrs.get(
337407
"_FillValue",
338-
(dtype.type(mv) if not unsigned else mv),
408+
(
409+
_encode_unsigned_fill_value(name, mv, dtype)
410+
if has_unsigned
411+
else dtype.type(mv)
412+
),
339413
)
340414
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
341415

342416
# apply fillna
343-
if not pd.isnull(fill_value):
417+
if fill_value is not None and not pd.isnull(fill_value):
344418
# special case DateTime to properly handle NaT
345419
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
346420
data = duck_array_ops.where(
@@ -349,46 +423,63 @@ def encode(self, variable: Variable, name: T_Name = None):
349423
else:
350424
data = duck_array_ops.fillna(data, fill_value)
351425

426+
if fill_value is not None and has_unsigned:
427+
pop_to(encoding, attrs, "_Unsigned")
428+
# XXX: Is this actually needed? Doesn't the backend handle this?
429+
data = duck_array_ops.astype(duck_array_ops.around(data), dtype)
430+
attrs["_FillValue"] = fill_value
431+
352432
return Variable(dims, data, attrs, encoding, fastpath=True)
353433

354434
def decode(self, variable: Variable, name: T_Name = None):
355435
raw_fill_dict, encoded_fill_values = _check_fill_values(
356436
variable.attrs, name, variable.dtype
357437
)
438+
if "_Unsigned" not in variable.attrs and not raw_fill_dict:
439+
return variable
358440

359-
if raw_fill_dict:
360-
dims, data, attrs, encoding = unpack_for_decoding(variable)
361-
[
362-
safe_setitem(encoding, attr, value, name=name)
363-
for attr, value in raw_fill_dict.items()
364-
]
365-
366-
if encoded_fill_values:
367-
# special case DateTime to properly handle NaT
368-
dtype: np.typing.DTypeLike
369-
decoded_fill_value: Any
370-
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
371-
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
441+
dims, data, attrs, encoding = unpack_for_decoding(variable)
442+
443+
# Even if _Unsigned is use, retain on-disk _FillValue
444+
[
445+
safe_setitem(encoding, attr, value, name=name)
446+
for attr, value in raw_fill_dict.items()
447+
]
448+
449+
if "_Unsigned" in attrs:
450+
unsigned = pop_to(attrs, encoding, "_Unsigned")
451+
data = _convert_unsigned_fill_value(
452+
name,
453+
data,
454+
unsigned,
455+
raw_fill_dict.get("_FillValue"),
456+
encoded_fill_values,
457+
)
458+
459+
if encoded_fill_values:
460+
# special case DateTime to properly handle NaT
461+
dtype: np.typing.DTypeLike
462+
decoded_fill_value: Any
463+
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
464+
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
465+
else:
466+
if "scale_factor" not in attrs and "add_offset" not in attrs:
467+
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
372468
else:
373-
if "scale_factor" not in attrs and "add_offset" not in attrs:
374-
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
375-
else:
376-
dtype, decoded_fill_value = (
377-
_choose_float_dtype(data.dtype, attrs),
378-
np.nan,
379-
)
469+
dtype, decoded_fill_value = (
470+
_choose_float_dtype(data.dtype, attrs),
471+
np.nan,
472+
)
380473

381-
transform = partial(
382-
_apply_mask,
383-
encoded_fill_values=encoded_fill_values,
384-
decoded_fill_value=decoded_fill_value,
385-
dtype=dtype,
386-
)
387-
data = lazy_elemwise_func(data, transform, dtype)
474+
transform = partial(
475+
_apply_mask,
476+
encoded_fill_values=encoded_fill_values,
477+
decoded_fill_value=decoded_fill_value,
478+
dtype=dtype,
479+
)
480+
data = lazy_elemwise_func(data, transform, dtype)
388481

389-
return Variable(dims, data, attrs, encoding, fastpath=True)
390-
else:
391-
return variable
482+
return Variable(dims, data, attrs, encoding, fastpath=True)
392483

393484

394485
def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTypeLike):
@@ -506,74 +597,6 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
506597
return variable
507598

508599

509-
class UnsignedIntegerCoder(VariableCoder):
510-
def encode(self, variable: Variable, name: T_Name = None) -> Variable:
511-
# from netCDF best practices
512-
# https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
513-
# "_Unsigned = "true" to indicate that
514-
# integer data should be treated as unsigned"
515-
if variable.encoding.get("_Unsigned", "false") == "true":
516-
dims, data, attrs, encoding = unpack_for_encoding(variable)
517-
518-
pop_to(encoding, attrs, "_Unsigned")
519-
# we need the on-disk type here
520-
# trying to get it from encoding, resort to an int with the same precision as data.dtype if not available
521-
signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}"))
522-
if "_FillValue" in attrs:
523-
try:
524-
# user provided the on-disk signed fill
525-
new_fill = signed_dtype.type(attrs["_FillValue"])
526-
except OverflowError:
527-
# user provided the in-memory unsigned fill, convert to signed type
528-
unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}")
529-
# use view here to prevent OverflowError
530-
new_fill = (
531-
np.array(attrs["_FillValue"], dtype=unsigned_dtype)
532-
.view(signed_dtype)
533-
.item()
534-
)
535-
attrs["_FillValue"] = new_fill
536-
data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)
537-
538-
return Variable(dims, data, attrs, encoding, fastpath=True)
539-
else:
540-
return variable
541-
542-
def decode(self, variable: Variable, name: T_Name = None) -> Variable:
543-
if "_Unsigned" in variable.attrs:
544-
dims, data, attrs, encoding = unpack_for_decoding(variable)
545-
unsigned = pop_to(attrs, encoding, "_Unsigned")
546-
547-
if data.dtype.kind == "i":
548-
if unsigned == "true":
549-
unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
550-
transform = partial(np.asarray, dtype=unsigned_dtype)
551-
if "_FillValue" in attrs:
552-
new_fill = np.array(attrs["_FillValue"], dtype=data.dtype)
553-
# use view here to prevent OverflowError
554-
attrs["_FillValue"] = new_fill.view(unsigned_dtype).item()
555-
data = lazy_elemwise_func(data, transform, unsigned_dtype)
556-
elif data.dtype.kind == "u":
557-
if unsigned == "false":
558-
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
559-
transform = partial(np.asarray, dtype=signed_dtype)
560-
data = lazy_elemwise_func(data, transform, signed_dtype)
561-
if "_FillValue" in attrs:
562-
new_fill = signed_dtype.type(attrs["_FillValue"])
563-
attrs["_FillValue"] = new_fill
564-
else:
565-
warnings.warn(
566-
f"variable {name!r} has _Unsigned attribute but is not "
567-
"of integer type. Ignoring attribute.",
568-
SerializationWarning,
569-
stacklevel=3,
570-
)
571-
572-
return Variable(dims, data, attrs, encoding, fastpath=True)
573-
else:
574-
return variable
575-
576-
577600
class DefaultFillvalueCoder(VariableCoder):
578601
"""Encode default _FillValue if needed."""
579602

xarray/conventions.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,6 @@ def encode_cf_variable(
187187
times.CFTimedeltaCoder(),
188188
variables.CFScaleOffsetCoder(),
189189
variables.CFMaskCoder(),
190-
variables.UnsignedIntegerCoder(),
191190
variables.NativeEnumCoder(),
192191
variables.NonStringCoder(),
193192
variables.DefaultFillvalueCoder(),
@@ -279,7 +278,6 @@ def decode_cf_variable(
279278

280279
if mask_and_scale:
281280
for coder in [
282-
variables.UnsignedIntegerCoder(),
283281
variables.CFMaskCoder(),
284282
variables.CFScaleOffsetCoder(),
285283
]:

0 commit comments

Comments
 (0)