Skip to content

Commit 0c876e4

Browse files
tomwhitepre-commit-ci[bot]kmuehlbauerdcherian
authored
CF encoding should preserve vlen dtype for empty arrays (#7862)
* CF encoding should preserve vlen dtype for empty arrays * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * preserve vlen string dtype in netcdf4 and zarr backends * check for h5py-variant ("vlen") in coding.strings.check_vlen_dtype * add test to check preserving vlen dtype for empty vlen string arrays * ignore call_overload error for np.dtype("O", metadata={"vlen": str}) * use filter.codec_id instead of private filter._meta as suggested in review * update comment and add whats-new.rst entry * fix whats-new.rst * fix whats-new.rst (missing dot) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Kai Mühlbauer <kai.muehlbauer@uni-bonn.de> Co-authored-by: Kai Mühlbauer <kmuehlbauer@wradlib.org> Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>
1 parent 99f9559 commit 0c876e4

File tree

8 files changed

+56
-6
lines changed

8 files changed

+56
-6
lines changed

doc/whats-new.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ Bug fixes
5050
By `Mattia Almansi <https://github.com/malmans2>`_.
5151
- Don't call ``CachingFileManager.__del__`` on interpreter shutdown (:issue:`7814`, :pull:`7880`).
5252
By `Justus Magin <https://github.com/keewis>`_.
53+
- Preserve vlen dtype for empty string arrays (:issue:`7328`, :pull:`7862`).
54+
By `Tom White <https://github.com/tomwhite>`_ and `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
5355
- Ensure dtype of reindex result matches dtype of the original DataArray (:issue:`7299`, :pull:`7917`)
5456
By `Anderson Banihirwe <https://github.com/andersy005>`_.
5557

xarray/backends/netCDF4_.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,12 @@ def __init__(self, variable_name, datastore):
6565

6666
dtype = array.dtype
6767
if dtype is str:
68-
# use object dtype because that's the only way in numpy to
69-
# represent variable length strings; it also prevents automatic
70-
# string concatenation via conventions.decode_cf_variable
71-
dtype = np.dtype("O")
68+
# use object dtype (with additional vlen string metadata) because that's
69+
# the only way in numpy to represent variable length strings and to
70+
# check vlen string dtype in further steps
71+
# it also prevents automatic string concatenation via
72+
# conventions.decode_cf_variable
73+
dtype = coding.strings.create_vlen_dtype(str)
7274
self.dtype = dtype
7375

7476
def __setitem__(self, key, value):

xarray/backends/zarr.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,14 @@ def __init__(self, variable_name, datastore):
7070
array = self.get_array()
7171
self.shape = array.shape
7272

73-
dtype = array.dtype
73+
# preserve vlen string object dtype (GH 7328)
74+
if array.filters is not None and any(
75+
[filt.codec_id == "vlen-utf8" for filt in array.filters]
76+
):
77+
dtype = coding.strings.create_vlen_dtype(str)
78+
else:
79+
dtype = array.dtype
80+
7481
self.dtype = dtype
7582

7683
def get_array(self):

xarray/coding/strings.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ def check_vlen_dtype(dtype):
2929
if dtype.kind != "O" or dtype.metadata is None:
3030
return None
3131
else:
32-
return dtype.metadata.get("element_type")
32+
# check xarray (element_type) as well as h5py (vlen)
33+
return dtype.metadata.get("element_type", dtype.metadata.get("vlen"))
3334

3435

3536
def is_unicode_dtype(dtype):

xarray/conventions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,10 @@ def ensure_dtype_not_object(var: Variable, name: T_Name = None) -> Variable:
108108
if var.dtype.kind == "O":
109109
dims, data, attrs, encoding = _var_as_tuple(var)
110110

111+
# leave vlen dtypes unchanged
112+
if strings.check_vlen_dtype(data.dtype) is not None:
113+
return var
114+
111115
if is_duck_dask_array(data):
112116
warnings.warn(
113117
"variable {} has data in the form of a dask array with "

xarray/tests/test_backends.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
)
4747
from xarray.backends.pydap_ import PydapDataStore
4848
from xarray.backends.scipy_ import ScipyBackendEntrypoint
49+
from xarray.coding.strings import check_vlen_dtype, create_vlen_dtype
4950
from xarray.coding.variables import SerializationWarning
5051
from xarray.conventions import encode_dataset_coordinates
5152
from xarray.core import indexing
@@ -859,6 +860,20 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
859860
with self.roundtrip(original) as actual:
860861
assert_identical(expected, actual)
861862

863+
def test_roundtrip_empty_vlen_string_array(self) -> None:
864+
# checks preserving vlen dtype for empty arrays GH7862
865+
dtype = create_vlen_dtype(str)
866+
original = Dataset({"a": np.array([], dtype=dtype)})
867+
assert check_vlen_dtype(original["a"].dtype) == str
868+
with self.roundtrip(original) as actual:
869+
assert_identical(original, actual)
870+
assert object == actual["a"].dtype
871+
assert actual["a"].dtype == original["a"].dtype
872+
# only check metadata for capable backends
873+
# eg. NETCDF3 based backends do not roundtrip metadata
874+
if actual["a"].dtype.metadata is not None:
875+
assert check_vlen_dtype(actual["a"].dtype) == str
876+
862877
@pytest.mark.parametrize(
863878
"decoded_fn, encoded_fn",
864879
[

xarray/tests/test_coding_strings.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ def test_vlen_dtype() -> None:
3232
assert strings.is_bytes_dtype(dtype)
3333
assert strings.check_vlen_dtype(dtype) is bytes
3434

35+
# check h5py variant ("vlen")
36+
dtype = np.dtype("O", metadata={"vlen": str}) # type: ignore[call-overload]
37+
assert strings.check_vlen_dtype(dtype) is str
38+
3539
assert strings.check_vlen_dtype(np.dtype(object)) is None
3640

3741

xarray/tests/test_conventions.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,3 +487,18 @@ def test_decode_cf_error_includes_variable_name():
487487
ds = Dataset({"invalid": ([], 1e36, {"units": "days since 2000-01-01"})})
488488
with pytest.raises(ValueError, match="Failed to decode variable 'invalid'"):
489489
decode_cf(ds)
490+
491+
492+
def test_encode_cf_variable_with_vlen_dtype() -> None:
493+
v = Variable(
494+
["x"], np.array(["a", "b"], dtype=coding.strings.create_vlen_dtype(str))
495+
)
496+
encoded_v = conventions.encode_cf_variable(v)
497+
assert encoded_v.data.dtype.kind == "O"
498+
assert coding.strings.check_vlen_dtype(encoded_v.data.dtype) == str
499+
500+
# empty array
501+
v = Variable(["x"], np.array([], dtype=coding.strings.create_vlen_dtype(str)))
502+
encoded_v = conventions.encode_cf_variable(v)
503+
assert encoded_v.data.dtype.kind == "O"
504+
assert coding.strings.check_vlen_dtype(encoded_v.data.dtype) == str

0 commit comments

Comments
 (0)