-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Avoid auto creation of indexes in concat #8872
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
22995e9
7142c9f
7fb075a
285c1de
cc24757
90a2592
beb665a
22f361d
55166fc
322b76e
da6692b
35dfb67
fd3de2b
0a60172
21afbb1
6e9ead6
2534712
3e848eb
95d453c
ba5627e
3719ba7
018e74b
f680505
f10509a
8152c0a
aa813cf
e78de7d
b1329cc
a9f7e0c
2ce3dec
87a08b4
e0c6db1
214ed7d
78d2798
fc206b0
ce797f1
62e750f
f86c82f
4dd8d3c
d5d90fd
e00dbab
5bb88b8
1d471b1
766605d
971287f
10c0ed5
51eea5d
bde9f2b
d5241ce
7e8f895
ed85446
39571ba
206985b
86998e4
5894724
dad9433
b235c09
36a2223
e17c13f
e8fa857
648d5bc
deb292c
6dd57a9
b0e3612
b2f06a0
ff70fc7
2f97a5c
6d825e5
b88b5a6
30c7408
b243150
9e9e168
dca2fb9
c979672
ac27ce0
d73ac48
9ebbb33
d1b656d
ac998e9
0849b94
25764ca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ | |
|
||
from xarray.core import dtypes, utils | ||
from xarray.core.alignment import align, reindex_variables | ||
from xarray.core.coordinates import Coordinates | ||
from xarray.core.duck_array_ops import lazy_array_equiv | ||
from xarray.core.indexes import Index, PandasIndex | ||
from xarray.core.merge import ( | ||
|
@@ -646,14 +647,26 @@ def get_indexes(name): | |
# preserves original variable order | ||
result_vars[name] = result_vars.pop(name) | ||
|
||
result = type(datasets[0])(result_vars, attrs=result_attrs) | ||
|
||
absent_coord_names = coord_names - set(result.variables) | ||
absent_coord_names = coord_names - set(result_vars) | ||
if absent_coord_names: | ||
raise ValueError( | ||
f"Variables {absent_coord_names!r} are coordinates in some datasets but not others." | ||
) | ||
result = result.set_coords(coord_names) | ||
coord_vars = { | ||
name: result_var | ||
for name, result_var in result_vars.items() | ||
if name in coord_names | ||
} | ||
coords = Coordinates(coord_vars, indexes=result_indexes) | ||
|
||
# TODO: this is just the complement of the set of coord_vars | ||
result_data_vars = { | ||
name: result_var | ||
for name, result_var in result_vars.items() | ||
if name not in coord_names | ||
} | ||
|
||
result = type(datasets[0])(result_data_vars, coords=coords, attrs=result_attrs) | ||
result.encoding = result_encoding | ||
|
||
result = result.drop_vars(unlabeled_dims, errors="ignore") | ||
|
@@ -665,10 +678,6 @@ def get_indexes(name): | |
else: | ||
index_vars = index.create_variables() | ||
result[dim] = index_vars[dim] | ||
result_indexes[dim] = index | ||
|
||
# TODO: add indexes at Dataset creation (when it is supported) | ||
result = result._overwrite_indexes(result_indexes) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removing those lines doesn't break any existing test because the line above It would be better to explicitly add both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @benbovy. I think your comment might explain the behaviour I just noticed in https://github.com/TomNicholas/VirtualiZarr/issues/18#issuecomment-2023955860 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I've addressed your comment now @benbovy |
||
|
||
return result | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
from xarray.core.coordinates import Coordinates | ||
from xarray.core.indexes import PandasIndex | ||
from xarray.tests import ( | ||
ConcatenatableArray, | ||
InaccessibleArray, | ||
assert_array_equal, | ||
assert_equal, | ||
|
@@ -978,6 +979,34 @@ def test_concat_str_dtype(self, dtype, dim) -> None: | |
|
||
assert np.issubdtype(actual.x2.dtype, dtype) | ||
|
||
def test_concat_avoids_index_auto_creation(self) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reminder to myself: The reason this test didn't catch the problem described in https://github.com/TomNicholas/VirtualiZarr/issues/18#issuecomment-2023955860 is because this test checks that concatenating datasets that start without indexes stay without indexes, whereas that problem is from concatenating datasets with indexes but having the coordinate variables be silently replaced by |
||
# TODO once passing indexes={} directly to Dataset constructor is allowed then no need to create coords first | ||
coords = Coordinates( | ||
{"x": ConcatenatableArray(np.array([1, 2, 3]))}, indexes={} | ||
) | ||
datasets = [ | ||
Dataset( | ||
{"a": (["x", "y"], ConcatenatableArray(np.zeros((3, 3))))}, | ||
coords=coords, | ||
) | ||
for _ in range(2) | ||
] | ||
# should not raise on concat | ||
combined = concat(datasets, dim="x") | ||
assert combined["a"].shape == (6, 3) | ||
assert combined["a"].dims == ("x", "y") | ||
|
||
# nor have auto-created any indexes | ||
assert combined.indexes == {} | ||
|
||
# should not raise on stack | ||
combined = concat(datasets, dim="z") | ||
assert combined["a"].shape == (2, 3, 3) | ||
assert combined["a"].dims == ("z", "x", "y") | ||
|
||
# nor have auto-created any indexes | ||
assert combined.indexes == {} | ||
|
||
|
||
class TestConcatDataArray: | ||
def test_concat(self) -> None: | ||
|
@@ -1051,6 +1080,35 @@ def test_concat_lazy(self) -> None: | |
assert combined.shape == (2, 3, 3) | ||
assert combined.dims == ("z", "x", "y") | ||
|
||
def test_concat_avoids_index_auto_creation(self) -> None: | ||
# TODO once passing indexes={} directly to DataArray constructor is allowed then no need to create coords first | ||
coords = Coordinates( | ||
{"x": ConcatenatableArray(np.array([1, 2, 3]))}, indexes={} | ||
) | ||
arrays = [ | ||
DataArray( | ||
ConcatenatableArray(np.zeros((3, 3))), | ||
dims=["x", "y"], | ||
coords=coords, | ||
) | ||
for _ in range(2) | ||
] | ||
# should not raise on concat | ||
combined = concat(arrays, dim="x") | ||
assert combined.shape == (6, 3) | ||
assert combined.dims == ("x", "y") | ||
|
||
# nor have auto-created any indexes | ||
assert combined.indexes == {} | ||
|
||
# should not raise on stack | ||
combined = concat(arrays, dim="z") | ||
assert combined.shape == (2, 3, 3) | ||
assert combined.dims == ("z", "x", "y") | ||
|
||
# nor have auto-created any indexes | ||
assert combined.indexes == {} | ||
|
||
@pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0]) | ||
def test_concat_fill_value(self, fill_value) -> None: | ||
foo = DataArray([1, 2], coords=[("x", [1, 2])]) | ||
|
Uh oh!
There was an error while loading. Please reload this page.