From 61e5ebd13ee698d92324593ec8ef4393e205c8e0 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 4 Sep 2023 11:55:29 +0200 Subject: [PATCH 1/4] add option to deactivate multi-index dim coord --- xarray/core/indexes.py | 9 ++++++++- xarray/core/options.py | 8 ++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index b5e396963a1..f37d2aab5d9 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -15,6 +15,7 @@ PandasIndexingAdapter, PandasMultiIndexingAdapter, ) +from xarray.core.options import OPTIONS from xarray.core.utils import ( Frozen, emit_user_level_warning, @@ -1123,7 +1124,13 @@ def create_variables( variables = {} index_vars: IndexVars = {} - for name in (self.dim,) + self.index.names: + + if OPTIONS["future_no_mindex_dim_coord"]: + names = self.index.names + else: + names = (self.dim,) + self.index.names + + for name in names: if name == self.dim: level = None dtype = None diff --git a/xarray/core/options.py b/xarray/core/options.py index eb0c56c7ee0..c8bb6d530de 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -26,6 +26,7 @@ "display_default_indexes", "enable_cftimeindex", "file_cache_maxsize", + "future_no_mindex_dim_coord", "keep_attrs", "warn_for_unclosed_files", "use_bottleneck", @@ -48,6 +49,7 @@ class T_Options(TypedDict): display_default_indexes: Literal["default", True, False] enable_cftimeindex: bool file_cache_maxsize: int + future_no_mindex_dim_coord: bool keep_attrs: Literal["default", True, False] warn_for_unclosed_files: bool use_bottleneck: bool @@ -70,6 +72,7 @@ class T_Options(TypedDict): "display_default_indexes": False, "enable_cftimeindex": True, "file_cache_maxsize": 128, + "future_no_mindex_dim_coord": False, "keep_attrs": "default", "warn_for_unclosed_files": False, "use_bottleneck": True, @@ -98,6 +101,7 @@ def _positive_integer(value: int) -> bool: "display_default_indexes": lambda choice: choice in [True, False, "default"], "enable_cftimeindex": lambda value: isinstance(value, bool), "file_cache_maxsize": _positive_integer, + "future_no_mindex_dim_coord": lambda value: isinstance(value, bool), "keep_attrs": lambda choice: choice in [True, False, "default"], "use_bottleneck": lambda value: isinstance(value, bool), "use_flox": lambda value: isinstance(value, bool), @@ -218,6 +222,10 @@ class set_options: global least-recently-usage cached. This should be smaller than your system's per-process file descriptor limit, e.g., ``ulimit -n`` on Linux. + future_no_mindex_dim_coord : bool, default: False + If True, no dimension coordinate with tuple values is be created + for a pandas multi-index. This is currently deactivated but + it will be the default behavior in the future. keep_attrs : {"default", True, False} Whether to keep attributes on xarray Datasets/dataarrays after operations. Can be From ece9c21291b9bbde806cb1634d4c592afe725ed9 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 4 Sep 2023 14:04:54 +0200 Subject: [PATCH 2/4] adapt stack and drop_vars --- xarray/core/dataset.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index f1a0cb9dc34..916d6656c8d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5132,7 +5132,8 @@ def _stack_once( if len(product_vars) == len(dims): idx = index_cls.stack(product_vars, new_dim) - new_indexes[new_dim] = idx + if not OPTIONS["future_no_mindex_dim_coord"]: + new_indexes[new_dim] = idx new_indexes.update({k: idx for k in product_vars}) idx_vars = idx.create_variables(product_vars) # keep consistent multi-index coordinate order @@ -5779,7 +5780,7 @@ def drop_vars( for var in names: maybe_midx = self._indexes.get(var, None) if isinstance(maybe_midx, PandasMultiIndex): - idx_coord_names = set(maybe_midx.index.names + [maybe_midx.dim]) + idx_coord_names = set(self.xindexes.get_all_coords(var)) idx_other_names = idx_coord_names - set(names) other_names.update(idx_other_names) if other_names: From 2058008f88c97d0fd1826c07a916918c252de37a Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 4 Sep 2023 14:07:20 +0200 Subject: [PATCH 3/4] Deprecate reorder_levels - This is rather an edge case: same result can be done with a couple of extra steps (update the pandas index directly and re-assign it) - API won't be compatible with possibly several multi-indexes along the same dimension with no dimension coordinate --- xarray/core/dataset.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 916d6656c8d..b73d7c9f646 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5008,6 +5008,15 @@ def reorder_levels( Another dataset, with this dataset's data but replaced coordinates. """ + warnings.warn( + "The `reorder_levels` method will be removed in the future, " + "when dimension coordinates with a PandasMultiIndex are removed.\n" + "Instead of `ds.reorder_levels(x=['two', 'one'])`, you could do:\n" + "`new_midx = ds.xindexes['one'].index.reorder_levels(['two', 'one'])`\n" + "`ds.assign_coords(xr.Coordinates.from_pandas_multiindex(new_midx, 'x'))`", + FutureWarning, + stacklevel=2, + ) dim_order = either_dict_or_kwargs(dim_order, dim_order_kwargs, "reorder_levels") variables = self._variables.copy() indexes = dict(self._indexes) From 87d5bf72e766b101db32dc65e6a79957368812ee Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 4 Sep 2023 14:09:53 +0200 Subject: [PATCH 4/4] started updating the tests --- conftest.py | 9 +++ xarray/tests/test_dataset.py | 116 +++++++++++++++++++++-------------- 2 files changed, 79 insertions(+), 46 deletions(-) diff --git a/conftest.py b/conftest.py index 862a1a1d0bc..e620efb5f25 100644 --- a/conftest.py +++ b/conftest.py @@ -2,6 +2,8 @@ import pytest +from xarray import set_options + def pytest_addoption(parser): """Add command-line flags for pytest.""" @@ -39,3 +41,10 @@ def add_standard_imports(doctest_namespace, tmpdir): # always switch to the temporary directory, so files get written there tmpdir.chdir() + + +@pytest.fixture(scope="function", params=[True, False]) +def mindex_dim_coord(request): + set_options(future_no_mindex_dim_coord=not request.param) + yield request.param + set_options(future_no_mindex_dim_coord=request.param) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index e119cfe9bc6..f7bbd93422d 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -8,7 +8,7 @@ from copy import copy, deepcopy from io import StringIO from textwrap import dedent -from typing import Any, Literal +from typing import Any, Literal, cast import numpy as np import pandas as pd @@ -32,7 +32,7 @@ from xarray.core import dtypes, indexing, utils from xarray.core.common import duck_array_ops, full_like from xarray.core.coordinates import Coordinates, DatasetCoordinates -from xarray.core.indexes import Index, PandasIndex +from xarray.core.indexes import Index, PandasIndex, PandasMultiIndex from xarray.core.pycompat import array_type, integer_types from xarray.core.utils import is_scalar from xarray.testing import _assert_internal_invariants @@ -335,43 +335,58 @@ def test_repr(self) -> None: data = Dataset(attrs={"foo": "bar" * 1000}) assert len(repr(data)) < 1000 - def test_repr_multiindex(self) -> None: + def test_repr_multiindex(self, mindex_dim_coord) -> None: data = create_test_multiindex() - expected = dedent( - """\ - - Dimensions: (x: 4) - Coordinates: - * x (x) object MultiIndex - * level_1 (x) object 'a' 'a' 'b' 'b' - * level_2 (x) int64 1 2 1 2 - Data variables: - *empty*""" - ) - actual = "\n".join(x.rstrip() for x in repr(data).split("\n")) - print(actual) - assert expected == actual + if mindex_dim_coord: + expected = dedent( + """\ + + Dimensions: (x: 4) + Coordinates: + * x (x) object MultiIndex + * level_1 (x) object 'a' 'a' 'b' 'b' + * level_2 (x) int64 1 2 1 2 + Data variables: + *empty*""" + ) + else: + expected = dedent( + """\ + + Dimensions: (x: 4) + Coordinates: + * level_1 (x) object 'a' 'a' 'b' 'b' + * level_2 (x) int64 1 2 1 2 + Dimensions without coordinates: x + Data variables: + *empty*""" + ) - # verify that long level names are not truncated - mindex = pd.MultiIndex.from_product( - [["a", "b"], [1, 2]], names=("a_quite_long_level_name", "level_2") - ) - data = Dataset({}, {"x": mindex}) - expected = dedent( - """\ - - Dimensions: (x: 4) - Coordinates: - * x (x) object MultiIndex - * a_quite_long_level_name (x) object 'a' 'a' 'b' 'b' - * level_2 (x) int64 1 2 1 2 - Data variables: - *empty*""" - ) actual = "\n".join(x.rstrip() for x in repr(data).split("\n")) print(actual) assert expected == actual + if not mindex_dim_coord: + # verify that long level names are not truncated + mindex = pd.MultiIndex.from_product( + [["a", "b"], [1, 2]], names=("a_quite_long_level_name", "level_2") + ) + data = Dataset({}, {"x": mindex}) + expected = dedent( + """\ + + Dimensions: (x: 4) + Coordinates: + * a_quite_long_level_name (x) object 'a' 'a' 'b' 'b' + * level_2 (x) int64 1 2 1 2 + Dimensions without coordinates: x + Data variables: + *empty*""" + ) + actual = "\n".join(x.rstrip() for x in repr(data).split("\n")) + print(actual) + assert expected == actual + def test_repr_period_index(self) -> None: data = create_test_data(seed=456) data.coords["time"] = pd.period_range("2000-01-01", periods=20, freq="B") @@ -2644,8 +2659,9 @@ def test_drop_variables(self) -> None: assert_identical(expected, actual) def test_drop_multiindex_level(self) -> None: - data = create_test_multiindex() - expected = data.drop_vars(["x", "level_1", "level_2"]) + with set_options(future_no_mindex_dim_coord=True): + data = create_test_multiindex() + expected = data.drop_vars(["level_1", "level_2"]) with pytest.warns(DeprecationWarning): actual = data.drop_vars("level_1") assert_identical(expected, actual) @@ -3377,15 +3393,15 @@ def test_expand_dims_kwargs_python36plus(self) -> None: ) assert_identical(other_way_expected, other_way) - def test_set_index(self) -> None: + def test_set_index(self, mindex_dim_coord) -> None: expected = create_test_multiindex() - mindex = expected["x"].to_index() + mindex = cast(PandasMultiIndex, expected.xindexes["level_1"]).index indexes = [mindex.get_level_values(n) for n in mindex.names] coords = {idx.name: ("x", idx) for idx in indexes} ds = Dataset({}, coords=coords) obj = ds.set_index(x=mindex.names) - assert_identical(obj, expected) + assert_identical(obj, expected, check_default_indexes=False) # ensure pre-existing indexes involved are removed # (level_2 should be a coordinate with no index) @@ -3434,14 +3450,17 @@ def test_set_index_deindexed_coords(self) -> None: ) assert_identical(actual, expected) - def test_reset_index(self) -> None: + def test_reset_index(self, mindex_dim_coord) -> None: ds = create_test_multiindex() - mindex = ds["x"].to_index() + mindex = cast(PandasMultiIndex, ds.xindexes["level_1"]).index indexes = [mindex.get_level_values(n) for n in mindex.names] coords = {idx.name: ("x", idx) for idx in indexes} expected = Dataset({}, coords=coords) - obj = ds.reset_index("x") + if mindex_dim_coord: + obj = ds.reset_index("x") + else: + obj = ds.reset_index(["level_1", "level_2"]) assert_identical(obj, expected, check_default_indexes=False) assert len(obj.xindexes) == 0 @@ -3475,12 +3494,15 @@ def test_reset_index_drop_dims(self) -> None: ], ) def test_reset_index_drop_convert( - self, arg, drop, dropped, converted, renamed + self, arg, drop, dropped, converted, renamed, mindex_dim_coord ) -> None: # regressions https://github.com/pydata/xarray/issues/6946 and # https://github.com/pydata/xarray/issues/6989 # check that multi-index dimension or level coordinates are dropped, converted # from IndexVariable to Variable or renamed to dimension as expected + if arg == "x" or "x" in arg and not mindex_dim_coord: + return + midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("foo", "bar")) ds = xr.Dataset(coords={"x": midx}) reset = ds.reset_index(arg, drop=drop) @@ -3494,7 +3516,7 @@ def test_reset_index_drop_convert( def test_reorder_levels(self) -> None: ds = create_test_multiindex() - mindex = ds["x"].to_index() + mindex = cast(PandasMultiIndex, ds.xindexes["level_1"]).index midx = mindex.reorder_levels(["level_2", "level_1"]) expected = Dataset({}, coords={"x": midx}) @@ -3502,12 +3524,14 @@ def test_reorder_levels(self) -> None: ds["level_1"].attrs["foo"] = "bar" expected["level_1"].attrs["foo"] = "bar" - reindexed = ds.reorder_levels(x=["level_2", "level_1"]) + with pytest.warns(FutureWarning, match=r"will be removed"): + reindexed = ds.reorder_levels(x=["level_2", "level_1"]) assert_identical(reindexed, expected) ds = Dataset({}, coords={"x": [1, 2]}) - with pytest.raises(ValueError, match=r"has no MultiIndex"): - ds.reorder_levels(x=["level_1", "level_2"]) + with pytest.warns(FutureWarning, match=r"will be removed"): + with pytest.raises(ValueError, match=r"has no MultiIndex"): + ds.reorder_levels(x=["level_1", "level_2"]) def test_set_xindex(self) -> None: ds = Dataset(