From bc94c6de33f0972716450e716121f4f33472b9f0 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 9 May 2025 11:19:31 +0200 Subject: [PATCH 01/25] clean-up indexing.PandasIndexingAdapter typing --- xarray/core/indexing.py | 55 +++++------------------------------------ 1 file changed, 6 insertions(+), 49 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index c1b847202c7..d973eb12ac8 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1846,29 +1846,13 @@ def _prepare_key(self, key: Any | tuple[Any, ...]) -> tuple[Any, ...]: return key - def _handle_result( - self, result: Any - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): + def _handle_result(self, result: Any) -> PandasIndexingAdapter | np.ndarray: if isinstance(result, pd.Index): return type(self)(result, dtype=self.dtype) else: return self._convert_scalar(result) - def _oindex_get( - self, indexer: OuterIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): + def _oindex_get(self, indexer: OuterIndexer) -> PandasIndexingAdapter | np.ndarray: key = self._prepare_key(indexer.tuple) if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional @@ -1881,13 +1865,7 @@ def _oindex_get( def _vindex_get( self, indexer: VectorizedIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): + ) -> PandasIndexingAdapter | np.ndarray: _assert_not_chunked_indexer(indexer.tuple) key = self._prepare_key(indexer.tuple) @@ -1901,13 +1879,7 @@ def _vindex_get( def __getitem__( self, indexer: ExplicitIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): + ) -> PandasIndexingAdapter | np.ndarray: key = self._prepare_key(indexer.tuple) if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional @@ -1987,15 +1959,7 @@ def _convert_scalar(self, item): item = item[idx] return super()._convert_scalar(item) - def _oindex_get( - self, indexer: OuterIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): + def _oindex_get(self, indexer: OuterIndexer) -> PandasIndexingAdapter | np.ndarray: result = super()._oindex_get(indexer) if isinstance(result, type(self)): result.level = self.level @@ -2003,13 +1967,7 @@ def _oindex_get( def _vindex_get( self, indexer: VectorizedIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): + ) -> PandasIndexingAdapter | np.ndarray: result = super()._vindex_get(indexer) if isinstance(result, type(self)): result.level = self.level @@ -2019,7 +1977,6 @@ def __getitem__(self, indexer: ExplicitIndexer): result = super().__getitem__(indexer) if isinstance(result, type(self)): result.level = self.level - return result def __repr__(self) -> str: From 17ff7e9465ccc627bad2cf08e841deacf41f68e4 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 9 May 2025 13:05:34 +0200 Subject: [PATCH 02/25] streamline PandasIndexingAdapter indexing logic Grouping the logic into one method will make it easier overriding the behavior in subclasses (interval index) without affecting much readability. Also it yield more DRY code. --- xarray/core/indexing.py | 67 ++++++++++++----------------------------- 1 file changed, 20 insertions(+), 47 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index d973eb12ac8..bb54f6b83df 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1838,57 +1838,42 @@ def _convert_scalar(self, item) -> np.ndarray: # a NumPy array. return to_0d_array(item) - def _prepare_key(self, key: Any | tuple[Any, ...]) -> tuple[Any, ...]: - if isinstance(key, tuple) and len(key) == 1: + def _index_get( + self, indexer: ExplicitIndexer, func_name: str + ) -> PandasIndexingAdapter | np.ndarray: + key = indexer.tuple + + if len(key) == 1: # unpack key so it can index a pandas.Index object (pandas.Index # objects don't like tuples) (key,) = key - return key + # if multidimensional key, convert the index to numpy array and index the latter + if getattr(key, "ndim", 0) > 1: + indexable = NumpyIndexingAdapter(np.asarray(self)) + return getattr(indexable, func_name)(indexer) + + # otherwise index the pandas index then re-wrap or convert the result + result = self.array[key] - def _handle_result(self, result: Any) -> PandasIndexingAdapter | np.ndarray: if isinstance(result, pd.Index): return type(self)(result, dtype=self.dtype) else: return self._convert_scalar(result) def _oindex_get(self, indexer: OuterIndexer) -> PandasIndexingAdapter | np.ndarray: - key = self._prepare_key(indexer.tuple) - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - indexable = NumpyIndexingAdapter(np.asarray(self)) - return indexable.oindex[indexer] - - result = self.array[key] - - return self._handle_result(result) + return self._index_get(indexer, "_oindex_get") def _vindex_get( self, indexer: VectorizedIndexer ) -> PandasIndexingAdapter | np.ndarray: _assert_not_chunked_indexer(indexer.tuple) - key = self._prepare_key(indexer.tuple) - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - indexable = NumpyIndexingAdapter(np.asarray(self)) - return indexable.vindex[indexer] - - result = self.array[key] - - return self._handle_result(result) + return self._index_get(indexer, "_vindex_get") def __getitem__( self, indexer: ExplicitIndexer ) -> PandasIndexingAdapter | np.ndarray: - key = self._prepare_key(indexer.tuple) - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - indexable = NumpyIndexingAdapter(np.asarray(self)) - return indexable[indexer] - - result = self.array[key] - - return self._handle_result(result) + return self._index_get(indexer, "__getitem__") def transpose(self, order) -> pd.Index: return self.array # self.array should be always one-dimensional @@ -1953,28 +1938,16 @@ def __array__( else: return super().__array__(dtype, copy=copy) - def _convert_scalar(self, item): + def _convert_scalar(self, item: Any): if isinstance(item, tuple) and self.level is not None: idx = tuple(self.array.names).index(self.level) item = item[idx] return super()._convert_scalar(item) - def _oindex_get(self, indexer: OuterIndexer) -> PandasIndexingAdapter | np.ndarray: - result = super()._oindex_get(indexer) - if isinstance(result, type(self)): - result.level = self.level - return result - - def _vindex_get( - self, indexer: VectorizedIndexer + def _index_get( + self, indexer: ExplicitIndexer, func_name: str ) -> PandasIndexingAdapter | np.ndarray: - result = super()._vindex_get(indexer) - if isinstance(result, type(self)): - result.level = self.level - return result - - def __getitem__(self, indexer: ExplicitIndexer): - result = super().__getitem__(indexer) + result = super()._index_get(indexer, func_name) if isinstance(result, type(self)): result.level = self.level return result From 2b25155717519411e90a85fcf26679efc9787b0c Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 9 May 2025 14:38:26 +0200 Subject: [PATCH 03/25] clean-up PandasIndexingAdapter dtype handling Prevent numpy.dtype conversions or castings implemented in various places, gather the logic into one method. --- xarray/core/indexing.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index bb54f6b83df..83b0b484182 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1778,6 +1778,15 @@ def __init__( def dtype(self) -> np.dtype | pd.api.extensions.ExtensionDtype: # type: ignore[override] return self._dtype + def _get_numpy_dtype(self, dtype: np.typing.DTypeLike | None = None) -> np.dtype: + if dtype is None: + if is_valid_numpy_dtype(self.dtype): + return cast(np.dtype, self.dtype) + else: + return get_valid_numpy_dtype(self.array) + else: + return np.dtype(dtype) + def __array__( self, dtype: np.typing.DTypeLike | None = None, @@ -1785,11 +1794,9 @@ def __array__( *, copy: bool | None = None, ) -> np.ndarray: - if dtype is None and is_valid_numpy_dtype(self.dtype): - dtype = cast(np.dtype, self.dtype) - else: - dtype = get_valid_numpy_dtype(self.array) + dtype = self._get_numpy_dtype(dtype) array = self.array + if isinstance(array, pd.PeriodIndex): with suppress(AttributeError): # this might not be public API @@ -1829,10 +1836,8 @@ def _convert_scalar(self, item) -> np.ndarray: # numpy fails to convert pd.Timestamp to np.datetime64[ns] item = np.asarray(item.to_datetime64()) elif self.dtype != object: - dtype = self.dtype - if pd.api.types.is_extension_array_dtype(dtype): - dtype = get_valid_numpy_dtype(self.array) - item = np.asarray(item, dtype=cast(np.dtype, dtype)) + dtype = self._get_numpy_dtype() + item = np.asarray(item, dtype=dtype) # as for numpy.ndarray indexing, we always want the result to be # a NumPy array. @@ -1896,7 +1901,9 @@ def copy(self, deep: bool = True) -> Self: def nbytes(self) -> int: if pd.api.types.is_extension_array_dtype(self.dtype): return self.array.nbytes - return cast(np.dtype, self.dtype).itemsize * len(self.array) + + dtype = self._get_numpy_dtype() + return dtype.itemsize * len(self.array) class PandasMultiIndexingAdapter(PandasIndexingAdapter): From 99810788d76cd429ed3eea5dadbc290432e1f764 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 9 May 2025 14:49:56 +0200 Subject: [PATCH 04/25] more clean-up --- xarray/core/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 83b0b484182..e8cd39b0ca0 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1936,8 +1936,8 @@ def __array__( *, copy: bool | None = None, ) -> np.ndarray: - if dtype is None: - dtype = cast(np.dtype, self.dtype) + dtype = self._get_numpy_dtype(dtype) + if self.level is not None: return np.asarray( self.array.get_level_values(self.level).values, dtype=dtype From 29098ac18c907deebdc13c9da4010f40d116cae5 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 26 May 2025 17:26:16 +0200 Subject: [PATCH 05/25] repr: prevent loading lazy variables into memory Maybe slice PandasIndexingAdapter / CoordinateTransformAdapter before formatting them as arrays. For PandasIndexingAdapter, this prevents converting a large pd.RangeIndex into a explicit index or array. --- xarray/core/formatting.py | 12 +++++++++++- xarray/core/indexing.py | 40 ++++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 7aa333ffb2e..b47840e79db 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -19,7 +19,11 @@ from xarray.core.datatree_render import RenderDataTree from xarray.core.duck_array_ops import array_all, array_any, array_equiv, astype, ravel -from xarray.core.indexing import MemoryCachedArray +from xarray.core.indexing import ( + CoordinateTransformIndexingAdapter, + MemoryCachedArray, + PandasIndexingAdapter, +) from xarray.core.options import OPTIONS, _get_boolean_with_default from xarray.core.treenode import group_subtrees from xarray.core.utils import is_duck_array @@ -651,6 +655,12 @@ def short_array_repr(array): def short_data_repr(array): """Format "data" for DataArray and Variable.""" internal_data = getattr(array, "variable", array)._data + + if isinstance( + internal_data, PandasIndexingAdapter | CoordinateTransformIndexingAdapter + ): + array = internal_data._get_array_subset() + if isinstance(array, np.ndarray): return short_array_repr(array) elif is_duck_array(internal_data): diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index e8cd39b0ca0..f5e841cad55 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -9,7 +9,6 @@ from contextlib import suppress from dataclasses import dataclass, field from datetime import timedelta -from html import escape from typing import TYPE_CHECKING, Any, cast, overload import numpy as np @@ -1883,6 +1882,23 @@ def __getitem__( def transpose(self, order) -> pd.Index: return self.array # self.array should be always one-dimensional + def _get_array_subset(self) -> np.ndarray: + # avoid converting a large pd.Index (especially pd.MultiIndex and pd.RangeIndex) + # into a numpy array for the array repr + threshold = max(100, OPTIONS["display_values_threshold"] + 2) + if self.size > threshold: + pos = threshold // 2 + subset_start = (self[OuterIndexer((slice(pos),))],) + subset_end = (self[OuterIndexer((slice(-pos, None),))],) + return np.concatenate([np.asarray(subset_start), np.asarray(subset_end)]) + else: + return np.asarray(self) + + def _repr_inline_(self, max_width: int) -> str: + from xarray.core.formatting import format_array_flat + + return format_array_flat(self._get_array_subset(), max_width) + def __repr__(self) -> str: return f"{type(self).__name__}(array={self.array!r}, dtype={self.dtype!r})" @@ -1968,31 +1984,11 @@ def __repr__(self) -> str: ) return f"{type(self).__name__}{props}" - def _get_array_subset(self) -> np.ndarray: - # used to speed-up the repr for big multi-indexes - threshold = max(100, OPTIONS["display_values_threshold"] + 2) - if self.size > threshold: - pos = threshold // 2 - indices = np.concatenate([np.arange(0, pos), np.arange(-pos, 0)]) - subset = self[OuterIndexer((indices,))] - else: - subset = self - - return np.asarray(subset) - def _repr_inline_(self, max_width: int) -> str: - from xarray.core.formatting import format_array_flat - if self.level is None: return "MultiIndex" else: - return format_array_flat(self._get_array_subset(), max_width) - - def _repr_html_(self) -> str: - from xarray.core.formatting import short_array_repr - - array_repr = short_array_repr(self._get_array_subset()) - return f"
{escape(array_repr)}
" + return super()._repr_inline_(max_width=max_width) def copy(self, deep: bool = True) -> Self: # see PandasIndexingAdapter.copy From 5f09354d37b336768f19183f34252eb08914fd00 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 26 May 2025 18:51:09 +0200 Subject: [PATCH 06/25] fix array (index) subsetting --- xarray/core/indexing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index f5e841cad55..5835f8d3714 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1890,7 +1890,9 @@ def _get_array_subset(self) -> np.ndarray: pos = threshold // 2 subset_start = (self[OuterIndexer((slice(pos),))],) subset_end = (self[OuterIndexer((slice(-pos, None),))],) - return np.concatenate([np.asarray(subset_start), np.asarray(subset_end)]) + return np.concatenate( + [np.asarray(subset_start), np.asarray(subset_end)], axis=-1 + ) else: return np.asarray(self) From 0e5154c3324d079a09d6596622078716773fd386 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 4 Jul 2025 14:58:00 +0200 Subject: [PATCH 07/25] treat multi-index and coord-transform variables as lazy --- xarray/core/formatting.py | 13 +++-- xarray/core/indexing.py | 58 ++++++++++------------- xarray/core/variable.py | 10 +++- xarray/tests/test_coordinate_transform.py | 11 +++++ xarray/tests/test_formatting.py | 43 +++++++++++++++++ 5 files changed, 93 insertions(+), 42 deletions(-) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 734a0461b05..dcbc8167ca1 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -21,9 +21,9 @@ from xarray.core.duck_array_ops import array_all, array_any, array_equiv, astype, ravel from xarray.core.extension_array import PandasExtensionArray from xarray.core.indexing import ( - CoordinateTransformIndexingAdapter, + BasicIndexer, + ExplicitlyIndexed, MemoryCachedArray, - PandasIndexingAdapter, ) from xarray.core.options import OPTIONS, _get_boolean_with_default from xarray.core.treenode import group_subtrees @@ -91,6 +91,8 @@ def first_n_items(array, n_desired): if n_desired < array.size: indexer = _get_indexer_at_least_n_items(array.shape, n_desired, from_end=False) + if isinstance(array, ExplicitlyIndexed): + indexer = BasicIndexer(indexer) array = array[indexer] # We pass variable objects in to handle indexing @@ -115,6 +117,8 @@ def last_n_items(array, n_desired): if n_desired < array.size: indexer = _get_indexer_at_least_n_items(array.shape, n_desired, from_end=True) + if isinstance(array, ExplicitlyIndexed): + indexer = BasicIndexer(indexer) array = array[indexer] # We pass variable objects in to handle indexing @@ -664,11 +668,6 @@ def short_data_repr(array): """Format "data" for DataArray and Variable.""" internal_data = getattr(array, "variable", array)._data - if isinstance( - internal_data, PandasIndexingAdapter | CoordinateTransformIndexingAdapter - ): - array = internal_data._get_array_subset() - if isinstance(array, np.ndarray): return short_array_repr(array) elif is_duck_array(internal_data): diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index bd8ba37b256..8e4458fb88f 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -19,7 +19,6 @@ from xarray.core import duck_array_ops from xarray.core.coordinate_transform import CoordinateTransform from xarray.core.nputils import NumpyVIndexAdapter -from xarray.core.options import OPTIONS from xarray.core.types import T_Xarray from xarray.core.utils import ( NDArrayMixin, @@ -1774,6 +1773,12 @@ def __init__( else: self._dtype = np.dtype(cast(DTypeLike, dtype)) + @property + def _in_memory(self) -> bool: + # prevent costly conversion of a memory-saving pd.RangeIndex into a + # large numpy array. + return not isinstance(self.array, pd.RangeIndex) + @property def dtype(self) -> np.dtype | pd.api.extensions.ExtensionDtype: # type: ignore[override] return self._dtype @@ -1887,24 +1892,13 @@ def __getitem__( def transpose(self, order) -> pd.Index: return self.array # self.array should be always one-dimensional - def _get_array_subset(self) -> np.ndarray: - # avoid converting a large pd.Index (especially pd.MultiIndex and pd.RangeIndex) - # into a numpy array for the array repr - threshold = max(100, OPTIONS["display_values_threshold"] + 2) - if self.size > threshold: - pos = threshold // 2 - subset_start = (self[OuterIndexer((slice(pos),))],) - subset_end = (self[OuterIndexer((slice(-pos, None),))],) - return np.concatenate( - [np.asarray(subset_start), np.asarray(subset_end)], axis=-1 - ) - else: - return np.asarray(self) - def _repr_inline_(self, max_width: int) -> str: + # we want to display values in the inline repr for lazy coordinates too + # (pd.RangeIndex and pd.MultiIndex). `format_array_flat` prevents loading + # the whole array in memory. from xarray.core.formatting import format_array_flat - return format_array_flat(self._get_array_subset(), max_width) + return format_array_flat(self, max_width) def __repr__(self) -> str: return f"{type(self).__name__}(array={self.array!r}, dtype={self.dtype!r})" @@ -1968,6 +1962,14 @@ def __array__( else: return super().__array__(dtype, copy=copy) + @property + def _in_memory(self) -> bool: + # The pd.MultiIndex's data is fully in memory, but it has a different + # layout than the level and dimension coordinate arrays. Marking this + # adapter class as a "lazy" array will prevent costly conversion when, + # e.g., formatting the Xarray reprs. + return False + def _convert_scalar(self, item: Any): if isinstance(item, tuple) and self.level is not None: idx = tuple(self.array.names).index(self.level) @@ -2032,6 +2034,10 @@ def dtype(self) -> np.dtype: def shape(self) -> tuple[int, ...]: return tuple(self._transform.dim_size.values()) + @property + def _in_memory(self) -> bool: + return False + def get_duck_array(self) -> np.ndarray: all_coords = self._transform.generate_coords(dims=self._dims) return np.asarray(all_coords[self._coord_name]) @@ -2092,23 +2098,9 @@ def transpose(self, order: Iterable[int]) -> Self: def __repr__(self: Any) -> str: return f"{type(self).__name__}(transform={self._transform!r})" - def _get_array_subset(self) -> np.ndarray: - threshold = max(100, OPTIONS["display_values_threshold"] + 2) - if self.size > threshold: - pos = threshold // 2 - flat_indices = np.concatenate( - [np.arange(0, pos), np.arange(self.size - pos, self.size)] - ) - subset = self.vindex[ - VectorizedIndexer(np.unravel_index(flat_indices, self.shape)) - ] - else: - subset = self - - return np.asarray(subset) - def _repr_inline_(self, max_width: int) -> str: - """Good to see some labels even for a lazy coordinate.""" + # we want to display values in the inline repr for this lazy coordinate + # `format_array_flat` prevents loading the whole array in memory. from xarray.core.formatting import format_array_flat - return format_array_flat(self._get_array_subset(), max_width) + return format_array_flat(self, max_width) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 00d97e868c4..bcc2ca4e460 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -23,6 +23,7 @@ from xarray.core.extension_array import PandasExtensionArray from xarray.core.indexing import ( BasicIndexer, + CoordinateTransformIndexingAdapter, OuterIndexer, PandasIndexingAdapter, VectorizedIndexer, @@ -403,10 +404,15 @@ def _new( return cls_(dims_, data, attrs_) @property - def _in_memory(self): + def _in_memory(self) -> bool: + if isinstance( + self._data, PandasIndexingAdapter | CoordinateTransformIndexingAdapter + ): + return self._data._in_memory + return isinstance( self._data, - np.ndarray | np.number | PandasIndexingAdapter | PandasExtensionArray, + np.ndarray | np.number | PandasExtensionArray, ) or ( isinstance(self._data, indexing.MemoryCachedArray) and isinstance(self._data.array, indexing.NumpyIndexingAdapter) diff --git a/xarray/tests/test_coordinate_transform.py b/xarray/tests/test_coordinate_transform.py index 386ce426998..627063eb8cb 100644 --- a/xarray/tests/test_coordinate_transform.py +++ b/xarray/tests/test_coordinate_transform.py @@ -123,6 +123,17 @@ def test_coordinate_transform_variable_repr_inline() -> None: ) +def test_coordinate_transform_variable_repr() -> None: + var = create_coords(scale=2.0, shape=(2, 2))["x"].variable + + actual = repr(var) + expected = """ + Size: 32B +[4 values with dtype=float64] + """.strip() + assert actual == expected + + def test_coordinate_transform_variable_basic_outer_indexing() -> None: var = create_coords(scale=2.0, shape=(4, 4))["x"].variable diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 88c2c819405..c2ab1144e7b 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -1189,3 +1189,46 @@ def test_array_repr_dtypes(): Dimensions without coordinates: x """.strip() assert actual == expected + + +def test_repr_pandas_range_index() -> None: + # lazy data repr but values shown in inline repr + xidx = xr.indexes.PandasIndex(pd.RangeIndex(10), "x") + ds = xr.Dataset(coords=xr.Coordinates.from_xindex(xidx)) + actual = repr(ds.x) + expected = """ + Size: 80B +[10 values with dtype=int64] +Coordinates: + * x (x) int64 80B 0 1 2 3 4 5 6 7 8 9 + """.strip() + assert actual == expected + + +def test_repr_pandas_multi_index() -> None: + # lazy data repr but values shown in inline repr + midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=["foo", "bar"]) + coords = xr.Coordinates.from_pandas_multiindex(midx, "x") + ds = xr.Dataset(coords=coords) + + actual = repr(ds.x) + expected = """ + Size: 32B +[4 values with dtype=object] +Coordinates: + * x (x) object 32B MultiIndex + * foo (x) object 32B 'a' 'a' 'b' 'b' + * bar (x) int64 32B 1 2 1 2 + """.strip() + assert actual == expected + + actual = repr(ds.foo) + expected = """ + Size: 32B +[4 values with dtype=object] +Coordinates: + * x (x) object 32B MultiIndex + * foo (x) object 32B 'a' 'a' 'b' 'b' + * bar (x) int64 32B 1 2 1 2 + """.strip() + assert actual == expected From 4efb1353a257c79967eb01337c5c4d682e5f2a19 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 4 Jul 2025 16:51:14 +0200 Subject: [PATCH 08/25] update whats new --- doc/whats-new.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ad83cfac531..99ddb88e691 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,6 +33,13 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Refactored the ``PandasIndexingAdapter`` and + ``CoordinateTransformIndexingAdapter`` internal indexing classes. Coordinate + variables that wrap a :py:class:`pandas.RangeIndex`, a + :py:class:`pandas.MultiIndex` or a + :py:class:`xarray.indexes.CoordinateTransform` are now displayed as lazy variables + in the Xarray data reprs (:pull:`10355`). + By `Benoit Bovy `_. .. _whats-new.2025.07.0: From ef73a7ea93626521a725e033a71aac9595deea76 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 08:50:47 +0200 Subject: [PATCH 09/25] add benchmarks for pandas and xarray RangeIndex --- asv_bench/benchmarks/repr.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/asv_bench/benchmarks/repr.py b/asv_bench/benchmarks/repr.py index aa4b6cb7df1..68a082fcc4f 100644 --- a/asv_bench/benchmarks/repr.py +++ b/asv_bench/benchmarks/repr.py @@ -57,3 +57,31 @@ def time_repr(self): def time_repr_html(self): self.da._repr_html_() + + +class ReprPandasRangeIndex: + # display a memory-saving pandas.RangeIndex shouldn't trigger memory + # expensive conversion into a numpy array + def setup(self): + index = xr.indexes.PandasIndex(pd.RangeIndex(1_000_000), "x") + self.ds = xr.Dataset(coords=xr.Coordinates.from_xindex(index)) + + def time_repr(self): + repr(self.ds.x) + + def time_repr_html(self): + self.ds.x._repr_html_() + + +class ReprXarrayRangeIndex: + # display an Xarray RangeIndex shouldn't trigger memory expensive conversion + # of its lazy coordinate into a numpy array + def setup(self): + index = xr.indexes.RangeIndex.arange(1_000_000, dim="x") + self.ds = xr.Dataset(coords=xr.Coordinates.from_xindex(index)) + + def time_repr(self): + repr(self.ds.x) + + def time_repr_html(self): + self.ds.x._repr_html_() From a2ccb7da7ee01fe6c2de5c15d8f96a4c981d3e07 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 09:19:31 +0200 Subject: [PATCH 10/25] fix benchmark numba import error (numpy 2.3) Add a separate conda envrionment file for benchmarks. --- .github/workflows/benchmarks.yml | 4 ++-- ci/requirements/environment-benchmark.yml | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 ci/requirements/environment-benchmark.yml diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index e8d411ec927..b1c1a0828aa 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest env: ASV_DIR: "./asv_bench" - CONDA_ENV_FILE: ci/requirements/environment.yml + CONDA_ENV_FILE: ci/requirements/environment-benchmark.yml steps: # We need the full repo to avoid this issue @@ -29,7 +29,7 @@ jobs: with: micromamba-version: "1.5.10-0" environment-file: ${{env.CONDA_ENV_FILE}} - environment-name: xarray-tests + environment-name: xarray-benchmark cache-environment: true cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}-benchmark" # add "build" because of https://github.com/airspeed-velocity/asv/issues/1385 diff --git a/ci/requirements/environment-benchmark.yml b/ci/requirements/environment-benchmark.yml new file mode 100644 index 00000000000..a5408f5796b --- /dev/null +++ b/ci/requirements/environment-benchmark.yml @@ -0,0 +1,21 @@ +name: xarray-benchmark +channels: + - conda-forge + - nodefaults +dependencies: + - bottleneck + - cftime + - dask-core + - distributed + - flox + - numba + - numbagg + - numexpr + - numpy>=2.2,<2.3 # https://github.com/numba/numba/issues/10105 + - opt_einsum + - packaging + - pandas + - pyarrow # pandas raises a deprecation warning without this, breaking doctests + - sparse + - toolz + - zarr From 07f6cdb28b4adc05e347f24b2eb02108cdbd830e Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 09:27:55 +0200 Subject: [PATCH 11/25] benchmark: pin numpy in conf + consistent conda env Do we need to specify dependencies both in asv.conf and via a conda environment file? --- asv_bench/asv.conf.json | 2 +- ci/requirements/environment-benchmark.yml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 20c873540de..b377542e402 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -60,7 +60,7 @@ // }, "matrix": { "setuptools_scm": [""], // GH6609 - "numpy": [""], + "numpy": ["2.2"], "pandas": [""], "netcdf4": [""], "scipy": [""], diff --git a/ci/requirements/environment-benchmark.yml b/ci/requirements/environment-benchmark.yml index a5408f5796b..0e5c7f4b489 100644 --- a/ci/requirements/environment-benchmark.yml +++ b/ci/requirements/environment-benchmark.yml @@ -8,6 +8,7 @@ dependencies: - dask-core - distributed - flox + - netcdf4 - numba - numbagg - numexpr @@ -17,5 +18,6 @@ dependencies: - pandas - pyarrow # pandas raises a deprecation warning without this, breaking doctests - sparse + - scipy - toolz - zarr From a953b41e6b035f7f1b1b78d4db552d701ca3e98e Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 09:53:06 +0200 Subject: [PATCH 12/25] pyproject: bump setuptools(-scm) Should fix license format error. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5e5fd00328b..56ccc2ef1c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,7 +97,7 @@ dask = "xarray.namedarray.daskmanager:DaskManager" [build-system] build-backend = "setuptools.build_meta" -requires = ["setuptools>=42", "setuptools-scm>=7"] +requires = ["setuptools>=77.0.3", "setuptools-scm>=8"] [tool.setuptools.packages.find] include = ["xarray*"] From 2be275da285f7140da420790c092bcb60da0fb51 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 10:01:54 +0200 Subject: [PATCH 13/25] ci benchmarks: try fixing package install --- asv_bench/asv.conf.json | 3 ++- ci/requirements/environment-benchmark.yml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index b377542e402..c60eb451c39 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -59,6 +59,7 @@ // "pip+emcee": [""], // emcee is only available for install with pip. // }, "matrix": { + "setuptools": ["77.0.3"], "setuptools_scm": [""], // GH6609 "numpy": ["2.2"], "pandas": [""], @@ -76,7 +77,7 @@ // https://github.com/airspeed-velocity/asv/issues/1389#issuecomment-2076131185 "build_command": [ "python -m build", - "python -mpip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}" + "python -m pip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}" ], // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional diff --git a/ci/requirements/environment-benchmark.yml b/ci/requirements/environment-benchmark.yml index 0e5c7f4b489..1422cc60cda 100644 --- a/ci/requirements/environment-benchmark.yml +++ b/ci/requirements/environment-benchmark.yml @@ -17,6 +17,7 @@ dependencies: - packaging - pandas - pyarrow # pandas raises a deprecation warning without this, breaking doctests + - pip - sparse - scipy - toolz From 825cdb1614fda67205836dfe8c224b76042af747 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 10:07:22 +0200 Subject: [PATCH 14/25] next try --- asv_bench/asv.conf.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index c60eb451c39..a1bffdb4aea 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -59,7 +59,7 @@ // "pip+emcee": [""], // emcee is only available for install with pip. // }, "matrix": { - "setuptools": ["77.0.3"], + "setuptools": [""], "setuptools_scm": [""], // GH6609 "numpy": ["2.2"], "pandas": [""], From c890a6909f001d615553e6a6d321f8fe93e8b723 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 10:16:38 +0200 Subject: [PATCH 15/25] next try --- asv_bench/asv.conf.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index a1bffdb4aea..cf4d6c7be45 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -59,7 +59,7 @@ // "pip+emcee": [""], // emcee is only available for install with pip. // }, "matrix": { - "setuptools": [""], + "setuptools": ["80"], "setuptools_scm": [""], // GH6609 "numpy": ["2.2"], "pandas": [""], From 16fe98b4083e8b22e91a8f199b40f0b22287d829 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 10:27:00 +0200 Subject: [PATCH 16/25] next try --- asv_bench/asv.conf.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index cf4d6c7be45..dc075f04fe0 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -59,7 +59,7 @@ // "pip+emcee": [""], // emcee is only available for install with pip. // }, "matrix": { - "setuptools": ["80"], + "setuptools": ["78"], "setuptools_scm": [""], // GH6609 "numpy": ["2.2"], "pandas": [""], From 8ae12f75e022ca697b7e2fb8ee431fb46b863aff Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 10:37:27 +0200 Subject: [PATCH 17/25] benchmarks: try disabling no build isolation --- asv_bench/asv.conf.json | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index dc075f04fe0..f1f1e69b8a1 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -59,8 +59,6 @@ // "pip+emcee": [""], // emcee is only available for install with pip. // }, "matrix": { - "setuptools": ["78"], - "setuptools_scm": [""], // GH6609 "numpy": ["2.2"], "pandas": [""], "netcdf4": [""], @@ -77,7 +75,7 @@ // https://github.com/airspeed-velocity/asv/issues/1389#issuecomment-2076131185 "build_command": [ "python -m build", - "python -m pip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}" + "python -m pip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" ], // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional From f40f38cf5ffe553529322b7e7439995fd2d6c5c0 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 10:46:01 +0200 Subject: [PATCH 18/25] Revert "benchmarks: try disabling no build isolation" This reverts commit 8ae12f75e022ca697b7e2fb8ee431fb46b863aff. --- asv_bench/asv.conf.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index f1f1e69b8a1..dc075f04fe0 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -59,6 +59,8 @@ // "pip+emcee": [""], // emcee is only available for install with pip. // }, "matrix": { + "setuptools": ["78"], + "setuptools_scm": [""], // GH6609 "numpy": ["2.2"], "pandas": [""], "netcdf4": [""], @@ -75,7 +77,7 @@ // https://github.com/airspeed-velocity/asv/issues/1389#issuecomment-2076131185 "build_command": [ "python -m build", - "python -m pip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" + "python -m pip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}" ], // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional From 0ecc2141cf1ae1af7eb43d8b577364cb7eb71cd5 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 10:46:12 +0200 Subject: [PATCH 19/25] Revert "next try" This reverts commit 16fe98b4083e8b22e91a8f199b40f0b22287d829. --- asv_bench/asv.conf.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index dc075f04fe0..cf4d6c7be45 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -59,7 +59,7 @@ // "pip+emcee": [""], // emcee is only available for install with pip. // }, "matrix": { - "setuptools": ["78"], + "setuptools": ["80"], "setuptools_scm": [""], // GH6609 "numpy": ["2.2"], "pandas": [""], From 74e993c43f9ac0571778cf5f44e28fa0e4a8df84 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 10:46:21 +0200 Subject: [PATCH 20/25] Revert "next try" This reverts commit c890a6909f001d615553e6a6d321f8fe93e8b723. --- asv_bench/asv.conf.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index cf4d6c7be45..a1bffdb4aea 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -59,7 +59,7 @@ // "pip+emcee": [""], // emcee is only available for install with pip. // }, "matrix": { - "setuptools": ["80"], + "setuptools": [""], "setuptools_scm": [""], // GH6609 "numpy": ["2.2"], "pandas": [""], From 3420fc9c302c2b5701af516269b17ece5ce8b3a4 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 10:46:23 +0200 Subject: [PATCH 21/25] Revert "next try" This reverts commit 825cdb1614fda67205836dfe8c224b76042af747. --- asv_bench/asv.conf.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index a1bffdb4aea..c60eb451c39 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -59,7 +59,7 @@ // "pip+emcee": [""], // emcee is only available for install with pip. // }, "matrix": { - "setuptools": [""], + "setuptools": ["77.0.3"], "setuptools_scm": [""], // GH6609 "numpy": ["2.2"], "pandas": [""], From 97579f58a508d35be177f666c6a900841cdef876 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 10:46:25 +0200 Subject: [PATCH 22/25] Revert "ci benchmarks: try fixing package install" This reverts commit 2be275da285f7140da420790c092bcb60da0fb51. --- asv_bench/asv.conf.json | 3 +-- ci/requirements/environment-benchmark.yml | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index c60eb451c39..b377542e402 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -59,7 +59,6 @@ // "pip+emcee": [""], // emcee is only available for install with pip. // }, "matrix": { - "setuptools": ["77.0.3"], "setuptools_scm": [""], // GH6609 "numpy": ["2.2"], "pandas": [""], @@ -77,7 +76,7 @@ // https://github.com/airspeed-velocity/asv/issues/1389#issuecomment-2076131185 "build_command": [ "python -m build", - "python -m pip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}" + "python -mpip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}" ], // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional diff --git a/ci/requirements/environment-benchmark.yml b/ci/requirements/environment-benchmark.yml index 1422cc60cda..0e5c7f4b489 100644 --- a/ci/requirements/environment-benchmark.yml +++ b/ci/requirements/environment-benchmark.yml @@ -17,7 +17,6 @@ dependencies: - packaging - pandas - pyarrow # pandas raises a deprecation warning without this, breaking doctests - - pip - sparse - scipy - toolz From 86df720b46c5739f324137cab01192f3acbf1d01 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 10:46:28 +0200 Subject: [PATCH 23/25] Revert "pyproject: bump setuptools(-scm)" This reverts commit a953b41e6b035f7f1b1b78d4db552d701ca3e98e. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 56ccc2ef1c3..5e5fd00328b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,7 +97,7 @@ dask = "xarray.namedarray.daskmanager:DaskManager" [build-system] build-backend = "setuptools.build_meta" -requires = ["setuptools>=77.0.3", "setuptools-scm>=8"] +requires = ["setuptools>=42", "setuptools-scm>=7"] [tool.setuptools.packages.find] include = ["xarray*"] From 0887a8ef89982165dcf9987db8ecbd6f26e9cbee Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 10:49:49 +0200 Subject: [PATCH 24/25] I'm tired of Python packaging pyproject: switch to old format (prior to PEP 639) for license metadata. Because new format is supported starting from setuptools 77.0.3, which seems too recent for numpy 2.2 (<2.3) that is required by numba (benchmarks). --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5e5fd00328b..5a208e4ffcc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ classifiers = [ ] description = "N-D labeled arrays and datasets in Python" dynamic = ["version"] -license = "Apache-2.0" +license = { text = "Apache-2.0" } name = "xarray" readme = "README.md" requires-python = ">=3.11" From 8a76b468612b1fea11fc03a680c13fa5bd7a72d7 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 10:59:06 +0200 Subject: [PATCH 25/25] Let's fix all this later Revert "I'm tired of Python packaging" This reverts commit 0887a8ef89982165dcf9987db8ecbd6f26e9cbee. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5a208e4ffcc..5e5fd00328b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ classifiers = [ ] description = "N-D labeled arrays and datasets in Python" dynamic = ["version"] -license = { text = "Apache-2.0" } +license = "Apache-2.0" name = "xarray" readme = "README.md" requires-python = ">=3.11"