Skip to content

Add CFIntervalIndex #10296

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9282fc4
add IntervalIndex
benbovy May 7, 2025
f71f767
add index description (docstrings)
benbovy May 7, 2025
f7041fd
add type annotations
benbovy May 7, 2025
9401774
expose IntervalIndex publicly via xarray.indexes
benbovy May 7, 2025
781d33f
add a few TODOs
benbovy May 7, 2025
48dc0bd
clean-up
benbovy May 8, 2025
b424b12
better docstrings
benbovy May 8, 2025
8d80e71
refactor: use two sub-indexes
benbovy May 8, 2025
e60a1a4
check consistent central values vs. intervals
benbovy May 8, 2025
8918fe8
fix mypy
benbovy May 8, 2025
c722a2e
implement join and reindex_like
benbovy May 8, 2025
23fb18b
add mid_index and bounds_index properties
benbovy May 8, 2025
de4f5d8
clean-up indexing.PandasIndexingAdapter typing
benbovy May 9, 2025
e1bf896
streamline PandasIndexingAdapter indexing logic
benbovy May 9, 2025
06a3b92
add xarray indexing adapater for pd.IntervalIndex
benbovy May 9, 2025
80f496f
clean-up PandasIndexingAdapter dtype handling
benbovy May 9, 2025
67d8f6c
fix mypy
benbovy May 9, 2025
a8015aa
IntervalIndex sel / isel: handle boundary dim & coord
benbovy May 9, 2025
5b5cbee
more clean-up
benbovy May 9, 2025
fdc1943
rename IntervalIndex -> CFIntervalIndex
benbovy Jul 3, 2025
3a8fd3c
Merge branch 'main' into add-interval-index
benbovy Jul 3, 2025
edfa435
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 3, 2025
bc20226
fix circular import
benbovy Jul 3, 2025
3ec2c65
Merge branch 'main' into add-interval-index
dcherian Jul 8, 2025
4cabb7c
Fix bad merge
dcherian Jul 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions xarray/core/indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,9 +612,6 @@ def get_indexer_nd(index: pd.Index, labels, method=None, tolerance=None) -> np.n
return indexer


T_PandasIndex = TypeVar("T_PandasIndex", bound="PandasIndex")


class PandasIndex(Index):
"""Wrap a pandas.Index as an xarray compatible index."""

Expand Down Expand Up @@ -912,9 +909,7 @@ def rename(self, name_dict, dims_dict):
new_dim = dims_dict.get(self.dim, self.dim)
return self._replace(index, dim=new_dim)

def _copy(
self: T_PandasIndex, deep: bool = True, memo: dict[int, Any] | None = None
) -> T_PandasIndex:
def _copy(self, deep: bool = True, memo: dict[int, Any] | None = None) -> Self:
if deep:
# pandas is not using the memo
index = self.index.copy(deep=True)
Expand Down
3 changes: 2 additions & 1 deletion xarray/indexes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
PandasIndex,
PandasMultiIndex,
)
from xarray.indexes.interval_index import IntervalIndex
from xarray.indexes.range_index import RangeIndex

__all__ = ["Index", "PandasIndex", "PandasMultiIndex", "RangeIndex"]
__all__ = ["Index", "IntervalIndex", "PandasIndex", "PandasMultiIndex", "RangeIndex"]
185 changes: 185 additions & 0 deletions xarray/indexes/interval_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
from __future__ import annotations

from collections.abc import Hashable, Iterable, Mapping, Sequence
from typing import TYPE_CHECKING, Any, cast

import numpy as np
import pandas as pd

from xarray import Variable
from xarray.core.indexes import Index, PandasIndex
from xarray.core.indexing import IndexSelResult

if TYPE_CHECKING:
from xarray.core.types import Self


class IntervalIndex(Index):
"""Xarray index of 1-dimensional intervals.

This index is built on top of :py:class:`~xarray.indexes.PandasIndex` and
wraps a :py:class:`pandas.IntervalIndex`. It is associated with two
coordinate variables:

- a 1-dimensional coordinate where each label represents an interval that is
materialized by its midpoint (i.e., the average of its left and right
boundaries)

- a 2-dimensional coordinate that represents the left and right boundaries
of each interval. One of the two dimensions is shared with the
aforementioned coordinate and the other one has length 2.

"""

_index: PandasIndex
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should save the central values too.

_bounds_name: Hashable
_bounds_dim: str

def __init__(self, index: PandasIndex, bounds_name: Hashable, bounds_dim: str):
assert isinstance(index.index, pd.IntervalIndex)
self._index = index
self._bounds_name = bounds_name
self._bounds_dim = bounds_dim

@classmethod
def from_variables(
cls,
variables: Mapping[Any, Variable],
*,
options: Mapping[str, Any],
) -> Self:
# TODO: allow set the index from one variable? Guess bounds like cf_xarray's add_bounds
assert len(variables) == 2

for k, v in variables.items():
if v.ndim == 2:
# TODO: be flexible with dimension order? Check which dim has length 2
bounds_name, bounds = k, v
elif v.ndim == 1:
dim, _ = k, v

bounds = bounds.transpose(..., dim)
left, right = bounds.data.tolist()
# TODO: support non-dimension coordinates (pass variable name to pd.IntervalIndex.from_arrays)
# TODO: propagate coordinate dtype (pass it to PandasIndex constructor)
# TODO: add "closed" build option (maybe choose "closed='both'" as default here? to be consistent with
# CF conventions: https://cfconventions.org/cf-conventions/cf-conventions.html#bounds-one-d)
index = PandasIndex(pd.IntervalIndex.from_arrays(left, right), dim)
bounds_dim = (set(bounds.dims) - set(dim)).pop()

return cls(index, bounds_name, str(bounds_dim))

@classmethod
def concat(
cls,
indexes: Sequence[IntervalIndex],
dim: Hashable,
positions: Iterable[Iterable[int]] | None = None,
) -> IntervalIndex:
new_index = PandasIndex.concat(
[idx._index for idx in indexes], dim, positions=positions
)

if indexes:
bounds_name = indexes[0]._bounds_name
bounds_dim = indexes[0]._bounds_dim
if any(
idx._bounds_name != bounds_name or idx._bounds_dim != bounds_dim
for idx in indexes
):
raise ValueError(
f"Cannot concatenate along dimension {dim!r} indexes with different "
"boundary coordinate or dimension names"
)
else:
bounds_name = new_index.index.name + "_bounds"
bounds_dim = "bnd"

return cls(new_index, bounds_name, bounds_dim)

@property
def _pd_index(self) -> pd.IntervalIndex:
# For typing purpose only
# TODO: cleaner to make PandasIndex a generic class, i.e., PandasIndex[pd.IntervalIndex]
# will be easier once PEP 696 is fully supported (starting from Python 3.13)
return cast(pd.IntervalIndex, self._index.index)

def create_variables(
self, variables: Mapping[Any, Variable] | None = None
) -> dict[Any, Variable]:
if variables is None:
variables = {}
empty_var = Variable((), 0)
bounds_attrs = variables.get(self._bounds_name, empty_var).attrs
mid_attrs = variables.get(self._index.dim, empty_var).attrs

# TODO: create a PandasIndexingAdapter subclass for the boundary variable
# and wrap it here (avoid data copy)
bounds_var = Variable(
dims=(self._bounds_dim, self._index.dim),
data=np.stack([self._pd_index.left, self._pd_index.right], axis=0),
attrs=bounds_attrs,
)
# TODO: use PandasIndexingAdapter directly (avoid data copy)
# and/or maybe add an index build option to preserve original labels?
# (if those differ from interval midpoints as defined by pd.IntervalIndex)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should always save the central value and return that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. This is sub-optimal in the case where the central values exactly correspond to what is returned pd.IntervalIndex.mid (e.g., when creating an IntervalIndex directly from a pd.IntervalIndex), but as I was trying to account for this special case I went into all sorts of small complications, so I think it'll be easier to always save the central values into a separate PandasIndex here.

mid_var = Variable(
dims=(self._index.dim,),
data=self._pd_index.mid,
attrs=mid_attrs,
)

return {self._index.dim: mid_var, self._bounds_name: bounds_var}

def should_add_coord_to_array(
self,
name: Hashable,
var: Variable,
dims: set[Hashable],
) -> bool:
# add both the mid and boundary coordinates if the index dimension
# is present in the array dimensions
if self._index.dim in dims:
return True
else:
return False

def to_pandas_index(self) -> pd.Index:
return self._pd_index

def equals(self, other: Index) -> bool:
if not isinstance(other, IntervalIndex):
return False
return self._index.equals(other._index)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again need to check the central value here


def sel(self, labels: dict[Any, Any], **kwargs) -> IndexSelResult:
return self._index.sel(labels, **kwargs)

def isel(
self, indexers: Mapping[Any, int | slice | np.ndarray | Variable]
) -> Self | None:
new_index = self._index.isel(indexers)
if new_index is not None:
return type(self)(new_index, self._bounds_name, self._bounds_dim)
else:
return None

def roll(self, shifts: Mapping[Any, int]) -> Self | None:
new_index = self._index.roll(shifts)
return type(self)(new_index, self._bounds_name, self._bounds_dim)

def rename(
self,
name_dict: Mapping[Any, Hashable],
dims_dict: Mapping[Any, Hashable],
) -> Self:
new_index = self._index.rename(name_dict, dims_dict)

bounds_name = name_dict.get(self._bounds_name, self._bounds_name)
bounds_dim = dims_dict.get(self._bounds_dim, self._bounds_dim)

return type(self)(new_index, bounds_name, str(bounds_dim))

def __repr__(self) -> str:
string = f"{self._index!r}"
return string
Loading