Skip to content

Optional tolerance feature to isin per issue #5587 #5862

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ v0.19.1 (unreleased)

New Features
~~~~~~~~~~~~
- Added an optional ``tolerance`` argument to :py:func:`isin` for numerically close datasets (:issue:`5587`, :pull:`5862`).
By `Shane Hazelquist <https://github.com/shazelquist>`.
- Added a :py:func:`get_options` method to xarray's root namespace (:issue:`5698`, :pull:`5716`)
By `Pushkar Kopparla <https://github.com/pkopparla>`_.
- Xarray now does a better job rendering variable names that are long LaTeX sequences when plotting (:issue:`5681`, :pull:`5682`).
Expand Down
20 changes: 18 additions & 2 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1384,15 +1384,16 @@ def notnull(self, keep_attrs: bool = None):
keep_attrs=keep_attrs,
)

def isin(self, test_elements):
def isin(self, test_elements, tolerance=None):
"""Tests each value in the array for whether it is in test elements.

Parameters
----------
test_elements : array_like
The values against which to test each value of `element`.
This argument is flattened if an array or array_like.
See numpy notes for behavior with non-array-like parameters.
tolerance : dtype
Optional parameter for acceptible equal tolerance.

Returns
-------
Expand All @@ -1407,6 +1408,12 @@ def isin(self, test_elements):
array([ True, False, True])
Dimensions without coordinates: x

>>> array = xr.DataArray([1, 2, 3], dims="x")
>>> array.isin([1.1, 2.9], tolerance=0.2)
<xarray.DataArray (x: 3)>
array([ True, False, True])
Dimensions without coordinates: x

See Also
--------
numpy.isin
Expand All @@ -1427,6 +1434,15 @@ def isin(self, test_elements):
# second argument
test_elements = test_elements.data

if tolerance:
# non-zero tolerance arguments
return apply_ufunc(
duck_array_ops.isin_tolerance,
self,
kwargs=dict(test_elements=test_elements, tolerance=tolerance),
dask="allowed",
)

return apply_ufunc(
duck_array_ops.isin,
self,
Expand Down
33 changes: 33 additions & 0 deletions xarray/core/duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,39 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float):
return np.where(isnull(array), np.nan, array.astype(dtype))


def isin_tolerance(self, test_elements, tolerance):
"""Compare self.values to test_elements elementwise.
Parameters
----------
self : numpy.array_like
test_elements : array_like
tolerance : dtype
Absolute value of acceptable range between self and test_elements.

Returns
-------
array_like : Same shape as self, but contains bool values.

Notes
-----
Vectorized comparisons elementwise require immense memory for larger datasets
because it generates np.array with shape (*self.shape, *test_elements.shape)

"""
test_elements = np.asarray(test_elements)
merge_axis = (
*[
mergeaxis
for mergeaxis in range(
len(self.shape), len(self.shape) + len(test_elements.shape)
)
],
)
return (np.abs(np.subtract.outer(self, test_elements)) < abs(tolerance)).any(
merge_axis
)


def timedelta_to_numeric(value, datetime_unit="ns", dtype=float):
"""Convert a timedelta-like object to numerical values.

Expand Down
49 changes: 48 additions & 1 deletion xarray/tests/test_duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@
import pytest
from numpy import array, nan

from xarray import DataArray, Dataset, cftime_range, concat
from xarray import DataArray, Dataset, cftime_range, concat, testing
from xarray.core import dtypes, duck_array_ops
from xarray.core.duck_array_ops import (
array_notnull_equiv,
concatenate,
count,
first,
gradient,
isin_tolerance,
last,
least_squares,
mean,
Expand Down Expand Up @@ -892,3 +893,49 @@ def test_push_dask():
dask.array.from_array(array, chunks=(1, 2, 3, 2, 2, 1, 1)), axis=0, n=None
)
np.testing.assert_equal(actual, expected)


@pytest.mark.parametrize("shape", [(200), (10, 10, 2), (4, 50)])
@pytest.mark.parametrize("tolerance", [1e-2, 1e-4, 1e-6])
@pytest.mark.parametrize("dask_for_A", [True, False])
@pytest.mark.parametrize("dask_for_B", [True, False])
def test_isin_tolerance(shape, tolerance, dask_for_A, dask_for_B):
if (dask_for_A or dask_for_B) and not has_dask:
pytest.skip("requires dask")

in_margin = tolerance / 2 # measure within acceptable margin
arrayA = np.arange(-10.0, 10.0, 0.1).reshape(shape)
expected = np.array([item % 2 == 0 for item in range(0, arrayA.size)]).reshape(
shape
)
if dask_for_A or dask_for_B: # tests including dask arrays
import dask.array

if dask_for_A:
arrayA = dask.array.from_array(arrayA)
expected = dask.array.from_array(expected)
for offset_direction in [1, -1]:
# generate test set
if dask_for_B:
arrayB = dask.array.from_array(
[-99 * (~expected) + (in_margin + arrayA * expected)]
)
else:
arrayB = -99 * (~expected) + (in_margin + arrayA * expected)
# test function
actual = isin_tolerance(arrayA, arrayB, tolerance)

testing.assert_duckarray_equal(actual, expected)
if dask_for_A:
assert isinstance(actual, dask_array_type)
else:
assert isinstance(actual, np.ndarray)
else: # test only using numpy
for offset_direction in [1, -1]:
# generate test set
arrayB = -99 * (~expected) + (in_margin + arrayA * expected)
# test function
actual = isin_tolerance(arrayA, arrayB, tolerance)

testing.assert_duckarray_equal(actual, expected)
assert isinstance(actual, np.ndarray)
5 changes: 5 additions & 0 deletions xarray/tests/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,11 @@ def test_dataarray_property(prop):
False,
marks=xfail(reason="Missing implementation for np.isin"),
),
param(
do("isin_tolerance", [1 - 1e-7, 2, 3 + 1e-7], tolerance=1e-6),
False,
marks=xfail(reason="Missing implementation for isin_tolerance"),
),
param(
do("item", (1, 1)),
False,
Expand Down