Skip to content

Optional tolerance feature to isin per issue #5587 #5862

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
20 changes: 18 additions & 2 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1384,15 +1384,16 @@ def notnull(self, keep_attrs: bool = None):
keep_attrs=keep_attrs,
)

def isin(self, test_elements):
def isin(self, test_elements, tolerance=None):
"""Tests each value in the array for whether it is in test elements.

Parameters
----------
test_elements : array_like
The values against which to test each value of `element`.
This argument is flattened if an array or array_like.
See numpy notes for behavior with non-array-like parameters.
tolerance : dtype
Optional parameter for acceptible equal tolerance.

Returns
-------
Expand All @@ -1407,6 +1408,12 @@ def isin(self, test_elements):
array([ True, False, True])
Dimensions without coordinates: x

>>> array = xr.DataArray([1, 2, 3], dims="x")
>>> array.isin([1.1, 2.9], tolerance=0.2)
<xarray.DataArray (x: 3)>
array([ True, False, True])
Dimensions without coordinates: x

See Also
--------
numpy.isin
Expand All @@ -1427,6 +1434,15 @@ def isin(self, test_elements):
# second argument
test_elements = test_elements.data

if tolerance:
# non-zero tolerance arguments
return apply_ufunc(
duck_array_ops.isin_tolerance,
self,
kwargs=dict(test_elements=test_elements, tolerance=tolerance),
dask="allowed",
)

return apply_ufunc(
duck_array_ops.isin,
self,
Expand Down
33 changes: 33 additions & 0 deletions xarray/core/duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,39 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float):
return np.where(isnull(array), np.nan, array.astype(dtype))


def isin_tolerance(self, test_elements, tolerance):
"""Compare self.values to test_elements elementwise.
Parameters
----------
self : numpy.array_like
test_elements : array_like
tolerance : dtype
Absolute value of acceptable range between self and test_elements.

Returns
-------
array_like : Same shape as self, but contains bool values.

Notes
-----
Vectorized comparisons elementwise require immense memory for larger datasets
because it generates np.array with shape (*self.shape, *test_elements.shape)

"""
test_elements = np.asarray(test_elements)
merge_axis = (
*[
mergeaxis
for mergeaxis in range(
len(self.shape), len(self.shape) + len(test_elements.shape)
)
],
)
return (np.abs(np.subtract.outer(self, test_elements)) < abs(tolerance)).any(
merge_axis
)


def timedelta_to_numeric(value, datetime_unit="ns", dtype=float):
"""Convert a timedelta-like object to numerical values.

Expand Down
26 changes: 26 additions & 0 deletions xarray/tests/test_duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
count,
first,
gradient,
isin_tolerance,
last,
least_squares,
mean,
Expand Down Expand Up @@ -892,3 +893,28 @@ def test_push_dask():
dask.array.from_array(array, chunks=(1, 2, 3, 2, 2, 1, 1)), axis=0, n=None
)
np.testing.assert_equal(actual, expected)


@pytest.mark.parametrize("shape", [(200, 1), (10, 10, 2), (4, 50)])
@pytest.mark.parametrize("tolerance", [1e-2, 1e-4, 1e-6])
@pytest.mark.parametrize("dask", [False, True])
def test_isin_tolerance(shape, tolerance, dask):
if dask and not has_dask:
pytest.skip("requires dask")

in_margin = tolerance / 2 # within acceptable margin
arrayA = np.arange(-10.0, 10.0, 0.1).reshape(shape)

expected = np.array([item % 2 == 0 for item in range(0, arrayA.size)]).reshape(
shape
)
for offset_direction in [1, -1]:
# generate test set
arrayB = -99 * (~expected) + (in_margin + arrayA * expected)
if dask:
import dask.array

arrayB = dask.array.from_array(arrayB)

actual = isin_tolerance(arrayA, arrayB, tolerance)
np.testing.assert_equal(actual, expected)
5 changes: 5 additions & 0 deletions xarray/tests/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,11 @@ def test_dataarray_property(prop):
False,
marks=xfail(reason="Missing implementation for np.isin"),
),
param(
do("isin_tolerance", [1 - 1e-7, 2, 3 + 1e-7], tolerance=1e-6),
False,
marks=xfail(reason="Missing implementation for isin_tolerance"),
),
param(
do("item", (1, 1)),
False,
Expand Down