Skip to content

Optional tolerance feature to isin per issue #5587 #5862

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
20 changes: 18 additions & 2 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1384,15 +1384,16 @@ def notnull(self, keep_attrs: bool = None):
keep_attrs=keep_attrs,
)

def isin(self, test_elements):
def isin(self, test_elements, tolerance=None):
"""Tests each value in the array for whether it is in test elements.

Parameters
----------
test_elements : array_like
The values against which to test each value of `element`.
This argument is flattened if an array or array_like.
See numpy notes for behavior with non-array-like parameters.
tolerance : dtype
Optional parameter for acceptible equal tolerance.

Returns
-------
Expand All @@ -1407,6 +1408,12 @@ def isin(self, test_elements):
array([ True, False, True])
Dimensions without coordinates: x

>>> array = xr.DataArray([1, 2, 3], dims="x")
>>> array.isin([1.1, 2.9], tolerance=0.2)
<xarray.DataArray (x: 3)>
array([ True, False, True])
Dimensions without coordinates: x

See Also
--------
numpy.isin
Expand All @@ -1427,6 +1434,15 @@ def isin(self, test_elements):
# second argument
test_elements = test_elements.data

if tolerance:
# non-zero tolerance arguments
return apply_ufunc(
duck_array_ops.isin_tolerance,
self,
kwargs=dict(test_elements=test_elements, tolerance=tolerance),
dask="allowed",
)

return apply_ufunc(
duck_array_ops.isin,
self,
Expand Down
33 changes: 33 additions & 0 deletions xarray/core/duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,39 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float):
return np.where(isnull(array), np.nan, array.astype(dtype))


def isin_tolerance(self, test_elements, tolerance):
"""Compare self.values to test_elements elementwise.
Parameters
----------
self : numpy.array_like
test_elements : array_like
tolerance : dtype
Absolute value of acceptable range between self and test_elements.

Returns
-------
array_like : Same shape as self, but contains bool values.

Notes
-----
Vectorized comparisons elementwise require immense memory for larger datasets
because it generates np.array with shape (*self.shape, *test_elements.shape)

"""
test_elements = np.asarray(test_elements)
merge_axis = (
*[
mergeaxis
for mergeaxis in range(
len(self.shape), len(self.shape) + len(test_elements.shape)
)
],
)
return (np.abs(np.subtract.outer(self, test_elements)) < abs(tolerance)).any(
merge_axis
)


def timedelta_to_numeric(value, datetime_unit="ns", dtype=float):
"""Convert a timedelta-like object to numerical values.

Expand Down
17 changes: 17 additions & 0 deletions xarray/tests/test_duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
count,
first,
gradient,
isin_tolerance,
last,
least_squares,
mean,
Expand Down Expand Up @@ -892,3 +893,19 @@ def test_push_dask():
dask.array.from_array(array, chunks=(1, 2, 3, 2, 2, 1, 1)), axis=0, n=None
)
np.testing.assert_equal(actual, expected)


@pytest.mark.parametrize("shape", [(200, 1), (10, 10, 2), (4, 50)])
@pytest.mark.parametrize("tolerance", [1e-2, 1e-4, 1e-6])
def test_isin_tolerance(shape, tolerance):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a test with dask arrays as well?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure! Does the implementation in commit b79620a cover dask arrays?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You have to change the expected value as well. Because if I input a dask array I would expect to get a dask array out as well from a duckarray operation.

There's also a assert_duckarray_equal that you maybe can use instead of numpys version:

def assert_duckarray_equal(x, y, err_msg="", verbose=True):

I'm probably missing something but it is kind of surprising it hasn't already been imported in this test module.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yeah, that makes sense. Thanks for the clarification!

in_margin = tolerance / 2 # in margin
arrayA = np.arange(-10.0, 10.0, 0.1).reshape(shape)
expected = np.array([item % 2 == 0 for item in range(0, arrayA.size)]).reshape(
shape
)
for c in range(1, 5):
# generate test set
arrayB = -99 * (~expected.flatten()) + (in_margin + arrayA * expected).flatten()
with raise_if_dask_computes():
actual = isin_tolerance(arrayA, arrayB, tolerance)
np.testing.assert_equal(actual, expected)
5 changes: 5 additions & 0 deletions xarray/tests/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,11 @@ def test_dataarray_property(prop):
False,
marks=xfail(reason="Missing implementation for np.isin"),
),
param(
do("isin_tolerance", [1 - 1e-7, 2, 3 + 1e-7], tolerance=1e-6),
False,
marks=xfail(reason="Missing implementation for isin_tolerance"),
),
param(
do("item", (1, 1)),
False,
Expand Down