diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 87c85d45454..420433413e3 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -23,6 +23,8 @@ v0.19.1 (unreleased) New Features ~~~~~~~~~~~~ +- Added an optional ``tolerance`` argument to :py:func:`isin` for numerically close datasets (:issue:`5587`, :pull:`5862`). + By `Shane Hazelquist `. - Added a :py:func:`get_options` method to xarray's root namespace (:issue:`5698`, :pull:`5716`) By `Pushkar Kopparla `_. - Xarray now does a better job rendering variable names that are long LaTeX sequences when plotting (:issue:`5681`, :pull:`5682`). diff --git a/xarray/core/common.py b/xarray/core/common.py index 2c5d7900ef8..178ac9dafbf 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1384,15 +1384,16 @@ def notnull(self, keep_attrs: bool = None): keep_attrs=keep_attrs, ) - def isin(self, test_elements): + def isin(self, test_elements, tolerance=None): """Tests each value in the array for whether it is in test elements. Parameters ---------- test_elements : array_like The values against which to test each value of `element`. - This argument is flattened if an array or array_like. See numpy notes for behavior with non-array-like parameters. + tolerance : dtype + Optional parameter for acceptible equal tolerance. Returns ------- @@ -1407,6 +1408,12 @@ def isin(self, test_elements): array([ True, False, True]) Dimensions without coordinates: x + >>> array = xr.DataArray([1, 2, 3], dims="x") + >>> array.isin([1.1, 2.9], tolerance=0.2) + + array([ True, False, True]) + Dimensions without coordinates: x + See Also -------- numpy.isin @@ -1427,6 +1434,15 @@ def isin(self, test_elements): # second argument test_elements = test_elements.data + if tolerance: + # non-zero tolerance arguments + return apply_ufunc( + duck_array_ops.isin_tolerance, + self, + kwargs=dict(test_elements=test_elements, tolerance=tolerance), + dask="allowed", + ) + return apply_ufunc( duck_array_ops.isin, self, diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 00c92c030c8..f1222253bd1 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -467,6 +467,39 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): return np.where(isnull(array), np.nan, array.astype(dtype)) +def isin_tolerance(self, test_elements, tolerance): + """Compare self.values to test_elements elementwise. + Parameters + ---------- + self : numpy.array_like + test_elements : array_like + tolerance : dtype + Absolute value of acceptable range between self and test_elements. + + Returns + ------- + array_like : Same shape as self, but contains bool values. + + Notes + ----- + Vectorized comparisons elementwise require immense memory for larger datasets + because it generates np.array with shape (*self.shape, *test_elements.shape) + + """ + test_elements = np.asarray(test_elements) + merge_axis = ( + *[ + mergeaxis + for mergeaxis in range( + len(self.shape), len(self.shape) + len(test_elements.shape) + ) + ], + ) + return (np.abs(np.subtract.outer(self, test_elements)) < abs(tolerance)).any( + merge_axis + ) + + def timedelta_to_numeric(value, datetime_unit="ns", dtype=float): """Convert a timedelta-like object to numerical values. diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 6d49e20909d..fcd1687924d 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -7,7 +7,7 @@ import pytest from numpy import array, nan -from xarray import DataArray, Dataset, cftime_range, concat +from xarray import DataArray, Dataset, cftime_range, concat, testing from xarray.core import dtypes, duck_array_ops from xarray.core.duck_array_ops import ( array_notnull_equiv, @@ -15,6 +15,7 @@ count, first, gradient, + isin_tolerance, last, least_squares, mean, @@ -892,3 +893,49 @@ def test_push_dask(): dask.array.from_array(array, chunks=(1, 2, 3, 2, 2, 1, 1)), axis=0, n=None ) np.testing.assert_equal(actual, expected) + + +@pytest.mark.parametrize("shape", [(200), (10, 10, 2), (4, 50)]) +@pytest.mark.parametrize("tolerance", [1e-2, 1e-4, 1e-6]) +@pytest.mark.parametrize("dask_for_A", [True, False]) +@pytest.mark.parametrize("dask_for_B", [True, False]) +def test_isin_tolerance(shape, tolerance, dask_for_A, dask_for_B): + if (dask_for_A or dask_for_B) and not has_dask: + pytest.skip("requires dask") + + in_margin = tolerance / 2 # measure within acceptable margin + arrayA = np.arange(-10.0, 10.0, 0.1).reshape(shape) + expected = np.array([item % 2 == 0 for item in range(0, arrayA.size)]).reshape( + shape + ) + if dask_for_A or dask_for_B: # tests including dask arrays + import dask.array + + if dask_for_A: + arrayA = dask.array.from_array(arrayA) + expected = dask.array.from_array(expected) + for offset_direction in [1, -1]: + # generate test set + if dask_for_B: + arrayB = dask.array.from_array( + [-99 * (~expected) + (in_margin + arrayA * expected)] + ) + else: + arrayB = -99 * (~expected) + (in_margin + arrayA * expected) + # test function + actual = isin_tolerance(arrayA, arrayB, tolerance) + + testing.assert_duckarray_equal(actual, expected) + if dask_for_A: + assert isinstance(actual, dask_array_type) + else: + assert isinstance(actual, np.ndarray) + else: # test only using numpy + for offset_direction in [1, -1]: + # generate test set + arrayB = -99 * (~expected) + (in_margin + arrayA * expected) + # test function + actual = isin_tolerance(arrayA, arrayB, tolerance) + + testing.assert_duckarray_equal(actual, expected) + assert isinstance(actual, np.ndarray) diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py index 3d57d3dc961..b1f0a0b087b 100644 --- a/xarray/tests/test_sparse.py +++ b/xarray/tests/test_sparse.py @@ -454,6 +454,11 @@ def test_dataarray_property(prop): False, marks=xfail(reason="Missing implementation for np.isin"), ), + param( + do("isin_tolerance", [1 - 1e-7, 2, 3 + 1e-7], tolerance=1e-6), + False, + marks=xfail(reason="Missing implementation for isin_tolerance"), + ), param( do("item", (1, 1)), False,