From 41e8caeaacc186d711a560bc1b40b2840797cfbd Mon Sep 17 00:00:00 2001 From: Shane Hazelquist Date: Wed, 13 Oct 2021 16:46:30 -0700 Subject: [PATCH 1/7] Optional tolerance feature to isin per issue #5587 --- xarray/core/common.py | 18 +++++++++++++++- xarray/core/duck_array_ops.py | 32 +++++++++++++++++++++++++++++ xarray/tests/test_duck_array_ops.py | 17 +++++++++++++++ xarray/tests/test_sparse.py | 5 +++++ 4 files changed, 71 insertions(+), 1 deletion(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 2c5d7900ef8..c40b70366c9 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1384,12 +1384,13 @@ def notnull(self, keep_attrs: bool = None): keep_attrs=keep_attrs, ) - def isin(self, test_elements): + def isin(self, test_elements, tolerance=None): """Tests each value in the array for whether it is in test elements. Parameters ---------- test_elements : array_like + tolerance : dtype The values against which to test each value of `element`. This argument is flattened if an array or array_like. See numpy notes for behavior with non-array-like parameters. @@ -1407,6 +1408,12 @@ def isin(self, test_elements): array([ True, False, True]) Dimensions without coordinates: x + >>> array = xr.DataArray([1, 2, 3], dims="x") + >>> array.isin([1.1, 2.9], tolerance = 0.2) + + array([ True, False, True]) + Dimensions without coordinates: x + See Also -------- numpy.isin @@ -1427,6 +1434,15 @@ def isin(self, test_elements): # second argument test_elements = test_elements.data + if tolerance: + # non-zero & None arguments + return apply_ufunc( + duck_array_ops.isin_tolerance, + self, + kwargs=dict(test_elements=test_elements, tolerance=tolerance), + dask="allowed", + ) + return apply_ufunc( duck_array_ops.isin, self, diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 00c92c030c8..a7d411a9ad9 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -467,6 +467,38 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): return np.where(isnull(array), np.nan, array.astype(dtype)) +def isin_tolerance(self, test_elements, tolerance): + """Compare self.values to test_elements elementwise. + Parameters + ---------- + self : numpy.array_like + test_elements : numpy.array_like + tolerance : dtype + Absolute value of accemptable range between self and test_elements. + + Returns + ------- + array_like : Same shape as self, but contains bool values. + + Notes + ----- + Vectorized comparasons elementwise require immense memory for larger datasets + because it generates np.array with shape (self.shape, test_elements.shape) + + """ + merge_axis = ( + *[ + mergeaxis + for mergeaxis in range( + len(self.shape), len(self.shape) + len(test_elements.shape) + ) + ], + ) + return (np.abs(np.subtract.outer(self, test_elements)) < abs(tolerance)).any( + merge_axis + ) + + def timedelta_to_numeric(value, datetime_unit="ns", dtype=float): """Convert a timedelta-like object to numerical values. diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 6d49e20909d..56246d744f2 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -23,6 +23,7 @@ push, py_timedelta_to_float, stack, + isin_tolerance, timedelta_to_numeric, where, ) @@ -892,3 +893,19 @@ def test_push_dask(): dask.array.from_array(array, chunks=(1, 2, 3, 2, 2, 1, 1)), axis=0, n=None ) np.testing.assert_equal(actual, expected) + + +@pytest.mark.parametrize("shape", [(200, 1), (10, 10, 2), (4, 50)]) +@pytest.mark.parametrize("tolerance", [1e-2, 1e-4, 1e-6]) +def test_isin_tolerance(shape, tolerance): + in_margin = tolerance / 2 # in margin + arrayA = np.arange(-10.0, 10.0, 0.1).reshape(shape) + expected = np.array([item % 2 == 0 for item in range(0, arrayA.size)]).reshape( + shape + ) + for c in range(1, 5): + # generate test set + arrayB = -99 * (~expected.flatten()) + (in_margin + arrayA * expected).flatten() + with raise_if_dask_computes(): + actual = isin_tolerance(arrayA, arrayB, tolerance) + np.testing.assert_equal(actual, expected) diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py index 3d57d3dc961..b1f0a0b087b 100644 --- a/xarray/tests/test_sparse.py +++ b/xarray/tests/test_sparse.py @@ -454,6 +454,11 @@ def test_dataarray_property(prop): False, marks=xfail(reason="Missing implementation for np.isin"), ), + param( + do("isin_tolerance", [1 - 1e-7, 2, 3 + 1e-7], tolerance=1e-6), + False, + marks=xfail(reason="Missing implementation for isin_tolerance"), + ), param( do("item", (1, 1)), False, From 549a58fd041943fcf70e9275f2db3328fcb8e57d Mon Sep 17 00:00:00 2001 From: Shane Hazelquist Date: Thu, 14 Oct 2021 21:00:54 -0700 Subject: [PATCH 2/7] Fixed bug in 41e8cae running doctest & format issue for docs --- xarray/core/common.py | 8 ++++---- xarray/core/duck_array_ops.py | 17 +++++++++-------- xarray/tests/test_duck_array_ops.py | 2 +- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index c40b70366c9..178ac9dafbf 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1390,10 +1390,10 @@ def isin(self, test_elements, tolerance=None): Parameters ---------- test_elements : array_like - tolerance : dtype The values against which to test each value of `element`. - This argument is flattened if an array or array_like. See numpy notes for behavior with non-array-like parameters. + tolerance : dtype + Optional parameter for acceptible equal tolerance. Returns ------- @@ -1409,7 +1409,7 @@ def isin(self, test_elements, tolerance=None): Dimensions without coordinates: x >>> array = xr.DataArray([1, 2, 3], dims="x") - >>> array.isin([1.1, 2.9], tolerance = 0.2) + >>> array.isin([1.1, 2.9], tolerance=0.2) array([ True, False, True]) Dimensions without coordinates: x @@ -1435,7 +1435,7 @@ def isin(self, test_elements, tolerance=None): test_elements = test_elements.data if tolerance: - # non-zero & None arguments + # non-zero tolerance arguments return apply_ufunc( duck_array_ops.isin_tolerance, self, diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index a7d411a9ad9..ecc5f39d1c2 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -13,12 +13,12 @@ import pandas as pd from numpy import all as array_all # noqa from numpy import any as array_any # noqa -from numpy import zeros_like # noqa -from numpy import around, broadcast_to # noqa +from numpy import zeros_like # noqa; noqa +from numpy import around, broadcast_to from numpy import concatenate as _concatenate -from numpy import einsum, isclose, isin, isnan, isnat, pad # noqa +from numpy import einsum, isclose, isin, isnan, isnat, pad from numpy import stack as _stack -from numpy import take, tensordot, transpose, unravel_index # noqa +from numpy import take, tensordot, transpose, unravel_index from numpy import where as _where from . import dask_array_compat, dask_array_ops, dtypes, npcompat, nputils @@ -472,9 +472,9 @@ def isin_tolerance(self, test_elements, tolerance): Parameters ---------- self : numpy.array_like - test_elements : numpy.array_like + test_elements : array_like tolerance : dtype - Absolute value of accemptable range between self and test_elements. + Absolute value of acceptable range between self and test_elements. Returns ------- @@ -482,10 +482,11 @@ def isin_tolerance(self, test_elements, tolerance): Notes ----- - Vectorized comparasons elementwise require immense memory for larger datasets - because it generates np.array with shape (self.shape, test_elements.shape) + Vectorized comparisons elementwise require immense memory for larger datasets + because it generates np.array with shape (*self.shape, *test_elements.shape) """ + test_elements = np.asarray(test_elements) merge_axis = ( *[ mergeaxis diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 56246d744f2..a6f1679ea1e 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -15,6 +15,7 @@ count, first, gradient, + isin_tolerance, last, least_squares, mean, @@ -23,7 +24,6 @@ push, py_timedelta_to_float, stack, - isin_tolerance, timedelta_to_numeric, where, ) From d67d18ad7620e9f9ea6f8e6987116cf53459650b Mon Sep 17 00:00:00 2001 From: Shane Hazelquist Date: Thu, 14 Oct 2021 21:13:32 -0700 Subject: [PATCH 3/7] fixed flake8 error, missed the # noqa --- xarray/core/duck_array_ops.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index ecc5f39d1c2..f1222253bd1 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -13,12 +13,12 @@ import pandas as pd from numpy import all as array_all # noqa from numpy import any as array_any # noqa -from numpy import zeros_like # noqa; noqa -from numpy import around, broadcast_to +from numpy import zeros_like # noqa +from numpy import around, broadcast_to # noqa from numpy import concatenate as _concatenate -from numpy import einsum, isclose, isin, isnan, isnat, pad +from numpy import einsum, isclose, isin, isnan, isnat, pad # noqa from numpy import stack as _stack -from numpy import take, tensordot, transpose, unravel_index +from numpy import take, tensordot, transpose, unravel_index # noqa from numpy import where as _where from . import dask_array_compat, dask_array_ops, dtypes, npcompat, nputils From b79620a94c0bfd76639efa7666aebf1206eeae52 Mon Sep 17 00:00:00 2001 From: Shane Hazelquist Date: Fri, 15 Oct 2021 15:42:16 -0700 Subject: [PATCH 4/7] Added dask array to test parameters & removed vestigial loop. --- xarray/tests/test_duck_array_ops.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index a6f1679ea1e..fd261a761d1 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -897,15 +897,24 @@ def test_push_dask(): @pytest.mark.parametrize("shape", [(200, 1), (10, 10, 2), (4, 50)]) @pytest.mark.parametrize("tolerance", [1e-2, 1e-4, 1e-6]) -def test_isin_tolerance(shape, tolerance): - in_margin = tolerance / 2 # in margin +@pytest.mark.parametrize("dask", [False, True]) +def test_isin_tolerance(shape, tolerance, dask): + if dask and not has_dask: + pytest.skip("requires dask") + + in_margin = tolerance / 2 # within acceptable margin arrayA = np.arange(-10.0, 10.0, 0.1).reshape(shape) + expected = np.array([item % 2 == 0 for item in range(0, arrayA.size)]).reshape( shape ) - for c in range(1, 5): + for offset_direction in [1, -1]: # generate test set - arrayB = -99 * (~expected.flatten()) + (in_margin + arrayA * expected).flatten() - with raise_if_dask_computes(): - actual = isin_tolerance(arrayA, arrayB, tolerance) - np.testing.assert_equal(actual, expected) + arrayB = -99 * (~expected) + (in_margin + arrayA * expected) + if dask: + import dask.array + + arrayB = dask.array.from_array(arrayB) + + actual = isin_tolerance(arrayA, arrayB, tolerance) + np.testing.assert_equal(actual, expected) From e2a26750dfd34df2e025866c1995757d0958027a Mon Sep 17 00:00:00 2001 From: Shane Hazelquist Date: Sat, 16 Oct 2021 21:45:18 -0700 Subject: [PATCH 5/7] Using proper import of testing module instead of local --- xarray/tests/test_duck_array_ops.py | 52 +++++++++++++++++++---------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index fd261a761d1..7563c34c790 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -7,7 +7,7 @@ import pytest from numpy import array, nan -from xarray import DataArray, Dataset, cftime_range, concat +from xarray import DataArray, Dataset, cftime_range, concat, testing from xarray.core import dtypes, duck_array_ops from xarray.core.duck_array_ops import ( array_notnull_equiv, @@ -895,26 +895,44 @@ def test_push_dask(): np.testing.assert_equal(actual, expected) -@pytest.mark.parametrize("shape", [(200, 1), (10, 10, 2), (4, 50)]) +@pytest.mark.parametrize("shape", [(200), (10, 10, 2), (4, 50)]) @pytest.mark.parametrize("tolerance", [1e-2, 1e-4, 1e-6]) -@pytest.mark.parametrize("dask", [False, True]) -def test_isin_tolerance(shape, tolerance, dask): - if dask and not has_dask: +@pytest.mark.parametrize("dask_for_A", [True, False]) +@pytest.mark.parametrize("dask_for_B", [True, False]) +def test_isin_tolerance(shape, tolerance, dask_for_A, dask_for_B): + if (dask_for_A or dask_for_B) and not has_dask: pytest.skip("requires dask") - in_margin = tolerance / 2 # within acceptable margin + in_margin = tolerance / 2 # measure within acceptable margin arrayA = np.arange(-10.0, 10.0, 0.1).reshape(shape) - expected = np.array([item % 2 == 0 for item in range(0, arrayA.size)]).reshape( shape ) - for offset_direction in [1, -1]: - # generate test set - arrayB = -99 * (~expected) + (in_margin + arrayA * expected) - if dask: - import dask.array - - arrayB = dask.array.from_array(arrayB) - - actual = isin_tolerance(arrayA, arrayB, tolerance) - np.testing.assert_equal(actual, expected) + if dask_for_A or dask_for_B: # tests including dask arrays + import dask.array + + if dask_for_A: + arrayA = dask.array.from_array(arrayA) + expected = dask.array.from_array(expected) + for offset_direction in [1, -1]: + # generate test set + if dask_for_B: + arrayB = dask.array.from_array( + [-99 * (~expected) + (in_margin + arrayA * expected)] + ) + else: + arrayB = -99 * (~expected) + (in_margin + arrayA * expected) + # test function + actual = isin_tolerance(arrayA, arrayB, tolerance) + + testing.assert_duckarray_equal(actual, expected) + assert type(actual) == type(expected) + else: # test only using numpy + for offset_direction in [1, -1]: + # generate test set + arrayB = -99 * (~expected) + (in_margin + arrayA * expected) + # test function + actual = isin_tolerance(arrayA, arrayB, tolerance) + + testing.assert_duckarray_equal(actual, expected) + assert type(actual) == type(expected) From 0cc1b6aa0e93b031fbdad4d95114dcb76f0d2c2b Mon Sep 17 00:00:00 2001 From: Shane Hazelquist Date: Sun, 17 Oct 2021 14:36:49 -0700 Subject: [PATCH 6/7] Using local assert_duckarray_equal & array type with isinstance. --- xarray/tests/test_duck_array_ops.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 7563c34c790..19a6ac783fa 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -7,7 +7,7 @@ import pytest from numpy import array, nan -from xarray import DataArray, Dataset, cftime_range, concat, testing +from xarray import DataArray, Dataset, cftime_range, concat from xarray.core import dtypes, duck_array_ops from xarray.core.duck_array_ops import ( array_notnull_equiv, @@ -902,6 +902,7 @@ def test_push_dask(): def test_isin_tolerance(shape, tolerance, dask_for_A, dask_for_B): if (dask_for_A or dask_for_B) and not has_dask: pytest.skip("requires dask") + from xarray.testing import assert_duckarray_equal in_margin = tolerance / 2 # measure within acceptable margin arrayA = np.arange(-10.0, 10.0, 0.1).reshape(shape) @@ -925,8 +926,11 @@ def test_isin_tolerance(shape, tolerance, dask_for_A, dask_for_B): # test function actual = isin_tolerance(arrayA, arrayB, tolerance) - testing.assert_duckarray_equal(actual, expected) - assert type(actual) == type(expected) + assert_duckarray_equal(actual, expected) + if dask_for_A: + assert isinstance(actual, dask_array_type) + else: + assert isinstance(actual, np.ndarray) else: # test only using numpy for offset_direction in [1, -1]: # generate test set @@ -934,5 +938,5 @@ def test_isin_tolerance(shape, tolerance, dask_for_A, dask_for_B): # test function actual = isin_tolerance(arrayA, arrayB, tolerance) - testing.assert_duckarray_equal(actual, expected) - assert type(actual) == type(expected) + assert_duckarray_equal(actual, expected) + assert isinstance(actual, np.ndarray) From 130f951c65f6a6da959b17ffbfb5af8d472c4023 Mon Sep 17 00:00:00 2001 From: Shane Hazelquist Date: Fri, 22 Oct 2021 17:15:27 -0700 Subject: [PATCH 7/7] Doc changes in whats-new.rst & cleaned up testing import permutation. --- doc/whats-new.rst | 2 ++ xarray/tests/test_duck_array_ops.py | 7 +++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 87c85d45454..420433413e3 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -23,6 +23,8 @@ v0.19.1 (unreleased) New Features ~~~~~~~~~~~~ +- Added an optional ``tolerance`` argument to :py:func:`isin` for numerically close datasets (:issue:`5587`, :pull:`5862`). + By `Shane Hazelquist `. - Added a :py:func:`get_options` method to xarray's root namespace (:issue:`5698`, :pull:`5716`) By `Pushkar Kopparla `_. - Xarray now does a better job rendering variable names that are long LaTeX sequences when plotting (:issue:`5681`, :pull:`5682`). diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 19a6ac783fa..fcd1687924d 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -7,7 +7,7 @@ import pytest from numpy import array, nan -from xarray import DataArray, Dataset, cftime_range, concat +from xarray import DataArray, Dataset, cftime_range, concat, testing from xarray.core import dtypes, duck_array_ops from xarray.core.duck_array_ops import ( array_notnull_equiv, @@ -902,7 +902,6 @@ def test_push_dask(): def test_isin_tolerance(shape, tolerance, dask_for_A, dask_for_B): if (dask_for_A or dask_for_B) and not has_dask: pytest.skip("requires dask") - from xarray.testing import assert_duckarray_equal in_margin = tolerance / 2 # measure within acceptable margin arrayA = np.arange(-10.0, 10.0, 0.1).reshape(shape) @@ -926,7 +925,7 @@ def test_isin_tolerance(shape, tolerance, dask_for_A, dask_for_B): # test function actual = isin_tolerance(arrayA, arrayB, tolerance) - assert_duckarray_equal(actual, expected) + testing.assert_duckarray_equal(actual, expected) if dask_for_A: assert isinstance(actual, dask_array_type) else: @@ -938,5 +937,5 @@ def test_isin_tolerance(shape, tolerance, dask_for_A, dask_for_B): # test function actual = isin_tolerance(arrayA, arrayB, tolerance) - assert_duckarray_equal(actual, expected) + testing.assert_duckarray_equal(actual, expected) assert isinstance(actual, np.ndarray)