Support complex arrays in xr.corr (#7392)

headtr1ck · dcherian · web-flow · commit 21d86450b3ce · 2023-02-14T09:38:27.000-07:00
* complex cov * fix mypy * update whatsa-new * Update xarray/core/computation.py * slight improvements to tests * bugfix in corr_cov for multiple dims * fix whats-new * allow refreshing of backends * Revert "allow refreshing of backends" This reverts commit 576692b. --------- Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -23,6 +23,8 @@ v2023.03.0 (unreleased)
 New Features
 ~~~~~~~~~~~~
 
+- Fix :py:meth:`xr.cov` and :py:meth:`xr.corr` now support complex valued arrays  (:issue:`7340`, :pull:`7392`).
+  By `Michael Niklas <https://github.com/headtr1ck>`_.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
diff --git a/xarray/core/computation.py b/xarray/core/computation.py
@@ -9,7 +9,16 @@
 import warnings
 from collections import Counter
 from collections.abc import Hashable, Iterable, Mapping, Sequence
-from typing import TYPE_CHECKING, AbstractSet, Any, Callable, TypeVar, Union, overload
+from typing import (
+    TYPE_CHECKING,
+    AbstractSet,
+    Any,
+    Callable,
+    Literal,
+    TypeVar,
+    Union,
+    overload,
+)
 
 import numpy as np
 
@@ -21,7 +30,7 @@
 from xarray.core.merge import merge_attrs, merge_coordinates_without_align
 from xarray.core.options import OPTIONS, _get_keep_attrs
 from xarray.core.pycompat import is_duck_dask_array
-from xarray.core.types import T_DataArray
+from xarray.core.types import Dims, T_DataArray
 from xarray.core.utils import is_dict_like, is_scalar
 from xarray.core.variable import Variable
 
@@ -1209,7 +1218,9 @@ def apply_ufunc(
         return apply_array_ufunc(func, *args, dask=dask)
 
 
-def cov(da_a, da_b, dim=None, ddof=1):
+def cov(
+    da_a: T_DataArray, da_b: T_DataArray, dim: Dims = None, ddof: int = 1
+) -> T_DataArray:
     """
     Compute covariance between two DataArray objects along a shared dimension.
 
@@ -1219,9 +1230,9 @@ def cov(da_a, da_b, dim=None, ddof=1):
         Array to compute.
     da_b : DataArray
         Array to compute.
-    dim : str, optional
+    dim : str, iterable of hashable, "..." or None, optional
         The dimension along which the covariance will be computed
-    ddof : int, optional
+    ddof : int, default: 1
         If ddof=1, covariance is normalized by N-1, giving an unbiased estimate,
         else normalization is by N.
 
@@ -1289,7 +1300,7 @@ def cov(da_a, da_b, dim=None, ddof=1):
     return _cov_corr(da_a, da_b, dim=dim, ddof=ddof, method="cov")
 
 
-def corr(da_a, da_b, dim=None):
+def corr(da_a: T_DataArray, da_b: T_DataArray, dim: Dims = None) -> T_DataArray:
     """
     Compute the Pearson correlation coefficient between
     two DataArray objects along a shared dimension.
@@ -1300,7 +1311,7 @@ def corr(da_a, da_b, dim=None):
         Array to compute.
     da_b : DataArray
         Array to compute.
-    dim : str, optional
+    dim : str, iterable of hashable, "..." or None, optional
         The dimension along which the correlation will be computed
 
     Returns
@@ -1368,7 +1379,11 @@ def corr(da_a, da_b, dim=None):
 
 
 def _cov_corr(
-    da_a: T_DataArray, da_b: T_DataArray, dim=None, ddof=0, method=None
+    da_a: T_DataArray,
+    da_b: T_DataArray,
+    dim: Dims = None,
+    ddof: int = 0,
+    method: Literal["cov", "corr", None] = None,
 ) -> T_DataArray:
     """
     Internal method for xr.cov() and xr.corr() so only have to
@@ -1388,22 +1403,21 @@ def _cov_corr(
     demeaned_da_b = da_b - da_b.mean(dim=dim)
 
     # 4. Compute covariance along the given dim
-    #
     # N.B. `skipna=True` is required or auto-covariance is computed incorrectly. E.g.
     # Try xr.cov(da,da) for da = xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"])
-    cov = (demeaned_da_a * demeaned_da_b).sum(dim=dim, skipna=True, min_count=1) / (
-        valid_count
-    )
+    cov = (demeaned_da_a.conj() * demeaned_da_b).sum(
+        dim=dim, skipna=True, min_count=1
+    ) / (valid_count)
 
     if method == "cov":
-        return cov
+        return cov  # type: ignore[return-value]
 
     else:
         # compute std + corr
         da_a_std = da_a.std(dim=dim)
         da_b_std = da_b.std(dim=dim)
         corr = cov / (da_a_std * da_b_std)
-        return corr
+        return corr  # type: ignore[return-value]
 
 
 def cross(
@@ -1616,7 +1630,7 @@ def cross(
 
 def dot(
     *arrays,
-    dims: str | Iterable[Hashable] | ellipsis | None = None,
+    dims: Dims = None,
     **kwargs: Any,
 ):
     """Generalized dot product for xarray objects. Like np.einsum, but
@@ -1626,7 +1640,7 @@ def dot(
     ----------
     *arrays : DataArray or Variable
         Arrays to compute.
-    dims : ..., str or tuple of str, optional
+    dims : str, iterable of hashable, "..." or None, optional
         Which dimensions to sum over. Ellipsis ('...') sums over all dimensions.
         If not specified, then all the common dimensions are summed over.
     **kwargs : dict
diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py
@@ -1387,25 +1387,29 @@ def test_vectorize_exclude_dims_dask() -> None:
 
 def test_corr_only_dataarray() -> None:
     with pytest.raises(TypeError, match="Only xr.DataArray is supported"):
-        xr.corr(xr.Dataset(), xr.Dataset())
+        xr.corr(xr.Dataset(), xr.Dataset())  # type: ignore[type-var]
 
 
-def arrays_w_tuples():
+@pytest.fixture(scope="module")
+def arrays():
     da = xr.DataArray(
         np.random.random((3, 21, 4)),
         coords={"time": pd.date_range("2000-01-01", freq="1D", periods=21)},
         dims=("a", "time", "x"),
     )
 
-    arrays = [
+    return [
         da.isel(time=range(0, 18)),
         da.isel(time=range(2, 20)).rolling(time=3, center=True).mean(),
         xr.DataArray([[1, 2], [1, np.nan]], dims=["x", "time"]),
         xr.DataArray([[1, 2], [np.nan, np.nan]], dims=["x", "time"]),
         xr.DataArray([[1, 2], [2, 1]], dims=["x", "time"]),
     ]
 
-    array_tuples = [
+
+@pytest.fixture(scope="module")
+def array_tuples(arrays):
+    return [
         (arrays[0], arrays[0]),
         (arrays[0], arrays[1]),
         (arrays[1], arrays[1]),
@@ -1417,27 +1421,19 @@ def arrays_w_tuples():
         (arrays[4], arrays[4]),
     ]
 
-    return arrays, array_tuples
-
 
 @pytest.mark.parametrize("ddof", [0, 1])
-@pytest.mark.parametrize(
-    "da_a, da_b",
-    [
-        arrays_w_tuples()[1][3],
-        arrays_w_tuples()[1][4],
-        arrays_w_tuples()[1][5],
-        arrays_w_tuples()[1][6],
-        arrays_w_tuples()[1][7],
-        arrays_w_tuples()[1][8],
-    ],
-)
+@pytest.mark.parametrize("n", [3, 4, 5, 6, 7, 8])
 @pytest.mark.parametrize("dim", [None, "x", "time"])
 @requires_dask
-def test_lazy_corrcov(da_a, da_b, dim, ddof) -> None:
+def test_lazy_corrcov(
+    n: int, dim: str | None, ddof: int, array_tuples: tuple[xr.DataArray, xr.DataArray]
+) -> None:
     # GH 5284
     from dask import is_dask_collection
 
+    da_a, da_b = array_tuples[n]
+
     with raise_if_dask_computes():
         cov = xr.cov(da_a.chunk(), da_b.chunk(), dim=dim, ddof=ddof)
         assert is_dask_collection(cov)
@@ -1447,12 +1443,13 @@ def test_lazy_corrcov(da_a, da_b, dim, ddof) -> None:
 
 
 @pytest.mark.parametrize("ddof", [0, 1])
-@pytest.mark.parametrize(
-    "da_a, da_b",
-    [arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2]],
-)
+@pytest.mark.parametrize("n", [0, 1, 2])
 @pytest.mark.parametrize("dim", [None, "time"])
-def test_cov(da_a, da_b, dim, ddof) -> None:
+def test_cov(
+    n: int, dim: str | None, ddof: int, array_tuples: tuple[xr.DataArray, xr.DataArray]
+) -> None:
+    da_a, da_b = array_tuples[n]
+
     if dim is not None:
 
         def np_cov_ind(ts1, ts2, a, x):
@@ -1499,12 +1496,13 @@ def np_cov(ts1, ts2):
         assert_allclose(actual, expected)
 
 
-@pytest.mark.parametrize(
-    "da_a, da_b",
-    [arrays_w_tuples()[1][0], arrays_w_tuples()[1][1], arrays_w_tuples()[1][2]],
-)
+@pytest.mark.parametrize("n", [0, 1, 2])
 @pytest.mark.parametrize("dim", [None, "time"])
-def test_corr(da_a, da_b, dim) -> None:
+def test_corr(
+    n: int, dim: str | None, array_tuples: tuple[xr.DataArray, xr.DataArray]
+) -> None:
+    da_a, da_b = array_tuples[n]
+
     if dim is not None:
 
         def np_corr_ind(ts1, ts2, a, x):
@@ -1547,12 +1545,12 @@ def np_corr(ts1, ts2):
         assert_allclose(actual, expected)
 
 
-@pytest.mark.parametrize(
-    "da_a, da_b",
-    arrays_w_tuples()[1],
-)
+@pytest.mark.parametrize("n", range(9))
 @pytest.mark.parametrize("dim", [None, "time", "x"])
-def test_covcorr_consistency(da_a, da_b, dim) -> None:
+def test_covcorr_consistency(
+    n: int, dim: str | None, array_tuples: tuple[xr.DataArray, xr.DataArray]
+) -> None:
+    da_a, da_b = array_tuples[n]
     # Testing that xr.corr and xr.cov are consistent with each other
     # 1. Broadcast the two arrays
     da_a, da_b = broadcast(da_a, da_b)
@@ -1569,10 +1567,13 @@ def test_covcorr_consistency(da_a, da_b, dim) -> None:
 
 
 @requires_dask
-@pytest.mark.parametrize("da_a, da_b", arrays_w_tuples()[1])
+@pytest.mark.parametrize("n", range(9))
 @pytest.mark.parametrize("dim", [None, "time", "x"])
 @pytest.mark.filterwarnings("ignore:invalid value encountered in .*divide")
-def test_corr_lazycorr_consistency(da_a, da_b, dim) -> None:
+def test_corr_lazycorr_consistency(
+    n: int, dim: str | None, array_tuples: tuple[xr.DataArray, xr.DataArray]
+) -> None:
+    da_a, da_b = array_tuples[n]
     da_al = da_a.chunk()
     da_bl = da_b.chunk()
     c_abl = xr.corr(da_al, da_bl, dim=dim)
@@ -1591,22 +1592,27 @@ def test_corr_dtype_error():
     xr.testing.assert_equal(xr.corr(da_a, da_b), xr.corr(da_a, da_b.chunk()))
 
 
-@pytest.mark.parametrize(
-    "da_a",
-    arrays_w_tuples()[0],
-)
+@pytest.mark.parametrize("n", range(5))
 @pytest.mark.parametrize("dim", [None, "time", "x", ["time", "x"]])
-def test_autocov(da_a, dim) -> None:
+def test_autocov(n: int, dim: str | None, arrays) -> None:
+    da = arrays[n]
+
     # Testing that the autocovariance*(N-1) is ~=~ to the variance matrix
     # 1. Ignore the nans
-    valid_values = da_a.notnull()
+    valid_values = da.notnull()
     # Because we're using ddof=1, this requires > 1 value in each sample
-    da_a = da_a.where(valid_values.sum(dim=dim) > 1)
-    expected = ((da_a - da_a.mean(dim=dim)) ** 2).sum(dim=dim, skipna=True, min_count=1)
-    actual = xr.cov(da_a, da_a, dim=dim) * (valid_values.sum(dim) - 1)
+    da = da.where(valid_values.sum(dim=dim) > 1)
+    expected = ((da - da.mean(dim=dim)) ** 2).sum(dim=dim, skipna=True, min_count=1)
+    actual = xr.cov(da, da, dim=dim) * (valid_values.sum(dim) - 1)
     assert_allclose(actual, expected)
 
 
+def test_complex_cov() -> None:
+    da = xr.DataArray([1j, -1j])
+    actual = xr.cov(da, da)
+    assert abs(actual.item()) == 2
+
+
 @requires_dask
 def test_vectorize_dask_new_output_dims() -> None:
     # regression test for GH3574