Add tests

dcherian · dcherian · commit 5e2fdfb77802 · 2024-08-30T13:38:57.000-06:00
diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py
@@ -566,7 +566,7 @@ def sizes(self) -> Mapping[Hashable, int]:
             self._sizes = self._obj.isel({self._group_dim: index}).sizes
         return self._sizes
 
-    def shuffle(self, chunks: T_Chunks = None) -> DataArrayGroupBy | DatasetGroupBy:
+    def shuffle(self, chunks: T_Chunks = None):
         """
         Sort or "shuffle" the underlying object.
 
@@ -610,7 +610,10 @@ def shuffle(self, chunks: T_Chunks = None) -> DataArrayGroupBy | DatasetGroupBy:
         """
         (grouper,) = self.groupers
         return self._shuffle_obj(chunks).groupby(
-            {grouper.name: grouper.grouper.reset()},
+            # Using group.name handles the BinGrouper case
+            # It does *not* handle the TimeResampler case,
+            # so we just override this method in Resample
+            {grouper.group.name: grouper.grouper.reset()},
             restore_coord_dims=self._restore_coord_dims,
         )
 
@@ -624,11 +627,11 @@ def _shuffle_obj(self, chunks: T_Chunks) -> T_Xarray:
         as_dataset = self._obj._to_temp_dataset() if was_array else self._obj
         no_slices: list[list[int]] = [
             list(range(*idx.indices(size))) if isinstance(idx, slice) else idx
-            for idx in self._group_indices
+            for idx in self.encoded.group_indices
         ]
 
         if grouper.name not in as_dataset._variables:
-            as_dataset.coords[grouper.name] = grouper.group1d
+            as_dataset.coords[grouper.name] = grouper.group
 
         # Shuffling is only different from `isel` for chunked arrays.
         # Extract them out, and treat them specially. The rest, we route through isel.
@@ -644,7 +647,7 @@ def _shuffle_obj(self, chunks: T_Chunks) -> T_Xarray:
         shuffled = subset.isel({dim: np.concatenate(no_slices)})
         for name, var in is_chunked.items():
             shuffled[name] = var._shuffle(
-                indices=list(self._group_indices), dim=dim, chunks=chunks
+                indices=list(self.encoded.group_indices), dim=dim, chunks=chunks
             )
         shuffled = self._maybe_unstack(shuffled)
         new_obj = self._obj._from_temp_dataset(shuffled) if was_array else shuffled
diff --git a/xarray/core/resample.py b/xarray/core/resample.py
@@ -2,7 +2,7 @@
 
 import warnings
 from collections.abc import Callable, Hashable, Iterable, Sequence
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 
 from xarray.core._aggregations import (
     DataArrayResampleAggregations,
@@ -14,6 +14,8 @@
 if TYPE_CHECKING:
     from xarray.core.dataarray import DataArray
     from xarray.core.dataset import Dataset
+    from xarray.core.types import T_Chunks
+    from xarray.groupers import Resampler
 
 from xarray.groupers import RESAMPLE_DIM
 
@@ -58,6 +60,60 @@ def _flox_reduce(
         result = result.rename({RESAMPLE_DIM: self._group_dim})
         return result
 
+    def shuffle(self, chunks: T_Chunks = None):
+        """
+        Sort or "shuffle" the underlying object.
+
+        "Shuffle" means the object is sorted so that all group members occur sequentially,
+        in the same chunk. Multiple groups may occur in the same chunk.
+        This method is particularly useful for chunked arrays (e.g. dask, cubed).
+        particularly when you need to map a function that requires all members of a group
+        to be present in a single chunk. For chunked array types, the order of appearance
+        is not guaranteed, but will depend on the input chunking.
+
+        .. warning::
+
+           With resampling it is a lot better to use ``.chunk`` instead of ``.shuffle``,
+           since one can only resample a sorted time coordinate.
+
+        Parameters
+        ----------
+        chunks : int, tuple of int, "auto" or mapping of hashable to int or tuple of int, optional
+            How to adjust chunks along dimensions not present in the array being grouped by.
+
+        Returns
+        -------
+        DataArrayGroupBy or DatasetGroupBy
+
+        Examples
+        --------
+        >>> import dask
+        >>> da = xr.DataArray(
+        ...     dims="x",
+        ...     data=dask.array.arange(10, chunks=3),
+        ...     coords={"x": [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]},
+        ...     name="a",
+        ... )
+        >>> shuffled = da.groupby("x").shuffle()
+        >>> shuffled.quantile(q=0.5).compute()
+        <xarray.DataArray 'a' (x: 4)> Size: 32B
+        array([9., 3., 4., 5.])
+        Coordinates:
+            quantile  float64 8B 0.5
+          * x         (x) int64 32B 0 1 2 3
+
+        See Also
+        --------
+        dask.dataframe.DataFrame.shuffle
+        dask.array.shuffle
+        """
+        (grouper,) = self.groupers
+        shuffled = self._shuffle_obj(chunks).drop_vars(RESAMPLE_DIM)
+        return shuffled.resample(
+            {self._group_dim: cast("Resampler", grouper.grouper.reset())},
+            restore_coord_dims=self._restore_coord_dims,
+        )
+
     def _drop_coords(self) -> T_Xarray:
         """Drop non-dimension coordinates along the resampled dimension."""
         obj = self._obj
diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py
@@ -1659,13 +1659,14 @@ def test_groupby_bins(
         )
 
         with xr.set_options(use_flox=use_flox):
-            actual = array.groupby_bins("dim_0", bins=bins, **cut_kwargs).sum()
+            gb = array.groupby_bins("dim_0", bins=bins, **cut_kwargs)
+            actual = gb.sum()
             assert_identical(expected, actual)
+            assert_identical(expected, gb.shuffle().sum())
 
-            actual = array.groupby_bins("dim_0", bins=bins, **cut_kwargs).map(
-                lambda x: x.sum()
-            )
+            actual = gb.map(lambda x: x.sum())
             assert_identical(expected, actual)
+            assert_identical(expected, gb.shuffle().map(lambda x: x.sum()))
 
             # make sure original array dims are unchanged
             assert len(array.dim_0) == 4
@@ -1810,8 +1811,9 @@ def test_groupby_fastpath_for_monotonic(self, use_flox: bool) -> None:
 
 
 class TestDataArrayResample:
+    @pytest.mark.parametrize("shuffle", [True, False])
     @pytest.mark.parametrize("use_cftime", [True, False])
-    def test_resample(self, use_cftime: bool) -> None:
+    def test_resample(self, use_cftime: bool, shuffle: bool) -> None:
         if use_cftime and not has_cftime:
             pytest.skip()
         times = xr.date_range(
@@ -1833,16 +1835,22 @@ def resample_as_pandas(array, *args, **kwargs):
 
         array = DataArray(np.arange(10), [("time", times)])
 
-        actual = array.resample(time="24h").mean()
+        rs = array.resample(time="24h")
+
+        actual = rs.mean()
         expected = resample_as_pandas(array, "24h")
         assert_identical(expected, actual)
+        assert_identical(expected, rs.shuffle().mean())
 
-        actual = array.resample(time="24h").reduce(np.mean)
-        assert_identical(expected, actual)
+        assert_identical(expected, rs.reduce(np.mean))
+        assert_identical(expected, rs.shuffle().reduce(np.mean))
 
-        actual = array.resample(time="24h", closed="right").mean()
+        rs = array.resample(time="24h", closed="right")
+        actual = rs.mean()
+        shuffled = rs.shuffle().mean()
         expected = resample_as_pandas(array, "24h", closed="right")
         assert_identical(expected, actual)
+        assert_identical(expected, shuffled)
 
         with pytest.raises(ValueError, match=r"Index must be monotonic"):
             array[[2, 0, 1]].resample(time="1D")
@@ -2795,6 +2803,27 @@ def test_multiple_groupers_mixed(use_flox) -> None:
     # ------
 
 
+@requires_dask
+def test_groupby_shuffle():
+    import dask
+
+    da = DataArray(
+        dask.array.from_array(np.array([1, 2, 3, 0, 2, np.nan]), chunks=2),
+        dims="d",
+        coords=dict(
+            labels1=("d", np.array(["a", "b", "c", "c", "b", "a"])),
+            labels2=("d", np.array(["x", "y", "z", "z", "y", "x"])),
+        ),
+        name="foo",
+    )
+
+    gb = da.groupby("labels1")
+    shuffled = gb.shuffle()
+    shuffled_obj = shuffled._obj
+    with xr.set_options(use_flox=False):
+        xr.testing.assert_identical(gb.mean(), shuffled.mean())
+
+
 # Possible property tests
 # 1. lambda x: x
 # 2. grouped-reduce on unique coords is identical to array