Support shuffling with multiple groupers

dcherian · dcherian · commit a22c7ed0166a · 2024-08-30T14:01:37.000-06:00
diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py
@@ -608,19 +608,21 @@ def shuffle(self, chunks: T_Chunks = None):
         dask.dataframe.DataFrame.shuffle
         dask.array.shuffle
         """
-        (grouper,) = self.groupers
-        return self._shuffle_obj(chunks).groupby(
+        new_groupers = {
             # Using group.name handles the BinGrouper case
             # It does *not* handle the TimeResampler case,
             # so we just override this method in Resample
-            {grouper.group.name: grouper.grouper.reset()},
+            grouper.group.name: grouper.grouper.reset()
+            for grouper in self.groupers
+        }
+        return self._shuffle_obj(chunks).groupby(
+            new_groupers,
             restore_coord_dims=self._restore_coord_dims,
         )
 
     def _shuffle_obj(self, chunks: T_Chunks) -> T_Xarray:
         from xarray.core.dataarray import DataArray
 
-        (grouper,) = self.groupers
         dim = self._group_dim
         size = self._obj.sizes[dim]
         was_array = isinstance(self._obj, DataArray)
@@ -629,9 +631,11 @@ def _shuffle_obj(self, chunks: T_Chunks) -> T_Xarray:
             list(range(*idx.indices(size))) if isinstance(idx, slice) else idx
             for idx in self.encoded.group_indices
         ]
+        no_slices = [idx for idx in no_slices if idx]
 
-        if grouper.name not in as_dataset._variables:
-            as_dataset.coords[grouper.name] = grouper.group
+        for grouper in self.groupers:
+            if grouper.name not in as_dataset._variables:
+                as_dataset.coords[grouper.name] = grouper.group
 
         # Shuffling is only different from `isel` for chunked arrays.
         # Extract them out, and treat them specially. The rest, we route through isel.
@@ -644,10 +648,13 @@ def _shuffle_obj(self, chunks: T_Chunks) -> T_Xarray:
         subset = as_dataset[
             [name for name in as_dataset._variables if name not in is_chunked]
         ]
+
         shuffled = subset.isel({dim: np.concatenate(no_slices)})
         for name, var in is_chunked.items():
             shuffled[name] = var._shuffle(
-                indices=list(self.encoded.group_indices), dim=dim, chunks=chunks
+                indices=list(idx for idx in self.encoded.group_indices if idx),
+                dim=dim,
+                chunks=chunks,
             )
         shuffled = self._maybe_unstack(shuffled)
         new_obj = self._obj._from_temp_dataset(shuffled) if was_array else shuffled
@@ -861,7 +868,9 @@ def _maybe_unstack(self, obj):
             #       and `inserted_dims`
             # if multiple groupers all share the same single dimension, then
             # we don't stack/unstack. Do that manually now.
-            obj = obj.unstack(*self.encoded.unique_coord.dims)
+            dims_to_unstack = self.encoded.unique_coord.dims
+            if all(dim in obj.dims for dim in dims_to_unstack):
+                obj = obj.unstack(*dims_to_unstack)
             to_drop = [
                 grouper.name
                 for grouper in self.groupers
diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py
@@ -2684,8 +2684,9 @@ def test_weather_data_resample(use_flox):
     assert expected.location.attrs == ds.location.attrs
 
 
+@pytest.mark.parametrize("shuffle", [True, False])
 @pytest.mark.parametrize("use_flox", [True, False])
-def test_multiple_groupers(use_flox) -> None:
+def test_multiple_groupers(use_flox: bool, shuffle: bool) -> None:
     da = DataArray(
         np.array([1, 2, 3, 0, 2, np.nan]),
         dims="d",
@@ -2697,6 +2698,8 @@ def test_multiple_groupers(use_flox) -> None:
     )
 
     gb = da.groupby(labels1=UniqueGrouper(), labels2=UniqueGrouper())
+    if shuffle:
+        gb = gb.shuffle()
     repr(gb)
 
     expected = DataArray(
@@ -2716,6 +2719,8 @@ def test_multiple_groupers(use_flox) -> None:
     coords = {"a": ("x", [0, 0, 1, 1]), "b": ("y", [0, 0, 1, 1])}
     square = DataArray(np.arange(16).reshape(4, 4), coords=coords, dims=["x", "y"])
     gb = square.groupby(a=UniqueGrouper(), b=UniqueGrouper())
+    if shuffle:
+        gb = gb.shuffle()
     repr(gb)
     with xr.set_options(use_flox=use_flox):
         actual = gb.mean()
@@ -2739,11 +2744,15 @@ def test_multiple_groupers(use_flox) -> None:
         dims=["x", "y", "z"],
     )
     gb = b.groupby(x=UniqueGrouper(), y=UniqueGrouper())
+    if shuffle:
+        gb = gb.shuffle()
     repr(gb)
     with xr.set_options(use_flox=use_flox):
         assert_identical(gb.mean("z"), b.mean("z"))
 
     gb = b.groupby(x=UniqueGrouper(), xy=UniqueGrouper())
+    if shuffle:
+        gb = gb.shuffle()
     repr(gb)
     with xr.set_options(use_flox=use_flox):
         actual = gb.mean()
@@ -2758,13 +2767,16 @@ def test_multiple_groupers(use_flox) -> None:
 
 
 @pytest.mark.parametrize("use_flox", [True, False])
-def test_multiple_groupers_mixed(use_flox) -> None:
+@pytest.mark.parametrize("shuffle", [True, False])
+def test_multiple_groupers_mixed(use_flox: bool, shuffle: bool) -> None:
     # This groupby has missing groups
     ds = xr.Dataset(
         {"foo": (("x", "y"), np.arange(12).reshape((4, 3)))},
         coords={"x": [10, 20, 30, 40], "letters": ("x", list("abba"))},
     )
     gb = ds.groupby(x=BinGrouper(bins=[5, 15, 25]), letters=UniqueGrouper())
+    if shuffle:
+        gb = gb.shuffle()
     expected_data = np.array(
         [
             [[0.0, np.nan], [np.nan, 3.0]],
@@ -2803,27 +2815,6 @@ def test_multiple_groupers_mixed(use_flox) -> None:
     # ------
 
 
-@requires_dask
-def test_groupby_shuffle():
-    import dask
-
-    da = DataArray(
-        dask.array.from_array(np.array([1, 2, 3, 0, 2, np.nan]), chunks=2),
-        dims="d",
-        coords=dict(
-            labels1=("d", np.array(["a", "b", "c", "c", "b", "a"])),
-            labels2=("d", np.array(["x", "y", "z", "z", "y", "x"])),
-        ),
-        name="foo",
-    )
-
-    gb = da.groupby("labels1")
-    shuffled = gb.shuffle()
-    shuffled_obj = shuffled._obj
-    with xr.set_options(use_flox=False):
-        xr.testing.assert_identical(gb.mean(), shuffled.mean())
-
-
 # Possible property tests
 # 1. lambda x: x
 # 2. grouped-reduce on unique coords is identical to array