Fix GroupBy first, last with flox (#10173)

dcherian · web-flow · commit ec88c2885ffb · 2025-03-25T20:24:58.000-06:00
* Fix GroupBy first, last with flox Closes #10169 * fix test * parallelize upstream tests
diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml
@@ -84,7 +84,7 @@ jobs:
         if: success()
         id: status
         run: |
-          python -m pytest --timeout=60 -rf \
+          python -m pytest --timeout=60 -rf -nauto \
             --report-log output-${{ matrix.python-version }}-log.jsonl
       - name: Generate and publish the report
         if: |
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -41,6 +41,9 @@ Bug fixes
   This fixes the issue where using the ``zarr_version`` parameter would raise a deprecation warning telling the user to use
   a non-existent ``zarr_format`` parameter instead. (:issue:`10163`, :pull:`10164`)
   By `Karl Krauth <https://github.com/Karl-Krauth>`_.
+- Fix grouped and resampled ``first``, ``last`` with datetimes (:issue:`10169`, :pull:`10173`)
+  By `Deepak Cherian <https://github.com/dcherian>`_.
+
 
 Documentation
 ~~~~~~~~~~~~~
diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py
@@ -1401,8 +1401,14 @@ def _first_or_last(
             and OPTIONS["use_flox"]
             and contains_only_chunked_or_numpy(self._obj)
         ):
+            import flox.xrdtypes
+
             result = self._flox_reduce(
-                dim=None, func=op, skipna=skipna, keep_attrs=keep_attrs
+                dim=None,
+                func=op,
+                skipna=skipna,
+                keep_attrs=keep_attrs,
+                fill_value=flox.xrdtypes.NA,
             )
         else:
             result = self.reduce(
diff --git a/xarray/tests/conftest.py b/xarray/tests/conftest.py
@@ -6,7 +6,7 @@
 
 import xarray as xr
 from xarray import DataArray, Dataset, DataTree
-from xarray.tests import create_test_data, requires_dask
+from xarray.tests import create_test_data, has_cftime, requires_dask
 
 
 @pytest.fixture(params=["numpy", pytest.param("dask", marks=requires_dask)])
@@ -97,6 +97,18 @@ def da(request, backend):
         raise ValueError
 
 
+@pytest.fixture(
+    params=[
+        False,
+        pytest.param(
+            True, marks=pytest.mark.skipif(not has_cftime, reason="no cftime")
+        ),
+    ]
+)
+def use_cftime(request):
+    return request.param
+
+
 @pytest.fixture(params=[Dataset, DataArray])
 def type(request):
     return request.param
diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py
@@ -1845,7 +1845,6 @@ def test_groupby_fastpath_for_monotonic(self, use_flox: bool) -> None:
 
 class TestDataArrayResample:
     @pytest.mark.parametrize("shuffle", [True, False])
-    @pytest.mark.parametrize("use_cftime", [True, False])
     @pytest.mark.parametrize(
         "resample_freq",
         [
@@ -1906,12 +1905,8 @@ def resample_as_pandas(array, *args, **kwargs):
         with pytest.raises(ValueError):
             reverse.resample(time=resample_freq).mean()
 
-    @pytest.mark.parametrize("use_cftime", [True, False])
     def test_resample_doctest(self, use_cftime: bool) -> None:
         # run the doctest example here so we are not surprised
-        if use_cftime and not has_cftime:
-            pytest.skip()
-
         da = xr.DataArray(
             np.array([1, 2, 3, 1, 2, np.nan]),
             dims="time",
@@ -1947,8 +1942,10 @@ def func(arg1, arg2, arg3=0.0):
         actual = da.resample(time="D").map(func, args=(1.0,), arg3=1.0)
         assert_identical(actual, expected)
 
-    def test_resample_first(self) -> None:
-        times = pd.date_range("2000-01-01", freq="6h", periods=10)
+    def test_resample_first_last(self, use_cftime) -> None:
+        times = xr.date_range(
+            "2000-01-01", freq="6h", periods=10, use_cftime=use_cftime
+        )
         array = DataArray(np.arange(10), [("time", times)])
 
         # resample to same frequency
@@ -1961,7 +1958,7 @@ def test_resample_first(self) -> None:
 
         # verify that labels don't use the first value
         actual = array.resample(time="24h").first()
-        expected = DataArray(array.to_series().resample("24h").first())
+        expected = array.isel(time=[0, 4, 8])
         assert_identical(expected, actual)
 
         # missing values
@@ -1978,10 +1975,17 @@ def test_resample_first(self) -> None:
         # regression test for https://stackoverflow.com/questions/33158558/
         array = Dataset({"time": times})["time"]
         actual = array.resample(time="1D").last()
-        expected_times = pd.to_datetime(
-            ["2000-01-01T18", "2000-01-02T18", "2000-01-03T06"], unit="ns"
+        expected = array.isel(time=[3, 7, 9]).assign_coords(time=times[::4])
+        assert_identical(expected, actual)
+
+        # missing periods, GH10169
+        actual = array.isel(time=[0, 1, 2, 3, 8, 9]).resample(time="1D").last()
+        expected = DataArray(
+            np.array([times[3], np.datetime64("NaT"), times[9]]),
+            dims="time",
+            coords={"time": times[::4]},
+            name="time",
         )
-        expected = DataArray(expected_times, [("time", times[::4])], name="time")
         assert_identical(expected, actual)
 
     def test_resample_bad_resample_dim(self) -> None:
@@ -2298,7 +2302,6 @@ def test_resample_origin(self) -> None:
 
 
 class TestDatasetResample:
-    @pytest.mark.parametrize("use_cftime", [True, False])
     @pytest.mark.parametrize(
         "resample_freq",
         [