From 4940c51ba56ee65653d95c83570aba895a22bc77 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
Date: Sat, 19 Apr 2025 11:22:37 -0400
Subject: [PATCH 1/2] BUG(string dtype): groupby/resampler.min/max returns
 float on all NA strings (#60985)

* BUG(string dtype): groupby/resampler.min/max returns float on all NA strings

* Merge cleanup

* whatsnew

* Add type-ignore

* Remove condition
---
 pandas/core/groupby/groupby.py          |  8 ++-
 pandas/tests/groupby/test_reductions.py | 93 +++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index c8e2ccc7bdaeb..3a28de5222d15 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -85,6 +85,7 @@ class providing the base-class of operations.
     is_numeric_dtype,
     is_object_dtype,
     is_scalar,
+    is_string_dtype,
     needs_i8_conversion,
     pandas_dtype,
 )
@@ -1945,8 +1946,13 @@ def _agg_py_fallback(
             # preserve the kind of exception that raised
             raise type(err)(msg) from err
 
-        if ser.dtype == object:
+        dtype = ser.dtype
+        if dtype == object:
             res_values = res_values.astype(object, copy=False)
+        elif is_string_dtype(dtype):
+            # mypy doesn't infer dtype is an ExtensionDtype
+            string_array_cls = dtype.construct_array_type()  # type: ignore[union-attr]
+            res_values = string_array_cls._from_sequence(res_values, dtype=dtype)
 
         # If we are DataFrameGroupBy and went through a SeriesGroupByPath
         # then we need to reshape
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
index 599b0aabf85d5..6bd46c5b2088f 100644
--- a/pandas/tests/groupby/test_reductions.py
+++ b/pandas/tests/groupby/test_reductions.py
@@ -20,6 +20,7 @@
     isna,
 )
 import pandas._testing as tm
+from pandas.tests.groupby import get_groupby_method_args
 from pandas.util import _test_decorators as td
 
 
@@ -710,6 +711,98 @@ def test_min_empty_string_dtype(func, string_dtype_no_object):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("min_count", [0, 1])
+@pytest.mark.parametrize("test_series", [True, False])
+def test_string_dtype_all_na(
+    string_dtype_no_object, reduction_func, skipna, min_count, test_series
+):
+    # https://github.com/pandas-dev/pandas/issues/60985
+    if reduction_func == "corrwith":
+        # corrwith is deprecated.
+        return
+
+    dtype = string_dtype_no_object
+
+    if reduction_func in [
+        "any",
+        "all",
+        "idxmin",
+        "idxmax",
+        "mean",
+        "median",
+        "std",
+        "var",
+    ]:
+        kwargs = {"skipna": skipna}
+    elif reduction_func in ["kurt"]:
+        kwargs = {"min_count": min_count}
+    elif reduction_func in ["count", "nunique", "quantile", "sem", "size"]:
+        kwargs = {}
+    else:
+        kwargs = {"skipna": skipna, "min_count": min_count}
+
+    expected_dtype, expected_value = dtype, pd.NA
+    if reduction_func in ["all", "any"]:
+        expected_dtype = "bool"
+        # TODO: For skipna=False, bool(pd.NA) raises; should groupby?
+        expected_value = not skipna if reduction_func == "any" else True
+    elif reduction_func in ["count", "nunique", "size"]:
+        # TODO: Should be more consistent - return Int64 when dtype.na_value is pd.NA?
+        if (
+            test_series
+            and reduction_func == "size"
+            and dtype.storage == "pyarrow"
+            and dtype.na_value is pd.NA
+        ):
+            expected_dtype = "Int64"
+        else:
+            expected_dtype = "int64"
+        expected_value = 1 if reduction_func == "size" else 0
+    elif reduction_func in ["idxmin", "idxmax"]:
+        expected_dtype, expected_value = "float64", np.nan
+    elif not skipna or min_count > 0:
+        expected_value = pd.NA
+    elif reduction_func == "sum":
+        # https://github.com/pandas-dev/pandas/pull/60936
+        expected_value = ""
+
+    df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype)
+    obj = df["b"] if test_series else df
+    args = get_groupby_method_args(reduction_func, obj)
+    gb = obj.groupby(df["a"])
+    method = getattr(gb, reduction_func)
+
+    if reduction_func in [
+        "mean",
+        "median",
+        "kurt",
+        "prod",
+        "quantile",
+        "sem",
+        "skew",
+        "std",
+        "var",
+    ]:
+        msg = f"dtype '{dtype}' does not support operation '{reduction_func}'"
+        with pytest.raises(TypeError, match=msg):
+            method(*args, **kwargs)
+        return
+    elif reduction_func in ["idxmin", "idxmax"] and not skipna:
+        msg = f"{reduction_func} with skipna=False encountered an NA value."
+        with pytest.raises(ValueError, match=msg):
+            method(*args, **kwargs)
+        return
+
+    result = method(*args, **kwargs)
+    index = pd.Index(["x"], name="a", dtype=dtype)
+    if test_series or reduction_func == "size":
+        name = None if not test_series and reduction_func == "size" else "b"
+        expected = Series(expected_value, index=index, dtype=expected_dtype, name=name)
+    else:
+        expected = DataFrame({"b": expected_value}, index=index, dtype=expected_dtype)
+    tm.assert_equal(result, expected)
+
+
 def test_max_nan_bug():
     df = DataFrame(
         {

From 3ba6011548de77a0d97f7be80d51c2cd72da13c5 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 11 Jun 2025 18:06:00 +0200
Subject: [PATCH 2/2] skipna not yet available on 2.3 for groupby

---
 pandas/tests/groupby/test_reductions.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
index 6bd46c5b2088f..55408a485c816 100644
--- a/pandas/tests/groupby/test_reductions.py
+++ b/pandas/tests/groupby/test_reductions.py
@@ -714,7 +714,7 @@ def test_min_empty_string_dtype(func, string_dtype_no_object):
 @pytest.mark.parametrize("min_count", [0, 1])
 @pytest.mark.parametrize("test_series", [True, False])
 def test_string_dtype_all_na(
-    string_dtype_no_object, reduction_func, skipna, min_count, test_series
+    string_dtype_no_object, reduction_func, min_count, test_series
 ):
     # https://github.com/pandas-dev/pandas/issues/60985
     if reduction_func == "corrwith":
@@ -733,19 +733,19 @@ def test_string_dtype_all_na(
         "std",
         "var",
     ]:
-        kwargs = {"skipna": skipna}
+        kwargs = {}
     elif reduction_func in ["kurt"]:
         kwargs = {"min_count": min_count}
     elif reduction_func in ["count", "nunique", "quantile", "sem", "size"]:
         kwargs = {}
     else:
-        kwargs = {"skipna": skipna, "min_count": min_count}
+        kwargs = {"min_count": min_count}
 
     expected_dtype, expected_value = dtype, pd.NA
     if reduction_func in ["all", "any"]:
         expected_dtype = "bool"
         # TODO: For skipna=False, bool(pd.NA) raises; should groupby?
-        expected_value = not skipna if reduction_func == "any" else True
+        expected_value = False if reduction_func == "any" else True
     elif reduction_func in ["count", "nunique", "size"]:
         # TODO: Should be more consistent - return Int64 when dtype.na_value is pd.NA?
         if (
@@ -760,7 +760,7 @@ def test_string_dtype_all_na(
         expected_value = 1 if reduction_func == "size" else 0
     elif reduction_func in ["idxmin", "idxmax"]:
         expected_dtype, expected_value = "float64", np.nan
-    elif not skipna or min_count > 0:
+    elif min_count > 0:
         expected_value = pd.NA
     elif reduction_func == "sum":
         # https://github.com/pandas-dev/pandas/pull/60936
@@ -787,11 +787,6 @@ def test_string_dtype_all_na(
         with pytest.raises(TypeError, match=msg):
             method(*args, **kwargs)
         return
-    elif reduction_func in ["idxmin", "idxmax"] and not skipna:
-        msg = f"{reduction_func} with skipna=False encountered an NA value."
-        with pytest.raises(ValueError, match=msg):
-            method(*args, **kwargs)
-        return
 
     result = method(*args, **kwargs)
     index = pd.Index(["x"], name="a", dtype=dtype)