From 4940c51ba56ee65653d95c83570aba895a22bc77 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 19 Apr 2025 11:22:37 -0400 Subject: [PATCH 1/2] BUG(string dtype): groupby/resampler.min/max returns float on all NA strings (#60985) * BUG(string dtype): groupby/resampler.min/max returns float on all NA strings * Merge cleanup * whatsnew * Add type-ignore * Remove condition --- pandas/core/groupby/groupby.py | 8 ++- pandas/tests/groupby/test_reductions.py | 93 +++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c8e2ccc7bdaeb..3a28de5222d15 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -85,6 +85,7 @@ class providing the base-class of operations. is_numeric_dtype, is_object_dtype, is_scalar, + is_string_dtype, needs_i8_conversion, pandas_dtype, ) @@ -1945,8 +1946,13 @@ def _agg_py_fallback( # preserve the kind of exception that raised raise type(err)(msg) from err - if ser.dtype == object: + dtype = ser.dtype + if dtype == object: res_values = res_values.astype(object, copy=False) + elif is_string_dtype(dtype): + # mypy doesn't infer dtype is an ExtensionDtype + string_array_cls = dtype.construct_array_type() # type: ignore[union-attr] + res_values = string_array_cls._from_sequence(res_values, dtype=dtype) # If we are DataFrameGroupBy and went through a SeriesGroupByPath # then we need to reshape diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 599b0aabf85d5..6bd46c5b2088f 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -20,6 +20,7 @@ isna, ) import pandas._testing as tm +from pandas.tests.groupby import get_groupby_method_args from pandas.util import _test_decorators as td @@ -710,6 +711,98 @@ def test_min_empty_string_dtype(func, string_dtype_no_object): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("min_count", [0, 1]) +@pytest.mark.parametrize("test_series", [True, False]) +def test_string_dtype_all_na( + string_dtype_no_object, reduction_func, skipna, min_count, test_series +): + # https://github.com/pandas-dev/pandas/issues/60985 + if reduction_func == "corrwith": + # corrwith is deprecated. + return + + dtype = string_dtype_no_object + + if reduction_func in [ + "any", + "all", + "idxmin", + "idxmax", + "mean", + "median", + "std", + "var", + ]: + kwargs = {"skipna": skipna} + elif reduction_func in ["kurt"]: + kwargs = {"min_count": min_count} + elif reduction_func in ["count", "nunique", "quantile", "sem", "size"]: + kwargs = {} + else: + kwargs = {"skipna": skipna, "min_count": min_count} + + expected_dtype, expected_value = dtype, pd.NA + if reduction_func in ["all", "any"]: + expected_dtype = "bool" + # TODO: For skipna=False, bool(pd.NA) raises; should groupby? + expected_value = not skipna if reduction_func == "any" else True + elif reduction_func in ["count", "nunique", "size"]: + # TODO: Should be more consistent - return Int64 when dtype.na_value is pd.NA? + if ( + test_series + and reduction_func == "size" + and dtype.storage == "pyarrow" + and dtype.na_value is pd.NA + ): + expected_dtype = "Int64" + else: + expected_dtype = "int64" + expected_value = 1 if reduction_func == "size" else 0 + elif reduction_func in ["idxmin", "idxmax"]: + expected_dtype, expected_value = "float64", np.nan + elif not skipna or min_count > 0: + expected_value = pd.NA + elif reduction_func == "sum": + # https://github.com/pandas-dev/pandas/pull/60936 + expected_value = "" + + df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype) + obj = df["b"] if test_series else df + args = get_groupby_method_args(reduction_func, obj) + gb = obj.groupby(df["a"]) + method = getattr(gb, reduction_func) + + if reduction_func in [ + "mean", + "median", + "kurt", + "prod", + "quantile", + "sem", + "skew", + "std", + "var", + ]: + msg = f"dtype '{dtype}' does not support operation '{reduction_func}'" + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) + return + elif reduction_func in ["idxmin", "idxmax"] and not skipna: + msg = f"{reduction_func} with skipna=False encountered an NA value." + with pytest.raises(ValueError, match=msg): + method(*args, **kwargs) + return + + result = method(*args, **kwargs) + index = pd.Index(["x"], name="a", dtype=dtype) + if test_series or reduction_func == "size": + name = None if not test_series and reduction_func == "size" else "b" + expected = Series(expected_value, index=index, dtype=expected_dtype, name=name) + else: + expected = DataFrame({"b": expected_value}, index=index, dtype=expected_dtype) + tm.assert_equal(result, expected) + + def test_max_nan_bug(): df = DataFrame( { From 3ba6011548de77a0d97f7be80d51c2cd72da13c5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 Jun 2025 18:06:00 +0200 Subject: [PATCH 2/2] skipna not yet available on 2.3 for groupby --- pandas/tests/groupby/test_reductions.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 6bd46c5b2088f..55408a485c816 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -714,7 +714,7 @@ def test_min_empty_string_dtype(func, string_dtype_no_object): @pytest.mark.parametrize("min_count", [0, 1]) @pytest.mark.parametrize("test_series", [True, False]) def test_string_dtype_all_na( - string_dtype_no_object, reduction_func, skipna, min_count, test_series + string_dtype_no_object, reduction_func, min_count, test_series ): # https://github.com/pandas-dev/pandas/issues/60985 if reduction_func == "corrwith": @@ -733,19 +733,19 @@ def test_string_dtype_all_na( "std", "var", ]: - kwargs = {"skipna": skipna} + kwargs = {} elif reduction_func in ["kurt"]: kwargs = {"min_count": min_count} elif reduction_func in ["count", "nunique", "quantile", "sem", "size"]: kwargs = {} else: - kwargs = {"skipna": skipna, "min_count": min_count} + kwargs = {"min_count": min_count} expected_dtype, expected_value = dtype, pd.NA if reduction_func in ["all", "any"]: expected_dtype = "bool" # TODO: For skipna=False, bool(pd.NA) raises; should groupby? - expected_value = not skipna if reduction_func == "any" else True + expected_value = False if reduction_func == "any" else True elif reduction_func in ["count", "nunique", "size"]: # TODO: Should be more consistent - return Int64 when dtype.na_value is pd.NA? if ( @@ -760,7 +760,7 @@ def test_string_dtype_all_na( expected_value = 1 if reduction_func == "size" else 0 elif reduction_func in ["idxmin", "idxmax"]: expected_dtype, expected_value = "float64", np.nan - elif not skipna or min_count > 0: + elif min_count > 0: expected_value = pd.NA elif reduction_func == "sum": # https://github.com/pandas-dev/pandas/pull/60936 @@ -787,11 +787,6 @@ def test_string_dtype_all_na( with pytest.raises(TypeError, match=msg): method(*args, **kwargs) return - elif reduction_func in ["idxmin", "idxmax"] and not skipna: - msg = f"{reduction_func} with skipna=False encountered an NA value." - with pytest.raises(ValueError, match=msg): - method(*args, **kwargs) - return result = method(*args, **kwargs) index = pd.Index(["x"], name="a", dtype=dtype)