Skip to content

[backport 2.3.x] BUG(string dtype): groupby/resampler.min/max returns float on all NA strings (#60985) #61633

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ class providing the base-class of operations.
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
needs_i8_conversion,
pandas_dtype,
)
Expand Down Expand Up @@ -1945,8 +1946,13 @@ def _agg_py_fallback(
# preserve the kind of exception that raised
raise type(err)(msg) from err

if ser.dtype == object:
dtype = ser.dtype
if dtype == object:
res_values = res_values.astype(object, copy=False)
elif is_string_dtype(dtype):
# mypy doesn't infer dtype is an ExtensionDtype
string_array_cls = dtype.construct_array_type() # type: ignore[union-attr]
res_values = string_array_cls._from_sequence(res_values, dtype=dtype)

# If we are DataFrameGroupBy and went through a SeriesGroupByPath
# then we need to reshape
Expand Down
88 changes: 88 additions & 0 deletions pandas/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
isna,
)
import pandas._testing as tm
from pandas.tests.groupby import get_groupby_method_args
from pandas.util import _test_decorators as td


Expand Down Expand Up @@ -710,6 +711,93 @@ def test_min_empty_string_dtype(func, string_dtype_no_object):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("min_count", [0, 1])
@pytest.mark.parametrize("test_series", [True, False])
def test_string_dtype_all_na(
string_dtype_no_object, reduction_func, min_count, test_series
):
# https://github.com/pandas-dev/pandas/issues/60985
if reduction_func == "corrwith":
# corrwith is deprecated.
return

dtype = string_dtype_no_object

if reduction_func in [
"any",
"all",
"idxmin",
"idxmax",
"mean",
"median",
"std",
"var",
]:
kwargs = {}
elif reduction_func in ["kurt"]:
kwargs = {"min_count": min_count}
elif reduction_func in ["count", "nunique", "quantile", "sem", "size"]:
kwargs = {}
else:
kwargs = {"min_count": min_count}

expected_dtype, expected_value = dtype, pd.NA
if reduction_func in ["all", "any"]:
expected_dtype = "bool"
# TODO: For skipna=False, bool(pd.NA) raises; should groupby?
expected_value = False if reduction_func == "any" else True
elif reduction_func in ["count", "nunique", "size"]:
# TODO: Should be more consistent - return Int64 when dtype.na_value is pd.NA?
if (
test_series
and reduction_func == "size"
and dtype.storage == "pyarrow"
and dtype.na_value is pd.NA
):
expected_dtype = "Int64"
else:
expected_dtype = "int64"
expected_value = 1 if reduction_func == "size" else 0
elif reduction_func in ["idxmin", "idxmax"]:
expected_dtype, expected_value = "float64", np.nan
elif min_count > 0:
expected_value = pd.NA
elif reduction_func == "sum":
# https://github.com/pandas-dev/pandas/pull/60936
expected_value = ""

df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype)
obj = df["b"] if test_series else df
args = get_groupby_method_args(reduction_func, obj)
gb = obj.groupby(df["a"])
method = getattr(gb, reduction_func)

if reduction_func in [
"mean",
"median",
"kurt",
"prod",
"quantile",
"sem",
"skew",
"std",
"var",
]:
msg = f"dtype '{dtype}' does not support operation '{reduction_func}'"
with pytest.raises(TypeError, match=msg):
method(*args, **kwargs)
return

result = method(*args, **kwargs)
index = pd.Index(["x"], name="a", dtype=dtype)
if test_series or reduction_func == "size":
name = None if not test_series and reduction_func == "size" else "b"
expected = Series(expected_value, index=index, dtype=expected_dtype, name=name)
else:
expected = DataFrame({"b": expected_value}, index=index, dtype=expected_dtype)
tm.assert_equal(result, expected)


@pytest.mark.parametrize("min_count", [0, 1])
def test_string_dtype_empty_sum(string_dtype_no_object, min_count):
# https://github.com/pandas-dev/pandas/issues/60229
Expand Down
Loading