From f64275e4c9bb7556bfdf3f32fa256b974c4cdfa5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 10 Mar 2025 12:06:53 -0400 Subject: [PATCH 1/4] BUG(string dtype): Empty sum produces incorrect result (#60936) --- pandas/core/arrays/base.py | 10 +++++++- pandas/tests/frame/test_reductions.py | 10 ++++++++ pandas/tests/groupby/test_reductions.py | 14 +++++++++++ pandas/tests/resample/test_base.py | 25 +++++++++++++++++++ .../tests/resample/test_resampler_grouper.py | 20 +++++++++++++++ 5 files changed, 78 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 62ca2a45fb941..ffcd6ebc4ae11 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2386,7 +2386,15 @@ def _groupby_op( if op.how not in ["any", "all"]: # Fail early to avoid conversion to object op._get_cython_function(op.kind, op.how, np.dtype(object), False) - npvalues = self.to_numpy(object, na_value=np.nan) + + arr = self + if op.how == "sum": + # https://github.com/pandas-dev/pandas/issues/60229 + # All NA should result in the empty string. + assert "skipna" in kwargs + if kwargs["skipna"] and min_count == 0: + arr = arr.fillna("") + npvalues = arr.to_numpy(object, na_value=np.nan) else: raise NotImplementedError( f"function is not implemented for this dtype: {self.dtype}" diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 1b2e55c978071..8b450cecfca00 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -846,6 +846,16 @@ def test_axis_1_empty(self, all_reductions, index): expected = Series([], index=index, dtype=expected_dtype) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("min_count", [0, 1]) + def test_axis_1_sum_na(self, string_dtype_no_object, skipna, min_count): + # https://github.com/pandas-dev/pandas/issues/60229 + dtype = string_dtype_no_object + df = DataFrame({"a": [pd.NA]}, dtype=dtype) + result = df.sum(axis=1, skipna=skipna, min_count=min_count) + value = "" if skipna and min_count == 0 else pd.NA + expected = Series([value], dtype=dtype) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) @pytest.mark.parametrize("numeric_only", [None, True, False]) def test_sum_prod_nanops(self, method, unit, numeric_only): diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 599b0aabf85d5..8701b8d86651e 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -710,6 +710,20 @@ def test_min_empty_string_dtype(func, string_dtype_no_object): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("min_count", [0, 1]) +def test_string_dtype_empty_sum(string_dtype_no_object, skipna, min_count): + # https://github.com/pandas-dev/pandas/issues/60229 + dtype = string_dtype_no_object + df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype) + gb = df.groupby("a") + result = gb.sum(skipna=skipna, min_count=min_count) + value = "" if skipna and min_count == 0 else pd.NA + expected = DataFrame( + {"b": value}, index=pd.Index(["x"], name="a", dtype=dtype), dtype=dtype + ) + tm.assert_frame_equal(result, expected) + + def test_max_nan_bug(): df = DataFrame( { diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index dcf6c6099abab..27d87729e2075 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -150,6 +150,31 @@ def test_resample_empty_series(freq, empty_series_dti, resample_method): assert result.index.freq == expected.index.freq +@pytest.mark.parametrize("min_count", [0, 1]) +def test_resample_empty_sum_string(string_dtype_no_object, min_count): + # https://github.com/pandas-dev/pandas/issues/60229 + dtype = string_dtype_no_object + ser = Series( + pd.NA, + index=DatetimeIndex( + [ + "2000-01-01 00:00:00", + "2000-01-01 00:00:10", + "2000-01-01 00:00:20", + "2000-01-01 00:00:30", + ] + ), + dtype=dtype, + ) + rs = ser.resample("20s") + result = rs.sum(min_count=min_count) + + value = "" if min_count == 0 else pd.NA + index = date_range(start="2000-01-01", freq="20s", periods=2, unit="s") + expected = Series(value, index=index, dtype=dtype) + tm.assert_series_equal(result, expected) + + @all_ts @pytest.mark.parametrize( "freq", diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index e2d456fea2b23..278a19bdc8348 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -526,6 +526,26 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("min_count", [0, 1]) +def test_groupby_resample_empty_sum_string( + string_dtype_no_object, test_frame, min_count +): + # https://github.com/pandas-dev/pandas/issues/60229 + dtype = string_dtype_no_object + test_frame = test_frame.assign(B=pd.array([pd.NA] * len(test_frame), dtype=dtype)) + gbrs = test_frame.groupby("A").resample("40s") + result = gbrs.sum(min_count=min_count) + + index = pd.MultiIndex( + levels=[[1, 2, 3], [pd.to_datetime("2000-01-01", unit="ns")]], + codes=[[0, 1, 2], [0, 0, 0]], + names=["A", None], + ) + value = "" if min_count == 0 else pd.NA + expected = DataFrame({"B": value}, index=index, dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_groupby_resample_with_list_of_keys(): # GH 47362 df = DataFrame( From e2f988dc1d895b8c0116ae757541c940f67e5d37 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 Jun 2025 12:01:39 +0200 Subject: [PATCH 2/4] skipna keyword not yet available in groupby for 2.3 --- pandas/core/arrays/base.py | 3 +-- pandas/tests/groupby/test_reductions.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ffcd6ebc4ae11..28a95ce1784a2 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2391,8 +2391,7 @@ def _groupby_op( if op.how == "sum": # https://github.com/pandas-dev/pandas/issues/60229 # All NA should result in the empty string. - assert "skipna" in kwargs - if kwargs["skipna"] and min_count == 0: + if min_count == 0: arr = arr.fillna("") npvalues = arr.to_numpy(object, na_value=np.nan) else: diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 8701b8d86651e..896cd223c3ff3 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -711,13 +711,13 @@ def test_min_empty_string_dtype(func, string_dtype_no_object): @pytest.mark.parametrize("min_count", [0, 1]) -def test_string_dtype_empty_sum(string_dtype_no_object, skipna, min_count): +def test_string_dtype_empty_sum(string_dtype_no_object, min_count): # https://github.com/pandas-dev/pandas/issues/60229 dtype = string_dtype_no_object df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype) gb = df.groupby("a") - result = gb.sum(skipna=skipna, min_count=min_count) - value = "" if skipna and min_count == 0 else pd.NA + result = gb.sum(min_count=min_count) + value = "" if min_count == 0 else pd.NA expected = DataFrame( {"b": value}, index=pd.Index(["x"], name="a", dtype=dtype), dtype=dtype ) From f3f54114e25dca12978307a06d56e8920056348e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 Jun 2025 15:37:41 +0200 Subject: [PATCH 3/4] avoid depr warning about include_groups --- pandas/tests/resample/test_resampler_grouper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 278a19bdc8348..3eba4b4f23bd4 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -533,7 +533,7 @@ def test_groupby_resample_empty_sum_string( # https://github.com/pandas-dev/pandas/issues/60229 dtype = string_dtype_no_object test_frame = test_frame.assign(B=pd.array([pd.NA] * len(test_frame), dtype=dtype)) - gbrs = test_frame.groupby("A").resample("40s") + gbrs = test_frame.groupby("A").resample("40s", include_groups=False) result = gbrs.sum(min_count=min_count) index = pd.MultiIndex( From 914d062c9f906ed7c116df86140fd4d8062c2d80 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 Jun 2025 15:38:48 +0200 Subject: [PATCH 4/4] changed default resolution --- pandas/tests/resample/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 27d87729e2075..1d9e9124db2b0 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -170,7 +170,7 @@ def test_resample_empty_sum_string(string_dtype_no_object, min_count): result = rs.sum(min_count=min_count) value = "" if min_count == 0 else pd.NA - index = date_range(start="2000-01-01", freq="20s", periods=2, unit="s") + index = date_range(start="2000-01-01", freq="20s", periods=2) expected = Series(value, index=index, dtype=dtype) tm.assert_series_equal(result, expected)