Skip to content

Commit 45c47c7

Browse files
authored
fix: Support Pandas future.infer_string=True in report generation (#1674)
* fix: Support Pandas future.infer_string=True in report generation Previously, report generation encountered issues when `future.infer_string=True` was set. This resulted in multiple warnings ("FutureWarning: Dtype inference on a pandas object is deprecated") and failures when string columns contained only empty strings ("AttributeError: 'StringDtype' object has no attribute 'pyarrow_dtype'"). This change resolves the issue by explicitly setting the dtype to "object" for the relevant operations. * refactor: Refactor pandas option usage for backward compatibility - Introduce the `optional_option_context` helper to replace the direct use of `pd.option_context("future.no_silent_downcasting", True)`, ensuring compatibility with older pandas versions that lack this option. - Update the `future.infer_string` test to run only on pandas >= 2.1, where it is applicable. * Add type annotations for optional_option_context * Make the silent downcasting explicit
1 parent 81d7deb commit 45c47c7

File tree

4 files changed

+49
-5
lines changed

4 files changed

+49
-5
lines changed

src/ydata_profiling/model/pandas/describe_categorical_pandas.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121

2222
def get_character_counts_vc(vc: pd.Series) -> pd.Series:
23-
series = pd.Series(vc.index, index=vc)
23+
series = pd.Series(vc.index, index=vc, dtype=object)
2424
characters = series[series != ""].apply(list)
2525
characters = characters.explode()
2626

@@ -170,7 +170,7 @@ def word_summary_vc(vc: pd.Series, stop_words: List[str] = []) -> dict:
170170
# TODO: configurable lowercase/punctuation etc.
171171
# TODO: remove punctuation in words
172172

173-
series = pd.Series(vc.index, index=vc)
173+
series = pd.Series(vc.index, index=vc, dtype=object)
174174
word_lists = series.str.lower().str.split()
175175
words = word_lists.explode().str.strip(string.punctuation + string.whitespace)
176176
word_counts = pd.Series(words.index, index=words)
@@ -188,7 +188,7 @@ def word_summary_vc(vc: pd.Series, stop_words: List[str] = []) -> dict:
188188

189189

190190
def length_summary_vc(vc: pd.Series) -> dict:
191-
series = pd.Series(vc.index, index=vc)
191+
series = pd.Series(vc.index, index=vc, dtype=object)
192192
length = series.str.len()
193193
length_counts = pd.Series(length.index, index=length)
194194
length_counts = length_counts.groupby(level=0, sort=False).sum()

src/ydata_profiling/model/pandas/summary_pandas.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from ydata_profiling.config import Settings
1212
from ydata_profiling.model.typeset import ProfilingTypeSet
13+
from ydata_profiling.utils.compat import optional_option_context
1314
from ydata_profiling.utils.dataframe import sort_column_names
1415

1516
BaseSummarizer: Any = "BaseSummarizer" # type: ignore
@@ -38,7 +39,8 @@ def pandas_describe_1d(
3839
"""
3940

4041
# Make sure pd.NA is not in the series
41-
series = series.fillna(np.nan)
42+
with optional_option_context("future.no_silent_downcasting", True):
43+
series = series.fillna(np.nan).infer_objects(copy=False)
4244

4345
has_cast_type = _is_cast_type_defined(typeset, series.name) # type:ignore
4446
cast_type = (

src/ydata_profiling/utils/compat.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
"""Utility functions for (version) compatibility"""
2+
3+
from contextlib import contextmanager
24
from functools import lru_cache
3-
from typing import Tuple
5+
from typing import Generator, Tuple
46

57
import pandas as pd
68

@@ -12,3 +14,18 @@ def pandas_version_info() -> Tuple[int, ...]:
1214
akin to `sys.version_info` for the Python version.
1315
"""
1416
return tuple(int(s) for s in pd.__version__.split("."))
17+
18+
19+
@contextmanager
20+
def optional_option_context(
21+
option_key: str, value: object
22+
) -> Generator[None, None, None]:
23+
"""
24+
A context manager that sets an option only if it is available in the
25+
current pandas version; otherwise, it is a no-op.
26+
"""
27+
try:
28+
with pd.option_context(option_key, value):
29+
yield
30+
except pd.errors.OptionError:
31+
yield
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import pandas as pd
2+
import pytest
3+
4+
from ydata_profiling import ProfileReport
5+
from ydata_profiling.utils.compat import pandas_version_info
6+
7+
8+
@pytest.fixture()
9+
def df():
10+
df = pd.DataFrame(
11+
{
12+
"foo": [1, 2, 3],
13+
"bar": ["", "", ""],
14+
}
15+
)
16+
return df
17+
18+
19+
@pytest.mark.skipif(
20+
pandas_version_info() < (2, 1, 0), reason="requires pandas 2.1 or higher"
21+
)
22+
def test_pd_future_infer_string(df: pd.DataFrame):
23+
with pd.option_context("future.infer_string", True):
24+
profile_report = ProfileReport(df, title="Test Report", progress_bar=False)
25+
assert len(profile_report.to_html()) > 0

0 commit comments

Comments
 (0)