Skip to content

Commit fc8dfe9

Browse files
feat: add new alert description (#1724)
* fix: add new alert description * fix(linting): code formatting * feat: add new dirty categories config * feat: informative banner for end-users * fix(linting): code formatting * fix: rename the alert for Dirty category * fix(linting): code formatting * fix: linter fixes * fix(linting): code formatting * chore: set dirty categories config to False * chore: disable linter for expected code design * fix(linting): code formatting --------- Co-authored-by: Azory YData Bot <azory@ydata.ai>
1 parent 79c1c9d commit fc8dfe9

File tree

6 files changed

+112
-17
lines changed

6 files changed

+112
-17
lines changed

src/ydata_profiling/config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ class CatVars(BaseModel):
6868
redact: bool = False
6969
histogram_largest: int = 50
7070
stop_words: List[str] = []
71+
dirty_categories: bool = False
72+
dirty_categories_threshold: float = 0.85
7173

7274

7375
class BoolVars(BaseModel):

src/ydata_profiling/model/alerts.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ class AlertType(Enum):
5252
DUPLICATES = auto()
5353
"""This variable contains duplicates."""
5454

55+
NEAR_DUPLICATES = auto()
56+
"""This variable contains duplicates."""
57+
5558
SKEWED = auto()
5659
"""This variable is highly skewed."""
5760

@@ -70,6 +73,9 @@ class AlertType(Enum):
7073
UNIQUE = auto()
7174
"""This variable has unique values."""
7275

76+
DIRTY_CATEGORY = auto()
77+
"""This variable is a categories with potential fuzzy values, and for that reason might incur in consistency issues."""
78+
7379
CONSTANT_LENGTH = auto()
7480
"""This variable has a constant length."""
7581

@@ -205,7 +211,29 @@ def _get_description(self) -> str:
205211
if self.values is not None:
206212
return f"Dataset has {self.values['n_duplicates']} ({fmt_percent(self.values['p_duplicates'])}) duplicate rows"
207213
else:
208-
return "Dataset has duplicated values"
214+
return "Dataset has no duplicated rows"
215+
216+
217+
class NearDuplicatesAlert(Alert):
218+
def __init__(
219+
self,
220+
values: Optional[Dict] = None,
221+
column_name: Optional[str] = None,
222+
is_empty: bool = False,
223+
):
224+
super().__init__(
225+
alert_type=AlertType.NEAR_DUPLICATES,
226+
values=values,
227+
column_name=column_name,
228+
fields={"n_near_dups"},
229+
is_empty=is_empty,
230+
)
231+
232+
def _get_description(self) -> str:
233+
if self.values is not None:
234+
return f"Dataset has {self.values['n_near_dups']} ({fmt_percent(self.values['p_near_dups'])}) near duplicate rows"
235+
else:
236+
return "Dataset has no near duplicated rows"
209237

210238

211239
class EmptyAlert(Alert):
@@ -249,6 +277,28 @@ def _get_description(self) -> str:
249277
return f"[{self.column_name}] has a high cardinality"
250278

251279

280+
class DirtyCategoryAlert(Alert):
281+
def __init__(
282+
self,
283+
values: Optional[Dict] = None,
284+
column_name: Optional[str] = None,
285+
is_empty: bool = False,
286+
):
287+
super().__init__(
288+
alert_type=AlertType.DIRTY_CATEGORY,
289+
values=values,
290+
column_name=column_name,
291+
fields={"n_fuzzy_vals"},
292+
is_empty=is_empty,
293+
)
294+
295+
def _get_description(self) -> str:
296+
if self.values is not None:
297+
return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {fmt_percent(self.values['p_fuzzy_vals'])} per category"
298+
else:
299+
return f"[{self.column_name}] no dirty categories values."
300+
301+
252302
class HighCorrelationAlert(Alert):
253303
def __init__(
254304
self,

src/ydata_profiling/model/pandas/describe_categorical_pandas.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
series_handle_nulls,
1717
series_hashable,
1818
)
19+
from ydata_profiling.utils.information import DisplayInfo
1920

2021

2122
def get_character_counts_vc(vc: pd.Series) -> pd.Series:
@@ -210,6 +211,9 @@ def length_summary_vc(vc: pd.Series) -> dict:
210211
return summary
211212

212213

214+
_displayed_catvar_banner = False
215+
216+
213217
@describe_categorical_1d.register
214218
@series_hashable
215219
@series_handle_nulls
@@ -226,6 +230,8 @@ def pandas_describe_categorical_1d(
226230
Returns:
227231
A dict containing calculated series description values.
228232
"""
233+
# Global info banner
234+
global _displayed_catvar_banner
229235

230236
# Make sure we deal with strings (Issue #100)
231237
series = series.astype(str)
@@ -262,4 +268,13 @@ def pandas_describe_categorical_1d(
262268
if config.vars.cat.words:
263269
summary.update(word_summary_vc(value_counts, config.vars.cat.stop_words))
264270

271+
if config.vars.cat.dirty_categories: # noqa: SIM102
272+
if not _displayed_catvar_banner:
273+
display_info = DisplayInfo(
274+
title="Identify dirty categories with ydata-sdk",
275+
info_text="This feature is only available for ydata-sdk users. Register to give try it.",
276+
)
277+
display_info.display_message()
278+
_displayed_catvar_banner = True
279+
265280
return config, series, summary
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<a href="#pp_var_{{ alert.anchor_id }}"><code>{{ alert.column_name }}</code></a> has dirty categories: {{ alert.values['n_fuzzy_vals'] }} distinct values
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Dataset has {{ alert.values['n_near_dups'] }} ({{ alert.values['p_near_dups'] | fmt_percent }}) <a href="#near_duplicate">near duplicate rows</a>

src/ydata_profiling/utils/information.py

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,22 +21,48 @@ def in_jupyter_notebook() -> bool:
2121
return isiPython
2222

2323

24+
class DisplayInfo:
25+
def __init__(
26+
self,
27+
title: str,
28+
info_text: str,
29+
link: str = "ttps://ydata.ai/register",
30+
):
31+
self.title = title
32+
self.link = link
33+
self.info_text = info_text
34+
35+
def display_message(self) -> None:
36+
"""
37+
Display an HTML message in case the user is in a Jupyter Notebook
38+
"""
39+
if in_jupyter_notebook():
40+
from IPython.display import HTML, display
41+
42+
info = f"""
43+
<div>
44+
<ins><a href="{self.link}">{self.title}</a></ins>
45+
<p>
46+
{self.info_text}
47+
</p>
48+
</div>
49+
"""
50+
display(HTML(info))
51+
else:
52+
info = (
53+
f"\033[1;34m{self.title}\033[0m"
54+
+ "\n"
55+
+ f"{self.info_text}"
56+
+ "\n"
57+
+ f"Register at {self.link}"
58+
)
59+
print(info) # noqa: T201
60+
61+
2462
def display_banner() -> None:
2563
global _displayed_banner
26-
if in_jupyter_notebook() and not _displayed_banner:
27-
from IPython.display import HTML, display
28-
29-
banner_html = f"""
30-
<div>
31-
<ins><a href="{link}">{title}</a></ins>
32-
<p>
33-
{info_text}
34-
</p>
35-
</div>
36-
"""
37-
display(HTML(banner_html))
38-
else:
39-
print(f"\033[1;34m{title}\033[0m") # noqa: T201
40-
print(info_text) # noqa: T201
41-
print(f"Register at {link}") # noqa: T201
64+
banner_info = DisplayInfo(title=title, info_text=info_text)
65+
66+
if not _displayed_banner:
67+
banner_info.display_message()
4268
_displayed_banner = True

0 commit comments

Comments
 (0)