Skip to content

Commit 1e8cb89

Browse files
authored
fix: typeset invalid dates errors (#1678)
* fix: ignore invalid dates during conversion * fix: apply type conversion to user defined types * test: add unit test to invalid date type convertion * fix: add invalid dates to variable info * fix(linting): code formatting * test: update unit tests * fix: rename to_datetime method
1 parent 816f1b7 commit 1e8cb89

File tree

4 files changed

+48
-1
lines changed

4 files changed

+48
-1
lines changed

src/ydata_profiling/model/pandas/describe_date_pandas.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@
1111
series_handle_nulls,
1212
series_hashable,
1313
)
14+
from ydata_profiling.model.typeset_relations import is_pandas_1
15+
16+
17+
def to_datetime(series: pd.Series) -> pd.Series:
18+
if is_pandas_1():
19+
return pd.to_datetime(series, errors="coerce")
20+
return pd.to_datetime(series, format="mixed", errors="coerce")
1421

1522

1623
@describe_date_1d.register
@@ -29,6 +36,12 @@ def pandas_describe_date_1d(
2936
Returns:
3037
A dict containing calculated series description values.
3138
"""
39+
og_series = series.dropna()
40+
series = to_datetime(og_series)
41+
invalid_values = og_series[series.isna()]
42+
43+
series = series.dropna()
44+
3245
if summary["value_counts_without_nan"].empty:
3346
values = series.values
3447
summary.update(
@@ -53,5 +66,12 @@ def pandas_describe_date_1d(
5366
if config.vars.num.chi_squared_threshold > 0.0:
5467
summary["chi_squared"] = chi_square(values)
5568

56-
summary.update(histogram_compute(config, values, summary["n_distinct"]))
69+
summary.update(histogram_compute(config, values, series.nunique()))
70+
summary.update(
71+
{
72+
"invalid_dates": invalid_values.nunique(),
73+
"n_invalid_dates": len(invalid_values),
74+
"p_invalid_dates": len(invalid_values) / summary["n"],
75+
}
76+
)
5777
return config, values, summary

src/ydata_profiling/model/pandas/summary_pandas.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def pandas_describe_1d(
4444
and series.name in typeset.type_schema
4545
):
4646
vtype = typeset.type_schema[series.name]
47+
4748
elif config.infer_dtypes:
4849
# Infer variable types
4950
vtype = typeset.infer_type(series)

src/ydata_profiling/report/structure/variables/render_date.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,16 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
6262
[
6363
{"name": "Minimum", "value": fmt(summary["min"]), "alert": False},
6464
{"name": "Maximum", "value": fmt(summary["max"]), "alert": False},
65+
{
66+
"name": "Invalid dates",
67+
"value": fmt(summary["n_invalid_dates"]),
68+
"alert": False,
69+
},
70+
{
71+
"name": "Invalid dates (%)",
72+
"value": fmt_percent(summary["p_invalid_dates"]),
73+
"alert": False,
74+
},
6575
],
6676
style=config.html.style,
6777
)

tests/unit/test_describe.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -582,3 +582,19 @@ def test_describe_list(summarizer, typeset):
582582

583583
with pytest.raises(NotImplementedError):
584584
describe(config, "", [1, 2, 3], summarizer, typeset)
585+
586+
587+
def test_decribe_series_type_schema(config, summarizer):
588+
"Test describe with invalid date types."
589+
typeset = ProfilingTypeSet(config, type_schema={"date": "datetime"})
590+
data = {
591+
"value": [1, 2, 3, 4],
592+
"date": ["0001-01-01", "9999-12-31", "2022-10-03", "2022-10-04"],
593+
}
594+
df = pd.DataFrame(data)
595+
result = describe(config, df, summarizer, typeset)
596+
597+
assert result.variables["date"]["type"] == "DateTime"
598+
assert result.variables["date"]["n_missing"] == 0
599+
assert result.variables["date"]["n_invalid_dates"] == 2
600+
assert result.variables["date"]["p_invalid_dates"] == 0.5

0 commit comments

Comments
 (0)