feat: add type schema (#1274)

alexbarros · web-flow · commit 79856bca5a0d · 2023-03-03T10:07:49.000-03:00
* feat: allows user to define variable types
diff --git a/src/ydata_profiling/compare_reports.py b/src/ydata_profiling/compare_reports.py
@@ -134,6 +134,10 @@ def _compare_profile_report_preprocess(
                     config.html.style.primary_colors
                 )
 
+    # enforce same types
+    for report in reports[1:]:
+        report._typeset = reports[0].typeset
+
     # Obtain description sets
     descriptions = [report.get_description() for report in reports]
     for label, description in zip(labels, descriptions):
diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py
@@ -12,6 +12,7 @@
 from ydata_profiling.config import Settings
 from ydata_profiling.model.summarizer import BaseSummarizer
 from ydata_profiling.model.summary import describe_1d, get_series_descriptions
+from ydata_profiling.model.typeset import ProfilingTypeSet
 from ydata_profiling.utils.dataframe import sort_column_names
 
 
@@ -37,8 +38,13 @@ def pandas_describe_1d(
     # Make sure pd.NA is not in the series
     series = series.fillna(np.nan)
 
-    # get `infer_dtypes` (bool) from config
-    if config.infer_dtypes:
+    if (
+        isinstance(typeset, ProfilingTypeSet)
+        and typeset.type_schema
+        and series.name in typeset.type_schema
+    ):
+        vtype = typeset.type_schema[series.name]
+    elif config.infer_dtypes:
         # Infer variable types
         vtype = typeset.infer_type(series)
         series = typeset.cast_to_inferred(series)
@@ -47,6 +53,7 @@ def pandas_describe_1d(
         # [new dtypes, changed using `astype` function are now considered]
         vtype = typeset.detect_type(series)
 
+    typeset.type_schema[series.name] = vtype
     return summarizer.summarize(config, series, dtype=vtype)
 
 
diff --git a/src/ydata_profiling/model/typeset.py b/src/ydata_profiling/model/typeset.py
@@ -241,11 +241,22 @@ def is_timedependent(series: pd.Series) -> bool:
 
 
 class ProfilingTypeSet(visions.VisionsTypeset):
-    def __init__(self, config: Settings):
+    def __init__(self, config: Settings, type_schema: dict = None):
         self.config = config
 
         types = typeset_types(config)
 
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", category=UserWarning)
             super().__init__(types)
+
+        self.type_schema = self._init_type_schema(type_schema or {})
+
+    def _init_type_schema(self, type_schema: dict) -> dict:
+        return {k: self._get_type(v) for k, v in type_schema.items()}
+
+    def _get_type(self, type_name: str) -> visions.VisionsBaseType:
+        for t in self.types:
+            if t.__name__.lower() == type_name.lower():
+                return t
+        raise ValueError(f"Type [{type_name}] not found.")
diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py
@@ -68,6 +68,7 @@ def __init__(
         typeset: Optional[VisionsTypeset] = None,
         summarizer: Optional[BaseSummarizer] = None,
         config: Optional[Settings] = None,
+        type_schema: Optional[dict] = None,
         **kwargs,
     ):
         """Generate a ProfileReport based on a pandas or spark.sql DataFrame
@@ -89,6 +90,7 @@ def __init__(
             sample: optional dict(name="Sample title", caption="Caption", data=pd.DataFrame())
             typeset: optional user typeset to use for type inference
             summarizer: optional user summarizer to generate custom summary output
+            type_schema: optional dict containing pairs of `column name`: `type`
             **kwargs: other arguments, for valid arguments, check the default configuration file.
         """
         self.__validate_inputs(df, minimal, tsmode, config_file, lazy)
@@ -139,6 +141,7 @@ def __init__(
         self.config = report_config
         self._df_hash = None
         self._sample = sample
+        self._type_schema = type_schema
         self._typeset = typeset
         self._summarizer = summarizer
 
@@ -230,7 +233,7 @@ def invalidate_cache(self, subset: Optional[str] = None) -> None:
     @property
     def typeset(self) -> Optional[VisionsTypeset]:
         if self._typeset is None:
-            self._typeset = ProfilingTypeSet(self.config)
+            self._typeset = ProfilingTypeSet(self.config, self._type_schema)
         return self._typeset
 
     @property
diff --git a/tests/unit/test_typeset_default.py b/tests/unit/test_typeset_default.py
@@ -1,5 +1,7 @@
 import os
 
+import numpy as np
+import pandas as pd
 import pytest
 from visions.test.series import get_series
 from visions.test.utils import (
@@ -14,6 +16,7 @@
 from tests.unit.test_utils import patch_arg
 from ydata_profiling.config import Settings
 from ydata_profiling.model.typeset import ProfilingTypeSet
+from ydata_profiling.profile_report import ProfileReport
 
 base_path = os.path.abspath(os.path.dirname(__file__))
 
@@ -161,7 +164,7 @@
     )
 )
 def test_contains(name, series, contains_type, member):
-    """Test the generated combinations for "series in type"
+    """Test the generated combinations for "series in type".
 
     Args:
         series: the series to test
@@ -349,3 +352,35 @@ def test_conversion(name, source_type, relation_type, series, member):
     """
     result, message = convert(name, source_type, relation_type, series, member)
     assert result, message
+
+
+@pytest.fixture
+def dataframe(size: int = 1000) -> pd.DataFrame:
+    return pd.DataFrame(
+        {
+            "boolean": np.random.choice([True, False], size=size),
+            "numeric": np.random.rand(size),
+            "categorical": np.random.choice(np.arange(5), size=size),
+            "timeseries": np.arange(size),
+        }
+    )
+
+
+def convertion_map() -> list:
+    types = {
+        "boolean": ["Categorical", "Unsupported"],
+        "numeric": ["Categorical", "Boolean", "Unsupported"],
+        "categorical": ["Numeric", "Boolean", "TimeSeries", "Unsupported"],
+        "timeseries": ["Numeric", "Boolean", "Categorical", "Unsupported"],
+    }
+    return [(k, {k: i}) for k, v in types.items() for i in v]
+
+
+@pytest.mark.parametrize("column,type_schema", convertion_map())
+def test_type_schema(dataframe: pd.DataFrame, column: str, type_schema: dict):
+    prof = ProfileReport(dataframe[[column]], tsmode=True, type_schema=type_schema)
+    prof.get_description()
+    assert isinstance(prof.typeset, ProfilingTypeSet)
+    assert prof.typeset.type_schema[column] == prof.typeset._get_type(
+        type_schema[column]
+    )