Support creating ADSDataset through pandas accessor (#173)

lu-ohai · web-flow · commit 95eff005621b · 2023-05-11T12:07:59.000-07:00
diff --git a/ads/dataset/dataset_with_target.py b/ads/dataset/dataset_with_target.py
@@ -200,6 +200,11 @@ def from_dataframe(
                 DatasetDefaults.sampling_confidence_interval,
                 **init_kwargs,
             )
+
+        if target not in df:
+            raise ValueError(
+                f"{target} column doesn't exist in data frame. Specify a valid one instead."
+            )
             
         if target_type is None:
             target_type = get_target_type(target, sampled_df, **init_kwargs)
diff --git a/ads/dataset/mixin/__init__.py b/ads/dataset/mixin/__init__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*--
+
+# Copyright (c) 2023 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
diff --git a/ads/dataset/mixin/dataset_accessor.py b/ads/dataset/mixin/dataset_accessor.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*--
+
+# Copyright (c) 2023 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
+
+from typing import Tuple
+import pandas as pd
+from ads.dataset import progress
+
+from ads.type_discovery.typed_feature import TypedFeature
+
+class ADSDatasetAccessMixin:
+
+    def dataset(
+        self,
+        sampled_df=None,
+        shape=None,
+        name="",
+        description=None,
+        type_discovery=True,
+        types={},
+        metadata=None,
+        progress=progress.DummyProgressBar(),
+        transformer_pipeline=None,
+        interactive=False,
+        **kwargs,
+    ):
+        """Converts pandas DataFrame into ADS Dataset.
+
+        Parameters
+        ----------
+        sampled_df: pandas.DataFrame, optional
+            The sampled pandas DataFrame. Defaults to None.
+        shape: Tuple, optional
+            The shape of pandas DataFrame. Defaults to None.
+        name: str, optional
+            The name of ADS Dataset. Defaults to "".
+        description: str, optional
+            Text describing the dataset. Defaults to "".
+        type_discovery: bool. optional
+            If false, the data types of the dataframe are used as such.
+            By default, the dataframe columns are associated with the best suited data types. Associating the features
+            with the disovered datatypes would impact visualizations and model prediction. Defaults to True.
+        types: dict, optional
+            Dictionary of <feature_name> : <data_type> to override the data type of features. Defaults to {}.
+        metadata: dict, optional
+            The metadata of ADS Dataset. Defaults to None.
+        progress: dataset.progress.ProgressBar, optional
+            The progress bar for ADS Dataset. Defaults to progress.DummyProgressBar()
+        transformer_pipeline: datasets.pipeline.TransformerPipeline, optional
+            A pipeline of transformations done outside the sdk and need to be applied at the time of scoring
+        kwargs: additional keyword arguments that would be passed to underlying dataframe read API
+            based on the format of the dataset
+
+        Returns
+        -------
+        ADSDataset: 
+            An instance of ADSDataset
+
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> df = pd.read_csv(<path_to_csv>)
+        >>> ds = df.ads.dataset()
+        """
+        from ads.dataset.dataset import ADSDataset
+
+        return ADSDataset.from_dataframe(
+            df=self._obj,
+            sampled_df=sampled_df,
+            shape=shape,
+            name=name,
+            description=description,
+            type_discovery=type_discovery,
+            types=types,
+            metadata=metadata,
+            progress=progress,
+            transformer_pipeline=transformer_pipeline,
+            interactive=interactive,
+            **kwargs
+        )
+
+    def dataset_with_target(
+        self, 
+        target: str,
+        sampled_df: pd.DataFrame = None,
+        shape: Tuple[int, int] = None,
+        target_type: TypedFeature = None,
+        positive_class=None,
+        **kwargs,
+    ):
+        """Converts pandas DataFrame into ADS Dataset with target.
+
+        Parameters
+        ----------
+        target: str, optional
+            Name of the target in dataset.
+            If set an ADSDatasetWithTarget object is returned, otherwise an ADSDataset object is returned which can be
+            used to understand the dataset through visualizations
+        sampled_df: pandas.DataFrame, optional
+            The sampled pandas DataFrame. Defaults to None.
+        shape: Tuple, optional 
+            The shape of pandas DataFrame. Defaults to None.
+        target_type: TypedFeature, optional
+            The target type of ADS Dataset. Defaults to None.
+        positive_class: Any, optional
+            Label in target for binary classification problems which should be identified as positive for modeling.
+            By default, the first unique value is considered as the positive label.
+        kwargs: additional keyword arguments that would be passed to underlying dataframe read API
+            based on the format of the dataset
+
+        Returns
+        -------
+        ADSDatasetWithTarget: 
+            An instance of ADSDatasetWithTarget
+
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> df = pd.read_csv(<path_to_csv>)
+        >>> ds = df.ads.dataset_with_target(target="target")
+        """
+        from ads.dataset.dataset_with_target import ADSDatasetWithTarget
+
+        return ADSDatasetWithTarget.from_dataframe(
+            df=self._obj,
+            target=target,
+            sampled_df=sampled_df,
+            shape=shape,
+            target_type=target_type,
+            positive_class=positive_class,
+            **kwargs
+        )
diff --git a/ads/feature_engineering/accessor/dataframe_accessor.py b/ads/feature_engineering/accessor/dataframe_accessor.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*--
 
-# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
+# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
 """
@@ -41,6 +41,7 @@
 import pandas as pd
 from ads.common.utils import DATA_SCHEMA_MAX_COL_NUM
 from ads.data_labeling.mixin.data_labeling import DataLabelingAccessMixin
+from ads.dataset.mixin.dataset_accessor import ADSDatasetAccessMixin
 from ads.dbmixin.db_pandas_accessor import DBAccessMixin
 from ads.feature_engineering import schema
 from ads.feature_engineering.accessor.mixin.eda_mixin import EDAMixin
@@ -53,7 +54,11 @@
 
 @pd.api.extensions.register_dataframe_accessor("ads")
 class ADSDataFrameAccessor(
-    ADSFeatureTypesMixin, EDAMixin, DBAccessMixin, DataLabelingAccessMixin
+    ADSFeatureTypesMixin,
+    EDAMixin,
+    DBAccessMixin,
+    DataLabelingAccessMixin,
+    ADSDatasetAccessMixin
 ):
     """ADS accessor for the Pandas DataFrame.
 
diff --git a/tests/unitary/with_extras/dataset/test_dataset_dataset.py b/tests/unitary/with_extras/dataset/test_dataset_dataset.py
@@ -62,13 +62,8 @@ def test_initialize_dataset(self):
             description="test_description",
             storage_options={'config':{},'region':'us-ashburn-1'}
         )
-        assert isinstance(employees, ADSDataset)
-        assert isinstance(employees.df, pd.DataFrame)
-        assert isinstance(employees.shape, Tuple)
-        assert employees.name == "test_dataset"
-        assert employees.description == "test_description"
-        assert "type_discovery" in employees.init_kwargs
-        assert isinstance(employees.transformer_pipeline, TransformerPipeline)
+        
+        self.assert_dataset(employees)
 
     def test_from_dataframe(self):
         employees = ADSDataset.from_dataframe(
@@ -77,13 +72,26 @@ def test_from_dataframe(self):
             description="test_description",
             storage_options={'config':{},'region':'us-ashburn-1'}
         )
-        assert isinstance(employees, ADSDataset)
-        assert isinstance(employees.df, pd.DataFrame)
-        assert isinstance(employees.shape, Tuple)
-        assert employees.name == "test_dataset"
-        assert employees.description == "test_description"
-        assert "type_discovery" in employees.init_kwargs
-        assert isinstance(employees.transformer_pipeline, TransformerPipeline)
+        
+        self.assert_dataset(employees)
+
+    def test_accessor(self):
+        df=pd.read_csv(self.get_data_path())
+        employees = df.ads.dataset(
+            name="test_dataset",
+            description="test_description",
+        )
+
+        self.assert_dataset(employees)
+
+    def assert_dataset(self, dataset):
+        assert isinstance(dataset, ADSDataset)
+        assert isinstance(dataset.df, pd.DataFrame)
+        assert isinstance(dataset.shape, Tuple)
+        assert dataset.name == "test_dataset"
+        assert dataset.description == "test_description"
+        assert "type_discovery" in dataset.init_kwargs
+        assert isinstance(dataset.transformer_pipeline, TransformerPipeline)
 
     def get_data_path(self):
         current_dir = os.path.dirname(os.path.abspath(__file__))
diff --git a/tests/unitary/with_extras/dataset/test_dataset_target.py b/tests/unitary/with_extras/dataset/test_dataset_target.py
@@ -6,6 +6,7 @@
 import os
 from typing import Tuple
 import pandas as pd
+import pytest
 from ads.dataset.classification_dataset import BinaryClassificationDataset
 from ads.dataset.dataset_with_target import ADSDatasetWithTarget
 from ads.dataset.pipeline import TransformerPipeline
@@ -23,14 +24,9 @@ def test_initialize_dataset_target(self):
         )
 
         assert isinstance(employees, ADSDatasetWithTarget)
-        assert isinstance(employees.df, pd.DataFrame)
-        assert isinstance(employees.shape, Tuple)
-        assert isinstance(employees.target, TargetVariable)
-        assert employees.target.type["type"] == "categorical"
         assert employees.name == "test_dataset"
         assert employees.description == "test_description"
-        assert "type_discovery" in employees.init_kwargs
-        assert isinstance(employees.transformer_pipeline, TransformerPipeline)
+        self.assert_dataset(employees)
 
     def test_dataset_target_from_dataframe(self):
         employees = ADSDatasetWithTarget.from_dataframe(
@@ -40,12 +36,34 @@ def test_dataset_target_from_dataframe(self):
         ).set_positive_class('Yes')
 
         assert isinstance(employees, BinaryClassificationDataset)
-        assert isinstance(employees.df, pd.DataFrame)
-        assert isinstance(employees.shape, Tuple)
-        assert isinstance(employees.target, TargetVariable)
-        assert employees.target.type["type"] == "categorical"
-        assert "type_discovery" in employees.init_kwargs
-        assert isinstance(employees.transformer_pipeline, TransformerPipeline)
+        self.assert_dataset(employees)
+
+    def test_accessor_with_target(self):
+        df=pd.read_csv(self.get_data_path())
+        employees = df.ads.dataset_with_target(
+            target="Attrition"
+        )
+
+        assert isinstance(employees, BinaryClassificationDataset)
+        self.assert_dataset(employees)
+
+    def test_accessor_with_target_error(self):
+        df=pd.read_csv(self.get_data_path())
+        wrong_column = "wrong_column"
+        with pytest.raises(
+            ValueError, match=f"{wrong_column} column doesn't exist in data frame. Specify a valid one instead."
+        ):
+            employees = df.ads.dataset_with_target(
+                target=wrong_column
+            )
+
+    def assert_dataset(self, dataset):
+        assert isinstance(dataset.df, pd.DataFrame)
+        assert isinstance(dataset.shape, Tuple)
+        assert isinstance(dataset.target, TargetVariable)
+        assert dataset.target.type["type"] == "categorical"
+        assert "type_discovery" in dataset.init_kwargs
+        assert isinstance(dataset.transformer_pipeline, TransformerPipeline)
 
     def get_data_path(self):
         current_dir = os.path.dirname(os.path.abspath(__file__))