Skip to content

Commit 95eff00

Browse files
authored
Support creating ADSDataset through pandas accessor (#173)
2 parents aa980c5 + eb52466 commit 95eff00

File tree

6 files changed

+203
-28
lines changed

6 files changed

+203
-28
lines changed

ads/dataset/dataset_with_target.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,11 @@ def from_dataframe(
200200
DatasetDefaults.sampling_confidence_interval,
201201
**init_kwargs,
202202
)
203+
204+
if target not in df:
205+
raise ValueError(
206+
f"{target} column doesn't exist in data frame. Specify a valid one instead."
207+
)
203208

204209
if target_type is None:
205210
target_type = get_target_type(target, sampled_df, **init_kwargs)

ads/dataset/mixin/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*--
3+
4+
# Copyright (c) 2023 Oracle and/or its affiliates.
5+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

ads/dataset/mixin/dataset_accessor.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*--
3+
4+
# Copyright (c) 2023 Oracle and/or its affiliates.
5+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
6+
7+
from typing import Tuple
8+
import pandas as pd
9+
from ads.dataset import progress
10+
11+
from ads.type_discovery.typed_feature import TypedFeature
12+
13+
class ADSDatasetAccessMixin:
14+
15+
def dataset(
16+
self,
17+
sampled_df=None,
18+
shape=None,
19+
name="",
20+
description=None,
21+
type_discovery=True,
22+
types={},
23+
metadata=None,
24+
progress=progress.DummyProgressBar(),
25+
transformer_pipeline=None,
26+
interactive=False,
27+
**kwargs,
28+
):
29+
"""Converts pandas DataFrame into ADS Dataset.
30+
31+
Parameters
32+
----------
33+
sampled_df: pandas.DataFrame, optional
34+
The sampled pandas DataFrame. Defaults to None.
35+
shape: Tuple, optional
36+
The shape of pandas DataFrame. Defaults to None.
37+
name: str, optional
38+
The name of ADS Dataset. Defaults to "".
39+
description: str, optional
40+
Text describing the dataset. Defaults to "".
41+
type_discovery: bool. optional
42+
If false, the data types of the dataframe are used as such.
43+
By default, the dataframe columns are associated with the best suited data types. Associating the features
44+
with the disovered datatypes would impact visualizations and model prediction. Defaults to True.
45+
types: dict, optional
46+
Dictionary of <feature_name> : <data_type> to override the data type of features. Defaults to {}.
47+
metadata: dict, optional
48+
The metadata of ADS Dataset. Defaults to None.
49+
progress: dataset.progress.ProgressBar, optional
50+
The progress bar for ADS Dataset. Defaults to progress.DummyProgressBar()
51+
transformer_pipeline: datasets.pipeline.TransformerPipeline, optional
52+
A pipeline of transformations done outside the sdk and need to be applied at the time of scoring
53+
kwargs: additional keyword arguments that would be passed to underlying dataframe read API
54+
based on the format of the dataset
55+
56+
Returns
57+
-------
58+
ADSDataset:
59+
An instance of ADSDataset
60+
61+
Examples
62+
--------
63+
>>> import pandas as pd
64+
>>> df = pd.read_csv(<path_to_csv>)
65+
>>> ds = df.ads.dataset()
66+
"""
67+
from ads.dataset.dataset import ADSDataset
68+
69+
return ADSDataset.from_dataframe(
70+
df=self._obj,
71+
sampled_df=sampled_df,
72+
shape=shape,
73+
name=name,
74+
description=description,
75+
type_discovery=type_discovery,
76+
types=types,
77+
metadata=metadata,
78+
progress=progress,
79+
transformer_pipeline=transformer_pipeline,
80+
interactive=interactive,
81+
**kwargs
82+
)
83+
84+
def dataset_with_target(
85+
self,
86+
target: str,
87+
sampled_df: pd.DataFrame = None,
88+
shape: Tuple[int, int] = None,
89+
target_type: TypedFeature = None,
90+
positive_class=None,
91+
**kwargs,
92+
):
93+
"""Converts pandas DataFrame into ADS Dataset with target.
94+
95+
Parameters
96+
----------
97+
target: str, optional
98+
Name of the target in dataset.
99+
If set an ADSDatasetWithTarget object is returned, otherwise an ADSDataset object is returned which can be
100+
used to understand the dataset through visualizations
101+
sampled_df: pandas.DataFrame, optional
102+
The sampled pandas DataFrame. Defaults to None.
103+
shape: Tuple, optional
104+
The shape of pandas DataFrame. Defaults to None.
105+
target_type: TypedFeature, optional
106+
The target type of ADS Dataset. Defaults to None.
107+
positive_class: Any, optional
108+
Label in target for binary classification problems which should be identified as positive for modeling.
109+
By default, the first unique value is considered as the positive label.
110+
kwargs: additional keyword arguments that would be passed to underlying dataframe read API
111+
based on the format of the dataset
112+
113+
Returns
114+
-------
115+
ADSDatasetWithTarget:
116+
An instance of ADSDatasetWithTarget
117+
118+
Examples
119+
--------
120+
>>> import pandas as pd
121+
>>> df = pd.read_csv(<path_to_csv>)
122+
>>> ds = df.ads.dataset_with_target(target="target")
123+
"""
124+
from ads.dataset.dataset_with_target import ADSDatasetWithTarget
125+
126+
return ADSDatasetWithTarget.from_dataframe(
127+
df=self._obj,
128+
target=target,
129+
sampled_df=sampled_df,
130+
shape=shape,
131+
target_type=target_type,
132+
positive_class=positive_class,
133+
**kwargs
134+
)

ads/feature_engineering/accessor/dataframe_accessor.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8 -*--
33

4-
# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
4+
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
55
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
66

77
"""
@@ -41,6 +41,7 @@
4141
import pandas as pd
4242
from ads.common.utils import DATA_SCHEMA_MAX_COL_NUM
4343
from ads.data_labeling.mixin.data_labeling import DataLabelingAccessMixin
44+
from ads.dataset.mixin.dataset_accessor import ADSDatasetAccessMixin
4445
from ads.dbmixin.db_pandas_accessor import DBAccessMixin
4546
from ads.feature_engineering import schema
4647
from ads.feature_engineering.accessor.mixin.eda_mixin import EDAMixin
@@ -53,7 +54,11 @@
5354

5455
@pd.api.extensions.register_dataframe_accessor("ads")
5556
class ADSDataFrameAccessor(
56-
ADSFeatureTypesMixin, EDAMixin, DBAccessMixin, DataLabelingAccessMixin
57+
ADSFeatureTypesMixin,
58+
EDAMixin,
59+
DBAccessMixin,
60+
DataLabelingAccessMixin,
61+
ADSDatasetAccessMixin
5762
):
5863
"""ADS accessor for the Pandas DataFrame.
5964

tests/unitary/with_extras/dataset/test_dataset_dataset.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,8 @@ def test_initialize_dataset(self):
6262
description="test_description",
6363
storage_options={'config':{},'region':'us-ashburn-1'}
6464
)
65-
assert isinstance(employees, ADSDataset)
66-
assert isinstance(employees.df, pd.DataFrame)
67-
assert isinstance(employees.shape, Tuple)
68-
assert employees.name == "test_dataset"
69-
assert employees.description == "test_description"
70-
assert "type_discovery" in employees.init_kwargs
71-
assert isinstance(employees.transformer_pipeline, TransformerPipeline)
65+
66+
self.assert_dataset(employees)
7267

7368
def test_from_dataframe(self):
7469
employees = ADSDataset.from_dataframe(
@@ -77,13 +72,26 @@ def test_from_dataframe(self):
7772
description="test_description",
7873
storage_options={'config':{},'region':'us-ashburn-1'}
7974
)
80-
assert isinstance(employees, ADSDataset)
81-
assert isinstance(employees.df, pd.DataFrame)
82-
assert isinstance(employees.shape, Tuple)
83-
assert employees.name == "test_dataset"
84-
assert employees.description == "test_description"
85-
assert "type_discovery" in employees.init_kwargs
86-
assert isinstance(employees.transformer_pipeline, TransformerPipeline)
75+
76+
self.assert_dataset(employees)
77+
78+
def test_accessor(self):
79+
df=pd.read_csv(self.get_data_path())
80+
employees = df.ads.dataset(
81+
name="test_dataset",
82+
description="test_description",
83+
)
84+
85+
self.assert_dataset(employees)
86+
87+
def assert_dataset(self, dataset):
88+
assert isinstance(dataset, ADSDataset)
89+
assert isinstance(dataset.df, pd.DataFrame)
90+
assert isinstance(dataset.shape, Tuple)
91+
assert dataset.name == "test_dataset"
92+
assert dataset.description == "test_description"
93+
assert "type_discovery" in dataset.init_kwargs
94+
assert isinstance(dataset.transformer_pipeline, TransformerPipeline)
8795

8896
def get_data_path(self):
8997
current_dir = os.path.dirname(os.path.abspath(__file__))

tests/unitary/with_extras/dataset/test_dataset_target.py

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import os
77
from typing import Tuple
88
import pandas as pd
9+
import pytest
910
from ads.dataset.classification_dataset import BinaryClassificationDataset
1011
from ads.dataset.dataset_with_target import ADSDatasetWithTarget
1112
from ads.dataset.pipeline import TransformerPipeline
@@ -23,14 +24,9 @@ def test_initialize_dataset_target(self):
2324
)
2425

2526
assert isinstance(employees, ADSDatasetWithTarget)
26-
assert isinstance(employees.df, pd.DataFrame)
27-
assert isinstance(employees.shape, Tuple)
28-
assert isinstance(employees.target, TargetVariable)
29-
assert employees.target.type["type"] == "categorical"
3027
assert employees.name == "test_dataset"
3128
assert employees.description == "test_description"
32-
assert "type_discovery" in employees.init_kwargs
33-
assert isinstance(employees.transformer_pipeline, TransformerPipeline)
29+
self.assert_dataset(employees)
3430

3531
def test_dataset_target_from_dataframe(self):
3632
employees = ADSDatasetWithTarget.from_dataframe(
@@ -40,12 +36,34 @@ def test_dataset_target_from_dataframe(self):
4036
).set_positive_class('Yes')
4137

4238
assert isinstance(employees, BinaryClassificationDataset)
43-
assert isinstance(employees.df, pd.DataFrame)
44-
assert isinstance(employees.shape, Tuple)
45-
assert isinstance(employees.target, TargetVariable)
46-
assert employees.target.type["type"] == "categorical"
47-
assert "type_discovery" in employees.init_kwargs
48-
assert isinstance(employees.transformer_pipeline, TransformerPipeline)
39+
self.assert_dataset(employees)
40+
41+
def test_accessor_with_target(self):
42+
df=pd.read_csv(self.get_data_path())
43+
employees = df.ads.dataset_with_target(
44+
target="Attrition"
45+
)
46+
47+
assert isinstance(employees, BinaryClassificationDataset)
48+
self.assert_dataset(employees)
49+
50+
def test_accessor_with_target_error(self):
51+
df=pd.read_csv(self.get_data_path())
52+
wrong_column = "wrong_column"
53+
with pytest.raises(
54+
ValueError, match=f"{wrong_column} column doesn't exist in data frame. Specify a valid one instead."
55+
):
56+
employees = df.ads.dataset_with_target(
57+
target=wrong_column
58+
)
59+
60+
def assert_dataset(self, dataset):
61+
assert isinstance(dataset.df, pd.DataFrame)
62+
assert isinstance(dataset.shape, Tuple)
63+
assert isinstance(dataset.target, TargetVariable)
64+
assert dataset.target.type["type"] == "categorical"
65+
assert "type_discovery" in dataset.init_kwargs
66+
assert isinstance(dataset.transformer_pipeline, TransformerPipeline)
4967

5068
def get_data_path(self):
5169
current_dir = os.path.dirname(os.path.abspath(__file__))

0 commit comments

Comments
 (0)