Skip to content

Commit 8fbf417

Browse files
authored
Deprecated DatasetFactory class and refactored code. (#254)
2 parents f0bcd44 + f72411e commit 8fbf417

File tree

4 files changed

+673
-35
lines changed

4 files changed

+673
-35
lines changed

ads/dataset/dataset.py

Lines changed: 54 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
DatasetDefaults,
3232
deprecate_default_value,
3333
deprecate_variable,
34+
get_dataset,
35+
infer_target_type,
3436
)
3537
from ads.dataset.label_encoder import DataFrameLabelEncoder
3638
from ads.dataset.pipeline import TransformerPipeline
@@ -223,7 +225,8 @@ def _head(self, n=5):
223225
224226
Examples
225227
--------
226-
>>> ds = DatasetFactory.open("classfication_data.csv")
228+
>>> import pandas as pd
229+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("classfication_data.csv"))
227230
>>> ds.head()
228231
* displays the first 5 rows of the dataset, just as the traditional head() function would *
229232
"""
@@ -298,7 +301,8 @@ def call(self, func, *args, sample_size=None, **kwargs):
298301
299302
Examples
300303
--------
301-
>>> ds = DatasetFactory.open("classfication_data.csv")
304+
>>> import pandas as pd
305+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("classfication_data.csv"))
302306
>>> def f1(df):
303307
... return(sum(df), axis=0)
304308
>>> sum_ds = ds.call(f1)
@@ -340,20 +344,19 @@ def set_target(self, target, type_discovery=True, target_type=None):
340344
341345
Examples
342346
--------
343-
>>> ds = DatasetFactory.open("classfication_data.csv")
347+
>>> import pandas as pd
348+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("classfication_data.csv"))
344349
>>> ds_with_target= ds.set_target("target_class")
345350
"""
346-
from ads.dataset.factory import DatasetFactory
347-
348351
if target_type:
349352
target_series = self.sampled_df[target].astype(target_type)
350353
else:
351354
target_series = self.sampled_df[target]
352-
return DatasetFactory._get_dataset(
355+
return get_dataset(
353356
self.df,
354357
self.sampled_df,
355358
target,
356-
DatasetFactory.infer_target_type(target, target_series, type_discovery),
359+
infer_target_type(target, target_series, type_discovery),
357360
self.shape,
358361
**self.init_kwargs,
359362
)
@@ -396,7 +399,8 @@ def to_pandas(self, filter=None, frac=None, include_transformer_pipeline=False):
396399
397400
Examples
398401
--------
399-
>>> ds = DatasetFactory.open("data.csv")
402+
>>> import pandas as pd
403+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
400404
>>> ds_as_df = ds.to_pandas()
401405
402406
Notes
@@ -462,7 +466,8 @@ def to_dask(
462466
463467
Examples
464468
--------
465-
>>> ds = DatasetFactory.open("data.csv")
469+
>>> import pandas as pd
470+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
466471
>>> ds_dask = ds.to_dask()
467472
468473
Notes
@@ -521,7 +526,8 @@ def to_h2o(self, filter=None, frac=None, include_transformer_pipeline=False):
521526
522527
Examples
523528
--------
524-
>>> ds = DatasetFactory.open("data.csv")
529+
>>> import pandas as pd
530+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
525531
>>> ds_as_h2o = ds.to_h2o()
526532
527533
Notes
@@ -578,7 +584,8 @@ def to_xgb(self, filter=None, frac=None, include_transformer_pipeline=False):
578584
579585
Examples
580586
--------
581-
>>> ds = DatasetFactory.open("data.csv")
587+
>>> import pandas as pd
588+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
582589
>>> xgb_dmat = ds.to_xgb()
583590
584591
Notes
@@ -617,7 +624,8 @@ def sample(self, frac=None, random_state=utils.random_state):
617624
618625
Examples
619626
--------
620-
>>> ds = DatasetFactory.open("data.csv")
627+
>>> import pandas as pd
628+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
621629
>>> ds_sample = ds.sample()
622630
"""
623631
df = self.df.sample(frac=frac, random_state=random_state)
@@ -644,7 +652,8 @@ def drop_columns(self, columns):
644652
645653
Examples
646654
--------
647-
>>> ds = DatasetFactory.open("data.csv")
655+
>>> import pandas as pd
656+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
648657
>>> ds_smaller = ds.drop_columns(['col1', 'col2'])
649658
"""
650659
self._validate_feature(columns)
@@ -671,7 +680,8 @@ def assign_column(self, column, arg):
671680
672681
Examples
673682
--------
674-
>>> ds = DatasetFactory.open("data.csv")
683+
>>> import pandas as pd
684+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
675685
>>> ds_same_size = ds.assign_column('target',lambda x: x>15 if x not None)
676686
>>> ds_bigger = ds.assign_column('new_col', np.arange(ds.shape[0]))
677687
"""
@@ -746,7 +756,8 @@ def rename_columns(self, columns):
746756
747757
Examples
748758
--------
749-
>>> ds = DatasetFactory.open("data.csv")
759+
>>> import pandas as pd
760+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
750761
>>> ds_renamed = ds.rename_columns({'col1': 'target'})
751762
"""
752763
if isinstance(columns, list):
@@ -770,7 +781,8 @@ def set_name(self, name):
770781
771782
Examples
772783
--------
773-
>>> ds = DatasetFactory.open("data1.csv")
784+
>>> import pandas as pd
785+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data1.csv"))
774786
>>> ds_renamed = ds.set_name("dataset1")
775787
"""
776788
self.name = name
@@ -788,7 +800,8 @@ def set_description(self, description):
788800
789801
Examples
790802
--------
791-
>>> ds = DatasetFactory.open("data1.csv")
803+
>>> import pandas as pd
804+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data1.csv"))
792805
>>> ds_renamed = ds.set_description("dataset1 is from "data1.csv"")
793806
"""
794807
self.description = description
@@ -821,7 +834,8 @@ def snapshot(self, snapshot_dir=None, name="", storage_options=None):
821834
822835
Examples
823836
--------
824-
>>> ds = DatasetFactory.open("data.csv")
837+
>>> import pandas as pd
838+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
825839
>>> ds_uri = ds.snapshot()
826840
"""
827841
if snapshot_dir is None:
@@ -873,7 +887,8 @@ def to_csv(self, path, storage_options=None, **kwargs):
873887
874888
Examples
875889
--------
876-
>>> ds = DatasetFactory.open("data.csv")
890+
>>> import pandas as pd
891+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
877892
>>> [ds_link] = ds.to_csv("my/path.csv")
878893
"""
879894
if storage_options is None:
@@ -900,7 +915,8 @@ def to_parquet(self, path, storage_options=None, **kwargs):
900915
901916
Examples
902917
--------
903-
>>> ds = DatasetFactory.open("data.csv")
918+
>>> import pandas as pd
919+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
904920
>>> ds.to_parquet("my/path")
905921
"""
906922
if storage_options is None:
@@ -927,7 +943,8 @@ def to_json(self, path, storage_options=None, **kwargs):
927943
928944
Examples
929945
--------
930-
>>> ds = DatasetFactory.open("data.csv")
946+
>>> import pandas as pd
947+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
931948
>>> ds.to_json("my/path.json")
932949
"""
933950
if storage_options is None:
@@ -962,7 +979,8 @@ def to_hdf(
962979
963980
Examples
964981
--------
965-
>>> ds = DatasetFactory.open("data.csv")
982+
>>> import pandas as pd
983+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
966984
>>> ds.to_hdf(path="my/path.h5", key="df")
967985
"""
968986
if storage_options is None:
@@ -1035,7 +1053,13 @@ def to_avro(self, path, schema=None, storage_options=None, **kwargs):
10351053
10361054
Examples
10371055
--------
1038-
>>> ds = DatasetFactory.open("data.avro")
1056+
>>> import pandas
1057+
>>> import fastavro
1058+
>>> with open("data.avro", "rb") as fp:
1059+
>>> reader = fastavro.reader(fp)
1060+
>>> records = [r for r in reader]
1061+
>>> df = pandas.DataFrame.from_records(records)
1062+
>>> ds = ADSDataset.from_dataframe(df)
10391063
>>> ds.to_avro("my/path.avro")
10401064
"""
10411065
# Get the row by row formatting
@@ -1101,7 +1125,8 @@ def astype(self, types):
11011125
11021126
Examples
11031127
--------
1104-
>>> ds = DatasetFactory.open("data.csv")
1128+
>>> import pandas as pd
1129+
>>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
11051130
>>> ds_reformatted = ds.astype({"target": "categorical"})
11061131
"""
11071132
return self.__getattr__("astype")(helper.map_types(types))
@@ -1119,8 +1144,10 @@ def merge(self, data, **kwargs):
11191144
11201145
Examples
11211146
--------
1122-
>>> ds1 = DatasetFactory.open("data1.csv")
1123-
>>> ds2 = DatasetFactory.open("data2.csv")
1147+
>>> import pandas as pd
1148+
>>> df1 = pd.read_csv("data1.csv")
1149+
>>> df2 = pd.read_csv("data2.csv")
1150+
>>> ds = ADSDataset.from_dataframe(df1.merge(df2))
11241151
>>> ds_12 = ds1.merge(ds2)
11251152
"""
11261153
assert isinstance(data, pd.DataFrame) or isinstance(
@@ -1275,9 +1302,8 @@ def _build_new_dataset(
12751302
if progress:
12761303
progress.update("Building new dataset")
12771304
target_type = self.target.type if target_type is None else target_type
1278-
from ads.dataset.factory import DatasetFactory
12791305

1280-
new_ds = DatasetFactory._get_dataset(
1306+
new_ds = get_dataset(
12811307
df,
12821308
sampled_df,
12831309
target,

ads/dataset/dataset_browser.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import pandas as pd
2020
import sklearn.datasets as sk_datasets
2121

22-
from ads.dataset.factory import DatasetFactory
22+
from ads.dataset import helper
2323
from ads.common.utils import inject_and_copy_kwargs
2424
from ads.common.decorator.runtime_dependency import (
2525
runtime_dependency,
@@ -170,7 +170,7 @@ def open(self, name: str, **kwargs):
170170

171171
for obj in self._generate_filelist():
172172
if obj["name"] == name:
173-
return DatasetFactory.open(
173+
return helper.open(
174174
**inject_and_copy_kwargs(
175175
kwargs,
176176
**{
@@ -202,7 +202,7 @@ def open(self, name: str, **kwargs):
202202

203203
for obj in self._generate_filelist():
204204
if obj["name"] == name:
205-
return DatasetFactory.open(
205+
return helper.open(
206206
**inject_and_copy_kwargs(
207207
kwargs,
208208
**{
@@ -286,7 +286,7 @@ def open(self, name: str, **kwargs):
286286
#
287287
for obj in self.listing:
288288
if obj["name"] == name:
289-
return DatasetFactory.open(
289+
return helper.open(
290290
obj["url"],
291291
format=obj["format"],
292292
name=obj["name"],
@@ -307,7 +307,7 @@ def list(self, filter_pattern: str = ".*") -> List[str]:
307307
@runtime_dependency(module="seaborn", install_from=OptionalDependency.VIZ)
308308
def open(self, name: str, **kwargs):
309309
if name in self.dataset_names:
310-
return DatasetFactory.open(
310+
return helper.open(
311311
seaborn.load_dataset(name), name=name, description="from seaborn"
312312
)
313313
else:
@@ -350,7 +350,7 @@ def open(self, name: str, **kwargs):
350350
df = pd.DataFrame(data.data, columns=data.feature_names)
351351
df["target"] = pd.Series(data.target)
352352

353-
return DatasetFactory.open(
353+
return helper.open(
354354
df, target="target", name=name, description=description
355355
)
356356

ads/dataset/factory.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,17 @@
5959
mindate = datetime.date(datetime.MINYEAR, 1, 1)
6060

6161

62+
warnings.warn(
63+
(
64+
"The `ads.dataset.factory` is deprecated in `oracle-ads 2.8.8` and will be removed in `oracle-ads 3.0`."
65+
"Use Pandas to read from local files or object storage directly. "
66+
"Check https://accelerated-data-science.readthedocs.io/en/latest/user_guide/loading_data/connect.html."
67+
),
68+
DeprecationWarning,
69+
stacklevel=2,
70+
)
71+
72+
6273
class DatasetFactory:
6374
@staticmethod
6475
@deprecated(

0 commit comments

Comments
 (0)