31
31
DatasetDefaults ,
32
32
deprecate_default_value ,
33
33
deprecate_variable ,
34
+ get_dataset ,
35
+ infer_target_type ,
34
36
)
35
37
from ads .dataset .label_encoder import DataFrameLabelEncoder
36
38
from ads .dataset .pipeline import TransformerPipeline
@@ -223,7 +225,8 @@ def _head(self, n=5):
223
225
224
226
Examples
225
227
--------
226
- >>> ds = DatasetFactory.open("classfication_data.csv")
228
+ >>> import pandas as pd
229
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("classfication_data.csv"))
227
230
>>> ds.head()
228
231
* displays the first 5 rows of the dataset, just as the traditional head() function would *
229
232
"""
@@ -298,7 +301,8 @@ def call(self, func, *args, sample_size=None, **kwargs):
298
301
299
302
Examples
300
303
--------
301
- >>> ds = DatasetFactory.open("classfication_data.csv")
304
+ >>> import pandas as pd
305
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("classfication_data.csv"))
302
306
>>> def f1(df):
303
307
... return(sum(df), axis=0)
304
308
>>> sum_ds = ds.call(f1)
@@ -340,20 +344,19 @@ def set_target(self, target, type_discovery=True, target_type=None):
340
344
341
345
Examples
342
346
--------
343
- >>> ds = DatasetFactory.open("classfication_data.csv")
347
+ >>> import pandas as pd
348
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("classfication_data.csv"))
344
349
>>> ds_with_target= ds.set_target("target_class")
345
350
"""
346
- from ads .dataset .factory import DatasetFactory
347
-
348
351
if target_type :
349
352
target_series = self .sampled_df [target ].astype (target_type )
350
353
else :
351
354
target_series = self .sampled_df [target ]
352
- return DatasetFactory . _get_dataset (
355
+ return get_dataset (
353
356
self .df ,
354
357
self .sampled_df ,
355
358
target ,
356
- DatasetFactory . infer_target_type (target , target_series , type_discovery ),
359
+ infer_target_type (target , target_series , type_discovery ),
357
360
self .shape ,
358
361
** self .init_kwargs ,
359
362
)
@@ -396,7 +399,8 @@ def to_pandas(self, filter=None, frac=None, include_transformer_pipeline=False):
396
399
397
400
Examples
398
401
--------
399
- >>> ds = DatasetFactory.open("data.csv")
402
+ >>> import pandas as pd
403
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
400
404
>>> ds_as_df = ds.to_pandas()
401
405
402
406
Notes
@@ -462,7 +466,8 @@ def to_dask(
462
466
463
467
Examples
464
468
--------
465
- >>> ds = DatasetFactory.open("data.csv")
469
+ >>> import pandas as pd
470
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
466
471
>>> ds_dask = ds.to_dask()
467
472
468
473
Notes
@@ -521,7 +526,8 @@ def to_h2o(self, filter=None, frac=None, include_transformer_pipeline=False):
521
526
522
527
Examples
523
528
--------
524
- >>> ds = DatasetFactory.open("data.csv")
529
+ >>> import pandas as pd
530
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
525
531
>>> ds_as_h2o = ds.to_h2o()
526
532
527
533
Notes
@@ -578,7 +584,8 @@ def to_xgb(self, filter=None, frac=None, include_transformer_pipeline=False):
578
584
579
585
Examples
580
586
--------
581
- >>> ds = DatasetFactory.open("data.csv")
587
+ >>> import pandas as pd
588
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
582
589
>>> xgb_dmat = ds.to_xgb()
583
590
584
591
Notes
@@ -617,7 +624,8 @@ def sample(self, frac=None, random_state=utils.random_state):
617
624
618
625
Examples
619
626
--------
620
- >>> ds = DatasetFactory.open("data.csv")
627
+ >>> import pandas as pd
628
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
621
629
>>> ds_sample = ds.sample()
622
630
"""
623
631
df = self .df .sample (frac = frac , random_state = random_state )
@@ -644,7 +652,8 @@ def drop_columns(self, columns):
644
652
645
653
Examples
646
654
--------
647
- >>> ds = DatasetFactory.open("data.csv")
655
+ >>> import pandas as pd
656
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
648
657
>>> ds_smaller = ds.drop_columns(['col1', 'col2'])
649
658
"""
650
659
self ._validate_feature (columns )
@@ -671,7 +680,8 @@ def assign_column(self, column, arg):
671
680
672
681
Examples
673
682
--------
674
- >>> ds = DatasetFactory.open("data.csv")
683
+ >>> import pandas as pd
684
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
675
685
>>> ds_same_size = ds.assign_column('target',lambda x: x>15 if x not None)
676
686
>>> ds_bigger = ds.assign_column('new_col', np.arange(ds.shape[0]))
677
687
"""
@@ -746,7 +756,8 @@ def rename_columns(self, columns):
746
756
747
757
Examples
748
758
--------
749
- >>> ds = DatasetFactory.open("data.csv")
759
+ >>> import pandas as pd
760
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
750
761
>>> ds_renamed = ds.rename_columns({'col1': 'target'})
751
762
"""
752
763
if isinstance (columns , list ):
@@ -770,7 +781,8 @@ def set_name(self, name):
770
781
771
782
Examples
772
783
--------
773
- >>> ds = DatasetFactory.open("data1.csv")
784
+ >>> import pandas as pd
785
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data1.csv"))
774
786
>>> ds_renamed = ds.set_name("dataset1")
775
787
"""
776
788
self .name = name
@@ -788,7 +800,8 @@ def set_description(self, description):
788
800
789
801
Examples
790
802
--------
791
- >>> ds = DatasetFactory.open("data1.csv")
803
+ >>> import pandas as pd
804
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data1.csv"))
792
805
>>> ds_renamed = ds.set_description("dataset1 is from "data1.csv"")
793
806
"""
794
807
self .description = description
@@ -821,7 +834,8 @@ def snapshot(self, snapshot_dir=None, name="", storage_options=None):
821
834
822
835
Examples
823
836
--------
824
- >>> ds = DatasetFactory.open("data.csv")
837
+ >>> import pandas as pd
838
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
825
839
>>> ds_uri = ds.snapshot()
826
840
"""
827
841
if snapshot_dir is None :
@@ -873,7 +887,8 @@ def to_csv(self, path, storage_options=None, **kwargs):
873
887
874
888
Examples
875
889
--------
876
- >>> ds = DatasetFactory.open("data.csv")
890
+ >>> import pandas as pd
891
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
877
892
>>> [ds_link] = ds.to_csv("my/path.csv")
878
893
"""
879
894
if storage_options is None :
@@ -900,7 +915,8 @@ def to_parquet(self, path, storage_options=None, **kwargs):
900
915
901
916
Examples
902
917
--------
903
- >>> ds = DatasetFactory.open("data.csv")
918
+ >>> import pandas as pd
919
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
904
920
>>> ds.to_parquet("my/path")
905
921
"""
906
922
if storage_options is None :
@@ -927,7 +943,8 @@ def to_json(self, path, storage_options=None, **kwargs):
927
943
928
944
Examples
929
945
--------
930
- >>> ds = DatasetFactory.open("data.csv")
946
+ >>> import pandas as pd
947
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
931
948
>>> ds.to_json("my/path.json")
932
949
"""
933
950
if storage_options is None :
@@ -962,7 +979,8 @@ def to_hdf(
962
979
963
980
Examples
964
981
--------
965
- >>> ds = DatasetFactory.open("data.csv")
982
+ >>> import pandas as pd
983
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
966
984
>>> ds.to_hdf(path="my/path.h5", key="df")
967
985
"""
968
986
if storage_options is None :
@@ -1035,7 +1053,13 @@ def to_avro(self, path, schema=None, storage_options=None, **kwargs):
1035
1053
1036
1054
Examples
1037
1055
--------
1038
- >>> ds = DatasetFactory.open("data.avro")
1056
+ >>> import pandas
1057
+ >>> import fastavro
1058
+ >>> with open("data.avro", "rb") as fp:
1059
+ >>> reader = fastavro.reader(fp)
1060
+ >>> records = [r for r in reader]
1061
+ >>> df = pandas.DataFrame.from_records(records)
1062
+ >>> ds = ADSDataset.from_dataframe(df)
1039
1063
>>> ds.to_avro("my/path.avro")
1040
1064
"""
1041
1065
# Get the row by row formatting
@@ -1101,7 +1125,8 @@ def astype(self, types):
1101
1125
1102
1126
Examples
1103
1127
--------
1104
- >>> ds = DatasetFactory.open("data.csv")
1128
+ >>> import pandas as pd
1129
+ >>> ds = ADSDataset.from_dataframe(pd.read_csv("data.csv"))
1105
1130
>>> ds_reformatted = ds.astype({"target": "categorical"})
1106
1131
"""
1107
1132
return self .__getattr__ ("astype" )(helper .map_types (types ))
@@ -1119,8 +1144,10 @@ def merge(self, data, **kwargs):
1119
1144
1120
1145
Examples
1121
1146
--------
1122
- >>> ds1 = DatasetFactory.open("data1.csv")
1123
- >>> ds2 = DatasetFactory.open("data2.csv")
1147
+ >>> import pandas as pd
1148
+ >>> df1 = pd.read_csv("data1.csv")
1149
+ >>> df2 = pd.read_csv("data2.csv")
1150
+ >>> ds = ADSDataset.from_dataframe(df1.merge(df2))
1124
1151
>>> ds_12 = ds1.merge(ds2)
1125
1152
"""
1126
1153
assert isinstance (data , pd .DataFrame ) or isinstance (
@@ -1275,9 +1302,8 @@ def _build_new_dataset(
1275
1302
if progress :
1276
1303
progress .update ("Building new dataset" )
1277
1304
target_type = self .target .type if target_type is None else target_type
1278
- from ads .dataset .factory import DatasetFactory
1279
1305
1280
- new_ds = DatasetFactory . _get_dataset (
1306
+ new_ds = get_dataset (
1281
1307
df ,
1282
1308
sampled_df ,
1283
1309
target ,
0 commit comments