1
1
#!/usr/bin/env python
2
2
# -*- coding: utf-8; -*-
3
3
4
- # Copyright (c) 2020, 2022 Oracle and/or its affiliates.
4
+ # Copyright (c) 2020, 2023 Oracle and/or its affiliates.
5
5
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
6
6
7
7
from __future__ import absolute_import , print_function
10
10
import importlib
11
11
from collections import defaultdict
12
12
from numbers import Number
13
- from typing import Union
13
+ from typing import Tuple , Union
14
14
15
15
import pandas as pd
16
16
from ads .common import utils , logger
23
23
from ads .dataset .dataset import ADSDataset
24
24
from ads .dataset .feature_engineering_transformer import FeatureEngineeringTransformer
25
25
from ads .dataset .feature_selection import FeatureImportance
26
- from ads .dataset .helper import deprecate_default_value , deprecate_variable
26
+ from ads .dataset .helper import (
27
+ DatasetDefaults ,
28
+ deprecate_default_value ,
29
+ deprecate_variable ,
30
+ generate_sample ,
31
+ get_target_type ,
32
+ is_text_data ,
33
+ )
27
34
from ads .dataset .label_encoder import DataFrameLabelEncoder
28
35
from ads .dataset .pipeline import TransformerPipeline
29
36
from ads .dataset .progress import DummyProgressBar
30
37
from ads .dataset .recommendation import Recommendation
31
38
from ads .dataset .recommendation_transformer import RecommendationTransformer
32
39
from ads .dataset .target import TargetVariable
33
- from ads .type_discovery .typed_feature import DateTimeTypedFeature
40
+ from ads .type_discovery .typed_feature import (
41
+ CategoricalTypedFeature ,
42
+ ContinuousTypedFeature ,
43
+ DocumentTypedFeature ,
44
+ GISTypedFeature ,
45
+ OrdinalTypedFeature ,
46
+ TypedFeature ,
47
+ DateTimeTypedFeature ,
48
+ TypedFeature
49
+ )
34
50
from sklearn .model_selection import train_test_split
35
51
from pandas .io .formats .printing import pprint_thing
36
52
from sklearn .preprocessing import FunctionTransformer
@@ -45,10 +61,10 @@ class ADSDatasetWithTarget(ADSDataset, metaclass=ABCMeta):
45
61
def __init__ (
46
62
self ,
47
63
df ,
48
- sampled_df ,
49
64
target ,
50
- target_type ,
51
- shape ,
65
+ sampled_df = None ,
66
+ shape = None ,
67
+ target_type = None ,
52
68
sample_max_rows = - 1 ,
53
69
type_discovery = True ,
54
70
types = {},
@@ -61,6 +77,16 @@ def __init__(
61
77
** kwargs ,
62
78
):
63
79
self .recommendation_transformer = None
80
+ if shape is None :
81
+ shape = df .shape
82
+ if sampled_df is None :
83
+ sampled_df = generate_sample (
84
+ df ,
85
+ shape [0 ],
86
+ DatasetDefaults .sampling_confidence_level ,
87
+ DatasetDefaults .sampling_confidence_interval ,
88
+ ** kwargs ,
89
+ )
64
90
65
91
if parent is None :
66
92
cols = sampled_df .columns .tolist ()
@@ -135,6 +161,8 @@ def __init__(
135
161
cols .insert (0 , cols .pop (cols .index (target )))
136
162
self .sampled_df = self .sampled_df [[* cols ]]
137
163
164
+ if target_type is None :
165
+ target_type = get_target_type (target , sampled_df , ** kwargs )
138
166
self .target = TargetVariable (self , target , target_type )
139
167
140
168
# remove target from type discovery conversion
@@ -145,6 +173,141 @@ def __init__(
145
173
):
146
174
step [1 ].kw_args ["dtypes" ].pop (self .target .name )
147
175
176
+ @staticmethod
177
+ def from_dataframe (
178
+ df : pd .DataFrame ,
179
+ target : str ,
180
+ sampled_df : pd .DataFrame = None ,
181
+ shape : Tuple [int , int ] = None ,
182
+ target_type : TypedFeature = None ,
183
+ positive_class = None ,
184
+ ** init_kwargs ,
185
+ ):
186
+ from ads .dataset .classification_dataset import (
187
+ BinaryClassificationDataset ,
188
+ BinaryTextClassificationDataset ,
189
+ MultiClassClassificationDataset ,
190
+ MultiClassTextClassificationDataset
191
+ )
192
+ from ads .dataset .forecasting_dataset import ForecastingDataset
193
+ from ads .dataset .regression_dataset import RegressionDataset
194
+
195
+ if sampled_df is None :
196
+ sampled_df = generate_sample (
197
+ df ,
198
+ (shape or df .shape )[0 ],
199
+ DatasetDefaults .sampling_confidence_level ,
200
+ DatasetDefaults .sampling_confidence_interval ,
201
+ ** init_kwargs ,
202
+ )
203
+
204
+ if target_type is None :
205
+ target_type = get_target_type (target , sampled_df , ** init_kwargs )
206
+
207
+ if len (df [target ].dropna ()) == 0 :
208
+ logger .warning (
209
+ "It is not recommended to use an empty column as the target variable."
210
+ )
211
+ raise ValueError (
212
+ f"We do not support using empty columns as the chosen target"
213
+ )
214
+ if utils .is_same_class (target_type , ContinuousTypedFeature ):
215
+ return RegressionDataset (
216
+ df = df ,
217
+ sampled_df = sampled_df ,
218
+ target = target ,
219
+ target_type = target_type ,
220
+ shape = shape ,
221
+ ** init_kwargs ,
222
+ )
223
+ elif utils .is_same_class (
224
+ target_type , DateTimeTypedFeature
225
+ ) or df .index .dtype .name .startswith ("datetime" ):
226
+ return ForecastingDataset (
227
+ df = df ,
228
+ sampled_df = sampled_df ,
229
+ target = target ,
230
+ target_type = target_type ,
231
+ shape = shape ,
232
+ ** init_kwargs ,
233
+ )
234
+
235
+ # Adding ordinal typed feature, but ultimately we should rethink how we want to model this type
236
+ elif utils .is_same_class (target_type , CategoricalTypedFeature ) or utils .is_same_class (
237
+ target_type , OrdinalTypedFeature
238
+ ):
239
+ if target_type .meta_data ["internal" ]["unique" ] == 2 :
240
+ if is_text_data (sampled_df , target ):
241
+ return BinaryTextClassificationDataset (
242
+ df = df ,
243
+ sampled_df = sampled_df ,
244
+ target = target ,
245
+ shape = shape ,
246
+ target_type = target_type ,
247
+ positive_class = positive_class ,
248
+ ** init_kwargs ,
249
+ )
250
+
251
+ return BinaryClassificationDataset (
252
+ df = df ,
253
+ sampled_df = sampled_df ,
254
+ target = target ,
255
+ shape = shape ,
256
+ target_type = target_type ,
257
+ positive_class = positive_class ,
258
+ ** init_kwargs ,
259
+ )
260
+ else :
261
+ if is_text_data (sampled_df , target ):
262
+ return MultiClassTextClassificationDataset (
263
+ df = df ,
264
+ sampled_df = sampled_df ,
265
+ target = target ,
266
+ target_type = target_type ,
267
+ shape = shape ,
268
+ ** init_kwargs ,
269
+ )
270
+ return MultiClassClassificationDataset (
271
+ df = df ,
272
+ sampled_df = sampled_df ,
273
+ target = target ,
274
+ target_type = target_type ,
275
+ shape = shape ,
276
+ ** init_kwargs ,
277
+ )
278
+ elif (
279
+ utils .is_same_class (target , DocumentTypedFeature )
280
+ or "text" in target_type ["type" ]
281
+ or "text" in target
282
+ ):
283
+ raise ValueError (
284
+ f"The column { target } cannot be used as the target column."
285
+ )
286
+ elif (
287
+ utils .is_same_class (target_type , GISTypedFeature )
288
+ or "coord" in target_type ["type" ]
289
+ or "coord" in target
290
+ ):
291
+ raise ValueError (
292
+ f"The column { target } cannot be used as the target column."
293
+ )
294
+ # This is to catch constant columns that are boolean. Added as a fix for pd.isnull(), and datasets with a
295
+ # binary target, but only data on one instance
296
+ elif target_type and target_type ["low_level_type" ] == "bool" :
297
+ return BinaryClassificationDataset (
298
+ df = df ,
299
+ sampled_df = sampled_df ,
300
+ target = target ,
301
+ shape = shape ,
302
+ target_type = target_type ,
303
+ positive_class = positive_class ,
304
+ ** init_kwargs ,
305
+ )
306
+ raise ValueError (
307
+ f"Unable to identify problem type. Specify the data type of { target } using 'types'. "
308
+ f"For example, types = {{{ target } : 'category'}}"
309
+ )
310
+
148
311
def rename_columns (self , columns ):
149
312
"""
150
313
Returns a dataset with columns renamed.
0 commit comments