Skip to content

Commit ca5766d

Browse files
authored
Update pandas version requirement to pandas>1.2.1,<2.1 (#308)
2 parents 6fbe4e4 + fab05dc commit ca5766d

File tree

22 files changed

+90
-93
lines changed

22 files changed

+90
-93
lines changed

ads/dataset/helper.py

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,6 @@ def _get_dtype_from_error(e):
314314
error_string = str(e)
315315

316316
if "mismatched dtypes" in error_string.lower():
317-
318317
# For the mismatched dtypes error, dask either returns a error message containing the dtype argument
319318
# to specify, or the found and expected dtypes in a table format, depending on what stage
320319
# the type inferencing fails. The below logic supports building the dtype dictionary for both cases
@@ -732,8 +731,8 @@ def down_sample(df, target):
732731
"""
733732
dfs = []
734733
target_value_counts = df[target].value_counts()
735-
min_key = min(target_value_counts.iteritems(), key=lambda k: k[1])
736-
for key, value in target_value_counts.iteritems():
734+
min_key = min(target_value_counts.items(), key=lambda k: k[1])
735+
for key, value in target_value_counts.items():
737736
if key != min_key[0]:
738737
dfs.append(
739738
df[df[target] == key].sample(frac=1 - ((value - min_key[1]) / value))
@@ -835,6 +834,7 @@ def _log_yscale_not_set():
835834
"`yscale` parameter is not set. Valid values are `'linear'`, `'log'`, `'symlog'`."
836835
)
837836

837+
838838
def infer_target_type(target, target_series, discover_target_type=True):
839839
# if type discovery is turned off, infer type from pandas dtype
840840
if discover_target_type:
@@ -845,13 +845,15 @@ def infer_target_type(target, target_series, discover_target_type=True):
845845
target_type = get_feature_type(target, target_series)
846846
return target_type
847847

848+
848849
def get_target_type(target, sampled_df, **init_kwargs):
849850
discover_target_type = init_kwargs.get("type_discovery", True)
850851
if target in init_kwargs.get("types", {}):
851852
sampled_df[target] = sampled_df[target].astype(init_kwargs.get("types")[target])
852853
discover_target_type = False
853854
return infer_target_type(target, sampled_df[target], discover_target_type)
854855

856+
855857
def get_dataset(
856858
df: pd.DataFrame,
857859
sampled_df: pd.DataFrame,
@@ -860,12 +862,12 @@ def get_dataset(
860862
shape: Tuple[int, int],
861863
positive_class=None,
862864
**init_kwargs,
863-
):
865+
):
864866
from ads.dataset.classification_dataset import (
865-
BinaryClassificationDataset,
866-
BinaryTextClassificationDataset,
867-
MultiClassClassificationDataset,
868-
MultiClassTextClassificationDataset
867+
BinaryClassificationDataset,
868+
BinaryTextClassificationDataset,
869+
MultiClassClassificationDataset,
870+
MultiClassTextClassificationDataset,
869871
)
870872
from ads.dataset.forecasting_dataset import ForecastingDataset
871873
from ads.dataset.regression_dataset import RegressionDataset
@@ -874,9 +876,7 @@ def get_dataset(
874876
logger.warning(
875877
"It is not recommended to use an empty column as the target variable."
876878
)
877-
raise ValueError(
878-
f"We do not support using empty columns as the chosen target"
879-
)
879+
raise ValueError(f"We do not support using empty columns as the chosen target")
880880
if utils.is_same_class(target_type, ContinuousTypedFeature):
881881
return RegressionDataset(
882882
df=df,
@@ -899,9 +899,9 @@ def get_dataset(
899899
)
900900

901901
# Adding ordinal typed feature, but ultimately we should rethink how we want to model this type
902-
elif utils.is_same_class(target_type, CategoricalTypedFeature) or utils.is_same_class(
903-
target_type, OrdinalTypedFeature
904-
):
902+
elif utils.is_same_class(
903+
target_type, CategoricalTypedFeature
904+
) or utils.is_same_class(target_type, OrdinalTypedFeature):
905905
if target_type.meta_data["internal"]["unique"] == 2:
906906
if is_text_data(sampled_df, target):
907907
return BinaryTextClassificationDataset(
@@ -946,17 +946,13 @@ def get_dataset(
946946
or "text" in target_type["type"]
947947
or "text" in target
948948
):
949-
raise ValueError(
950-
f"The column {target} cannot be used as the target column."
951-
)
949+
raise ValueError(f"The column {target} cannot be used as the target column.")
952950
elif (
953951
utils.is_same_class(target_type, GISTypedFeature)
954952
or "coord" in target_type["type"]
955953
or "coord" in target
956954
):
957-
raise ValueError(
958-
f"The column {target} cannot be used as the target column."
959-
)
955+
raise ValueError(f"The column {target} cannot be used as the target column.")
960956
# This is to catch constant columns that are boolean. Added as a fix for pd.isnull(), and datasets with a
961957
# binary target, but only data on one instance
962958
elif target_type["low_level_type"] == "bool":
@@ -974,6 +970,7 @@ def get_dataset(
974970
f"For example, types = {{{target}: 'category'}}"
975971
)
976972

973+
977974
def open(
978975
source,
979976
target=None,
@@ -1074,9 +1071,7 @@ def open(
10741071
progress.update("Opening data")
10751072
path = ElaboratedPath(source, format=format, **kwargs)
10761073
reader_fn = (
1077-
get_format_reader(path=path, **kwargs)
1078-
if reader_fn is None
1079-
else reader_fn
1074+
get_format_reader(path=path, **kwargs) if reader_fn is None else reader_fn
10801075
)
10811076
df = load_dataset(path=path, reader_fn=reader_fn, **kwargs)
10821077
name = path.name
@@ -1108,6 +1103,7 @@ def open(
11081103
),
11091104
)
11101105

1106+
11111107
def build_dataset(
11121108
df: pd.DataFrame,
11131109
shape: Tuple[int, int],
@@ -1149,9 +1145,7 @@ def build_dataset(
11491145
discover_target_type = False
11501146

11511147
# if type discovery is turned off, infer type from pandas dtype
1152-
target_type = infer_target_type(
1153-
target, sampled_df[target], discover_target_type
1154-
)
1148+
target_type = infer_target_type(target, sampled_df[target], discover_target_type)
11551149

11561150
result = get_dataset(
11571151
df=df,
@@ -1168,6 +1162,7 @@ def build_dataset(
11681162
)
11691163
return result
11701164

1165+
11711166
class CustomFormatReaders:
11721167
@staticmethod
11731168
def read_tsv(path: str, **kwargs) -> pd.DataFrame:
@@ -1352,7 +1347,6 @@ def read_xml(path: str, **kwargs) -> pd.DataFrame:
13521347
import xml.etree.cElementTree as et
13531348

13541349
def get_children(df, node, parent, i):
1355-
13561350
for name in node.attrib.keys():
13571351
df.at[i, parent + name] = node.attrib[name]
13581352
for child in list(node):
@@ -1374,6 +1368,7 @@ def get_children(df, node, parent, i):
13741368
last_i = i
13751369
return ret_df
13761370

1371+
13771372
reader_fns = {
13781373
"csv": pd.read_csv,
13791374
"tsv": CustomFormatReaders.read_tsv,
@@ -1399,13 +1394,15 @@ def get_children(df, node, parent, i):
13991394
"xml": CustomFormatReaders.read_xml,
14001395
}
14011396

1397+
14021398
def validate_kwargs(func: Callable, kwargs):
14031399
valid_params = inspect.signature(func).parameters
14041400
if "kwargs" in valid_params:
14051401
return kwargs
14061402
else:
14071403
return {k: v for k, v in kwargs.items() if k in valid_params}
14081404

1405+
14091406
def get_format_reader(path: ElaboratedPath, **kwargs) -> Callable:
14101407
format_key = path.format
14111408
try:
@@ -1420,6 +1417,7 @@ def get_format_reader(path: ElaboratedPath, **kwargs) -> Callable:
14201417

14211418
return reader_fn
14221419

1420+
14231421
def load_dataset(path: ElaboratedPath, reader_fn: Callable, **kwargs) -> pd.DataFrame:
14241422
dfs = []
14251423
for filename in path.paths:

ads/dataset/recommendation_transformer.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8; -*-
33

4-
# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
4+
# Copyright (c) 2020, 2023 Oracle and/or its affiliates.
55
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
66

77
from __future__ import print_function, absolute_import
@@ -131,7 +131,6 @@ def _get_recommendations(self, df):
131131
self.feature_metadata_[self.target_] = self.target_type_
132132

133133
for column in df.columns.values[df.isnull().any()]:
134-
135134
# filter out columns that were discovered as constant or primary key columns in the previous step,
136135
# as they would get dropped before imputation
137136
if (
@@ -246,10 +245,10 @@ def _get_recommendations(self, df):
246245
if not self.is_balanced and self.fix_imbalance:
247246
target_value_counts = df[self.target_].value_counts()
248247
minority_class_len = min(
249-
target_value_counts.iteritems(), key=lambda k: k[1]
248+
target_value_counts.items(), key=lambda k: k[1]
250249
)[1]
251250
majority_class_len = max(
252-
target_value_counts.iteritems(), key=lambda k: k[1]
251+
target_value_counts.items(), key=lambda k: k[1]
253252
)[1]
254253
minor_majority_ratio = minority_class_len / majority_class_len
255254

ads/evaluations/evaluation_plot.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -447,7 +447,7 @@ def _lift_and_gain_chart(cls, ax, evaluation):
447447

448448
@classmethod
449449
def _lift_chart(cls, ax, evaluation):
450-
for mod_name, col in evaluation.iteritems():
450+
for mod_name, col in evaluation.items():
451451
if col["y_score"] is not None:
452452
ax.plot(
453453
col["percentages"][1:],
@@ -476,7 +476,7 @@ def _lift_chart(cls, ax, evaluation):
476476

477477
@classmethod
478478
def _gain_chart(cls, ax, evaluation):
479-
for mod_name, col in evaluation.iteritems():
479+
for mod_name, col in evaluation.items():
480480
if col["y_score"] is not None:
481481
ax.plot(
482482
col["percentages"],
@@ -517,7 +517,7 @@ def _pr_curve(cls, axs, evaluation):
517517
ax.axis("off")
518518
return
519519
if cls.prob_type == "_bin":
520-
for mod_name, col in evaluation.iteritems():
520+
for mod_name, col in evaluation.items():
521521
if col["y_score"] is not None:
522522
ax.plot(
523523
col["recall_values"],
@@ -589,7 +589,7 @@ def _roc_curve(cls, axs, evaluation):
589589
ax.axis("off")
590590
return
591591
if cls.prob_type == "_bin":
592-
for mod_name, col in evaluation.iteritems():
592+
for mod_name, col in evaluation.items():
593593
if col["y_score"] is not None:
594594
ax.plot(
595595
col["false_positive_rate"],
@@ -803,7 +803,6 @@ def _pretty_scatter(
803803
label=None,
804804
plot_kwargs=None,
805805
):
806-
807806
if plot_kwargs is None:
808807
plot_kwargs = {}
809808
ax.scatter(x, y, s=s, label=label, marker="o", alpha=alpha, **plot_kwargs)

ads/feature_engineering/accessor/dataframe_accessor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def feature_type_description(self) -> pd.DataFrame:
218218
for col in self._obj:
219219
series_feature_type_df = self._obj[col].ads.feature_type_description
220220
series_feature_type_df.insert(0, "Column", col)
221-
result_df = result_df.append(series_feature_type_df)
221+
result_df = pd.concat([result_df, series_feature_type_df])
222222
result_df.reset_index(drop=True, inplace=True)
223223
return result_df
224224

ads/feature_engineering/accessor/mixin/correlation.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8 -*--
33

4-
# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
4+
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
55
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
66

77
from __future__ import print_function, absolute_import
@@ -68,7 +68,7 @@ def _list_to_dataframe(
6868
correlation_matrix = correlation_matrix.loc[:, correlation_matrix.index]
6969
if normal_form:
7070
data = []
71-
for (col1, col2), corr in correlation_matrix.stack().iteritems():
71+
for (col1, col2), corr in correlation_matrix.stack().items():
7272
data.append([col1, col2, round(corr, 4)])
7373
return pd.DataFrame(data, columns=["Column 1", "Column 2", "Value"])
7474
else:
@@ -161,6 +161,6 @@ def cont_vs_cont(df: pd.DataFrame, normal_form: bool = True) -> pd.DataFrame:
161161
if not normal_form:
162162
return df.corr(method="pearson")
163163
data = []
164-
for (col1, col2), corr in df.corr(method="pearson").stack().iteritems():
164+
for (col1, col2), corr in df.corr(method="pearson").stack().items():
165165
data.append([col1, col2, round(corr, 4)])
166166
return pd.DataFrame(data, columns=["Column 1", "Column 2", "Value"])

ads/feature_engineering/accessor/mixin/eda_mixin.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8 -*--
33

4-
# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
4+
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
55
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
66

77
"""
@@ -262,5 +262,5 @@ def warning(self) -> pd.DataFrame:
262262
warning_df = self._obj[col].ads.warning()
263263
if warning_df is not None:
264264
warning_df.insert(0, "Column", col)
265-
result_df = result_df.append(warning_df)
265+
result_df = pd.concat([result_df, warning_df])
266266
return result_df.reset_index(drop=True)

ads/feature_engineering/accessor/mixin/eda_mixin_series.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8 -*--
33

4-
# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
4+
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
55
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
66

77
"""
@@ -80,6 +80,6 @@ def warning(self) -> pd.DataFrame:
8080
warning_df = feature_type.warning(self._obj)
8181
if warning_df is not None:
8282
warning_df.insert(0, "Feature Type", feature_type.name)
83-
result_df = result_df.append(warning_df)
83+
result_df = pd.concat([result_df, warning_df])
8484
result_df.reset_index(drop=True, inplace=True)
8585
return result_df

ads/feature_engineering/accessor/mixin/feature_types_mixin.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8 -*--
33

4-
# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
4+
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
55
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
66

77
"""
@@ -115,14 +115,14 @@ def warning_registered(self) -> pd.DataFrame:
115115
for col in self._obj.columns:
116116
feature_type_df = self._obj[col].ads.warning_registered()
117117
feature_type_df.insert(0, "Column", col)
118-
result_df = result_df.append(feature_type_df)
118+
result_df = pd.concat([result_df, feature_type_df])
119119
else:
120120
result_df = pd.DataFrame((), columns=common_columns)
121121
for feature_type in self._feature_type:
122122
feature_type_df = feature_type.warning.registered()
123123
feature_type_df.insert(0, "Feature Type", feature_type.name)
124124
feature_type_df = feature_type_df.rename(columns={"Name": "Warning"})
125-
result_df = result_df.append(feature_type_df)
125+
result_df = pd.concat([result_df, feature_type_df])
126126
result_df.reset_index(drop=True, inplace=True)
127127
return result_df
128128

@@ -155,14 +155,14 @@ def validator_registered(self) -> pd.DataFrame:
155155
for col in self._obj.columns:
156156
feature_type_df = self._obj[col].ads.validator_registered()
157157
feature_type_df.insert(0, "Column", col)
158-
result_df = result_df.append(feature_type_df)
158+
result_df = pd.concat([result_df, feature_type_df])
159159
else:
160160
result_df = pd.DataFrame((), columns=common_columns)
161161
for feature_type in self._feature_type:
162162
feature_type_df = feature_type.validator.registered()
163163
feature_type_df.insert(0, "Feature Type", feature_type.name)
164164
feature_type_df = feature_type_df.rename(columns={"Name": "Validator"})
165-
result_df = result_df.append(feature_type_df)
165+
result_df = pd.concat([result_df, feature_type_df])
166166
result_df.reset_index(drop=True, inplace=True)
167167
return result_df
168168

ads/feature_engineering/feature_type/creditcard.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8 -*--
33

4-
# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
4+
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
55
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
66

77
"""
@@ -198,6 +198,7 @@ def feature_stat(x: pd.Series):
198198
df_stat = _count_unique_missing(x)
199199
card_types = x.apply(assign_issuer)
200200
value_counts = card_types.value_counts()
201+
value_counts.rename("creditcard", inplace=True)
201202
value_counts.index = [
202203
"count_" + cardtype for cardtype in list(value_counts.index)
203204
]

0 commit comments

Comments
 (0)