Skip to content

Commit ecd417d

Browse files
authored
Feature/anomaly detect (#546)
2 parents 7265aac + 8c1f1a9 commit ecd417d

File tree

8 files changed

+54
-66
lines changed

8 files changed

+54
-66
lines changed

ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,18 @@ def _load_data(self, spec):
5959
spec.target_column = target_col[0]
6060
self.full_data_dict = {spec.target_column: self.data}
6161
else:
62-
# Group the data by target column
63-
self.full_data_dict = dict(
64-
tuple(
65-
(group, df.reset_index(drop=True))
66-
for group, df in self.data.groupby(spec.target_category_columns[0])
62+
# Merge target category columns
63+
64+
self.data["__Series__"] = utils._merge_category_columns(self.data, spec.target_category_columns)
65+
unique_categories = self.data["__Series__"].unique()
66+
self.full_data_dict = dict()
67+
68+
for cat in unique_categories:
69+
data_by_cat = (
70+
self.data[self.data["__Series__"] == cat].drop(spec.target_category_columns + ["__Series__"],
71+
axis=1)
6772
)
68-
)
73+
self.full_data_dict[cat] = data_by_cat
6974

7075

7176
class AnomalyOutput:
@@ -108,25 +113,32 @@ def get_outliers_by_cat(self, category: str, data: pd.DataFrame):
108113
how='inner')
109114
return outliers
110115

111-
def get_inliers(self, full_data_dict):
116+
def get_inliers(self, data):
112117
inliers = pd.DataFrame()
113118

114119
for category in self.category_map.keys():
115120
inliers = pd.concat(
116-
[inliers, self.get_inliers_by_cat(category, full_data_dict[category])],
121+
[
122+
inliers,
123+
self.get_inliers_by_cat(
124+
category, data[data['__Series__'] == category].reset_index(drop=True).drop('__Series__', axis=1)
125+
)
126+
],
117127
axis=0,
118128
ignore_index=True,
119129
)
120130
return inliers
121131

122-
def get_outliers(self, full_data_dict):
132+
def get_outliers(self, data):
123133
outliers = pd.DataFrame()
124134

125135
for category in self.category_map.keys():
126136
outliers = pd.concat(
127137
[
128138
outliers,
129-
self.get_outliers_by_cat(category, full_data_dict[category]),
139+
self.get_outliers_by_cat(
140+
category, data[data['__Series__'] == category].reset_index(drop=True).drop('__Series__', axis=1)
141+
)
130142
],
131143
axis=0,
132144
ignore_index=True,

ads/opctl/operator/lowcode/anomaly/model/automlx.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,11 @@ def _build_model(self) -> pd.DataFrame:
4242
anomaly = pd.DataFrame({
4343
date_column: df[date_column],
4444
OutputColumns.ANOMALY_COL: y_pred
45-
})
45+
}).reset_index(drop=True)
4646
score = pd.DataFrame({
4747
date_column: df[date_column],
4848
OutputColumns.SCORE_COL: [item[1] for item in scores]
49-
})
49+
}).reset_index(drop=True)
5050
anomaly_output.add_output(target, anomaly, score)
5151

5252
return anomaly_output

ads/opctl/operator/lowcode/anomaly/model/autots.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,14 @@ def _build_model(self) -> AnomalyOutput:
5454
for target, df in full_data_dict.items():
5555
data = df.set_index(date_column)
5656

57-
if self.spec.target_category_columns is not None:
58-
data = data.drop(self.spec.target_category_columns[0], axis=1)
59-
6057
(anomaly, score) = model.detect(data)
6158

6259
if len(anomaly.columns) == 1:
6360
score.rename(
6461
columns={score.columns.values[0]: OutputColumns.SCORE_COL},
6562
inplace=True,
6663
)
67-
score = 1-score
64+
score = 1 - score
6865
score = score.reset_index(drop=False)
6966

7067
col = anomaly.columns.values[0]

ads/opctl/operator/lowcode/anomaly/model/base_model.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,6 @@ def generate_report(self):
6969

7070
blocks = []
7171
for target, df in self.datasets.full_data_dict.items():
72-
if self.spec.target_category_columns is not None:
73-
df = df.drop(columns=[self.spec.target_category_columns[0]])
7472
figure_blocks = []
7573
time_col = df[date_column].reset_index(drop=True)
7674
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
@@ -179,12 +177,12 @@ def _validation_data_evaluate_metrics(self, anomaly_output, filename, elapsed_ti
179177

180178
if data.empty:
181179
return total_metrics, summary_metrics, None
182-
180+
data["__Series__"] = utils._merge_category_columns(data, self.spec.target_category_columns)
183181
for cat in anomaly_output.category_map:
184182
output = anomaly_output.category_map[cat][0]
185183
date_col = self.spec.datetime_column.name
186184

187-
val_data = data[data[self.spec.target_category_columns[0]] == cat]
185+
val_data = data[data["__Series__"] == cat]
188186
val_data[date_col] = pd.to_datetime(val_data[date_col])
189187

190188
dates = output[output[date_col].isin(val_data[date_col])][date_col]
@@ -291,15 +289,15 @@ def _save_report(
291289
f2.write(f1.read())
292290

293291
if self.spec.generate_inliers:
294-
inliers = anomaly_output.get_inliers(self.datasets.full_data_dict)
292+
inliers = anomaly_output.get_inliers(self.datasets.data)
295293
utils._write_data(
296294
data=inliers,
297295
filename=os.path.join(output_dir, self.spec.inliers_filename),
298296
format="csv",
299297
storage_options=storage_options,
300298
)
301299

302-
outliers=anomaly_output.get_outliers(self.datasets.full_data_dict)
300+
outliers=anomaly_output.get_outliers(self.datasets.data)
303301
utils._write_data(
304302
data=outliers,
305303
filename=os.path.join(output_dir, self.spec.outliers_filename),

ads/opctl/operator/lowcode/anomaly/model/tods.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -80,17 +80,6 @@ def _build_model(self) -> pd.DataFrame:
8080

8181
# Store the model and predictions in dictionaries
8282
models[target] = model
83-
if self.spec.target_category_columns is None:
84-
dataset.data[OutputColumns.ANOMALY_COL] = predictions_train[target]
85-
else:
86-
dataset.data.loc[
87-
dataset.data[self.spec.target_category_columns[0]] == target,
88-
OutputColumns.ANOMALY_COL,
89-
] = predictions_train[target]
90-
91-
self.datasets.full_data_dict[target][
92-
OutputColumns.ANOMALY_COL
93-
] = predictions_train[target]
9483

9584
anomaly = pd.DataFrame({
9685
date_column: df[date_column],

ads/opctl/operator/lowcode/anomaly/schema.yaml

Lines changed: 7 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,13 @@ spec:
282282
meta:
283283
description: "The model to be used for anomaly detection"
284284

285+
contamination:
286+
required: false
287+
default: 0.1
288+
type: float
289+
meta:
290+
description: "Fraction of training dataset corresponding to anomalies (between 0.0 and 0.5)"
291+
285292
model_kwargs:
286293
type: dict
287294
required: false
@@ -293,33 +300,4 @@ spec:
293300
meta:
294301
description: "preprocessing and feature engineering can be disabled using this flag, Defaults to true"
295302

296-
time_budget:
297-
type: float
298-
required: false
299-
default: 0
300-
meta:
301-
description: "Time budget for optimization in seconds, defaults to 0 which means no limit & optimization continues till convergence."
302-
303-
training_ratio:
304-
type: float
305-
required: false
306-
default: 0.7
307-
meta:
308-
description: "Ratio of data to be used for training, rest of the data is used for validation. defaults to 0.7"
309-
310-
false_alarm_ratio:
311-
type: float
312-
required: false
313-
default: 0.01
314-
meta:
315-
description: "False alarm ratio is the ratio of the expected anomaly in the train dataset, defaults to 0.01"
316-
317-
metric:
318-
type: string
319-
required: false
320-
default: F1_MACRO
321-
allowed:
322-
- F1_MACRO
323-
- unsupervised_unify95
324-
- unsupervised_unify95_log_loss
325303
type: dict

ads/opctl/operator/lowcode/anomaly/utils.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
#!/usr/bin/env python
23
# -*- coding: utf-8 -*--
34

@@ -9,7 +10,7 @@
910
import fsspec
1011
from .operator_config import AnomalyOperatorSpec
1112
from .const import SupportedMetrics
12-
13+
from ads.opctl import logger
1314

1415
def _build_metrics_df(y_true, y_pred, column_name):
1516
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, confusion_matrix, \
@@ -19,12 +20,18 @@ def _build_metrics_df(y_true, y_pred, column_name):
1920
metrics[SupportedMetrics.PRECISION] = precision_score(y_true, y_pred)
2021
metrics[SupportedMetrics.ACCURACY] = accuracy_score(y_true, y_pred)
2122
metrics[SupportedMetrics.F1_SCORE] = f1_score(y_true, y_pred)
22-
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
23+
tn, *fn_fp_tp = confusion_matrix(y_true, y_pred).ravel()
24+
fp, fn, tp = fn_fp_tp if fn_fp_tp else (0, 0, 0)
2325
metrics[SupportedMetrics.FP] = fp
2426
metrics[SupportedMetrics.FN] = fn
2527
metrics[SupportedMetrics.TP] = tp
2628
metrics[SupportedMetrics.TN] = tn
27-
metrics[SupportedMetrics.ROC_AUC] = roc_auc_score(y_true, y_pred)
29+
try:
30+
# Throws exception if y_true has only one class
31+
metrics[SupportedMetrics.ROC_AUC] = roc_auc_score(y_true, y_pred)
32+
except Exception as e:
33+
logger.warn(f"An exception occurred: {e}")
34+
metrics[SupportedMetrics.ROC_AUC] = None
2835
precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
2936
metrics[SupportedMetrics.PRC_AUC] = auc(recall, precision)
3037
metrics[SupportedMetrics.MCC] = matthews_corrcoef(y_true, y_pred)
@@ -64,6 +71,12 @@ def _write_data(data, filename, format, storage_options, index=False, **kwargs):
6471
)
6572
raise ValueError(f"Unrecognized format: {format}")
6673

74+
def _merge_category_columns(data, target_category_columns):
75+
result = data.apply(
76+
lambda x: "__".join([str(x[col]) for col in target_category_columns]), axis=1
77+
)
78+
return result if not result.empty else pd.Series([], dtype=str)
79+
6780

6881
def get_frequency_of_datetime(data: pd.DataFrame, dataset_info: AnomalyOperatorSpec):
6982
"""

tests/operators/anomaly/test_anomaly_simple.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
MODELS = ["automlx", "autots"] # , "auto", "tods",
1818

19-
19+
# Mandatory YAML parameters
2020
TEMPLATE_YAML = {
2121
"kind": "operator",
2222
"type": "anomaly",
@@ -126,6 +126,7 @@ def test_artificial_small(model):
126126
yaml_i["spec"]["model"] = model
127127
yaml_i["spec"]["input_data"]["url"] = input_data
128128
yaml_i["spec"]["output_directory"]["url"] = output_dirname
129+
yaml_i["spec"]["contamination"] = 0.3
129130

130131
with open(anomaly_yaml_filename, "w") as f:
131132
f.write(yaml.dump(yaml_i))

0 commit comments

Comments
 (0)