Skip to content

Commit 3b9cc48

Browse files
authored
Ad/add support for valid data (#563)
2 parents f195ea6 + 043e89d commit 3b9cc48

File tree

20 files changed

+553
-467
lines changed

20 files changed

+553
-467
lines changed

ads/opctl/operator/common/operator_config.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,32 @@
88
import json
99
from abc import abstractmethod
1010
from dataclasses import dataclass
11-
from typing import Any, Dict
11+
from typing import Any, Dict, List
1212

1313
from ads.common.serializer import DataClassSerializable
1414

1515
from ads.opctl.operator.common.utils import OperatorValidator
1616
from ads.opctl.operator.common.errors import InvalidParameterError
1717

18+
@dataclass(repr=True)
19+
class InputData(DataClassSerializable):
20+
"""Class representing operator specification input data details."""
21+
22+
connect_args: Dict = None
23+
format: str = None
24+
columns: List[str] = None
25+
url: str = None
26+
filters: List[str] = None
27+
options: Dict = None
28+
limit: int = None
29+
sql: str = None
30+
table_name: str = None
31+
32+
33+
@dataclass(repr=True)
34+
class OutputDirectory(InputData):
35+
"""Class representing operator specification output directory details."""
36+
1837

1938
@dataclass(repr=True)
2039
class OperatorConfig(DataClassSerializable):

ads/opctl/operator/lowcode/anomaly/__main__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@ def operate(operator_config: AnomalyOperatorConfig) -> None:
2424
from .model.factory import AnomalyOperatorModelFactory
2525

2626
datasets = AnomalyDatasets(operator_config.spec)
27-
datasets2 = AnomalyData(operator_config.spec)
28-
print(f"d1: {datasets.data}\n\n d2: {datasets2.data}")
2927
AnomalyOperatorModelFactory.get_model(operator_config, datasets).generate_report()
3028

3129

ads/opctl/operator/lowcode/anomaly/environment.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,4 @@ dependencies:
88
- datapane
99
- cerberus
1010
- oracle-automlx==23.2.3
11-
- "git+https://github.com/datamllab/tods.git"
12-
- "git+https://github.com/oracle/accelerated-data-science.git@feature/anomaly#egg=oracle-ads"
11+
- "oracle-ads[anomaly]"

ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py

Lines changed: 25 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,18 @@ def __init__(self, spec: AnomalyOperatorSpec):
2828
super().__init__(spec=spec, name="test_data")
2929

3030

31+
class ValidationData(AbstractData):
32+
def __init__(self, spec: AnomalyOperatorSpec):
33+
super().__init__(spec=spec, name="validation_data")
34+
35+
def _ingest_data(self, spec):
36+
self.X_valid_dict = dict()
37+
self.y_valid_dict = dict()
38+
for s_id, df in self.get_dict_by_series().items():
39+
self.X_valid_dict[s_id] = df.drop([OutputColumns.ANOMALY_COL], axis=1)
40+
self.y_valid_dict[s_id] = df[OutputColumns.ANOMALY_COL]
41+
42+
3143
class AnomalyDatasets:
3244
def __init__(self, spec: AnomalyOperatorSpec):
3345
"""Instantiates the DataIO instance.
@@ -39,63 +51,23 @@ def __init__(self, spec: AnomalyOperatorSpec):
3951
"""
4052
self._data = AnomalyData(spec)
4153
self.data = self._data.get_data_long()
42-
# self.test_data = None
43-
# self.target_columns = None
4454
self.full_data_dict = self._data.get_dict_by_series()
45-
# self._load_data(spec)
46-
47-
# def _load_data(self, spec):
48-
# """Loads anomaly input data."""
49-
# try:
50-
# self.data = load_data(
51-
# filename=spec.input_data.url,
52-
# format=spec.input_data.format,
53-
# columns=spec.input_data.columns,
54-
# )
55-
# except InvalidParameterError as e:
56-
# e.args = e.args + ("Invalid Parameter: input_data",)
57-
# raise e
58-
# date_col = spec.datetime_column.name
59-
# self.data[date_col] = pd.to_datetime(self.data[date_col])
60-
# try:
61-
# spec.freq = get_frequency_of_datetime(self.data, spec)
62-
# except TypeError as e:
63-
# logger.warn(
64-
# f"Error determining frequency: {e.args}. Setting Frequency to None"
65-
# )
66-
# logger.debug(f"Full traceback: {e}")
67-
# spec.freq = None
68-
69-
# if spec.target_category_columns is None:
70-
# if spec.target_column is None:
71-
# target_col = [
72-
# col
73-
# for col in self.data.columns
74-
# if col not in [spec.datetime_column.name]
75-
# ]
76-
# spec.target_column = target_col[0]
77-
# self.full_data_dict = {spec.target_column: self.data}
78-
# else:
79-
# # Merge target category columns
80-
81-
# self.data[OutputColumns.Series] = merge_category_columns(
82-
# self.data, spec.target_category_columns
83-
# )
84-
# unique_categories = self.data[OutputColumns.Series].unique()
85-
# self.full_data_dict = dict()
86-
87-
# for cat in unique_categories:
88-
# data_by_cat = self.data[self.data[OutputColumns.Series] == cat].drop(
89-
# spec.target_category_columns + [OutputColumns.Series], axis=1
90-
# )
91-
# self.full_data_dict[cat] = data_by_cat
55+
if spec.validation_data is not None:
56+
self.valid_data = ValidationData(spec)
57+
self.X_valid_dict = self.valid_data.X_valid_dict
58+
self.y_valid_dict = self.valid_data.y_valid_dict
9259

9360

9461
class AnomalyOutput:
9562
def __init__(self, date_column):
9663
self.category_map = dict()
9764
self.date_column = date_column
9865

66+
def list_categories(self):
67+
categories = list(self.category_map.keys())
68+
categories.sort()
69+
return categories
70+
9971
def add_output(self, category: str, anomalies: pd.DataFrame, scores: pd.DataFrame):
10072
self.category_map[category] = (anomalies, scores)
10173

@@ -126,7 +98,7 @@ def get_outliers_by_cat(self, category: str, data: pd.DataFrame):
12698
def get_inliers(self, data):
12799
inliers = pd.DataFrame()
128100

129-
for category in self.category_map.keys():
101+
for category in self.list_categories():
130102
inliers = pd.concat(
131103
[
132104
inliers,
@@ -145,7 +117,7 @@ def get_inliers(self, data):
145117
def get_outliers(self, data):
146118
outliers = pd.DataFrame()
147119

148-
for category in self.category_map.keys():
120+
for category in self.list_categories():
149121
outliers = pd.concat(
150122
[
151123
outliers,
@@ -163,10 +135,10 @@ def get_outliers(self, data):
163135

164136
def get_scores(self, target_category_columns):
165137
if target_category_columns is None:
166-
return self.get_scores_by_cat(list(self.category_map.keys())[0])
138+
return self.get_scores_by_cat(self.list_categories()[0])
167139

168140
scores = pd.DataFrame()
169-
for category in self.category_map.keys():
141+
for category in self.list_categories():
170142
score = self.get_scores_by_cat(category)
171143
score[target_category_columns[0]] = category
172144
scores = pd.concat([scores, score], axis=0, ignore_index=True)

ads/opctl/operator/lowcode/anomaly/model/automlx.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,23 @@ def _build_model(self) -> pd.DataFrame:
2727
date_column = self.spec.datetime_column.name
2828
anomaly_output = AnomalyOutput(date_column=date_column)
2929

30+
time_budget = self.spec.model_kwargs.pop("time_budget", -1)
3031
# Iterate over the full_data_dict items
3132
for target, df in self.datasets.full_data_dict.items():
32-
est = automl.Pipeline(task="anomaly_detection")
33-
est.fit(df, y=None)
33+
est = automl.Pipeline(task="anomaly_detection", **self.spec.model_kwargs)
34+
est.fit(
35+
X=df,
36+
X_valid=self.X_valid_dict[target]
37+
if self.X_valid_dict is not None
38+
else None,
39+
y_valid=self.y_valid_dict[target]
40+
if self.y_valid_dict is not None
41+
else None,
42+
time_budget=time_budget,
43+
contamination=self.spec.contamination
44+
if self.y_valid_dict is not None
45+
else None,
46+
)
3447
y_pred = est.predict(df)
3548
scores = est.predict_proba(df)
3649

ads/opctl/operator/lowcode/anomaly/model/autots.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,23 +14,24 @@
1414

1515

1616
class AutoTSOperatorModel(AnomalyOperatorBaseModel):
17-
"""Class representing TODS Anomaly Detection operator model."""
17+
"""Class representing AutoTS Anomaly Detection operator model."""
1818

1919
@runtime_dependency(
2020
module="autots",
2121
err_msg=(
2222
"Please run `pip3 install autots` to "
23-
"install the required dependencies for TODS."
23+
"install the required dependencies for AutoTS."
2424
),
2525
)
2626
def _build_model(self) -> AnomalyOutput:
2727
from autots.evaluator.anomaly_detector import AnomalyDetector
2828

2929
method = self.spec.model_kwargs.get("method")
30+
transform_dict = self.spec.model_kwargs.get("transform_dict", {})
3031

3132
if method == "random" or method == "deep" or method == "fast":
3233
new_params = AnomalyDetector.get_new_params(method=method)
33-
new_params.pop("transform_dict")
34+
transform_dict = new_params.pop("transform_dict")
3435

3536
for key, value in new_params.items():
3637
self.spec.model_kwargs[key] = value
@@ -39,7 +40,12 @@ def _build_model(self) -> AnomalyOutput:
3940
self.spec.model_kwargs["output"] = "univariate"
4041

4142
if "transform_dict" not in self.spec.model_kwargs:
42-
self.spec.model_kwargs["transform_dict"] = {}
43+
self.spec.model_kwargs["transform_dict"] = transform_dict
44+
45+
if self.spec.contamination != 0.1: # TODO: remove hard-coding
46+
self.spec.model_kwargs.get("method_params", {})[
47+
"contamination"
48+
] = self.spec.contamination
4349

4450
model = AnomalyDetector(**self.spec.model_kwargs)
4551

0 commit comments

Comments
 (0)