Skip to content

Commit c389686

Browse files
committed
adding series_id in inliers, outliers
1 parent 9b9b33c commit c389686

File tree

4 files changed

+54
-27
lines changed

4 files changed

+54
-27
lines changed

ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,17 @@ def __init__(self, spec: AnomalyOperatorSpec):
4949
The anomaly operator spec.
5050
"""
5151
self._data = AnomalyData(spec)
52-
self.data_with_all_cols = self._data.get_data_with_all_cols()
5352
self.data = self._data.get_data_long()
5453
self.full_data_dict = self._data.get_dict_by_series()
5554
if spec.validation_data is not None:
5655
self.valid_data = ValidationData(spec)
5756
self.X_valid_dict = self.valid_data.X_valid_dict
5857
self.y_valid_dict = self.valid_data.y_valid_dict
5958

59+
# Returns raw data based on the series_id i.e; the merged target_category_column value
60+
def get_raw_data_by_cat(self, category):
61+
return self._data.get_raw_data_by_cat(category)
62+
6063

6164
class AnomalyOutput:
6265
def __init__(self, date_column):
@@ -95,38 +98,28 @@ def get_outliers_by_cat(self, category: str, data: pd.DataFrame):
9598
outliers = pd.merge(outliers, scores, on=self.date_column, how="inner")
9699
return outliers
97100

98-
def get_inliers(self, data):
101+
def get_inliers(self, datasets):
99102
inliers = pd.DataFrame()
100103

101104
for category in self.list_categories():
102105
inliers = pd.concat(
103106
[
104107
inliers,
105-
self.get_inliers_by_cat(
106-
category,
107-
data[data[OutputColumns.Series] == category]
108-
.reset_index(drop=True)
109-
.drop(OutputColumns.Series, axis=1),
110-
),
108+
self.get_inliers_by_cat(category, datasets.get_raw_data_by_cat(category)),
111109
],
112110
axis=0,
113111
ignore_index=True,
114112
)
115113
return inliers
116114

117-
def get_outliers(self, data):
115+
def get_outliers(self, datasets):
118116
outliers = pd.DataFrame()
119117

120118
for category in self.list_categories():
121119
outliers = pd.concat(
122120
[
123121
outliers,
124-
self.get_outliers_by_cat(
125-
category,
126-
data[data[OutputColumns.Series] == category]
127-
.reset_index(drop=True)
128-
.drop(OutputColumns.Series, axis=1),
129-
),
122+
self.get_outliers_by_cat(category, datasets.get_raw_data_by_cat(category)),
130123
],
131124
axis=0,
132125
ignore_index=True,

ads/opctl/operator/lowcode/anomaly/model/base_model.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,15 +272,15 @@ def _save_report(
272272
f2.write(f1.read())
273273

274274
if self.spec.generate_inliers:
275-
inliers = anomaly_output.get_inliers(self.datasets.data_with_all_cols)
275+
inliers = anomaly_output.get_inliers(self.datasets)
276276
write_data(
277277
data=inliers,
278278
filename=os.path.join(unique_output_dir, self.spec.inliers_filename),
279279
format="csv",
280280
storage_options=storage_options,
281281
)
282282

283-
outliers = anomaly_output.get_outliers(self.datasets.data_with_all_cols)
283+
outliers = anomaly_output.get_outliers(self.datasets)
284284
write_data(
285285
data=outliers,
286286
filename=os.path.join(unique_output_dir, self.spec.outliers_filename),

ads/opctl/operator/lowcode/common/data.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,18 @@ def __init__(self, spec: dict, name="input_data"):
2727
self.data_with_all_cols = None
2828
self.load_transform_ingest_data(spec)
2929

30-
31-
def get_data_with_all_cols(self):
32-
return self.data_with_all_cols.reset_index(drop=False)
30+
def get_raw_data_by_cat(self, category):
31+
import pandas as pd
32+
mapping = self._data_transformer.get_target_category_columns_map()
33+
# For given category, mapping gives the target_category_columns and it's values.
34+
# condition filters raw_data based on the values of target_category_columns for the given category
35+
condition = pd.Series(True, index=self.raw_data.index)
36+
if category in mapping:
37+
for col, val in mapping[category].items():
38+
condition &= (self.raw_data[col] == val)
39+
data_by_cat = self.raw_data[condition].reset_index(drop=True)
40+
data_by_cat = self._data_transformer._format_datetime_col(data_by_cat)
41+
return data_by_cat
3342

3443

3544
def get_dict_by_series(self):
@@ -71,19 +80,16 @@ def _load_data(self, data_spec, **kwargs):
7180
def _transform_data(self, spec, raw_data, **kwargs):
7281
transformation_start_time = time.time()
7382
self._data_transformer = self.Transformations(spec, name=self.name)
74-
self.data_with_all_cols = self._data_transformer.run(raw_data)
75-
data = self.data_with_all_cols
76-
if spec.target_category_columns:
77-
data = data.drop(spec.target_category_columns, axis=1)
83+
data = self._data_transformer.run(raw_data)
7884
transformation_end_time = time.time()
7985
logger.info(
8086
f"{self.name} transformations completed in {transformation_end_time - transformation_start_time} seconds"
8187
)
8288
return data
8389

8490
def load_transform_ingest_data(self, spec):
85-
raw_data = self._load_data(getattr(spec, self.name))
86-
self.data = self._transform_data(spec, raw_data)
91+
self.raw_data = self._load_data(getattr(spec, self.name))
92+
self.data = self._transform_data(spec, self.raw_data)
8793
self._ingest_data(spec)
8894

8995
def _ingest_data(self, spec):

ads/opctl/operator/lowcode/common/transformations.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,20 @@ def _remove_trailing_whitespace(self, df):
7878
return df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
7979

8080
def _set_series_id_column(self, df):
81+
self._target_category_columns_map = dict()
8182
if not self.target_category_columns:
8283
df[DataColumns.Series] = "Series 1"
8384
self.has_artificial_series = True
8485
else:
8586
df[DataColumns.Series] = merge_category_columns(
8687
df, self.target_category_columns
8788
)
88-
#df = df.drop(self.target_category_columns, axis=1)
89+
merged_values = df[DataColumns.Series].unique().tolist()
90+
if self.target_category_columns:
91+
for value in merged_values:
92+
self._target_category_columns_map[value] = df[df[DataColumns.Series] == value][self.target_category_columns].drop_duplicates().iloc[0].to_dict()
93+
94+
df = df.drop(self.target_category_columns, axis=1)
8995
return df
9096

9197
def _format_datetime_col(self, df):
@@ -189,3 +195,25 @@ def _check_historical_dataset(self, df):
189195
raise DataMismatchError(
190196
f"Expected {self.name} to have columns: {expected_names}, but instead found column names: {df.columns}. Is the {self.name} path correct?"
191197
)
198+
199+
"""
200+
Map between merged target category column values and target category column and its value
201+
If target category columns are PPG_Code, Class, Num
202+
Merged target category column values are Product Category 1__A__1, Product Category 2__A__2
203+
Then target_category_columns_map would be
204+
{
205+
"Product Category 1__A__1": {
206+
"PPG_Code": "Product Category 1",
207+
"Class": "A",
208+
"Num": 1
209+
},
210+
"Product Category 2__A__2": {
211+
"PPG_Code": "Product Category 2",
212+
"Class": "A",
213+
"Num": 2
214+
},
215+
216+
}
217+
"""
218+
def get_target_category_columns_map(self):
219+
return self._target_category_columns_map

0 commit comments

Comments
 (0)