Skip to content

Commit 8b5c699

Browse files
committed
update the todos, complete implementation
1 parent 8dbccae commit 8b5c699

File tree

2 files changed

+106
-91
lines changed

2 files changed

+106
-91
lines changed

ads/opctl/operator/lowcode/anomaly/model/base_model.py

Lines changed: 51 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets):
5555
def generate_report(self):
5656
"""Generates the report."""
5757
import matplotlib.pyplot as plt
58+
plt.rcParams.update({'figure.max_open_warning': 0})
5859
import report_creator as rc
5960

6061
start_time = time.time()
@@ -87,43 +88,57 @@ def generate_report(self):
8788
self.spec.datetime_column.name if self.spec.datetime_column else "index"
8889
)
8990

91+
(
92+
model_description,
93+
other_sections,
94+
) = self._generate_report()
95+
9096
blocks = []
9197
for target, df in self.datasets.full_data_dict.items():
92-
figure_blocks = []
93-
time_col = df[date_column].reset_index(drop=True)
94-
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
95-
OutputColumns.ANOMALY_COL
96-
]
97-
anomaly_indices = [i for i, index in enumerate(anomaly_col) if index == 1]
98-
downsampled_time_col = time_col
99-
selected_indices = list(range(len(time_col)))
100-
if self.spec.subsample_report_data:
101-
non_anomaly_indices = [i for i in range(len(time_col)) if i not in anomaly_indices]
102-
# Downsample non-anomalous data if it exceeds the threshold (1000)
103-
if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
104-
downsampled_non_anomaly_indices = non_anomaly_indices[::len(non_anomaly_indices)//SUBSAMPLE_THRESHOLD]
105-
selected_indices = anomaly_indices + downsampled_non_anomaly_indices
106-
selected_indices.sort()
107-
downsampled_time_col = time_col[selected_indices]
108-
109-
columns = set(df.columns).difference({date_column})
110-
for col in columns:
111-
y = df[col].reset_index(drop=True)
112-
113-
downsampled_y = y[selected_indices]
114-
115-
fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
116-
ax.grid()
117-
ax.plot(downsampled_time_col, downsampled_y, color="black")
118-
# Plot anomalies
119-
for i in anomaly_indices:
120-
ax.scatter(time_col[i], y[i], color="red", marker="o")
121-
plt.xlabel(date_column)
122-
plt.ylabel(col)
123-
plt.title(f"`{col}` with reference to anomalies")
124-
figure_blocks.append(rc.Widget(ax))
125-
126-
blocks.append(rc.Group(*figure_blocks, label=target))
98+
if target in anomaly_output.list_categories():
99+
figure_blocks = []
100+
time_col = df[date_column].reset_index(drop=True)
101+
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
102+
OutputColumns.ANOMALY_COL
103+
]
104+
anomaly_indices = [
105+
i for i, index in enumerate(anomaly_col) if index == 1
106+
]
107+
downsampled_time_col = time_col
108+
selected_indices = list(range(len(time_col)))
109+
if self.spec.subsample_report_data:
110+
non_anomaly_indices = [
111+
i for i in range(len(time_col)) if i not in anomaly_indices
112+
]
113+
# Downsample non-anomalous data if it exceeds the threshold (1000)
114+
if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
115+
downsampled_non_anomaly_indices = non_anomaly_indices[
116+
:: len(non_anomaly_indices) // SUBSAMPLE_THRESHOLD
117+
]
118+
selected_indices = (
119+
anomaly_indices + downsampled_non_anomaly_indices
120+
)
121+
selected_indices.sort()
122+
downsampled_time_col = time_col[selected_indices]
123+
124+
columns = set(df.columns).difference({date_column})
125+
for col in columns:
126+
y = df[col].reset_index(drop=True)
127+
128+
downsampled_y = y[selected_indices]
129+
130+
fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
131+
ax.grid()
132+
ax.plot(downsampled_time_col, downsampled_y, color="black")
133+
# Plot anomalies
134+
for i in anomaly_indices:
135+
ax.scatter(time_col[i], y[i], color="red", marker="o")
136+
plt.xlabel(date_column)
137+
plt.ylabel(col)
138+
plt.title(f"`{col}` with reference to anomalies")
139+
figure_blocks.append(rc.Widget(ax))
140+
141+
blocks.append(rc.Group(*figure_blocks, label=target))
127142
plots = rc.Select(blocks)
128143

129144
report_sections = []
@@ -133,7 +148,7 @@ def generate_report(self):
133148
yaml_appendix = rc.Yaml(self.config.to_dict())
134149
summary = rc.Block(
135150
rc.Group(
136-
rc.Text(f"You selected the **`{self.spec.model}`** model."),
151+
rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"),
137152
rc.Text(
138153
"Based on your dataset, you could have also selected "
139154
f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`."

ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py

Lines changed: 55 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import pandas as pd
88

99
from ads.common.decorator.runtime_dependency import runtime_dependency
10+
from ads.opctl import logger
1011
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns
1112

1213
from .anomaly_dataset import AnomalyOutput
@@ -29,68 +30,67 @@ def _build_model(self) -> AnomalyOutput:
2930
from rrcf import RCTree
3031

3132
model_kwargs = self.spec.model_kwargs
32-
# map the output as per anomaly dataset class, 1: outlier, 0: inlier
33-
# self.outlier_map = {1: 0, -1: 1}
3433

3534
anomaly_output = AnomalyOutput(date_column="index")
3635

3736
# Set tree parameters
3837
num_trees = model_kwargs.get("num_trees", 200)
39-
shingle_size = model_kwargs.get("shingle_size", 1)
40-
tree_size = model_kwargs.get("tree_size", 1000)
38+
shingle_size = model_kwargs.get("shingle_size", None)
39+
anamoly_threshold = model_kwargs.get("anamoly_threshold", 95)
4140

4241
for target, df in self.datasets.full_data_dict.items():
43-
df_values = df[self.spec.target_column].astype(float).values
44-
45-
# TODO: Update size to log logic
46-
points = np.vstack(list(rrcf.shingle(df_values, size=4)))
47-
48-
# TODO: remove hardcode
49-
sample_size_range = (1, 6)
50-
n = points.shape[0]
51-
avg_codisp = pd.Series(0.0, index=np.arange(n))
52-
index = np.zeros(n)
53-
54-
forest = []
55-
while len(forest) < num_trees:
56-
ixs = np.random.choice(n, size=sample_size_range, replace=False)
57-
trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs]
58-
forest.extend(trees)
59-
print(len(forest))
60-
61-
for tree in forest:
62-
codisp = pd.Series({leaf: tree.codisp(leaf) for leaf in tree.leaves})
63-
avg_codisp[codisp.index] += codisp
64-
np.add.at(index, codisp.index.values, 1)
65-
66-
avg_codisp /= index
67-
# TODO: remove hardcode
68-
avg_codisp.index = df.iloc[(4 - 1) :].index
69-
avg_codisp = (avg_codisp - avg_codisp.min()) / (
70-
avg_codisp.max() - avg_codisp.min()
71-
)
72-
73-
# TODO: use model kwargs for percentile threshold
74-
y_pred = (avg_codisp > np.percentile(avg_codisp, 95)).astype(int)
75-
76-
# TODO: rem pdb
77-
# import pdb
78-
79-
# pdb.set_trace()
80-
print("Done")
81-
82-
# scores = model.score_samples(df)
83-
84-
# index_col = df.columns[0]
85-
86-
# anomaly = pd.DataFrame(
87-
# {index_col: df[index_col], OutputColumns.ANOMALY_COL: y_pred}
88-
# ).reset_index(drop=True)
89-
# score = pd.DataFrame(
90-
# {"index": df[index_col], OutputColumns.SCORE_COL: scores}
91-
# ).reset_index(drop=True)
92-
93-
# anomaly_output.add_output(target, anomaly, score)
42+
try:
43+
if df.shape[0] == 1:
44+
raise ValueError("Dataset size must be greater than 1")
45+
df_values = df[self.spec.target_column].astype(float).values
46+
47+
cal_shingle_size = (
48+
shingle_size
49+
if shingle_size
50+
else int(2 ** np.floor(np.log2(df.shape[0])) / 2)
51+
)
52+
points = np.vstack(list(rrcf.shingle(df_values, size=cal_shingle_size)))
53+
54+
sample_size_range = (1, points.shape[0])
55+
n = points.shape[0]
56+
avg_codisp = pd.Series(0.0, index=np.arange(n))
57+
index = np.zeros(n)
58+
59+
forest = []
60+
while len(forest) < num_trees:
61+
ixs = np.random.choice(n, size=sample_size_range, replace=False)
62+
trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs]
63+
forest.extend(trees)
64+
65+
for tree in forest:
66+
codisp = pd.Series(
67+
{leaf: tree.codisp(leaf) for leaf in tree.leaves}
68+
)
69+
avg_codisp[codisp.index] += codisp
70+
np.add.at(index, codisp.index.values, 1)
71+
72+
avg_codisp /= index
73+
avg_codisp.index = df.iloc[(cal_shingle_size - 1) :].index
74+
avg_codisp = (avg_codisp - avg_codisp.min()) / (
75+
avg_codisp.max() - avg_codisp.min()
76+
)
77+
78+
y_pred = (
79+
avg_codisp > np.percentile(avg_codisp, anamoly_threshold)
80+
).astype(int)
81+
82+
index_col = df.columns[0]
83+
84+
anomaly = pd.DataFrame(
85+
{index_col: y_pred.index, OutputColumns.ANOMALY_COL: y_pred}
86+
).reset_index(drop=True)
87+
score = pd.DataFrame(
88+
{"index": avg_codisp.index, OutputColumns.SCORE_COL: avg_codisp}
89+
).reset_index(drop=True)
90+
91+
anomaly_output.add_output(target, anomaly, score)
92+
except Exception as e:
93+
logger.warn(f"Encountered Error: {e}. Skipping series {target}.")
9494

9595
return anomaly_output
9696

0 commit comments

Comments
 (0)