Skip to content

Commit 2267dd3

Browse files
authored
ODSC-61571 : Add RCF Implementation (#934)
2 parents 6b31d0f + c54e341 commit 2267dd3

File tree

9 files changed

+188
-38
lines changed

9 files changed

+188
-38
lines changed

THIRD_PARTY_LICENSES.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,12 @@ mlforecast
453453
* Source code: https://github.com/Nixtla/mlforecast
454454
* Project home: https://github.com/Nixtla/mlforecast
455455

456+
rrcf
457+
* Copyright 2018 kLabUM
458+
* License: MIT License
459+
* Source code: https://github.com/kLabUM/rrcf
460+
* Project home: https://github.com/kLabUM/rrcf
461+
456462
=======
457463
=============================== Licenses ===============================
458464
------------------------------------------------------------------------

ads/opctl/operator/lowcode/anomaly/const.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ class NonTimeADSupportedModels(str, metaclass=ExtendedEnumMeta):
2121

2222
OneClassSVM = "oneclasssvm"
2323
IsolationForest = "isolationforest"
24+
RandomCutForest = "randomcutforest"
2425
# TODO : Add DBScan
2526
# DBScan = "dbscan"
2627

ads/opctl/operator/lowcode/anomaly/model/base_model.py

Lines changed: 58 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,11 @@
1616

1717
from ads.common.object_storage_details import ObjectStorageDetails
1818
from ads.opctl import logger
19-
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns, SupportedMetrics, SUBSAMPLE_THRESHOLD
19+
from ads.opctl.operator.lowcode.anomaly.const import (
20+
SUBSAMPLE_THRESHOLD,
21+
OutputColumns,
22+
SupportedMetrics,
23+
)
2024
from ads.opctl.operator.lowcode.anomaly.utils import _build_metrics_df, default_signer
2125
from ads.opctl.operator.lowcode.common.utils import (
2226
disable_print,
@@ -55,6 +59,7 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets):
5559
def generate_report(self):
5660
"""Generates the report."""
5761
import matplotlib.pyplot as plt
62+
plt.rcParams.update({'figure.max_open_warning': 0})
5863
import report_creator as rc
5964

6065
start_time = time.time()
@@ -87,43 +92,59 @@ def generate_report(self):
8792
self.spec.datetime_column.name if self.spec.datetime_column else "index"
8893
)
8994

95+
(
96+
model_description,
97+
other_sections,
98+
) = self._generate_report()
99+
90100
blocks = []
91101
for target, df in self.datasets.full_data_dict.items():
92-
figure_blocks = []
93-
time_col = df[date_column].reset_index(drop=True)
94-
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
95-
OutputColumns.ANOMALY_COL
96-
]
97-
anomaly_indices = [i for i, index in enumerate(anomaly_col) if index == 1]
98-
downsampled_time_col = time_col
99-
selected_indices = list(range(len(time_col)))
100-
if self.spec.subsample_report_data:
101-
non_anomaly_indices = [i for i in range(len(time_col)) if i not in anomaly_indices]
102-
# Downsample non-anomalous data if it exceeds the threshold (1000)
103-
if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
104-
downsampled_non_anomaly_indices = non_anomaly_indices[::len(non_anomaly_indices)//SUBSAMPLE_THRESHOLD]
105-
selected_indices = anomaly_indices + downsampled_non_anomaly_indices
106-
selected_indices.sort()
107-
downsampled_time_col = time_col[selected_indices]
108-
109-
columns = set(df.columns).difference({date_column})
110-
for col in columns:
111-
y = df[col].reset_index(drop=True)
112-
113-
downsampled_y = y[selected_indices]
114-
115-
fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
116-
ax.grid()
117-
ax.plot(downsampled_time_col, downsampled_y, color="black")
118-
# Plot anomalies
119-
for i in anomaly_indices:
120-
ax.scatter(time_col[i], y[i], color="red", marker="o")
121-
plt.xlabel(date_column)
122-
plt.ylabel(col)
123-
plt.title(f"`{col}` with reference to anomalies")
124-
figure_blocks.append(rc.Widget(ax))
125-
126-
blocks.append(rc.Group(*figure_blocks, label=target))
102+
if target in anomaly_output.list_categories():
103+
figure_blocks = []
104+
time_col = df[date_column].reset_index(drop=True)
105+
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
106+
OutputColumns.ANOMALY_COL
107+
]
108+
anomaly_indices = [
109+
i for i, index in enumerate(anomaly_col) if index == 1
110+
]
111+
downsampled_time_col = time_col
112+
selected_indices = list(range(len(time_col)))
113+
if self.spec.subsample_report_data:
114+
non_anomaly_indices = [
115+
i for i in range(len(time_col)) if i not in anomaly_indices
116+
]
117+
# Downsample non-anomalous data if it exceeds the threshold (1000)
118+
if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
119+
downsampled_non_anomaly_indices = non_anomaly_indices[
120+
:: len(non_anomaly_indices) // SUBSAMPLE_THRESHOLD
121+
]
122+
selected_indices = (
123+
anomaly_indices + downsampled_non_anomaly_indices
124+
)
125+
selected_indices.sort()
126+
downsampled_time_col = time_col[selected_indices]
127+
128+
columns = set(df.columns).difference({date_column})
129+
for col in columns:
130+
y = df[col].reset_index(drop=True)
131+
132+
downsampled_y = y[selected_indices]
133+
134+
fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
135+
ax.grid()
136+
ax.plot(downsampled_time_col, downsampled_y, color="black")
137+
# Plot anomalies
138+
for i in anomaly_indices:
139+
ax.scatter(time_col[i], y[i], color="red", marker="o")
140+
plt.xlabel(date_column)
141+
plt.ylabel(col)
142+
plt.title(f"`{col}` with reference to anomalies")
143+
figure_blocks.append(rc.Widget(ax))
144+
else:
145+
figure_blocks = None
146+
147+
blocks.append(rc.Group(*figure_blocks, label=target)) if figure_blocks else None
127148
plots = rc.Select(blocks)
128149

129150
report_sections = []
@@ -133,7 +154,7 @@ def generate_report(self):
133154
yaml_appendix = rc.Yaml(self.config.to_dict())
134155
summary = rc.Block(
135156
rc.Group(
136-
rc.Text(f"You selected the **`{self.spec.model}`** model."),
157+
rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"),
137158
rc.Text(
138159
"Based on your dataset, you could have also selected "
139160
f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`."

ads/opctl/operator/lowcode/anomaly/model/factory.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from .base_model import AnomalyOperatorBaseModel
1616
from .isolationforest import IsolationForestOperatorModel
1717
from .oneclasssvm import OneClassSVMOperatorModel
18+
from .randomcutforest import RandomCutForestOperatorModel
1819

1920

2021
class UnSupportedModelError(Exception):
@@ -52,6 +53,7 @@ class AnomalyOperatorModelFactory:
5253
_NonTime_MAP = {
5354
NonTimeADSupportedModels.OneClassSVM: OneClassSVMOperatorModel,
5455
NonTimeADSupportedModels.IsolationForest: IsolationForestOperatorModel,
56+
NonTimeADSupportedModels.RandomCutForest: RandomCutForestOperatorModel,
5557
# TODO: Add DBScan model for non time based anomaly
5658
# NonTimeADSupportedModels.DBScan: DBScanOperatorModel,
5759
}
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
4+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
5+
6+
import numpy as np
7+
import pandas as pd
8+
9+
from ads.common.decorator.runtime_dependency import runtime_dependency
10+
from ads.opctl import logger
11+
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns
12+
13+
from .anomaly_dataset import AnomalyOutput
14+
from .base_model import AnomalyOperatorBaseModel
15+
16+
17+
class RandomCutForestOperatorModel(AnomalyOperatorBaseModel):
18+
"""
19+
Class representing Random Cut Forest Anomaly Detection operator model.
20+
"""
21+
22+
@runtime_dependency(
23+
module="rrcf",
24+
err_msg=(
25+
"Please run `pip install rrcf` to "
26+
"install the required dependencies for RandomCutForest."
27+
),
28+
)
29+
def _build_model(self) -> AnomalyOutput:
30+
from rrcf import RCTree
31+
32+
model_kwargs = self.spec.model_kwargs
33+
34+
anomaly_output = AnomalyOutput(date_column="index")
35+
36+
# Set tree parameters
37+
num_trees = model_kwargs.get("num_trees", 200)
38+
shingle_size = model_kwargs.get("shingle_size", None)
39+
anomaly_threshold = model_kwargs.get("anamoly_threshold", 95)
40+
41+
for target, df in self.datasets.full_data_dict.items():
42+
try:
43+
if df.shape[0] == 1:
44+
raise ValueError("Dataset size must be greater than 1")
45+
df_values = df[self.spec.target_column].astype(float).values
46+
47+
cal_shingle_size = (
48+
shingle_size
49+
if shingle_size
50+
else int(2 ** np.floor(np.log2(df.shape[0])) / 2)
51+
)
52+
points = np.vstack(list(rrcf.shingle(df_values, size=cal_shingle_size)))
53+
54+
sample_size_range = (1, points.shape[0])
55+
n = points.shape[0]
56+
avg_codisp = pd.Series(0.0, index=np.arange(n))
57+
index = np.zeros(n)
58+
59+
forest = []
60+
while len(forest) < num_trees:
61+
ixs = np.random.choice(n, size=sample_size_range, replace=False)
62+
trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs]
63+
forest.extend(trees)
64+
65+
for tree in forest:
66+
codisp = pd.Series(
67+
{leaf: tree.codisp(leaf) for leaf in tree.leaves}
68+
)
69+
avg_codisp[codisp.index] += codisp
70+
np.add.at(index, codisp.index.values, 1)
71+
72+
avg_codisp /= index
73+
avg_codisp.index = df.iloc[(cal_shingle_size - 1) :].index
74+
avg_codisp = (avg_codisp - avg_codisp.min()) / (
75+
avg_codisp.max() - avg_codisp.min()
76+
)
77+
78+
y_pred = (
79+
avg_codisp > np.percentile(avg_codisp, anomaly_threshold)
80+
).astype(int)
81+
82+
index_col = df.columns[0]
83+
84+
anomaly = pd.DataFrame(
85+
{index_col: y_pred.index, OutputColumns.ANOMALY_COL: y_pred}
86+
).reset_index(drop=True)
87+
score = pd.DataFrame(
88+
{"index": avg_codisp.index, OutputColumns.SCORE_COL: avg_codisp}
89+
).reset_index(drop=True)
90+
91+
anomaly_output.add_output(target, anomaly, score)
92+
except Exception as e:
93+
logger.warn(f"Encountered Error: {e}. Skipping series {target}.")
94+
95+
return anomaly_output
96+
97+
def _generate_report(self):
98+
"""Generates the report."""
99+
import report_creator as rc
100+
101+
other_sections = [
102+
rc.Heading("Selected Models Overview", level=2),
103+
rc.Text(
104+
"The following tables provide information regarding the chosen model."
105+
),
106+
]
107+
108+
model_description = rc.Text(
109+
"The Random Cut Forest (RCF) is an unsupervised machine learning algorithm that is used for anomaly detection."
110+
" It works by building an ensemble of binary trees (random cut trees) and using them to compute anomaly scores for data points."
111+
)
112+
113+
return (
114+
model_description,
115+
other_sections,
116+
)

ads/opctl/operator/lowcode/anomaly/schema.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,7 @@ spec:
363363
- auto
364364
- oneclasssvm
365365
- isolationforest
366+
- randomcutforest
366367
meta:
367368
description: "The model to be used for anomaly detection"
368369

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,8 @@ anomaly = [
176176
"autots",
177177
"oracledb",
178178
"report-creator==1.0.9",
179+
"rrcf==0.4.4",
180+
"scikit-learn"
179181
]
180182
recommender = [
181183
"oracle_ads[opctl]",

test-requirements-operators.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
-r test-requirements.txt
22
-e ".[forecast]"
3+
-e ".[anomaly]"
34
-e ".[recommender]"
45
-e ".[feature-store-marketplace]"
56
plotly

tests/operators/anomaly/test_anomaly_simple.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
for d in DATASETS:
5353
parameters_short.append((m, d))
5454

55-
MODELS = ["autots", "oneclasssvm", "isolationforest"]
55+
MODELS = ["autots", "oneclasssvm", "isolationforest", "randomcutforest"]
5656

5757
@pytest.mark.parametrize("model", ["autots"])
5858
def test_artificial_big(model):

0 commit comments

Comments
 (0)