ODSC-61571 : Add RCF Implementation (#934)

codeloop · web-flow · commit 2267dd3e2b69 · 2024-09-16T14:43:38.000+05:30
diff --git a/THIRD_PARTY_LICENSES.txt b/THIRD_PARTY_LICENSES.txt
@@ -453,6 +453,12 @@ mlforecast
 * Source code: https://github.com/Nixtla/mlforecast
 * Project home: https://github.com/Nixtla/mlforecast
 
+rrcf
+* Copyright 2018 kLabUM
+* License: MIT License
+* Source code: https://github.com/kLabUM/rrcf
+* Project home: https://github.com/kLabUM/rrcf
+
 =======
 =============================== Licenses ===============================
 ------------------------------------------------------------------------
diff --git a/ads/opctl/operator/lowcode/anomaly/const.py b/ads/opctl/operator/lowcode/anomaly/const.py
@@ -21,6 +21,7 @@ class NonTimeADSupportedModels(str, metaclass=ExtendedEnumMeta):
 
     OneClassSVM = "oneclasssvm"
     IsolationForest = "isolationforest"
+    RandomCutForest = "randomcutforest"
     # TODO : Add DBScan
     # DBScan = "dbscan"
     
diff --git a/ads/opctl/operator/lowcode/anomaly/model/base_model.py b/ads/opctl/operator/lowcode/anomaly/model/base_model.py
@@ -16,7 +16,11 @@
 
 from ads.common.object_storage_details import ObjectStorageDetails
 from ads.opctl import logger
-from ads.opctl.operator.lowcode.anomaly.const import OutputColumns, SupportedMetrics, SUBSAMPLE_THRESHOLD
+from ads.opctl.operator.lowcode.anomaly.const import (
+    SUBSAMPLE_THRESHOLD,
+    OutputColumns,
+    SupportedMetrics,
+)
 from ads.opctl.operator.lowcode.anomaly.utils import _build_metrics_df, default_signer
 from ads.opctl.operator.lowcode.common.utils import (
     disable_print,
@@ -55,6 +59,7 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets):
     def generate_report(self):
         """Generates the report."""
         import matplotlib.pyplot as plt
+        plt.rcParams.update({'figure.max_open_warning': 0})
         import report_creator as rc
 
         start_time = time.time()
@@ -87,43 +92,59 @@ def generate_report(self):
             self.spec.datetime_column.name if self.spec.datetime_column else "index"
         )
 
+        (
+            model_description,
+            other_sections,
+        ) = self._generate_report()
+
         blocks = []
         for target, df in self.datasets.full_data_dict.items():
-            figure_blocks = []
-            time_col = df[date_column].reset_index(drop=True)
-            anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
-                OutputColumns.ANOMALY_COL
-            ]
-            anomaly_indices = [i for i, index in enumerate(anomaly_col) if index == 1]
-            downsampled_time_col = time_col
-            selected_indices = list(range(len(time_col)))
-            if self.spec.subsample_report_data:
-                non_anomaly_indices = [i for i in range(len(time_col)) if i not in anomaly_indices]
-                # Downsample non-anomalous data if it exceeds the threshold (1000)
-                if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
-                    downsampled_non_anomaly_indices = non_anomaly_indices[::len(non_anomaly_indices)//SUBSAMPLE_THRESHOLD]
-                    selected_indices = anomaly_indices + downsampled_non_anomaly_indices
-                    selected_indices.sort()
-                downsampled_time_col = time_col[selected_indices]
-
-            columns = set(df.columns).difference({date_column})
-            for col in columns:
-                y = df[col].reset_index(drop=True)
-
-                downsampled_y = y[selected_indices]
-
-                fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
-                ax.grid()
-                ax.plot(downsampled_time_col, downsampled_y, color="black")
-                # Plot anomalies
-                for i in anomaly_indices:
-                    ax.scatter(time_col[i], y[i], color="red", marker="o")
-                plt.xlabel(date_column)
-                plt.ylabel(col)
-                plt.title(f"`{col}` with reference to anomalies")
-                figure_blocks.append(rc.Widget(ax))
-
-        blocks.append(rc.Group(*figure_blocks, label=target))
+            if target in anomaly_output.list_categories():
+                figure_blocks = []
+                time_col = df[date_column].reset_index(drop=True)
+                anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
+                    OutputColumns.ANOMALY_COL
+                ]
+                anomaly_indices = [
+                    i for i, index in enumerate(anomaly_col) if index == 1
+                ]
+                downsampled_time_col = time_col
+                selected_indices = list(range(len(time_col)))
+                if self.spec.subsample_report_data:
+                    non_anomaly_indices = [
+                        i for i in range(len(time_col)) if i not in anomaly_indices
+                    ]
+                    # Downsample non-anomalous data if it exceeds the threshold (1000)
+                    if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
+                        downsampled_non_anomaly_indices = non_anomaly_indices[
+                            :: len(non_anomaly_indices) // SUBSAMPLE_THRESHOLD
+                        ]
+                        selected_indices = (
+                            anomaly_indices + downsampled_non_anomaly_indices
+                        )
+                        selected_indices.sort()
+                    downsampled_time_col = time_col[selected_indices]
+
+                columns = set(df.columns).difference({date_column})
+                for col in columns:
+                    y = df[col].reset_index(drop=True)
+
+                    downsampled_y = y[selected_indices]
+
+                    fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
+                    ax.grid()
+                    ax.plot(downsampled_time_col, downsampled_y, color="black")
+                    # Plot anomalies
+                    for i in anomaly_indices:
+                        ax.scatter(time_col[i], y[i], color="red", marker="o")
+                    plt.xlabel(date_column)
+                    plt.ylabel(col)
+                    plt.title(f"`{col}` with reference to anomalies")
+                    figure_blocks.append(rc.Widget(ax))
+            else:
+                figure_blocks = None
+
+            blocks.append(rc.Group(*figure_blocks, label=target)) if figure_blocks else None
         plots = rc.Select(blocks)
 
         report_sections = []
@@ -133,7 +154,7 @@ def generate_report(self):
         yaml_appendix = rc.Yaml(self.config.to_dict())
         summary = rc.Block(
             rc.Group(
-                rc.Text(f"You selected the **`{self.spec.model}`** model."),
+                rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"),
                 rc.Text(
                     "Based on your dataset, you could have also selected "
                     f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`."
diff --git a/ads/opctl/operator/lowcode/anomaly/model/factory.py b/ads/opctl/operator/lowcode/anomaly/model/factory.py
@@ -15,6 +15,7 @@
 from .base_model import AnomalyOperatorBaseModel
 from .isolationforest import IsolationForestOperatorModel
 from .oneclasssvm import OneClassSVMOperatorModel
+from .randomcutforest import RandomCutForestOperatorModel
 
 
 class UnSupportedModelError(Exception):
@@ -52,6 +53,7 @@ class AnomalyOperatorModelFactory:
     _NonTime_MAP = {
         NonTimeADSupportedModels.OneClassSVM: OneClassSVMOperatorModel,
         NonTimeADSupportedModels.IsolationForest: IsolationForestOperatorModel,
+        NonTimeADSupportedModels.RandomCutForest: RandomCutForestOperatorModel,
         # TODO: Add DBScan model for non time based anomaly
         # NonTimeADSupportedModels.DBScan: DBScanOperatorModel,
     }
diff --git a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
+
+import numpy as np
+import pandas as pd
+
+from ads.common.decorator.runtime_dependency import runtime_dependency
+from ads.opctl import logger
+from ads.opctl.operator.lowcode.anomaly.const import OutputColumns
+
+from .anomaly_dataset import AnomalyOutput
+from .base_model import AnomalyOperatorBaseModel
+
+
+class RandomCutForestOperatorModel(AnomalyOperatorBaseModel):
+    """
+    Class representing Random Cut Forest Anomaly Detection operator model.
+    """
+
+    @runtime_dependency(
+        module="rrcf",
+        err_msg=(
+            "Please run `pip install rrcf` to "
+            "install the required dependencies for RandomCutForest."
+        ),
+    )
+    def _build_model(self) -> AnomalyOutput:
+        from rrcf import RCTree
+
+        model_kwargs = self.spec.model_kwargs
+
+        anomaly_output = AnomalyOutput(date_column="index")
+
+        # Set tree parameters
+        num_trees = model_kwargs.get("num_trees", 200)
+        shingle_size = model_kwargs.get("shingle_size", None)
+        anomaly_threshold = model_kwargs.get("anamoly_threshold", 95)
+
+        for target, df in self.datasets.full_data_dict.items():
+            try:
+                if df.shape[0] == 1:
+                    raise ValueError("Dataset size must be greater than 1")
+                df_values = df[self.spec.target_column].astype(float).values
+
+                cal_shingle_size = (
+                    shingle_size
+                    if shingle_size
+                    else int(2 ** np.floor(np.log2(df.shape[0])) / 2)
+                )
+                points = np.vstack(list(rrcf.shingle(df_values, size=cal_shingle_size)))
+
+                sample_size_range = (1, points.shape[0])
+                n = points.shape[0]
+                avg_codisp = pd.Series(0.0, index=np.arange(n))
+                index = np.zeros(n)
+
+                forest = []
+                while len(forest) < num_trees:
+                    ixs = np.random.choice(n, size=sample_size_range, replace=False)
+                    trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs]
+                    forest.extend(trees)
+
+                for tree in forest:
+                    codisp = pd.Series(
+                        {leaf: tree.codisp(leaf) for leaf in tree.leaves}
+                    )
+                    avg_codisp[codisp.index] += codisp
+                    np.add.at(index, codisp.index.values, 1)
+
+                avg_codisp /= index
+                avg_codisp.index = df.iloc[(cal_shingle_size - 1) :].index
+                avg_codisp = (avg_codisp - avg_codisp.min()) / (
+                    avg_codisp.max() - avg_codisp.min()
+                )
+
+                y_pred = (
+                    avg_codisp > np.percentile(avg_codisp, anomaly_threshold)
+                ).astype(int)
+
+                index_col = df.columns[0]
+
+                anomaly = pd.DataFrame(
+                    {index_col: y_pred.index, OutputColumns.ANOMALY_COL: y_pred}
+                ).reset_index(drop=True)
+                score = pd.DataFrame(
+                    {"index": avg_codisp.index, OutputColumns.SCORE_COL: avg_codisp}
+                ).reset_index(drop=True)
+
+                anomaly_output.add_output(target, anomaly, score)
+            except Exception as e:
+                logger.warn(f"Encountered Error: {e}. Skipping series {target}.")
+
+        return anomaly_output
+
+    def _generate_report(self):
+        """Generates the report."""
+        import report_creator as rc
+
+        other_sections = [
+            rc.Heading("Selected Models Overview", level=2),
+            rc.Text(
+                "The following tables provide information regarding the chosen model."
+            ),
+        ]
+
+        model_description = rc.Text(
+            "The Random Cut Forest (RCF) is an unsupervised machine learning algorithm that is used for anomaly detection."
+            " It works by building an ensemble of binary trees (random cut trees) and using them to compute anomaly scores for data points."
+        )
+
+        return (
+            model_description,
+            other_sections,
+        )
diff --git a/ads/opctl/operator/lowcode/anomaly/schema.yaml b/ads/opctl/operator/lowcode/anomaly/schema.yaml
@@ -363,6 +363,7 @@ spec:
         - auto
         - oneclasssvm
         - isolationforest
+        - randomcutforest
       meta:
         description: "The model to be used for anomaly detection"
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -176,6 +176,8 @@ anomaly  = [
   "autots",
   "oracledb",
   "report-creator==1.0.9",
+  "rrcf==0.4.4",
+  "scikit-learn"
 ]
 recommender = [
   "oracle_ads[opctl]",
diff --git a/test-requirements-operators.txt b/test-requirements-operators.txt
@@ -1,5 +1,6 @@
 -r test-requirements.txt
 -e ".[forecast]"
+-e ".[anomaly]"
 -e ".[recommender]"
 -e ".[feature-store-marketplace]"
 plotly
diff --git a/tests/operators/anomaly/test_anomaly_simple.py b/tests/operators/anomaly/test_anomaly_simple.py
@@ -52,7 +52,7 @@
     for d in DATASETS:
         parameters_short.append((m, d))
 
-MODELS = ["autots", "oneclasssvm", "isolationforest"]
+MODELS = ["autots", "oneclasssvm", "isolationforest", "randomcutforest"]
 
 @pytest.mark.parametrize("model", ["autots"])
 def test_artificial_big(model):

Original file line number	Diff line number	Diff line change
`@@ -176,6 +176,8 @@ anomaly = [`
`176`	`176`	`"autots",`
`177`	`177`	`"oracledb",`
`178`	`178`	`"report-creator==1.0.9",`
	`179`	`+ "rrcf==0.4.4",`
	`180`	`+ "scikit-learn"`
`179`	`181`	`]`
`180`	`182`	`recommender = [`
`181`	`183`	`"oracle_ads[opctl]",`