update the todos, complete implementation

codeloop · codeloop · commit 8b5c6995c932 · 2024-08-23T07:26:24.000Z
diff --git a/ads/opctl/operator/lowcode/anomaly/model/base_model.py b/ads/opctl/operator/lowcode/anomaly/model/base_model.py
@@ -55,6 +55,7 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets):
     def generate_report(self):
         """Generates the report."""
         import matplotlib.pyplot as plt
+        plt.rcParams.update({'figure.max_open_warning': 0})
         import report_creator as rc
 
         start_time = time.time()
@@ -87,43 +88,57 @@ def generate_report(self):
             self.spec.datetime_column.name if self.spec.datetime_column else "index"
         )
 
+        (
+            model_description,
+            other_sections,
+        ) = self._generate_report()
+
         blocks = []
         for target, df in self.datasets.full_data_dict.items():
-            figure_blocks = []
-            time_col = df[date_column].reset_index(drop=True)
-            anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
-                OutputColumns.ANOMALY_COL
-            ]
-            anomaly_indices = [i for i, index in enumerate(anomaly_col) if index == 1]
-            downsampled_time_col = time_col
-            selected_indices = list(range(len(time_col)))
-            if self.spec.subsample_report_data:
-                non_anomaly_indices = [i for i in range(len(time_col)) if i not in anomaly_indices]
-                # Downsample non-anomalous data if it exceeds the threshold (1000)
-                if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
-                    downsampled_non_anomaly_indices = non_anomaly_indices[::len(non_anomaly_indices)//SUBSAMPLE_THRESHOLD]
-                    selected_indices = anomaly_indices + downsampled_non_anomaly_indices
-                    selected_indices.sort()
-                downsampled_time_col = time_col[selected_indices]
-
-            columns = set(df.columns).difference({date_column})
-            for col in columns:
-                y = df[col].reset_index(drop=True)
-
-                downsampled_y = y[selected_indices]
-
-                fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
-                ax.grid()
-                ax.plot(downsampled_time_col, downsampled_y, color="black")
-                # Plot anomalies
-                for i in anomaly_indices:
-                    ax.scatter(time_col[i], y[i], color="red", marker="o")
-                plt.xlabel(date_column)
-                plt.ylabel(col)
-                plt.title(f"`{col}` with reference to anomalies")
-                figure_blocks.append(rc.Widget(ax))
-
-        blocks.append(rc.Group(*figure_blocks, label=target))
+            if target in anomaly_output.list_categories():
+                figure_blocks = []
+                time_col = df[date_column].reset_index(drop=True)
+                anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
+                    OutputColumns.ANOMALY_COL
+                ]
+                anomaly_indices = [
+                    i for i, index in enumerate(anomaly_col) if index == 1
+                ]
+                downsampled_time_col = time_col
+                selected_indices = list(range(len(time_col)))
+                if self.spec.subsample_report_data:
+                    non_anomaly_indices = [
+                        i for i in range(len(time_col)) if i not in anomaly_indices
+                    ]
+                    # Downsample non-anomalous data if it exceeds the threshold (1000)
+                    if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
+                        downsampled_non_anomaly_indices = non_anomaly_indices[
+                            :: len(non_anomaly_indices) // SUBSAMPLE_THRESHOLD
+                        ]
+                        selected_indices = (
+                            anomaly_indices + downsampled_non_anomaly_indices
+                        )
+                        selected_indices.sort()
+                    downsampled_time_col = time_col[selected_indices]
+
+                columns = set(df.columns).difference({date_column})
+                for col in columns:
+                    y = df[col].reset_index(drop=True)
+
+                    downsampled_y = y[selected_indices]
+
+                    fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
+                    ax.grid()
+                    ax.plot(downsampled_time_col, downsampled_y, color="black")
+                    # Plot anomalies
+                    for i in anomaly_indices:
+                        ax.scatter(time_col[i], y[i], color="red", marker="o")
+                    plt.xlabel(date_column)
+                    plt.ylabel(col)
+                    plt.title(f"`{col}` with reference to anomalies")
+                    figure_blocks.append(rc.Widget(ax))
+
+            blocks.append(rc.Group(*figure_blocks, label=target))
         plots = rc.Select(blocks)
 
         report_sections = []
@@ -133,7 +148,7 @@ def generate_report(self):
         yaml_appendix = rc.Yaml(self.config.to_dict())
         summary = rc.Block(
             rc.Group(
-                rc.Text(f"You selected the **`{self.spec.model}`** model."),
+                rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"),
                 rc.Text(
                     "Based on your dataset, you could have also selected "
                     f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`."
diff --git a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py
@@ -7,6 +7,7 @@
 import pandas as pd
 
 from ads.common.decorator.runtime_dependency import runtime_dependency
+from ads.opctl import logger
 from ads.opctl.operator.lowcode.anomaly.const import OutputColumns
 
 from .anomaly_dataset import AnomalyOutput
@@ -29,68 +30,67 @@ def _build_model(self) -> AnomalyOutput:
         from rrcf import RCTree
 
         model_kwargs = self.spec.model_kwargs
-        # map the output as per anomaly dataset class, 1: outlier, 0: inlier
-        # self.outlier_map = {1: 0, -1: 1}
 
         anomaly_output = AnomalyOutput(date_column="index")
 
         # Set tree parameters
         num_trees = model_kwargs.get("num_trees", 200)
-        shingle_size = model_kwargs.get("shingle_size", 1)
-        tree_size = model_kwargs.get("tree_size", 1000)
+        shingle_size = model_kwargs.get("shingle_size", None)
+        anamoly_threshold = model_kwargs.get("anamoly_threshold", 95)
 
         for target, df in self.datasets.full_data_dict.items():
-            df_values = df[self.spec.target_column].astype(float).values
-
-            # TODO: Update size to log logic
-            points = np.vstack(list(rrcf.shingle(df_values, size=4)))
-
-            # TODO: remove hardcode
-            sample_size_range = (1, 6)
-            n = points.shape[0]
-            avg_codisp = pd.Series(0.0, index=np.arange(n))
-            index = np.zeros(n)
-
-            forest = []
-            while len(forest) < num_trees:
-                ixs = np.random.choice(n, size=sample_size_range, replace=False)
-                trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs]
-                forest.extend(trees)
-                print(len(forest))
-
-            for tree in forest:
-                codisp = pd.Series({leaf: tree.codisp(leaf) for leaf in tree.leaves})
-                avg_codisp[codisp.index] += codisp
-                np.add.at(index, codisp.index.values, 1)
-
-            avg_codisp /= index
-            # TODO: remove hardcode
-            avg_codisp.index = df.iloc[(4 - 1) :].index
-            avg_codisp = (avg_codisp - avg_codisp.min()) / (
-                avg_codisp.max() - avg_codisp.min()
-            )
-
-            # TODO: use model kwargs for percentile threshold
-            y_pred = (avg_codisp > np.percentile(avg_codisp, 95)).astype(int)
-
-            # TODO: rem pdb
-            # import pdb
-
-            # pdb.set_trace()
-            print("Done")
-
-            # scores = model.score_samples(df)
-
-            # index_col = df.columns[0]
-
-            # anomaly = pd.DataFrame(
-            #     {index_col: df[index_col], OutputColumns.ANOMALY_COL: y_pred}
-            # ).reset_index(drop=True)
-            # score = pd.DataFrame(
-            #     {"index": df[index_col], OutputColumns.SCORE_COL: scores}
-            # ).reset_index(drop=True)
-
-            # anomaly_output.add_output(target, anomaly, score)
+            try:
+                if df.shape[0] == 1:
+                    raise ValueError("Dataset size must be greater than 1")
+                df_values = df[self.spec.target_column].astype(float).values
+
+                cal_shingle_size = (
+                    shingle_size
+                    if shingle_size
+                    else int(2 ** np.floor(np.log2(df.shape[0])) / 2)
+                )
+                points = np.vstack(list(rrcf.shingle(df_values, size=cal_shingle_size)))
+
+                sample_size_range = (1, points.shape[0])
+                n = points.shape[0]
+                avg_codisp = pd.Series(0.0, index=np.arange(n))
+                index = np.zeros(n)
+
+                forest = []
+                while len(forest) < num_trees:
+                    ixs = np.random.choice(n, size=sample_size_range, replace=False)
+                    trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs]
+                    forest.extend(trees)
+
+                for tree in forest:
+                    codisp = pd.Series(
+                        {leaf: tree.codisp(leaf) for leaf in tree.leaves}
+                    )
+                    avg_codisp[codisp.index] += codisp
+                    np.add.at(index, codisp.index.values, 1)
+
+                avg_codisp /= index
+                avg_codisp.index = df.iloc[(cal_shingle_size - 1) :].index
+                avg_codisp = (avg_codisp - avg_codisp.min()) / (
+                    avg_codisp.max() - avg_codisp.min()
+                )
+
+                y_pred = (
+                    avg_codisp > np.percentile(avg_codisp, anamoly_threshold)
+                ).astype(int)
+
+                index_col = df.columns[0]
+
+                anomaly = pd.DataFrame(
+                    {index_col: y_pred.index, OutputColumns.ANOMALY_COL: y_pred}
+                ).reset_index(drop=True)
+                score = pd.DataFrame(
+                    {"index": avg_codisp.index, OutputColumns.SCORE_COL: avg_codisp}
+                ).reset_index(drop=True)
+
+                anomaly_output.add_output(target, anomaly, score)
+            except Exception as e:
+                logger.warn(f"Encountered Error: {e}. Skipping series {target}.")
 
         return anomaly_output