Merge branch 'main' into feature/aquav1.0.3

VipulMascarenhas · VipulMascarenhas · commit 575fb31275f7 · 2024-07-22T09:54:35.000-07:00
diff --git a/ads/opctl/operator/lowcode/anomaly/const.py b/ads/opctl/operator/lowcode/anomaly/const.py
@@ -94,3 +94,4 @@ class OutputColumns(str, metaclass=ExtendedEnumMeta):
 
 
 TODS_DEFAULT_MODEL = "ocsvm"
+SUBSAMPLE_THRESHOLD = 1000
diff --git a/ads/opctl/operator/lowcode/anomaly/model/base_model.py b/ads/opctl/operator/lowcode/anomaly/model/base_model.py
@@ -16,7 +16,7 @@
 
 from ads.common.object_storage_details import ObjectStorageDetails
 from ads.opctl import logger
-from ads.opctl.operator.lowcode.anomaly.const import OutputColumns, SupportedMetrics
+from ads.opctl.operator.lowcode.anomaly.const import OutputColumns, SupportedMetrics, SUBSAMPLE_THRESHOLD
 from ads.opctl.operator.lowcode.anomaly.utils import _build_metrics_df, default_signer
 from ads.opctl.operator.lowcode.common.utils import (
     disable_print,
@@ -79,7 +79,7 @@ def generate_report(self):
                 anomaly_output, test_data, elapsed_time
             )
         table_blocks = [
-            rc.DataTable(df, label=col, index=True)
+            rc.DataTable(df.head(SUBSAMPLE_THRESHOLD) if self.spec.subsample_report_data and len(df) > SUBSAMPLE_THRESHOLD else df, label=col, index=True)
             for col, df in self.datasets.full_data_dict.items()
         ]
         data_table = rc.Select(blocks=table_blocks)
@@ -94,20 +94,36 @@ def generate_report(self):
             anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
                 OutputColumns.ANOMALY_COL
             ]
+            anomaly_indices = [i for i, index in enumerate(anomaly_col) if index == 1]
+            downsampled_time_col = time_col
+            selected_indices = list(range(len(time_col)))
+            if self.spec.subsample_report_data:
+                non_anomaly_indices = [i for i in range(len(time_col)) if i not in anomaly_indices]
+                # Downsample non-anomalous data if it exceeds the threshold (1000)
+                if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
+                    downsampled_non_anomaly_indices = non_anomaly_indices[::len(non_anomaly_indices)//SUBSAMPLE_THRESHOLD]
+                    selected_indices = anomaly_indices + downsampled_non_anomaly_indices
+                    selected_indices.sort()
+                downsampled_time_col = time_col[selected_indices]
+
             columns = set(df.columns).difference({date_column})
             for col in columns:
                 y = df[col].reset_index(drop=True)
+
+                downsampled_y = y[selected_indices]
+
                 fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
                 ax.grid()
-                ax.plot(time_col, y, color="black")
-                for i, index in enumerate(anomaly_col):
-                    if index == 1:
-                        ax.scatter(time_col[i], y[i], color="red", marker="o")
+                ax.plot(downsampled_time_col, downsampled_y, color="black")
+                # Plot anomalies
+                for i in anomaly_indices:
+                    ax.scatter(time_col[i], y[i], color="red", marker="o")
                 plt.xlabel(date_column)
                 plt.ylabel(col)
                 plt.title(f"`{col}` with reference to anomalies")
                 figure_blocks.append(rc.Widget(ax))
-            blocks.append(rc.Group(*figure_blocks, label=target))
+
+        blocks.append(rc.Group(*figure_blocks, label=target))
         plots = rc.Select(blocks)
 
         report_sections = []
diff --git a/ads/opctl/operator/lowcode/anomaly/operator_config.py b/ads/opctl/operator/lowcode/anomaly/operator_config.py
@@ -77,6 +77,7 @@ class AnomalyOperatorSpec(DataClassSerializable):
     model: str = None
     model_kwargs: Dict = field(default_factory=dict)
     contamination: float = None
+    subsample_report_data: bool = None
 
     def __post_init__(self):
         """Adjusts the specification details."""
diff --git a/ads/opctl/operator/lowcode/anomaly/schema.yaml b/ads/opctl/operator/lowcode/anomaly/schema.yaml
@@ -377,4 +377,8 @@ spec:
       type: dict
       required: false
 
+    subsample_report_data:
+      type: boolean
+      required: false
+
   type: dict
diff --git a/ads/opctl/operator/lowcode/common/errors.py b/ads/opctl/operator/lowcode/common/errors.py
@@ -39,3 +39,9 @@ def __init__(self, error: str):
             "complies with the required schema for the operator. \n"
             f"{error}"
         )
+
+
+class InsufficientDataError(Exception):
+    def __init__(self, message: str):
+        self.message = message
+        super().__init__(message)
diff --git a/ads/opctl/operator/lowcode/forecast/model/base_model.py b/ads/opctl/operator/lowcode/forecast/model/base_model.py
@@ -249,20 +249,28 @@ def generate_report(self):
                     train_metrics_sections = [sec9_text, sec9]
 
                 backtest_sections = []
+                output_dir = self.spec.output_directory.url
+                backtest_report_name = "backtest_stats.csv"
+                file_path = f"{output_dir}/{backtest_report_name}"
                 if self.spec.model == AUTO_SELECT:
-                    output_dir = self.spec.output_directory.url
-                    backtest_report_name = "backtest_stats.csv"
-                    backtest_stats = pd.read_csv(f"{output_dir}/{backtest_report_name}")
-                    average_dict = backtest_stats.mean().to_dict()
-                    del average_dict['backtest']
-                    best_model = min(average_dict, key=average_dict.get)
-                    backtest_text = rc.Heading("Back Testing Metrics", level=2)
-                    summary_text = rc.Text(
-                        f"Overall, the average scores for the models are {average_dict}, with {best_model}"
-                        f" being identified as the top-performing model during backtesting.")
-                    backtest_table = rc.DataTable(backtest_stats, index=True)
-                    liner_plot = get_auto_select_plot(backtest_stats)
-                    backtest_sections = [backtest_text, backtest_table, summary_text, liner_plot]
+                    backtest_sections.append(rc.Heading("Auto-select statistics", level=2))
+                    if not os.path.exists(file_path):
+                        failure_msg = rc.Text("auto-select could not be executed. Please check the "
+                                              "logs for more details.")
+                        backtest_sections.append(failure_msg)
+                    else:
+                        backtest_stats = pd.read_csv(file_path)
+                        average_dict = backtest_stats.mean().to_dict()
+                        del average_dict['backtest']
+                        best_model = min(average_dict, key=average_dict.get)
+                        backtest_text = rc.Heading("Back Testing Metrics", level=3)
+                        summary_text = rc.Text(
+                            f"Overall, the average scores for the models are {average_dict}, with {best_model}"
+                            f" being identified as the top-performing model during backtesting.")
+                        backtest_table = rc.DataTable(backtest_stats, index=True)
+                        liner_plot = get_auto_select_plot(backtest_stats)
+                        backtest_sections.extend([backtest_text, backtest_table, summary_text,
+                                                                      liner_plot])
 
 
                 forecast_plots = []
diff --git a/ads/opctl/operator/lowcode/forecast/model_evaluator.py b/ads/opctl/operator/lowcode/forecast/model_evaluator.py
@@ -12,7 +12,8 @@
 from ads.opctl.operator.lowcode.common.const import DataColumns
 from .model.forecast_datasets import ForecastDatasets
 from .operator_config import ForecastOperatorConfig
-
+from ads.opctl.operator.lowcode.forecast.model.factory import SupportedModels
+from ads.opctl.operator.lowcode.common.errors import InsufficientDataError
 
 class ModelEvaluator:
     """
@@ -61,6 +62,9 @@ def generate_k_fold_data(self, datasets: ForecastDatasets, operator_config: Fore
         unique_dates = min_series_data[date_col].unique()
 
         cut_offs = self.generate_cutoffs(unique_dates, horizon)
+        if not len(cut_offs):
+            raise InsufficientDataError("Insufficient data to evaluate multiple models. Please specify a model "
+                                        "instead of using auto-select.")
         training_datasets = [sampled_historical_data[sampled_historical_data[date_col] <= cut_off_date] for cut_off_date
                              in cut_offs]
         test_datasets = [sampled_historical_data[sampled_historical_data[date_col] > cut_offs[0]]]
@@ -137,7 +141,12 @@ def run_all_models(self, datasets: ForecastDatasets, operator_config: ForecastOp
         return metrics
 
     def find_best_model(self, datasets: ForecastDatasets, operator_config: ForecastOperatorConfig):
-        metrics = self.run_all_models(datasets, operator_config)
+        try:
+            metrics = self.run_all_models(datasets, operator_config)
+        except InsufficientDataError as e:
+            model = SupportedModels.Prophet
+            logger.error(f"Running {model} model as auto-select failed with the following error: {e.message}")
+            return model
         avg_backtests_metrics = {key: sum(value.values()) / len(value.values()) for key, value in metrics.items()}
         best_model = min(avg_backtests_metrics, key=avg_backtests_metrics.get)
         logger.info(f"Among models {self.models}, {best_model} model shows better performance during backtesting.")
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -45,6 +45,7 @@ Oracle Accelerated Data Science (ADS)
    user_guide/operators/forecasting_operator/index
    user_guide/operators/anomaly_detection_operator/index
    user_guide/operators/pii_operator/index
+   user_guide/operators/recommender_operator/index
 
 .. toctree::
    :hidden:
diff --git a/docs/source/user_guide/operators/recommender_operator/index.rst b/docs/source/user_guide/operators/recommender_operator/index.rst
@@ -1,6 +1,6 @@
-===
+===========
 Recommender
-===
+===========
 
 The Recommender Operator utilizes advanced algorithms to provide personalized recommendations based on user behavior and preferences. This operator streamlines the data science workflow by automating the process of selecting the best recommendation algorithms, tuning hyperparameters, and extracting relevant features, ensuring that users receive the most relevant and effective suggestions for their needs.
 
diff --git a/docs/source/user_guide/operators/recommender_operator/quickstart.rst b/docs/source/user_guide/operators/recommender_operator/quickstart.rst

Original file line number	Diff line number	Diff line change
`@@ -94,3 +94,4 @@ class OutputColumns(str, metaclass=ExtendedEnumMeta):`
`94`	`94`
`95`	`95`
`96`	`96`	`TODS_DEFAULT_MODEL = "ocsvm"`
	`97`	`+SUBSAMPLE_THRESHOLD = 1000`