Skip to content

Commit 575fb31

Browse files
Merge branch 'main' into feature/aquav1.0.3
2 parents eb31921 + 32d031e commit 575fb31

File tree

10 files changed

+70
-24
lines changed

10 files changed

+70
-24
lines changed

ads/opctl/operator/lowcode/anomaly/const.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,4 @@ class OutputColumns(str, metaclass=ExtendedEnumMeta):
9494

9595

9696
TODS_DEFAULT_MODEL = "ocsvm"
97+
SUBSAMPLE_THRESHOLD = 1000

ads/opctl/operator/lowcode/anomaly/model/base_model.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
from ads.common.object_storage_details import ObjectStorageDetails
1818
from ads.opctl import logger
19-
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns, SupportedMetrics
19+
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns, SupportedMetrics, SUBSAMPLE_THRESHOLD
2020
from ads.opctl.operator.lowcode.anomaly.utils import _build_metrics_df, default_signer
2121
from ads.opctl.operator.lowcode.common.utils import (
2222
disable_print,
@@ -79,7 +79,7 @@ def generate_report(self):
7979
anomaly_output, test_data, elapsed_time
8080
)
8181
table_blocks = [
82-
rc.DataTable(df, label=col, index=True)
82+
rc.DataTable(df.head(SUBSAMPLE_THRESHOLD) if self.spec.subsample_report_data and len(df) > SUBSAMPLE_THRESHOLD else df, label=col, index=True)
8383
for col, df in self.datasets.full_data_dict.items()
8484
]
8585
data_table = rc.Select(blocks=table_blocks)
@@ -94,20 +94,36 @@ def generate_report(self):
9494
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
9595
OutputColumns.ANOMALY_COL
9696
]
97+
anomaly_indices = [i for i, index in enumerate(anomaly_col) if index == 1]
98+
downsampled_time_col = time_col
99+
selected_indices = list(range(len(time_col)))
100+
if self.spec.subsample_report_data:
101+
non_anomaly_indices = [i for i in range(len(time_col)) if i not in anomaly_indices]
102+
# Downsample non-anomalous data if it exceeds the threshold (1000)
103+
if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
104+
downsampled_non_anomaly_indices = non_anomaly_indices[::len(non_anomaly_indices)//SUBSAMPLE_THRESHOLD]
105+
selected_indices = anomaly_indices + downsampled_non_anomaly_indices
106+
selected_indices.sort()
107+
downsampled_time_col = time_col[selected_indices]
108+
97109
columns = set(df.columns).difference({date_column})
98110
for col in columns:
99111
y = df[col].reset_index(drop=True)
112+
113+
downsampled_y = y[selected_indices]
114+
100115
fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
101116
ax.grid()
102-
ax.plot(time_col, y, color="black")
103-
for i, index in enumerate(anomaly_col):
104-
if index == 1:
105-
ax.scatter(time_col[i], y[i], color="red", marker="o")
117+
ax.plot(downsampled_time_col, downsampled_y, color="black")
118+
# Plot anomalies
119+
for i in anomaly_indices:
120+
ax.scatter(time_col[i], y[i], color="red", marker="o")
106121
plt.xlabel(date_column)
107122
plt.ylabel(col)
108123
plt.title(f"`{col}` with reference to anomalies")
109124
figure_blocks.append(rc.Widget(ax))
110-
blocks.append(rc.Group(*figure_blocks, label=target))
125+
126+
blocks.append(rc.Group(*figure_blocks, label=target))
111127
plots = rc.Select(blocks)
112128

113129
report_sections = []

ads/opctl/operator/lowcode/anomaly/operator_config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ class AnomalyOperatorSpec(DataClassSerializable):
7777
model: str = None
7878
model_kwargs: Dict = field(default_factory=dict)
7979
contamination: float = None
80+
subsample_report_data: bool = None
8081

8182
def __post_init__(self):
8283
"""Adjusts the specification details."""

ads/opctl/operator/lowcode/anomaly/schema.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,4 +377,8 @@ spec:
377377
type: dict
378378
required: false
379379

380+
subsample_report_data:
381+
type: boolean
382+
required: false
383+
380384
type: dict

ads/opctl/operator/lowcode/common/errors.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,9 @@ def __init__(self, error: str):
3939
"complies with the required schema for the operator. \n"
4040
f"{error}"
4141
)
42+
43+
44+
class InsufficientDataError(Exception):
45+
def __init__(self, message: str):
46+
self.message = message
47+
super().__init__(message)

ads/opctl/operator/lowcode/forecast/model/base_model.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -249,20 +249,28 @@ def generate_report(self):
249249
train_metrics_sections = [sec9_text, sec9]
250250

251251
backtest_sections = []
252+
output_dir = self.spec.output_directory.url
253+
backtest_report_name = "backtest_stats.csv"
254+
file_path = f"{output_dir}/{backtest_report_name}"
252255
if self.spec.model == AUTO_SELECT:
253-
output_dir = self.spec.output_directory.url
254-
backtest_report_name = "backtest_stats.csv"
255-
backtest_stats = pd.read_csv(f"{output_dir}/{backtest_report_name}")
256-
average_dict = backtest_stats.mean().to_dict()
257-
del average_dict['backtest']
258-
best_model = min(average_dict, key=average_dict.get)
259-
backtest_text = rc.Heading("Back Testing Metrics", level=2)
260-
summary_text = rc.Text(
261-
f"Overall, the average scores for the models are {average_dict}, with {best_model}"
262-
f" being identified as the top-performing model during backtesting.")
263-
backtest_table = rc.DataTable(backtest_stats, index=True)
264-
liner_plot = get_auto_select_plot(backtest_stats)
265-
backtest_sections = [backtest_text, backtest_table, summary_text, liner_plot]
256+
backtest_sections.append(rc.Heading("Auto-select statistics", level=2))
257+
if not os.path.exists(file_path):
258+
failure_msg = rc.Text("auto-select could not be executed. Please check the "
259+
"logs for more details.")
260+
backtest_sections.append(failure_msg)
261+
else:
262+
backtest_stats = pd.read_csv(file_path)
263+
average_dict = backtest_stats.mean().to_dict()
264+
del average_dict['backtest']
265+
best_model = min(average_dict, key=average_dict.get)
266+
backtest_text = rc.Heading("Back Testing Metrics", level=3)
267+
summary_text = rc.Text(
268+
f"Overall, the average scores for the models are {average_dict}, with {best_model}"
269+
f" being identified as the top-performing model during backtesting.")
270+
backtest_table = rc.DataTable(backtest_stats, index=True)
271+
liner_plot = get_auto_select_plot(backtest_stats)
272+
backtest_sections.extend([backtest_text, backtest_table, summary_text,
273+
liner_plot])
266274

267275

268276
forecast_plots = []

ads/opctl/operator/lowcode/forecast/model_evaluator.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
from ads.opctl.operator.lowcode.common.const import DataColumns
1313
from .model.forecast_datasets import ForecastDatasets
1414
from .operator_config import ForecastOperatorConfig
15-
15+
from ads.opctl.operator.lowcode.forecast.model.factory import SupportedModels
16+
from ads.opctl.operator.lowcode.common.errors import InsufficientDataError
1617

1718
class ModelEvaluator:
1819
"""
@@ -61,6 +62,9 @@ def generate_k_fold_data(self, datasets: ForecastDatasets, operator_config: Fore
6162
unique_dates = min_series_data[date_col].unique()
6263

6364
cut_offs = self.generate_cutoffs(unique_dates, horizon)
65+
if not len(cut_offs):
66+
raise InsufficientDataError("Insufficient data to evaluate multiple models. Please specify a model "
67+
"instead of using auto-select.")
6468
training_datasets = [sampled_historical_data[sampled_historical_data[date_col] <= cut_off_date] for cut_off_date
6569
in cut_offs]
6670
test_datasets = [sampled_historical_data[sampled_historical_data[date_col] > cut_offs[0]]]
@@ -137,7 +141,12 @@ def run_all_models(self, datasets: ForecastDatasets, operator_config: ForecastOp
137141
return metrics
138142

139143
def find_best_model(self, datasets: ForecastDatasets, operator_config: ForecastOperatorConfig):
140-
metrics = self.run_all_models(datasets, operator_config)
144+
try:
145+
metrics = self.run_all_models(datasets, operator_config)
146+
except InsufficientDataError as e:
147+
model = SupportedModels.Prophet
148+
logger.error(f"Running {model} model as auto-select failed with the following error: {e.message}")
149+
return model
141150
avg_backtests_metrics = {key: sum(value.values()) / len(value.values()) for key, value in metrics.items()}
142151
best_model = min(avg_backtests_metrics, key=avg_backtests_metrics.get)
143152
logger.info(f"Among models {self.models}, {best_model} model shows better performance during backtesting.")

docs/source/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ Oracle Accelerated Data Science (ADS)
4545
user_guide/operators/forecasting_operator/index
4646
user_guide/operators/anomaly_detection_operator/index
4747
user_guide/operators/pii_operator/index
48+
user_guide/operators/recommender_operator/index
4849

4950
.. toctree::
5051
:hidden:

docs/source/user_guide/operators/recommender/index.rst renamed to docs/source/user_guide/operators/recommender_operator/index.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
===
1+
===========
22
Recommender
3-
===
3+
===========
44

55
The Recommender Operator utilizes advanced algorithms to provide personalized recommendations based on user behavior and preferences. This operator streamlines the data science workflow by automating the process of selecting the best recommendation algorithms, tuning hyperparameters, and extracting relevant features, ensuring that users receive the most relevant and effective suggestions for their needs.
66

0 commit comments

Comments
 (0)