Skip to content

Commit 71c3590

Browse files
committed
auto model evalulator added for forecasting operator
1 parent 53137d1 commit 71c3590

File tree

9 files changed

+106
-80
lines changed

9 files changed

+106
-80
lines changed

ads/opctl/operator/lowcode/anomaly/model/base_model.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,33 +4,29 @@
44
# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
55
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
66

7+
import fsspec
8+
import numpy as np
79
import os
10+
import pandas as pd
811
import tempfile
912
import time
1013
from abc import ABC, abstractmethod
11-
from typing import Tuple
12-
13-
import fsspec
14-
import pandas as pd
15-
import numpy as np
1614
from sklearn import linear_model
15+
from typing import Tuple
1716

17+
from ads.common.object_storage_details import ObjectStorageDetails
1818
from ads.opctl import logger
19-
20-
from ..operator_config import AnomalyOperatorConfig, AnomalyOperatorSpec
21-
from .anomaly_dataset import AnomalyDatasets, AnomalyOutput, TestData
2219
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns, SupportedMetrics
23-
from ..const import SupportedModels
20+
from ads.opctl.operator.lowcode.anomaly.utils import _build_metrics_df, default_signer
2421
from ads.opctl.operator.lowcode.common.utils import (
2522
human_time_friendly,
2623
enable_print,
2724
disable_print,
2825
write_data,
29-
merge_category_columns,
30-
find_output_dirname,
3126
)
32-
from ads.opctl.operator.lowcode.anomaly.utils import _build_metrics_df, default_signer
33-
from ads.common.object_storage_details import ObjectStorageDetails
27+
from .anomaly_dataset import AnomalyDatasets, AnomalyOutput, TestData
28+
from ..const import SupportedModels
29+
from ..operator_config import AnomalyOperatorConfig, AnomalyOperatorSpec
3430

3531

3632
class AnomalyOperatorBaseModel(ABC):
@@ -246,7 +242,7 @@ def _save_report(
246242
"""Saves resulting reports to the given folder."""
247243
import report_creator as rc
248244

249-
unique_output_dir = find_output_dirname(self.spec.output_directory)
245+
unique_output_dir = self.spec.output_directory.url
250246

251247
if ObjectStorageDetails.is_oci_path(unique_output_dir):
252248
storage_options = default_signer()

ads/opctl/operator/lowcode/common/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ def human_time_friendly(seconds):
215215

216216

217217
def find_output_dirname(output_dir: OutputDirectory):
218-
if output_dir:
218+
if output_dir and output_dir.url:
219219
return output_dir.url
220220
output_dir = "results"
221221

ads/opctl/operator/lowcode/forecast/__main__.py

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -24,24 +24,9 @@ def operate(operator_config: ForecastOperatorConfig) -> None:
2424
from .model.factory import ForecastOperatorModelFactory
2525

2626
datasets = ForecastDatasets(operator_config)
27-
try:
28-
ForecastOperatorModelFactory.get_model(
29-
operator_config, datasets
30-
).generate_report()
31-
except Exception as e:
32-
if operator_config.spec.model == "auto":
33-
logger.debug(
34-
f"Failed to forecast with error {e.args}. Trying again with model `prophet`."
35-
)
36-
operator_config.spec.model = "prophet"
37-
operator_config.spec.model_kwargs = dict()
38-
datasets = ForecastDatasets(operator_config)
39-
ForecastOperatorModelFactory.get_model(
40-
operator_config, datasets
41-
).generate_report()
42-
else:
43-
raise
44-
27+
ForecastOperatorModelFactory.get_model(
28+
operator_config, datasets
29+
).generate_report()
4530

4631
def verify(spec: Dict, **kwargs: Dict) -> bool:
4732
"""Verifies the forecasting operator config."""

ads/opctl/operator/lowcode/forecast/model/base_model.py

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,31 +4,19 @@
44
# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
55
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
66

7-
import json
7+
import fsspec
8+
import numpy as np
89
import os
10+
import pandas as pd
911
import tempfile
1012
import time
13+
import traceback
1114
from abc import ABC, abstractmethod
1215
from typing import Tuple
13-
import traceback
14-
15-
import fsspec
16-
import numpy as np
17-
import pandas as pd
1816

19-
from ads.opctl.operator.lowcode.forecast.utils import (
20-
default_signer,
21-
evaluate_train_metrics,
22-
get_forecast_plots,
23-
_build_metrics_df,
24-
_build_metrics_per_horizon,
25-
load_pkl,
26-
write_pkl,
27-
_label_encode_dataframe,
28-
)
17+
from ads.common.decorator.runtime_dependency import runtime_dependency
2918
from ads.common.object_storage_details import ObjectStorageDetails
3019
from ads.opctl import logger
31-
3220
from ads.opctl.operator.lowcode.common.utils import (
3321
human_time_friendly,
3422
enable_print,
@@ -37,18 +25,27 @@
3725
merged_category_column_name,
3826
datetime_to_seconds,
3927
seconds_to_datetime,
40-
find_output_dirname,
4128
)
29+
from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import TestData
30+
from ads.opctl.operator.lowcode.forecast.utils import (
31+
default_signer,
32+
evaluate_train_metrics,
33+
get_forecast_plots,
34+
get_auto_select_plot,
35+
_build_metrics_df,
36+
_build_metrics_per_horizon,
37+
load_pkl,
38+
write_pkl,
39+
_label_encode_dataframe,
40+
)
41+
from .forecast_datasets import ForecastDatasets
4242
from ..const import (
4343
SUMMARY_METRICS_HORIZON_LIMIT,
4444
SupportedMetrics,
4545
SupportedModels,
4646
SpeedAccuracyMode,
4747
)
4848
from ..operator_config import ForecastOperatorConfig, ForecastOperatorSpec
49-
from ads.common.decorator.runtime_dependency import runtime_dependency
50-
from .forecast_datasets import ForecastDatasets, ForecastOutput
51-
from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import TestData
5249

5350

5451
class ForecastOperatorBaseModel(ABC):
@@ -250,6 +247,17 @@ def generate_report(self):
250247
sec9 = rc.DataTable(self.eval_metrics, index=True)
251248
train_metrics_sections = [sec9_text, sec9]
252249

250+
backtest_sections = []
251+
if self.spec.model == "auto-select":
252+
output_dir = self.spec.output_directory.url
253+
backtest_report_name = "backtest_stats.csv"
254+
backtest_stats = pd.read_csv(f"{output_dir}/{backtest_report_name}")
255+
backtest_text = rc.Heading("Back Testing Metrics", level=2)
256+
backtest_table = rc.DataTable(backtest_stats, index=True)
257+
liner_plot = get_auto_select_plot(backtest_stats)
258+
backtest_sections = [backtest_text, backtest_table, liner_plot]
259+
260+
253261
forecast_plots = []
254262
if len(self.forecast_output.list_series_ids()) > 0:
255263
forecast_text = rc.Heading(
@@ -276,6 +284,7 @@ def generate_report(self):
276284
yaml_appendix = rc.Yaml(self.config.to_dict())
277285
report_sections = (
278286
[summary]
287+
+ backtest_sections
279288
+ forecast_plots
280289
+ other_sections
281290
+ test_metrics_sections
@@ -409,7 +418,7 @@ def _save_report(
409418
"""Saves resulting reports to the given folder."""
410419
import report_creator as rc
411420

412-
unique_output_dir = find_output_dirname(self.spec.output_directory)
421+
unique_output_dir = self.spec.output_directory.url
413422

414423
if ObjectStorageDetails.is_oci_path(unique_output_dir):
415424
storage_options = default_signer()

ads/opctl/operator/lowcode/forecast/model/factory.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def get_model(
6060
In case of not supported model.
6161
"""
6262
model_type = operator_config.spec.model
63-
if model_type == "auto":
63+
if model_type == "auto-select":
6464
model_type = cls.auto_select_model(datasets, operator_config)
6565
if model_type not in cls._MAP:
6666
raise UnSupportedModelError(model_type)
@@ -89,5 +89,4 @@ def auto_select_model(
8989
from ..model_evaluator import ModelEvaluator
9090
all_models = cls._MAP.keys()
9191
model_evaluator = ModelEvaluator(all_models)
92-
best_model = model_evaluator.find_best_model(datasets, operator_config)
93-
return cls._MAP[best_model]
92+
return model_evaluator.find_best_model(datasets, operator_config)

ads/opctl/operator/lowcode/forecast/model_evaluator.py

Lines changed: 37 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,14 @@
55

66

77
import numpy as np
8+
import pandas as pd
9+
from pathlib import Path
10+
811
from ads.opctl import logger
9-
from ads.opctl.operator.lowcode.common.utils import (
10-
find_output_dirname,
11-
)
1212
from ads.opctl.operator.lowcode.common.const import DataColumns
1313
from .model.forecast_datasets import ForecastDatasets
1414
from .operator_config import ForecastOperatorConfig
15-
from pathlib import Path
16-
import pandas as pd
15+
1716

1817
class ModelEvaluator:
1918
def __init__(self, models, k=5, subsample_ratio=0.20):
@@ -31,7 +30,9 @@ def generate_cutoffs(self, unique_dates, horizon):
3130
cut_offs = sorted_dates[-horizon - 1:-horizon * (self.k + 1):-horizon][:len(valid_train_window_size)]
3231
return cut_offs
3332

34-
def generate_k_fold_data(self, datasets: ForecastDatasets, date_col: str, horizon: int):
33+
def generate_k_fold_data(self, datasets: ForecastDatasets, operator_config: ForecastOperatorConfig):
34+
date_col = operator_config.spec.datetime_column.name
35+
horizon = operator_config.spec.horizon
3536
historical_data = datasets.historical_data.data.reset_index()
3637
series_col = DataColumns.Series
3738
group_counts = historical_data[series_col].value_counts()
@@ -51,63 +52,80 @@ def generate_k_fold_data(self, datasets: ForecastDatasets, date_col: str, horizo
5152
for i, current in enumerate(cut_offs[1:]):
5253
test_datasets.append(sampled_historical_data[(current < sampled_historical_data[date_col]) & (
5354
sampled_historical_data[date_col] <= cut_offs[i])])
54-
return cut_offs, training_datasets, test_datasets
55+
all_additional = datasets.additional_data.data.reset_index()
56+
sampled_additional_data = all_additional[all_additional[series_col].isin(sampled_groups.index)]
57+
max_historical_date = sampled_historical_data[date_col].max()
58+
additional_data = [sampled_additional_data[sampled_additional_data[date_col] <= max_historical_date]]
59+
for cut_off in cut_offs[:-1]:
60+
trimmed_additional_data = sampled_additional_data[sampled_additional_data[date_col] <= cut_off]
61+
additional_data.append(trimmed_additional_data)
62+
return cut_offs, training_datasets, additional_data, test_datasets
5563

5664
def remove_none_values(self, obj):
5765
if isinstance(obj, dict):
5866
return {k: self.remove_none_values(v) for k, v in obj.items() if k is not None and v is not None}
5967
else:
6068
return obj
6169

62-
def create_operator_config(self, operator_config, backtest, model, historical_data, test_data):
63-
output_dir = find_output_dirname(operator_config.spec.output_directory)
64-
output_file_path = f'{output_dir}back_testing/{model}/{backtest}'
70+
def create_operator_config(self, operator_config, backtest, model, historical_data, additional_data, test_data):
71+
output_dir = operator_config.spec.output_directory.url
72+
output_file_path = f'{output_dir}/back_testing/{model}/{backtest}'
6573
Path(output_file_path).mkdir(parents=True, exist_ok=True)
6674
historical_data_url = f'{output_file_path}/historical.csv'
75+
additional_data_url = f'{output_file_path}/additional.csv'
6776
test_data_url = f'{output_file_path}/test.csv'
6877
historical_data.to_csv(historical_data_url, index=False)
78+
additional_data.to_csv(additional_data_url, index=False)
6979
test_data.to_csv(test_data_url, index=False)
7080
backtest_op_config_draft = operator_config.to_dict()
7181
backtest_spec = backtest_op_config_draft["spec"]
7282
backtest_spec["historical_data"]["url"] = historical_data_url
83+
backtest_spec["additional_data"]["url"] = additional_data_url
7384
backtest_spec["test_data"]["url"] = test_data_url
7485
backtest_spec["model"] = model
75-
backtest_spec["output_directory"]["url"] = output_file_path
86+
backtest_spec["output_directory"] = {"url": output_file_path}
7687
backtest_spec["target_category_columns"] = [DataColumns.Series]
77-
backtest_spec.pop('additional_data', None) # todo create additional data
88+
backtest_spec['generate_explanations'] = False
7889
cleaned_config = self.remove_none_values(backtest_op_config_draft)
7990

8091
backtest_op_config = ForecastOperatorConfig.from_dict(
8192
obj_dict=cleaned_config)
8293
return backtest_op_config
8394

8495
def run_all_models(self, datasets: ForecastDatasets, operator_config: ForecastOperatorConfig):
85-
date_col = operator_config.spec.datetime_column.name
86-
horizon = operator_config.spec.horizon
87-
cut_offs, train_sets, test_sets = self.generate_k_fold_data(datasets, date_col, horizon)
96+
cut_offs, train_sets, additional_data, test_sets = self.generate_k_fold_data(datasets, operator_config)
8897
metrics = {}
8998
for model in self.models:
9099
from .model.factory import ForecastOperatorModelFactory
91100
metrics[model] = {}
92101
for i in range(len(cut_offs)):
93102
backtest_historical_data = train_sets[i]
103+
backtest_additional_data = additional_data[i]
94104
backtest_test_data = test_sets[i]
95105
backtest_operator_config = self.create_operator_config(operator_config, i, model,
96106
backtest_historical_data,
107+
backtest_additional_data,
97108
backtest_test_data)
98109
datasets = ForecastDatasets(backtest_operator_config)
99110
ForecastOperatorModelFactory.get_model(
100111
backtest_operator_config, datasets
101112
).generate_report()
102-
metrics_df = pd.read_csv(f"{backtest_operator_config.spec.output_directory.url}/metrics.csv")
103-
metrics_df["average_accross_series"] = metrics_df.drop('metrics', axis=1).mean(axis=1)
104-
metrics_average_dict = dict(zip(metrics_df['metrics'].str.lower(), metrics_df['average_accross_series']))
113+
test_metrics_filename = backtest_operator_config.spec.test_metrics_filename
114+
metrics_df = pd.read_csv(
115+
f"{backtest_operator_config.spec.output_directory.url}/{test_metrics_filename}")
116+
metrics_df["average_across_series"] = metrics_df.drop('metrics', axis=1).mean(axis=1)
117+
metrics_average_dict = dict(zip(metrics_df['metrics'].str.lower(), metrics_df['average_across_series']))
105118
metrics[model][i] = metrics_average_dict[operator_config.spec.metric]
106119
return metrics
107120

108121
def find_best_model(self, datasets: ForecastDatasets, operator_config: ForecastOperatorConfig):
109122
metrics = self.run_all_models(datasets, operator_config)
110-
avg_backtests_metrics = {key : sum(value.values()) / len(value.values()) for key, value in metrics.items()}
123+
avg_backtests_metrics = {key: sum(value.values()) / len(value.values()) for key, value in metrics.items()}
111124
best_model = min(avg_backtests_metrics, key=avg_backtests_metrics.get)
112125
logger.info(f"Among models {self.models}, {best_model} model shows better performance during backtesting.")
126+
backtest_stats = pd.DataFrame(metrics).rename_axis('backtest')
127+
backtest_stats.reset_index(inplace=True)
128+
output_dir = operator_config.spec.output_directory.url
129+
backtest_report_name = "backtest_stats.csv"
130+
backtest_stats.to_csv(f"{output_dir}/{backtest_report_name}", index=False)
113131
return best_model

ads/opctl/operator/lowcode/forecast/operator_config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from .const import SupportedMetrics, SpeedAccuracyMode
1616
from .const import SupportedModels
17-
17+
from ads.opctl.operator.lowcode.common.utils import find_output_dirname
1818

1919
@dataclass(repr=True)
2020
class TestData(InputData):
@@ -90,6 +90,7 @@ class ForecastOperatorSpec(DataClassSerializable):
9090

9191
def __post_init__(self):
9292
"""Adjusts the specification details."""
93+
self.output_directory = self.output_directory or OutputDirectory(url=find_output_dirname(self.output_directory))
9394
self.metric = (self.metric or "").lower() or SupportedMetrics.SMAPE.lower()
9495
self.model = self.model or SupportedModels.Auto
9596
self.confidence_interval_width = self.confidence_interval_width or 0.80

ads/opctl/operator/lowcode/forecast/schema.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,14 +365,14 @@ spec:
365365
model:
366366
type: string
367367
required: false
368-
default: auto
368+
default: auto-select
369369
allowed:
370370
- prophet
371371
- arima
372372
- neuralprophet
373373
- automlx
374374
- autots
375-
- auto
375+
- auto-select
376376

377377
model_kwargs:
378378
type: dict

ads/opctl/operator/lowcode/forecast/utils.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,24 @@ def _select_plot_list(fn, series_ids):
265265
def _add_unit(num, unit):
266266
return f"{num} {unit}"
267267

268+
def get_auto_select_plot(backtest_results):
269+
fig = go.Figure()
270+
columns = backtest_results.columns.tolist()
271+
back_test_column = "backtest"
272+
columns.remove(back_test_column)
273+
for i, column in enumerate(columns):
274+
color = 0 #int(i * 255 / len(columns))
275+
fig.add_trace(
276+
go.Scatter(
277+
x=backtest_results[back_test_column],
278+
y=backtest_results[column],
279+
mode="lines",
280+
name=column,
281+
))
282+
283+
import report_creator as rc
284+
return rc.Widget(fig)
285+
268286

269287
def get_forecast_plots(
270288
forecast_output,

0 commit comments

Comments
 (0)