Skip to content

Commit 53137d1

Browse files
committed
auto model evalulator added for forecasting tasks
1 parent 07f0280 commit 53137d1

File tree

5 files changed

+90
-76
lines changed

5 files changed

+90
-76
lines changed

ads/opctl/operator/lowcode/common/transformations.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ def _set_series_id_column(self, df):
9797
for value in merged_values:
9898
self._target_category_columns_map[value] = df[df[DataColumns.Series] == value][self.target_category_columns].drop_duplicates().iloc[0].to_dict()
9999

100-
df = df.drop(self.target_category_columns, axis=1)
100+
if self.target_category_columns != [DataColumns.Series]:
101+
df = df.drop(self.target_category_columns, axis=1)
101102
return df
102103

103104
def _format_datetime_col(self, df):

ads/opctl/operator/lowcode/forecast/model/factory.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from .base_model import ForecastOperatorBaseModel
1313
from .neuralprophet import NeuralProphetOperatorModel
1414
from .prophet import ProphetOperatorModel
15-
from ..utils import select_auto_model
1615
from .forecast_datasets import ForecastDatasets
1716

1817
class UnSupportedModelError(Exception):
@@ -62,7 +61,33 @@ def get_model(
6261
"""
6362
model_type = operator_config.spec.model
6463
if model_type == "auto":
65-
model_type = select_auto_model(datasets, operator_config)
64+
model_type = cls.auto_select_model(datasets, operator_config)
6665
if model_type not in cls._MAP:
6766
raise UnSupportedModelError(model_type)
6867
return cls._MAP[model_type](config=operator_config, datasets=datasets)
68+
69+
@classmethod
70+
def auto_select_model(
71+
cls, datasets: ForecastDatasets, operator_config: ForecastOperatorConfig
72+
) -> str:
73+
"""
74+
Selects AutoMLX or Arima model based on column count.
75+
76+
If the number of columns is less than or equal to the maximum allowed for AutoMLX,
77+
returns 'AutoMLX'. Otherwise, returns 'Arima'.
78+
79+
Parameters
80+
------------
81+
datasets: ForecastDatasets
82+
Datasets for predictions
83+
84+
Returns
85+
--------
86+
str
87+
The type of the model.
88+
"""
89+
from ..model_evaluator import ModelEvaluator
90+
all_models = cls._MAP.keys()
91+
model_evaluator = ModelEvaluator(all_models)
92+
best_model = model_evaluator.find_best_model(datasets, operator_config)
93+
return cls._MAP[best_model]

ads/opctl/operator/lowcode/forecast/model_evaluator.py

Lines changed: 57 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -9,37 +9,42 @@
99
from ads.opctl.operator.lowcode.common.utils import (
1010
find_output_dirname,
1111
)
12-
from .const import ForecastOutputColumns
12+
from ads.opctl.operator.lowcode.common.const import DataColumns
1313
from .model.forecast_datasets import ForecastDatasets
1414
from .operator_config import ForecastOperatorConfig
15-
15+
from pathlib import Path
16+
import pandas as pd
1617

1718
class ModelEvaluator:
1819
def __init__(self, models, k=5, subsample_ratio=0.20):
1920
self.models = models
2021
self.k = k
2122
self.subsample_ratio = subsample_ratio
23+
self.minimum_sample_count = 5
24+
25+
def generate_cutoffs(self, unique_dates, horizon):
26+
sorted_dates = np.sort(unique_dates)
27+
train_window_size = [len(sorted_dates) - (i + 1) * horizon for i in range(self.k)]
28+
valid_train_window_size = [ws for ws in train_window_size if ws >= horizon * 3]
29+
if len(valid_train_window_size) < self.k:
30+
logger.warn(f"Only {valid_train_window_size} backtests can be created")
31+
cut_offs = sorted_dates[-horizon - 1:-horizon * (self.k + 1):-horizon][:len(valid_train_window_size)]
32+
return cut_offs
2233

2334
def generate_k_fold_data(self, datasets: ForecastDatasets, date_col: str, horizon: int):
2435
historical_data = datasets.historical_data.data.reset_index()
25-
series_col = ForecastOutputColumns.SERIES
36+
series_col = DataColumns.Series
2637
group_counts = historical_data[series_col].value_counts()
2738

28-
sample_count = max(5, int(len(group_counts) * self.subsample_ratio))
39+
sample_count = max(self.minimum_sample_count, int(len(group_counts) * self.subsample_ratio))
2940
sampled_groups = group_counts.head(sample_count)
3041
sampled_historical_data = historical_data[historical_data[series_col].isin(sampled_groups.index)]
3142

3243
min_group = group_counts.idxmin()
3344
min_series_data = historical_data[historical_data[series_col] == min_group]
3445
unique_dates = min_series_data[date_col].unique()
3546

36-
sorted_dates = np.sort(unique_dates)
37-
train_window_size = [len(sorted_dates) - (i + 1) * horizon for i in range(self.k)]
38-
valid_train_window_size = [ws for ws in train_window_size if ws >= horizon * 3]
39-
if len(valid_train_window_size) < self.k:
40-
logger.warn(f"Only ${valid_train_window_size} backtests can be created")
41-
42-
cut_offs = sorted_dates[-horizon - 1:-horizon * (self.k + 1):-horizon][:len(valid_train_window_size)]
47+
cut_offs = self.generate_cutoffs(unique_dates, horizon)
4348
training_datasets = [sampled_historical_data[sampled_historical_data[date_col] <= cut_off_date] for cut_off_date
4449
in cut_offs]
4550
test_datasets = [sampled_historical_data[sampled_historical_data[date_col] > cut_offs[0]]]
@@ -54,35 +59,55 @@ def remove_none_values(self, obj):
5459
else:
5560
return obj
5661

62+
def create_operator_config(self, operator_config, backtest, model, historical_data, test_data):
63+
output_dir = find_output_dirname(operator_config.spec.output_directory)
64+
output_file_path = f'{output_dir}back_testing/{model}/{backtest}'
65+
Path(output_file_path).mkdir(parents=True, exist_ok=True)
66+
historical_data_url = f'{output_file_path}/historical.csv'
67+
test_data_url = f'{output_file_path}/test.csv'
68+
historical_data.to_csv(historical_data_url, index=False)
69+
test_data.to_csv(test_data_url, index=False)
70+
backtest_op_config_draft = operator_config.to_dict()
71+
backtest_spec = backtest_op_config_draft["spec"]
72+
backtest_spec["historical_data"]["url"] = historical_data_url
73+
backtest_spec["test_data"]["url"] = test_data_url
74+
backtest_spec["model"] = model
75+
backtest_spec["output_directory"]["url"] = output_file_path
76+
backtest_spec["target_category_columns"] = [DataColumns.Series]
77+
backtest_spec.pop('additional_data', None) # todo create additional data
78+
cleaned_config = self.remove_none_values(backtest_op_config_draft)
79+
80+
backtest_op_config = ForecastOperatorConfig.from_dict(
81+
obj_dict=cleaned_config)
82+
return backtest_op_config
83+
5784
def run_all_models(self, datasets: ForecastDatasets, operator_config: ForecastOperatorConfig):
5885
date_col = operator_config.spec.datetime_column.name
5986
horizon = operator_config.spec.horizon
6087
cut_offs, train_sets, test_sets = self.generate_k_fold_data(datasets, date_col, horizon)
61-
88+
metrics = {}
6289
for model in self.models:
6390
from .model.factory import ForecastOperatorModelFactory
91+
metrics[model] = {}
6492
for i in range(len(cut_offs)):
6593
backtest_historical_data = train_sets[i]
6694
backtest_test_data = test_sets[i]
67-
output_dir = find_output_dirname(operator_config.spec.output_directory)
68-
output_file_path = f'{output_dir}back_test/{i}'
69-
from pathlib import Path
70-
Path(output_file_path).mkdir(parents=True, exist_ok=True)
71-
historical_data_url = f'{output_file_path}/historical.csv'
72-
test_data_url = f'{output_file_path}/test.csv'
73-
backtest_historical_data.to_csv(historical_data_url, index=False)
74-
backtest_test_data.to_csv(test_data_url, index=False)
75-
backtest_op_config_draft = operator_config.to_dict()
76-
backtest_spec = backtest_op_config_draft["spec"]
77-
backtest_spec["historical_data"]["url"] = historical_data_url
78-
backtest_spec["test_data"]["url"] = test_data_url
79-
backtest_spec["model"] = model
80-
backtest_spec["output_directory"]["url"] = output_dir
81-
cleaned_config = self.remove_none_values(backtest_op_config_draft)
82-
backtest_op_cofig = ForecastOperatorConfig.from_dict(
83-
obj_dict=cleaned_config)
84-
datasets = ForecastDatasets(backtest_op_cofig)
85-
95+
backtest_operator_config = self.create_operator_config(operator_config, i, model,
96+
backtest_historical_data,
97+
backtest_test_data)
98+
datasets = ForecastDatasets(backtest_operator_config)
8699
ForecastOperatorModelFactory.get_model(
87-
operator_config, datasets
100+
backtest_operator_config, datasets
88101
).generate_report()
102+
metrics_df = pd.read_csv(f"{backtest_operator_config.spec.output_directory.url}/metrics.csv")
103+
metrics_df["average_accross_series"] = metrics_df.drop('metrics', axis=1).mean(axis=1)
104+
metrics_average_dict = dict(zip(metrics_df['metrics'].str.lower(), metrics_df['average_accross_series']))
105+
metrics[model][i] = metrics_average_dict[operator_config.spec.metric]
106+
return metrics
107+
108+
def find_best_model(self, datasets: ForecastDatasets, operator_config: ForecastOperatorConfig):
109+
metrics = self.run_all_models(datasets, operator_config)
110+
avg_backtests_metrics = {key : sum(value.values()) / len(value.values()) for key, value in metrics.items()}
111+
best_model = min(avg_backtests_metrics, key=avg_backtests_metrics.get)
112+
logger.info(f"Among models {self.models}, {best_model} model shows better performance during backtesting.")
113+
return best_model

ads/opctl/operator/lowcode/forecast/schema.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,4 +413,8 @@ spec:
413413
- RMSE
414414
- MSE
415415
- SMAPE
416+
- mape
417+
- rmse
418+
- mse
419+
- smape
416420
type: dict

ads/opctl/operator/lowcode/forecast/utils.py

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
mean_absolute_percentage_error,
2020
mean_squared_error,
2121
)
22-
2322
try:
2423
from scipy.stats import linregress
2524
except:
@@ -34,7 +33,6 @@
3433
from .operator_config import ForecastOperatorSpec, ForecastOperatorConfig
3534
from ads.opctl.operator.lowcode.common.utils import merge_category_columns
3635
from ads.opctl.operator.lowcode.forecast.const import ForecastOutputColumns
37-
3836
# from ads.opctl.operator.lowcode.forecast.model.forecast_datasets import TestData, ForecastOutput
3937

4038

@@ -371,45 +369,6 @@ def plot_forecast_plotly(s_id):
371369

372370
return _select_plot_list(plot_forecast_plotly, forecast_output.list_series_ids())
373371

374-
375-
def select_auto_model(
376-
datasets: "ForecastDatasets", operator_config: ForecastOperatorConfig
377-
) -> str:
378-
"""
379-
Selects AutoMLX or Arima model based on column count.
380-
381-
If the number of columns is less than or equal to the maximum allowed for AutoMLX,
382-
returns 'AutoMLX'. Otherwise, returns 'Arima'.
383-
384-
Parameters
385-
------------
386-
datasets: ForecastDatasets
387-
Datasets for predictions
388-
389-
Returns
390-
--------
391-
str
392-
The type of the model.
393-
"""
394-
freq_in_secs = datasets.get_datetime_frequency_in_seconds()
395-
num_of_additional_cols = len(datasets.get_additional_data_column_names())
396-
row_count = datasets.get_num_rows()
397-
number_of_series = len(datasets.list_series_ids())
398-
if (
399-
num_of_additional_cols < 15
400-
and row_count < 10000
401-
and number_of_series < 10
402-
and freq_in_secs > 3600
403-
):
404-
return SupportedModels.AutoMLX
405-
elif row_count < 10000 and number_of_series > 10:
406-
return SupportedModels.AutoTS
407-
elif row_count > 20000:
408-
return SupportedModels.NeuralProphet
409-
else:
410-
return SupportedModels.NeuralProphet
411-
412-
413372
def convert_target(target: str, target_col: str):
414373
"""
415374
Removes the target_column that got appended to target.

0 commit comments

Comments
 (0)