9
9
from ads .opctl .operator .lowcode .common .utils import (
10
10
find_output_dirname ,
11
11
)
12
- from . const import ForecastOutputColumns
12
+ from ads . opctl . operator . lowcode . common . const import DataColumns
13
13
from .model .forecast_datasets import ForecastDatasets
14
14
from .operator_config import ForecastOperatorConfig
15
-
15
+ from pathlib import Path
16
+ import pandas as pd
16
17
17
18
class ModelEvaluator :
18
19
def __init__ (self , models , k = 5 , subsample_ratio = 0.20 ):
19
20
self .models = models
20
21
self .k = k
21
22
self .subsample_ratio = subsample_ratio
23
+ self .minimum_sample_count = 5
24
+
25
+ def generate_cutoffs (self , unique_dates , horizon ):
26
+ sorted_dates = np .sort (unique_dates )
27
+ train_window_size = [len (sorted_dates ) - (i + 1 ) * horizon for i in range (self .k )]
28
+ valid_train_window_size = [ws for ws in train_window_size if ws >= horizon * 3 ]
29
+ if len (valid_train_window_size ) < self .k :
30
+ logger .warn (f"Only { valid_train_window_size } backtests can be created" )
31
+ cut_offs = sorted_dates [- horizon - 1 :- horizon * (self .k + 1 ):- horizon ][:len (valid_train_window_size )]
32
+ return cut_offs
22
33
23
34
def generate_k_fold_data (self , datasets : ForecastDatasets , date_col : str , horizon : int ):
24
35
historical_data = datasets .historical_data .data .reset_index ()
25
- series_col = ForecastOutputColumns . SERIES
36
+ series_col = DataColumns . Series
26
37
group_counts = historical_data [series_col ].value_counts ()
27
38
28
- sample_count = max (5 , int (len (group_counts ) * self .subsample_ratio ))
39
+ sample_count = max (self . minimum_sample_count , int (len (group_counts ) * self .subsample_ratio ))
29
40
sampled_groups = group_counts .head (sample_count )
30
41
sampled_historical_data = historical_data [historical_data [series_col ].isin (sampled_groups .index )]
31
42
32
43
min_group = group_counts .idxmin ()
33
44
min_series_data = historical_data [historical_data [series_col ] == min_group ]
34
45
unique_dates = min_series_data [date_col ].unique ()
35
46
36
- sorted_dates = np .sort (unique_dates )
37
- train_window_size = [len (sorted_dates ) - (i + 1 ) * horizon for i in range (self .k )]
38
- valid_train_window_size = [ws for ws in train_window_size if ws >= horizon * 3 ]
39
- if len (valid_train_window_size ) < self .k :
40
- logger .warn (f"Only ${ valid_train_window_size } backtests can be created" )
41
-
42
- cut_offs = sorted_dates [- horizon - 1 :- horizon * (self .k + 1 ):- horizon ][:len (valid_train_window_size )]
47
+ cut_offs = self .generate_cutoffs (unique_dates , horizon )
43
48
training_datasets = [sampled_historical_data [sampled_historical_data [date_col ] <= cut_off_date ] for cut_off_date
44
49
in cut_offs ]
45
50
test_datasets = [sampled_historical_data [sampled_historical_data [date_col ] > cut_offs [0 ]]]
@@ -54,35 +59,55 @@ def remove_none_values(self, obj):
54
59
else :
55
60
return obj
56
61
62
+ def create_operator_config (self , operator_config , backtest , model , historical_data , test_data ):
63
+ output_dir = find_output_dirname (operator_config .spec .output_directory )
64
+ output_file_path = f'{ output_dir } back_testing/{ model } /{ backtest } '
65
+ Path (output_file_path ).mkdir (parents = True , exist_ok = True )
66
+ historical_data_url = f'{ output_file_path } /historical.csv'
67
+ test_data_url = f'{ output_file_path } /test.csv'
68
+ historical_data .to_csv (historical_data_url , index = False )
69
+ test_data .to_csv (test_data_url , index = False )
70
+ backtest_op_config_draft = operator_config .to_dict ()
71
+ backtest_spec = backtest_op_config_draft ["spec" ]
72
+ backtest_spec ["historical_data" ]["url" ] = historical_data_url
73
+ backtest_spec ["test_data" ]["url" ] = test_data_url
74
+ backtest_spec ["model" ] = model
75
+ backtest_spec ["output_directory" ]["url" ] = output_file_path
76
+ backtest_spec ["target_category_columns" ] = [DataColumns .Series ]
77
+ backtest_spec .pop ('additional_data' , None ) # todo create additional data
78
+ cleaned_config = self .remove_none_values (backtest_op_config_draft )
79
+
80
+ backtest_op_config = ForecastOperatorConfig .from_dict (
81
+ obj_dict = cleaned_config )
82
+ return backtest_op_config
83
+
57
84
def run_all_models (self , datasets : ForecastDatasets , operator_config : ForecastOperatorConfig ):
58
85
date_col = operator_config .spec .datetime_column .name
59
86
horizon = operator_config .spec .horizon
60
87
cut_offs , train_sets , test_sets = self .generate_k_fold_data (datasets , date_col , horizon )
61
-
88
+ metrics = {}
62
89
for model in self .models :
63
90
from .model .factory import ForecastOperatorModelFactory
91
+ metrics [model ] = {}
64
92
for i in range (len (cut_offs )):
65
93
backtest_historical_data = train_sets [i ]
66
94
backtest_test_data = test_sets [i ]
67
- output_dir = find_output_dirname (operator_config .spec .output_directory )
68
- output_file_path = f'{ output_dir } back_test/{ i } '
69
- from pathlib import Path
70
- Path (output_file_path ).mkdir (parents = True , exist_ok = True )
71
- historical_data_url = f'{ output_file_path } /historical.csv'
72
- test_data_url = f'{ output_file_path } /test.csv'
73
- backtest_historical_data .to_csv (historical_data_url , index = False )
74
- backtest_test_data .to_csv (test_data_url , index = False )
75
- backtest_op_config_draft = operator_config .to_dict ()
76
- backtest_spec = backtest_op_config_draft ["spec" ]
77
- backtest_spec ["historical_data" ]["url" ] = historical_data_url
78
- backtest_spec ["test_data" ]["url" ] = test_data_url
79
- backtest_spec ["model" ] = model
80
- backtest_spec ["output_directory" ]["url" ] = output_dir
81
- cleaned_config = self .remove_none_values (backtest_op_config_draft )
82
- backtest_op_cofig = ForecastOperatorConfig .from_dict (
83
- obj_dict = cleaned_config )
84
- datasets = ForecastDatasets (backtest_op_cofig )
85
-
95
+ backtest_operator_config = self .create_operator_config (operator_config , i , model ,
96
+ backtest_historical_data ,
97
+ backtest_test_data )
98
+ datasets = ForecastDatasets (backtest_operator_config )
86
99
ForecastOperatorModelFactory .get_model (
87
- operator_config , datasets
100
+ backtest_operator_config , datasets
88
101
).generate_report ()
102
+ metrics_df = pd .read_csv (f"{ backtest_operator_config .spec .output_directory .url } /metrics.csv" )
103
+ metrics_df ["average_accross_series" ] = metrics_df .drop ('metrics' , axis = 1 ).mean (axis = 1 )
104
+ metrics_average_dict = dict (zip (metrics_df ['metrics' ].str .lower (), metrics_df ['average_accross_series' ]))
105
+ metrics [model ][i ] = metrics_average_dict [operator_config .spec .metric ]
106
+ return metrics
107
+
108
+ def find_best_model (self , datasets : ForecastDatasets , operator_config : ForecastOperatorConfig ):
109
+ metrics = self .run_all_models (datasets , operator_config )
110
+ avg_backtests_metrics = {key : sum (value .values ()) / len (value .values ()) for key , value in metrics .items ()}
111
+ best_model = min (avg_backtests_metrics , key = avg_backtests_metrics .get )
112
+ logger .info (f"Among models { self .models } , { best_model } model shows better performance during backtesting." )
113
+ return best_model
0 commit comments