Skip to content

Commit 7c5f972

Browse files
committed
fix comments
1 parent b51dd4a commit 7c5f972

File tree

5 files changed

+45
-23
lines changed

5 files changed

+45
-23
lines changed

runtime/databricks/automl_runtime/forecast/pmdarima/training.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ class ArimaEstimator:
3535
"""
3636

3737
def __init__(self, horizon: int, frequency_unit: str, metric: str, seasonal_periods: List[int],
38-
num_folds: int = 20, max_steps: int = 150, exogenous_cols: Optional[List[str]] = None,
39-
split_cutoff: Optional[pd.Timestamp] = None) -> None:
38+
num_folds: int = 20, max_steps: int = 150, exogenous_cols: List[str] | None = None,
39+
split_cutoff: pd.Timestamp | None = None) -> None:
4040
"""
4141
:param horizon: Number of periods to forecast forward
4242
:param frequency_unit: Frequency of the time series
@@ -46,6 +46,10 @@ def __init__(self, horizon: int, frequency_unit: str, metric: str, seasonal_peri
4646
:param max_steps: Max steps for stepwise auto_arima
4747
:param exogenous_cols: Optional list of column names of exogenous variables. If provided, these columns are
4848
used as additional features in arima model.
49+
:param split_cutoff: Optional cutoff specified by user. If provided,
50+
it is the starting point of cutoffs for cross validation.
51+
For tuning job, it is the cutoff between train and validate split.
52+
For training job, it is the cutoff bewteen validate and test split.
4953
"""
5054
self._horizon = horizon
5155
self._frequency_unit = OFFSET_ALIAS_MAP[frequency_unit]

runtime/databricks/automl_runtime/forecast/prophet/forecast.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def __init__(self, horizon: int, frequency_unit: str, metric: str, interval_widt
9292
max_eval: int = 10, trial_timeout: int = None,
9393
random_state: int = 0, is_parallel: bool = True,
9494
regressors = None,
95-
split_cutoff: Optional[pd.Timestamp] = None, **prophet_kwargs) -> None:
95+
split_cutoff: pd.Timestamp | None = None, **prophet_kwargs) -> None:
9696
"""
9797
Initialization
9898
@@ -109,6 +109,10 @@ def __init__(self, horizon: int, frequency_unit: str, metric: str, interval_widt
109109
:param random_state: random seed for hyperopt
110110
:param is_parallel: Indicators to decide that whether run hyperopt in
111111
:param regressors: list of column names of external regressors
112+
:param split_cutoff: Optional cutoff specified by user. If provided,
113+
it is the starting point of cutoffs for cross validation.
114+
For tuning job, it is the cutoff between train and validate split.
115+
For training job, it is the cutoff bewteen validate and test split.
112116
:param prophet_kwargs: Optional keyword arguments for Prophet model.
113117
For information about the parameters see:
114118
`The Prophet source code <https://github.com/facebook/prophet/blob/master/python/prophet/forecaster.py>`_.

runtime/databricks/automl_runtime/forecast/utils.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -195,21 +195,27 @@ def generate_custom_cutoffs(df: pd.DataFrame, horizon: int, unit: str,
195195
:param df: pd.DataFrame of the historical data.
196196
:param horizon: int number of time into the future for forecasting.
197197
:param unit: frequency unit of the time series, which must be a pandas offset alias.
198-
:param split_cutoff: the user-specified cutoff, as the starting point of cutoffs
198+
:param split_cutoff: the user-specified cutoff, as the starting point of cutoffs.
199+
For tuning job, it is the cutoff between train and validate split.
200+
For training job, it is the cutoff bewteen validate and test split.
199201
:return: list of pd.Timestamp cutoffs for cross-validation.
200202
"""
201203
period = 1
202204
period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*period
203205
horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*horizon
204206

207+
# First cutoff is the cutoff bewteen splits
205208
cutoff = split_cutoff
206209
result = [cutoff]
207-
while result[-1] <= max(df["ds"]) - horizon_dateoffset:
210+
max_cutoff = max(df["ds"]) - horizon_dateoffset
211+
while result[-1] <= max_cutoff:
208212
cutoff += period_dateoffset
209-
if not (((df["ds"] > cutoff) & (df["ds"] <= cutoff + horizon_dateoffset)).any()):
210-
if cutoff < df["ds"].max():
211-
closest_date = df[df["ds"] > cutoff].min()["ds"]
212-
cutoff = closest_date - horizon_dateoffset
213+
# If data does not exist in data range (cutoff, cutoff + horizon_dateoffset]
214+
if (not (((df["ds"] > cutoff) & (df["ds"] <= cutoff + horizon_dateoffset)).any())) and (cutoff < df["ds"].max()):
215+
# Next cutoff point is "next date after cutoff in data - horizon_dateoffset"
216+
closest_date = df[df["ds"] > cutoff].min()["ds"]
217+
cutoff = closest_date - horizon_dateoffset
218+
# else no data left, leave cutoff as is, it will be dropped.
213219
result.append(cutoff)
214220
result = result[:-1]
215221
return result

runtime/tests/automl_runtime/forecast/pmdarima/training_test.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -74,15 +74,18 @@ def test_fit_success_with_exogenous(self):
7474
self.assertIn("pickled_model", results_pd)
7575

7676
def test_fit_success_with_split_cutoff(self):
77-
arima_estimator = ArimaEstimator(horizon=1,
78-
frequency_unit="d",
79-
metric="smape",
80-
seasonal_periods=[1, 7],
81-
num_folds=2,
82-
split_cutoff=pd.Timestamp('2020-07-17 00:00:00'))
83-
results_pd = arima_estimator.fit(self.df)
84-
self.assertIn("smape", results_pd)
85-
self.assertIn("pickled_model", results_pd)
77+
for freq, df, split_cutoff in [['d', self.df, '2020-07-17 00:00:00'],
78+
['d', self.df_string_time, '2020-07-17 00:00:00'],
79+
['month', self.df_monthly, '2020-09-07 00:00:00']]:
80+
arima_estimator = ArimaEstimator(horizon=1,
81+
frequency_unit=freq,
82+
metric="smape",
83+
seasonal_periods=[1, 7],
84+
num_folds=2,
85+
split_cutoff=pd.Timestamp(split_cutoff))
86+
results_pd = arima_estimator.fit(df)
87+
self.assertIn("smape", results_pd)
88+
self.assertIn("pickled_model", results_pd)
8689

8790
def test_fit_skip_too_long_seasonality(self):
8891
arima_estimator = ArimaEstimator(horizon=1,

runtime/tests/automl_runtime/forecast/prophet/forecast_test.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,15 @@ def test_training_with_extra_regressors(self):
141141
self.assertListEqual(model_json["extra_regressors"][0], ["f1", "f2"])
142142

143143
def test_training_with_split_cutoff(self):
144-
hyperopt_estim = ProphetHyperoptEstimator(horizon=1,
145-
frequency_unit="d",
144+
test_spaces = [['D', self.df, '2020-07-09 00:00:00'],
145+
['D', self.df_datetime_date, '2020-07-09 00:00:00'],
146+
['D', self.df_string_time, '2020-07-09 00:00:00'],
147+
['M', self.df_string_monthly_time, '2020-09-15 00:00:00'],
148+
['Q', self.df_string_quarterly_time, '2022-01-15 00:00:00'],
149+
['Y', self.df_string_annually_time, '2020-01-15 00:00:00']]
150+
for freq, df, split_cutoff in test_spaces:
151+
hyperopt_estim = ProphetHyperoptEstimator(horizon=1,
152+
frequency_unit=freq,
146153
metric="smape",
147154
interval_width=0.8,
148155
country_holidays="US",
@@ -151,9 +158,7 @@ def test_training_with_split_cutoff(self):
151158
trial_timeout=1000,
152159
random_state=0,
153160
is_parallel=False,
154-
split_cutoff=pd.Timestamp('2020-07-10 00:00:00'))
155-
156-
for df in [self.df, self.df_datetime_date, self.df_string_time]:
161+
split_cutoff=pd.Timestamp(split_cutoff))
157162
results = hyperopt_estim.fit(df)
158163
self.assertAlmostEqual(results["mse"][0], 0)
159164
self.assertAlmostEqual(results["rmse"][0], 0, delta=1e-6)

0 commit comments

Comments
 (0)