fix comments

Lanz-db · Lanz-db · commit 7c5f97272304 · 2024-07-19T23:46:44.000-07:00
diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py
@@ -35,8 +35,8 @@ class ArimaEstimator:
     """
 
     def __init__(self, horizon: int, frequency_unit: str, metric: str, seasonal_periods: List[int],
-                 num_folds: int = 20, max_steps: int = 150, exogenous_cols: Optional[List[str]] = None,
-                 split_cutoff: Optional[pd.Timestamp] = None) -> None:
+                 num_folds: int = 20, max_steps: int = 150, exogenous_cols: List[str] | None = None,
+                 split_cutoff: pd.Timestamp | None = None) -> None:
         """
         :param horizon: Number of periods to forecast forward
         :param frequency_unit: Frequency of the time series
@@ -46,6 +46,10 @@ def __init__(self, horizon: int, frequency_unit: str, metric: str, seasonal_peri
         :param max_steps: Max steps for stepwise auto_arima
         :param exogenous_cols: Optional list of column names of exogenous variables. If provided, these columns are
         used as additional features in arima model.
+        :param split_cutoff: Optional cutoff specified by user. If provided, 
+        it is the starting point of cutoffs for cross validation.
+        For tuning job, it is the cutoff between train and validate split.
+        For training job, it is the cutoff bewteen validate and test split.
         """
         self._horizon = horizon
         self._frequency_unit = OFFSET_ALIAS_MAP[frequency_unit]
diff --git a/runtime/databricks/automl_runtime/forecast/prophet/forecast.py b/runtime/databricks/automl_runtime/forecast/prophet/forecast.py
@@ -92,7 +92,7 @@ def __init__(self, horizon: int, frequency_unit: str, metric: str, interval_widt
                  max_eval: int = 10, trial_timeout: int = None,
                  random_state: int = 0, is_parallel: bool = True,
                  regressors = None, 
-                 split_cutoff: Optional[pd.Timestamp] = None, **prophet_kwargs) -> None:
+                 split_cutoff: pd.Timestamp | None = None, **prophet_kwargs) -> None:
         """
         Initialization
 
@@ -109,6 +109,10 @@ def __init__(self, horizon: int, frequency_unit: str, metric: str, interval_widt
         :param random_state: random seed for hyperopt
         :param is_parallel: Indicators to decide that whether run hyperopt in 
         :param regressors: list of column names of external regressors
+        :param split_cutoff: Optional cutoff specified by user. If provided, 
+        it is the starting point of cutoffs for cross validation.
+        For tuning job, it is the cutoff between train and validate split.
+        For training job, it is the cutoff bewteen validate and test split.
         :param prophet_kwargs: Optional keyword arguments for Prophet model.
             For information about the parameters see:
             `The Prophet source code <https://github.com/facebook/prophet/blob/master/python/prophet/forecaster.py>`_.
diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py
@@ -195,21 +195,27 @@ def generate_custom_cutoffs(df: pd.DataFrame, horizon: int, unit: str,
     :param df: pd.DataFrame of the historical data.
     :param horizon: int number of time into the future for forecasting.
     :param unit: frequency unit of the time series, which must be a pandas offset alias.
-    :param split_cutoff: the user-specified cutoff, as the starting point of cutoffs
+    :param split_cutoff: the user-specified cutoff, as the starting point of cutoffs.
+    For tuning job, it is the cutoff between train and validate split.
+    For training job, it is the cutoff bewteen validate and test split.
     :return: list of pd.Timestamp cutoffs for cross-validation.
     """
     period = 1 
     period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*period
     horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*horizon
 
+    # First cutoff is the cutoff bewteen splits
     cutoff = split_cutoff
     result = [cutoff]
-    while result[-1] <= max(df["ds"]) - horizon_dateoffset:
+    max_cutoff = max(df["ds"]) - horizon_dateoffset
+    while result[-1] <= max_cutoff:
         cutoff += period_dateoffset
-        if not (((df["ds"] > cutoff) & (df["ds"] <= cutoff + horizon_dateoffset)).any()):
-            if cutoff < df["ds"].max():
-                closest_date = df[df["ds"] > cutoff].min()["ds"]
-                cutoff = closest_date - horizon_dateoffset
+        # If data does not exist in data range (cutoff, cutoff + horizon_dateoffset]
+        if (not (((df["ds"] > cutoff) & (df["ds"] <= cutoff + horizon_dateoffset)).any())) and (cutoff < df["ds"].max()):
+            # Next cutoff point is "next date after cutoff in data - horizon_dateoffset"
+            closest_date = df[df["ds"] > cutoff].min()["ds"]
+            cutoff = closest_date - horizon_dateoffset
+        # else no data left, leave cutoff as is, it will be dropped.
         result.append(cutoff) 
     result = result[:-1]
     return result
diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py
@@ -74,15 +74,18 @@ def test_fit_success_with_exogenous(self):
         self.assertIn("pickled_model", results_pd)
     
     def test_fit_success_with_split_cutoff(self):
-        arima_estimator = ArimaEstimator(horizon=1,
-                                         frequency_unit="d",
-                                         metric="smape",
-                                         seasonal_periods=[1, 7],
-                                         num_folds=2,
-                                         split_cutoff=pd.Timestamp('2020-07-17 00:00:00'))
-        results_pd = arima_estimator.fit(self.df)
-        self.assertIn("smape", results_pd)
-        self.assertIn("pickled_model", results_pd)
+        for freq, df, split_cutoff in [['d', self.df, '2020-07-17 00:00:00'], 
+                         ['d', self.df_string_time, '2020-07-17 00:00:00'], 
+                         ['month', self.df_monthly, '2020-09-07 00:00:00']]:
+            arima_estimator = ArimaEstimator(horizon=1,
+                                            frequency_unit=freq,
+                                            metric="smape",
+                                            seasonal_periods=[1, 7],
+                                            num_folds=2,
+                                            split_cutoff=pd.Timestamp(split_cutoff))
+            results_pd = arima_estimator.fit(df)
+            self.assertIn("smape", results_pd)
+            self.assertIn("pickled_model", results_pd)
 
     def test_fit_skip_too_long_seasonality(self):
         arima_estimator = ArimaEstimator(horizon=1,
diff --git a/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py b/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py
@@ -141,8 +141,15 @@ def test_training_with_extra_regressors(self):
         self.assertListEqual(model_json["extra_regressors"][0], ["f1", "f2"])
 
     def test_training_with_split_cutoff(self):
-        hyperopt_estim = ProphetHyperoptEstimator(horizon=1,
-                                                  frequency_unit="d",
+        test_spaces = [['D', self.df, '2020-07-09 00:00:00'], 
+                       ['D', self.df_datetime_date, '2020-07-09 00:00:00'], 
+                       ['D', self.df_string_time, '2020-07-09 00:00:00'], 
+                       ['M', self.df_string_monthly_time, '2020-09-15 00:00:00'], 
+                       ['Q', self.df_string_quarterly_time, '2022-01-15 00:00:00'], 
+                       ['Y', self.df_string_annually_time, '2020-01-15 00:00:00']]
+        for freq, df, split_cutoff in test_spaces:
+            hyperopt_estim = ProphetHyperoptEstimator(horizon=1,
+                                                  frequency_unit=freq,
                                                   metric="smape",
                                                   interval_width=0.8,
                                                   country_holidays="US",
@@ -151,9 +158,7 @@ def test_training_with_split_cutoff(self):
                                                   trial_timeout=1000,
                                                   random_state=0,
                                                   is_parallel=False,
-                                                  split_cutoff=pd.Timestamp('2020-07-10 00:00:00'))
-
-        for df in [self.df, self.df_datetime_date, self.df_string_time]:
+                                                  split_cutoff=pd.Timestamp(split_cutoff))
             results = hyperopt_estim.fit(df)
             self.assertAlmostEqual(results["mse"][0], 0)
             self.assertAlmostEqual(results["rmse"][0], 0, delta=1e-6)