diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py index 36016f2..3b5c394 100644 --- a/runtime/databricks/automl_runtime/forecast/utils.py +++ b/runtime/databricks/automl_runtime/forecast/utils.py @@ -96,7 +96,7 @@ def make_single_future_dataframe( ) return pd.DataFrame(date_rng, columns=[column_name]) -def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str) -> int: +def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str, frequency_quantity: int = 1) -> int: """ Return validation_horizon, which is the lesser of `horizon` and one quarter of the dataframe's timedelta Since the seasonality period is never more than half of the dataframe's timedelta, @@ -105,10 +105,14 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str) -> int: :param df: pd.DataFrame of the historical data :param horizon: int number of time into the future for forecasting :param unit: frequency unit of the time series, which must be a pandas offset alias + :param frequency_quantity: int multiplier for the frequency unit, representing the number of `unit`s + per time step in the dataframe. This is useful when the time series has a granularity that + spans multiple `unit`s (e.g., if `unit='min'` and `frequency_quantity=5`, it means the data + follows a five-minute pattern). To make it backward compatible, defaults to 1. :return: horizon used for validation, in terms of the input `unit` """ MIN_HORIZONS = 4 # minimum number of horizons in the dataframe - horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * horizon + horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * horizon * frequency_quantity try: if MIN_HORIZONS * horizon_dateoffset + df["ds"].min() <= df["ds"].max(): @@ -119,14 +123,14 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str) -> int: # In order to calculate the validation horizon, we incrementally add offset # to the start time to the quarter of total timedelta. We did this since # pd.DateOffset does not support divide by operation. - unit_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) + timestep_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * frequency_quantity max_horizon = 0 cur_timestamp = df["ds"].min() - while cur_timestamp + unit_dateoffset <= df["ds"].max(): - cur_timestamp += unit_dateoffset + while cur_timestamp + timestep_dateoffset <= df["ds"].max(): + cur_timestamp += timestep_dateoffset max_horizon += 1 _logger.info(f"Horizon {horizon_dateoffset} too long relative to dataframe's " - f"timedelta. Validation horizon will be reduced to {max_horizon//MIN_HORIZONS*unit_dateoffset}.") + f"timedelta. Validation horizon will be reduced to {max_horizon//MIN_HORIZONS*timestep_dateoffset}.") return max_horizon // MIN_HORIZONS def generate_cutoffs(df: pd.DataFrame, horizon: int, unit: str, diff --git a/runtime/tests/automl_runtime/forecast/utils_test.py b/runtime/tests/automl_runtime/forecast/utils_test.py index 84bd94e..3d9c519 100644 --- a/runtime/tests/automl_runtime/forecast/utils_test.py +++ b/runtime/tests/automl_runtime/forecast/utils_test.py @@ -88,6 +88,40 @@ def test_truncate_logs(self): validation_horizon = get_validation_horizon(df, 10, "D") self.assertIn("too long relative to dataframe's timedelta. Validation horizon will be reduced to", cm.output[0]) + def test_frequency_quantity(self): + # Since we only add extra supports of 5 min, 10 min, 15 min and 30 min for now, only test cases are added. + # We need to add more test cases when we add more supports. + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:55:00", freq="5T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 5) + self.assertEqual(validation_horizon, 10) + + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="5T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 5) + self.assertEqual(validation_horizon, 6) + + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="10T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 10) + self.assertEqual(validation_horizon, 10) + + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="10T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 10) + self.assertEqual(validation_horizon, 3) + + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="15T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 15) + self.assertEqual(validation_horizon, 10) + + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="15T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 15) + self.assertEqual(validation_horizon, 2) + + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="30T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 30) + self.assertEqual(validation_horizon, 10) + + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="30T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 30) + self.assertEqual(validation_horizon, 1) class TestGenerateCutoffs(unittest.TestCase):