From fe2deaeab1668180b5875f312488810fd34117d6 Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Tue, 4 Feb 2025 16:50:43 -0800 Subject: [PATCH 1/2] init --- .../automl_runtime/forecast/utils.py | 10 ++++-- .../automl_runtime/forecast/utils_test.py | 32 +++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py index 36016f26..8be2d2c1 100644 --- a/runtime/databricks/automl_runtime/forecast/utils.py +++ b/runtime/databricks/automl_runtime/forecast/utils.py @@ -96,7 +96,7 @@ def make_single_future_dataframe( ) return pd.DataFrame(date_rng, columns=[column_name]) -def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str) -> int: +def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str, frequency_quantity: int = 1) -> int: """ Return validation_horizon, which is the lesser of `horizon` and one quarter of the dataframe's timedelta Since the seasonality period is never more than half of the dataframe's timedelta, @@ -105,10 +105,14 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str) -> int: :param df: pd.DataFrame of the historical data :param horizon: int number of time into the future for forecasting :param unit: frequency unit of the time series, which must be a pandas offset alias + :param frequency_quantity: int multiplier for the frequency unit, representing the number of `unit`s + per time step in the dataframe. This is useful when the time series has a granularity that + spans multiple `unit`s (e.g., if `unit='min'` and `frequency_quantity=5`, it means the data + follows a five-minute pattern). To make it backward compatible, defaults to 1. :return: horizon used for validation, in terms of the input `unit` """ MIN_HORIZONS = 4 # minimum number of horizons in the dataframe - horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * horizon + horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * horizon * frequency_quantity try: if MIN_HORIZONS * horizon_dateoffset + df["ds"].min() <= df["ds"].max(): @@ -119,7 +123,7 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str) -> int: # In order to calculate the validation horizon, we incrementally add offset # to the start time to the quarter of total timedelta. We did this since # pd.DateOffset does not support divide by operation. - unit_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) + unit_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * frequency_quantity max_horizon = 0 cur_timestamp = df["ds"].min() while cur_timestamp + unit_dateoffset <= df["ds"].max(): diff --git a/runtime/tests/automl_runtime/forecast/utils_test.py b/runtime/tests/automl_runtime/forecast/utils_test.py index 84bd94e0..b6d79907 100644 --- a/runtime/tests/automl_runtime/forecast/utils_test.py +++ b/runtime/tests/automl_runtime/forecast/utils_test.py @@ -88,6 +88,38 @@ def test_truncate_logs(self): validation_horizon = get_validation_horizon(df, 10, "D") self.assertIn("too long relative to dataframe's timedelta. Validation horizon will be reduced to", cm.output[0]) + def test_frequency_quantity(self): + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:55:00", freq="5T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 5) + self.assertEqual(validation_horizon, 10) + + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="5T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 5) + self.assertEqual(validation_horizon, 6) + + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="10T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 10) + self.assertEqual(validation_horizon, 10) + + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="10T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 10) + self.assertEqual(validation_horizon, 3) + + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="15T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 15) + self.assertEqual(validation_horizon, 10) + + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="15T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 15) + self.assertEqual(validation_horizon, 2) + + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="30T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 30) + self.assertEqual(validation_horizon, 10) + + df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="30T"), columns=["ds"]) + validation_horizon = get_validation_horizon(df, 10, "min", 30) + self.assertEqual(validation_horizon, 1) class TestGenerateCutoffs(unittest.TestCase): From f9d6d9438e3807cf73192dacb9c573ecd35e1a36 Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Wed, 5 Feb 2025 14:47:44 -0800 Subject: [PATCH 2/2] fix comment --- runtime/databricks/automl_runtime/forecast/utils.py | 8 ++++---- runtime/tests/automl_runtime/forecast/utils_test.py | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py index 8be2d2c1..3b5c3942 100644 --- a/runtime/databricks/automl_runtime/forecast/utils.py +++ b/runtime/databricks/automl_runtime/forecast/utils.py @@ -123,14 +123,14 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str, frequency_ # In order to calculate the validation horizon, we incrementally add offset # to the start time to the quarter of total timedelta. We did this since # pd.DateOffset does not support divide by operation. - unit_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * frequency_quantity + timestep_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * frequency_quantity max_horizon = 0 cur_timestamp = df["ds"].min() - while cur_timestamp + unit_dateoffset <= df["ds"].max(): - cur_timestamp += unit_dateoffset + while cur_timestamp + timestep_dateoffset <= df["ds"].max(): + cur_timestamp += timestep_dateoffset max_horizon += 1 _logger.info(f"Horizon {horizon_dateoffset} too long relative to dataframe's " - f"timedelta. Validation horizon will be reduced to {max_horizon//MIN_HORIZONS*unit_dateoffset}.") + f"timedelta. Validation horizon will be reduced to {max_horizon//MIN_HORIZONS*timestep_dateoffset}.") return max_horizon // MIN_HORIZONS def generate_cutoffs(df: pd.DataFrame, horizon: int, unit: str, diff --git a/runtime/tests/automl_runtime/forecast/utils_test.py b/runtime/tests/automl_runtime/forecast/utils_test.py index b6d79907..3d9c5195 100644 --- a/runtime/tests/automl_runtime/forecast/utils_test.py +++ b/runtime/tests/automl_runtime/forecast/utils_test.py @@ -89,6 +89,8 @@ def test_truncate_logs(self): self.assertIn("too long relative to dataframe's timedelta. Validation horizon will be reduced to", cm.output[0]) def test_frequency_quantity(self): + # Since we only add extra supports of 5 min, 10 min, 15 min and 30 min for now, only test cases are added. + # We need to add more test cases when we add more supports. df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:55:00", freq="5T"), columns=["ds"]) validation_horizon = get_validation_horizon(df, 10, "min", 5) self.assertEqual(validation_horizon, 10)