From fe2deaeab1668180b5875f312488810fd34117d6 Mon Sep 17 00:00:00 2001
From: Lan Zhang <lan.zhang@databricks.com>
Date: Tue, 4 Feb 2025 16:50:43 -0800
Subject: [PATCH 1/2] init

---
 .../automl_runtime/forecast/utils.py          | 10 ++++--
 .../automl_runtime/forecast/utils_test.py     | 32 +++++++++++++++++++
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py
index 36016f26..8be2d2c1 100644
--- a/runtime/databricks/automl_runtime/forecast/utils.py
+++ b/runtime/databricks/automl_runtime/forecast/utils.py
@@ -96,7 +96,7 @@ def make_single_future_dataframe(
     )
     return pd.DataFrame(date_rng, columns=[column_name])
 
-def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str) -> int:
+def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str, frequency_quantity: int = 1) -> int:
     """
     Return validation_horizon, which is the lesser of `horizon` and one quarter of the dataframe's timedelta
     Since the seasonality period is never more than half of the dataframe's timedelta,
@@ -105,10 +105,14 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str) -> int:
     :param df: pd.DataFrame of the historical data
     :param horizon: int number of time into the future for forecasting
     :param unit: frequency unit of the time series, which must be a pandas offset alias
+    :param frequency_quantity: int multiplier for the frequency unit, representing the number of `unit`s 
+        per time step in the dataframe. This is useful when the time series has a granularity that 
+        spans multiple `unit`s (e.g., if `unit='min'` and `frequency_quantity=5`, it means the data 
+        follows a five-minute pattern). To make it backward compatible, defaults to 1.
     :return: horizon used for validation, in terms of the input `unit`
     """
     MIN_HORIZONS = 4  # minimum number of horizons in the dataframe
-    horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * horizon
+    horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * horizon * frequency_quantity
 
     try:
         if MIN_HORIZONS * horizon_dateoffset + df["ds"].min() <= df["ds"].max():
@@ -119,7 +123,7 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str) -> int:
     # In order to calculate the validation horizon, we incrementally add offset
     # to the start time to the quarter of total timedelta. We did this since
     # pd.DateOffset does not support divide by operation.
-    unit_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])
+    unit_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * frequency_quantity
     max_horizon = 0
     cur_timestamp = df["ds"].min()
     while cur_timestamp + unit_dateoffset <= df["ds"].max():
diff --git a/runtime/tests/automl_runtime/forecast/utils_test.py b/runtime/tests/automl_runtime/forecast/utils_test.py
index 84bd94e0..b6d79907 100644
--- a/runtime/tests/automl_runtime/forecast/utils_test.py
+++ b/runtime/tests/automl_runtime/forecast/utils_test.py
@@ -88,6 +88,38 @@ def test_truncate_logs(self):
             validation_horizon = get_validation_horizon(df, 10, "D")
             self.assertIn("too long relative to dataframe's timedelta. Validation horizon will be reduced to", cm.output[0])
 
+    def test_frequency_quantity(self):
+        df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:55:00", freq="5T"), columns=["ds"])
+        validation_horizon = get_validation_horizon(df, 10, "min", 5)
+        self.assertEqual(validation_horizon, 10)
+
+        df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="5T"), columns=["ds"])
+        validation_horizon = get_validation_horizon(df, 10, "min", 5)
+        self.assertEqual(validation_horizon, 6)
+
+        df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="10T"), columns=["ds"])
+        validation_horizon = get_validation_horizon(df, 10, "min", 10)
+        self.assertEqual(validation_horizon, 10)
+
+        df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="10T"), columns=["ds"])
+        validation_horizon = get_validation_horizon(df, 10, "min", 10)
+        self.assertEqual(validation_horizon, 3)
+
+        df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="15T"), columns=["ds"])
+        validation_horizon = get_validation_horizon(df, 10, "min", 15)
+        self.assertEqual(validation_horizon, 10)
+
+        df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="15T"), columns=["ds"])
+        validation_horizon = get_validation_horizon(df, 10, "min", 15)
+        self.assertEqual(validation_horizon, 2)
+
+        df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="30T"), columns=["ds"])
+        validation_horizon = get_validation_horizon(df, 10, "min", 30)
+        self.assertEqual(validation_horizon, 10)
+
+        df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="30T"), columns=["ds"])
+        validation_horizon = get_validation_horizon(df, 10, "min", 30)
+        self.assertEqual(validation_horizon, 1)
 
 class TestGenerateCutoffs(unittest.TestCase):
 

From f9d6d9438e3807cf73192dacb9c573ecd35e1a36 Mon Sep 17 00:00:00 2001
From: Lan Zhang <lan.zhang@databricks.com>
Date: Wed, 5 Feb 2025 14:47:44 -0800
Subject: [PATCH 2/2] fix comment

---
 runtime/databricks/automl_runtime/forecast/utils.py | 8 ++++----
 runtime/tests/automl_runtime/forecast/utils_test.py | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py
index 8be2d2c1..3b5c3942 100644
--- a/runtime/databricks/automl_runtime/forecast/utils.py
+++ b/runtime/databricks/automl_runtime/forecast/utils.py
@@ -123,14 +123,14 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str, frequency_
     # In order to calculate the validation horizon, we incrementally add offset
     # to the start time to the quarter of total timedelta. We did this since
     # pd.DateOffset does not support divide by operation.
-    unit_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * frequency_quantity
+    timestep_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * frequency_quantity
     max_horizon = 0
     cur_timestamp = df["ds"].min()
-    while cur_timestamp + unit_dateoffset <= df["ds"].max():
-        cur_timestamp += unit_dateoffset
+    while cur_timestamp + timestep_dateoffset <= df["ds"].max():
+        cur_timestamp += timestep_dateoffset
         max_horizon += 1
     _logger.info(f"Horizon {horizon_dateoffset} too long relative to dataframe's "
-    f"timedelta. Validation horizon will be reduced to {max_horizon//MIN_HORIZONS*unit_dateoffset}.")
+    f"timedelta. Validation horizon will be reduced to {max_horizon//MIN_HORIZONS*timestep_dateoffset}.")
     return max_horizon // MIN_HORIZONS
 
 def generate_cutoffs(df: pd.DataFrame, horizon: int, unit: str,
diff --git a/runtime/tests/automl_runtime/forecast/utils_test.py b/runtime/tests/automl_runtime/forecast/utils_test.py
index b6d79907..3d9c5195 100644
--- a/runtime/tests/automl_runtime/forecast/utils_test.py
+++ b/runtime/tests/automl_runtime/forecast/utils_test.py
@@ -89,6 +89,8 @@ def test_truncate_logs(self):
             self.assertIn("too long relative to dataframe's timedelta. Validation horizon will be reduced to", cm.output[0])
 
     def test_frequency_quantity(self):
+        # Since we only add extra supports of 5 min, 10 min, 15 min and 30 min for now, only test cases are added.
+        # We need to add more test cases when we add more supports.
         df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:55:00", freq="5T"), columns=["ds"])
         validation_horizon = get_validation_horizon(df, 10, "min", 5)
         self.assertEqual(validation_horizon, 10)