Skip to content

[ML-47076] Add frequency_quantity in get_validation_horizon to support custom frequency #161

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions runtime/databricks/automl_runtime/forecast/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def make_single_future_dataframe(
)
return pd.DataFrame(date_rng, columns=[column_name])

def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str) -> int:
def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str, frequency_quantity: int = 1) -> int:
"""
Return validation_horizon, which is the lesser of `horizon` and one quarter of the dataframe's timedelta
Since the seasonality period is never more than half of the dataframe's timedelta,
Expand All @@ -105,10 +105,14 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str) -> int:
:param df: pd.DataFrame of the historical data
:param horizon: int number of time into the future for forecasting
:param unit: frequency unit of the time series, which must be a pandas offset alias
:param frequency_quantity: int multiplier for the frequency unit, representing the number of `unit`s
per time step in the dataframe. This is useful when the time series has a granularity that
spans multiple `unit`s (e.g., if `unit='min'` and `frequency_quantity=5`, it means the data
follows a five-minute pattern). To make it backward compatible, defaults to 1.
:return: horizon used for validation, in terms of the input `unit`
"""
MIN_HORIZONS = 4 # minimum number of horizons in the dataframe
horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * horizon
horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * horizon * frequency_quantity

try:
if MIN_HORIZONS * horizon_dateoffset + df["ds"].min() <= df["ds"].max():
Expand All @@ -119,14 +123,14 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str) -> int:
# In order to calculate the validation horizon, we incrementally add offset
# to the start time to the quarter of total timedelta. We did this since
# pd.DateOffset does not support divide by operation.
unit_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])
timestep_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * frequency_quantity
max_horizon = 0
cur_timestamp = df["ds"].min()
while cur_timestamp + unit_dateoffset <= df["ds"].max():
cur_timestamp += unit_dateoffset
while cur_timestamp + timestep_dateoffset <= df["ds"].max():
cur_timestamp += timestep_dateoffset
max_horizon += 1
_logger.info(f"Horizon {horizon_dateoffset} too long relative to dataframe's "
f"timedelta. Validation horizon will be reduced to {max_horizon//MIN_HORIZONS*unit_dateoffset}.")
f"timedelta. Validation horizon will be reduced to {max_horizon//MIN_HORIZONS*timestep_dateoffset}.")
return max_horizon // MIN_HORIZONS

def generate_cutoffs(df: pd.DataFrame, horizon: int, unit: str,
Expand Down
34 changes: 34 additions & 0 deletions runtime/tests/automl_runtime/forecast/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,40 @@ def test_truncate_logs(self):
validation_horizon = get_validation_horizon(df, 10, "D")
self.assertIn("too long relative to dataframe's timedelta. Validation horizon will be reduced to", cm.output[0])

def test_frequency_quantity(self):
# Since we only add extra supports of 5 min, 10 min, 15 min and 30 min for now, only test cases are added.
# We need to add more test cases when we add more supports.
df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:55:00", freq="5T"), columns=["ds"])
validation_horizon = get_validation_horizon(df, 10, "min", 5)
self.assertEqual(validation_horizon, 10)

df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="5T"), columns=["ds"])
validation_horizon = get_validation_horizon(df, 10, "min", 5)
self.assertEqual(validation_horizon, 6)

df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="10T"), columns=["ds"])
validation_horizon = get_validation_horizon(df, 10, "min", 10)
self.assertEqual(validation_horizon, 10)

df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="10T"), columns=["ds"])
validation_horizon = get_validation_horizon(df, 10, "min", 10)
self.assertEqual(validation_horizon, 3)

df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="15T"), columns=["ds"])
validation_horizon = get_validation_horizon(df, 10, "min", 15)
self.assertEqual(validation_horizon, 10)

df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="15T"), columns=["ds"])
validation_horizon = get_validation_horizon(df, 10, "min", 15)
self.assertEqual(validation_horizon, 2)

df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="30T"), columns=["ds"])
validation_horizon = get_validation_horizon(df, 10, "min", 30)
self.assertEqual(validation_horizon, 10)

df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="30T"), columns=["ds"])
validation_horizon = get_validation_horizon(df, 10, "min", 30)
self.assertEqual(validation_horizon, 1)

class TestGenerateCutoffs(unittest.TestCase):

Expand Down