From bad24657cdaf59b28a028a6f043bb3cefa46d58d Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Wed, 26 Feb 2025 16:36:17 -0800 Subject: [PATCH 1/6] init --- .../automl_runtime/forecast/deepar/model.py | 12 +- .../automl_runtime/forecast/deepar/utils.py | 27 ++-- .../automl_runtime/forecast/frequency.py | 72 ++++++++++ .../automl_runtime/forecast/pmdarima/model.py | 50 +++---- .../forecast/pmdarima/training.py | 34 +++-- .../forecast/prophet/forecast.py | 30 ++-- .../automl_runtime/forecast/prophet/model.py | 31 ++--- .../automl_runtime/forecast/utils.py | 82 +++++------ .../forecast/deepar/model_test.py | 19 +-- .../forecast/pmdarima/diagnostics_test.py | 3 +- .../forecast/pmdarima/model_test.py | 43 +++--- .../forecast/pmdarima/training_test.py | 46 +++---- .../forecast/prophet/diagnostics_test.py | 3 +- .../forecast/prophet/forecast_test.py | 16 +-- .../forecast/prophet/model_test.py | 30 ++-- .../automl_runtime/forecast/utils_test.py | 128 +++++++++--------- 16 files changed, 315 insertions(+), 311 deletions(-) create mode 100644 runtime/databricks/automl_runtime/forecast/frequency.py diff --git a/runtime/databricks/automl_runtime/forecast/deepar/model.py b/runtime/databricks/automl_runtime/forecast/deepar/model.py index 137c37a..76eb9f1 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/model.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/model.py @@ -23,6 +23,7 @@ from mlflow.utils.environment import _mlflow_conda_env from databricks.automl_runtime import version +from databricks.automl_runtime.forecast.frequency import Frequency from databricks.automl_runtime.forecast.model import ForecastModel, mlflow_forecast_log_model from databricks.automl_runtime.forecast.deepar.utils import set_index_and_fill_missing_time_steps @@ -42,7 +43,7 @@ class DeepARModel(ForecastModel): DeepAR mlflow model wrapper for forecasting. """ - def __init__(self, model: PyTorchPredictor, horizon: int, frequency_unit: str, frequency_quantity: int, + def __init__(self, model: PyTorchPredictor, horizon: int, frequency: Frequency, num_samples: int, target_col: str, time_col: str, id_cols: Optional[List[str]] = None) -> None: @@ -50,8 +51,7 @@ def __init__(self, model: PyTorchPredictor, horizon: int, frequency_unit: str, f Initialize the DeepAR mlflow Python model wrapper :param model: DeepAR model :param horizon: the number of periods to forecast forward - :param frequency_unit: the frequency unit of the time series - :param frequency_quantity: the frequency quantity of the time series + :param frequency: the frequency of the time series :param num_samples: the number of samples to draw from the distribution :param target_col: the target column name :param time_col: the time column name @@ -61,8 +61,7 @@ def __init__(self, model: PyTorchPredictor, horizon: int, frequency_unit: str, f super().__init__() self._model = model self._horizon = horizon - self._frequency_unit = frequency_unit - self._frequency_quantity = frequency_quantity + self._frequency = frequency self._num_samples = num_samples self._target_col = target_col self._time_col = time_col @@ -130,8 +129,7 @@ def predict_samples(self, model_input_transformed = set_index_and_fill_missing_time_steps(model_input, self._time_col, - self._frequency_unit, - self._frequency_quantity, + self._frequency, self._id_cols) test_ds = PandasDataset(model_input_transformed, target=self._target_col) diff --git a/runtime/databricks/automl_runtime/forecast/deepar/utils.py b/runtime/databricks/automl_runtime/forecast/deepar/utils.py index 016de93..c7593eb 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/utils.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/utils.py @@ -16,12 +16,12 @@ from typing import List, Optional import pandas as pd +from databricks.automl_runtime.forecast.frequency import Frequency def validate_and_generate_index(df: pd.DataFrame, time_col: str, - frequency_unit: str, - frequency_quantity: int): + frequency: Frequency): """ Generate a complete time index for the given DataFrame based on the specified frequency. - Ensures the time column is in datetime format. @@ -29,13 +29,12 @@ def validate_and_generate_index(df: pd.DataFrame, - Generates a new time index from the minimum to the maximum timestamp in the data. :param df: The input DataFrame containing the time column. :param time_col: The name of the time column. - :param frequency_unit: The frequency unit of the time series. - :param frequency_quantity: The frequency quantity of the time series. + :param frequency: The frequency of the time series. :return: A complete time index covering the full range of the dataset. :raises ValueError: If the day-of-month pattern is inconsistent for "MS" frequency. """ - if frequency_unit.upper() != "MS": - return pd.date_range(df[time_col].min(), df[time_col].max(), freq=f"{frequency_quantity}{frequency_unit}") + if frequency.frequency_unit.upper() != "MS": + return pd.date_range(df[time_col].min(), df[time_col].max(), freq=f"{frequency.frequency_quantity}{frequency.frequency_unit}") df[time_col] = pd.to_datetime(df[time_col]) # Ensure datetime format @@ -67,8 +66,7 @@ def validate_and_generate_index(df: pd.DataFrame, return new_index_full def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, - frequency_unit: str, - frequency_quantity: int, + frequency: Frequency, id_cols: Optional[List[str]] = None): """ Transform the input dataframe to an acceptable format for the GluonTS library. @@ -78,8 +76,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, :param df: the input dataframe that contains time_col :param time_col: time column name - :param frequency_unit: the frequency unit of the time series - :param frequency_quantity: the frequency quantity of the time series + :param frequency: the frequency of the time series :param id_cols: the column names of the identity columns for multi-series time series; None for single series :return: single-series - transformed dataframe; multi-series - dictionary of transformed dataframes, each key is the (concatenated) id of the time series @@ -88,11 +85,13 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, # We need to adjust the frequency_unit for pd.date_range if it is weekly, # otherwise it would always be "W-SUN" - if frequency_unit.upper() == "W": + if frequency.frequency_unit.upper() == "W": weekday_name = total_min.strftime("%a").upper() # e.g., "FRI" - frequency_unit = f"W-{weekday_name}" + adjusted_frequency = Frequency(frequency_unit=f"W-{weekday_name}", frequency_quantity=frequency.frequency_quantity) + else: + adjusted_frequency = Frequency(frequency_unit=frequency.frequency_unit, frequency_quantity=frequency.frequency_quantity) - valid_index = validate_and_generate_index(df=df, time_col=time_col, frequency_unit=frequency_unit, frequency_quantity=frequency_quantity) + valid_index = validate_and_generate_index(df=df, time_col=time_col, frequency=adjusted_frequency) if id_cols is not None: df_dict = {} @@ -111,7 +110,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, # Fill in missing time steps between the min and max time steps df = df.reindex(valid_index) - if frequency_unit.upper() == "MS": + if frequency.frequency_unit.upper() == "MS": # Truncate the day of month to avoid issues with pandas frequency check df = df.to_period("M") diff --git a/runtime/databricks/automl_runtime/forecast/frequency.py b/runtime/databricks/automl_runtime/forecast/frequency.py new file mode 100644 index 0000000..b7efa89 --- /dev/null +++ b/runtime/databricks/automl_runtime/forecast/frequency.py @@ -0,0 +1,72 @@ +# +# Copyright (C) 2022 Databricks, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from dataclasses import dataclass +from typing import ClassVar, Set + +@dataclass(frozen=True) +class Frequency: + """ + Represents the frequency of a time series. + + Attributes: + frequency_unit (str): The unit of time for the frequency. + frequency_quantity (int): The number of frequency_units in the period. + + Valid frequency units: source of truth is OFFSET_ALIAS_MAP in forecast.__init__.py + - Weeks: "W" + - Days: "d", "D", "days", "day" + - Hours: "hours", "hour", "hr", "h", "H + - Minutes: "m", "minute", "min", "minutes", "T" + - Seconds: "S", "seconds", "sec", "second" + - Months: "M", "MS", "month", "months" + - Quarters: "Q", "QS", "quarter", "quarters" + - Years: "Y", "YS", "year", "years" + + Valid frequency quantities: + - For minutes: {1, 5, 10, 15, 30} + - For all other units: {1} + """ + + VALID_FREQUENCY_UNITS: ClassVar[Set[str]] = { + "W", "d", "D", "days", "day", "hours", "hour", "hr", "h", "H", + "m", "minute", "min", "minutes", "T", "S", "seconds", + "sec", "second", "M", "MS", "month", "months", "Q", "QS", "quarter", + "quarters", "Y", "YS", "year", "years" + } + + VALID_MINUTE_QUANTITIES: ClassVar[Set[int]] = {1, 5, 10, 15, 30} + DEFAULT_QUANTITY: ClassVar[int] = 1 # Default for non-minute units + + frequency_unit: str + frequency_quantity: int + + def __post_init__(self): + if self.frequency_unit not in self.VALID_FREQUENCY_UNITS: + raise ValueError(f"Invalid frequency unit: {self.frequency_unit}") + + if self.frequency_unit in {"m", "minute", "min", "minutes", "T"}: + if self.frequency_quantity not in self.VALID_MINUTE_QUANTITIES: + raise ValueError( + f"Invalid frequency quantity {self.frequency_quantity} for minutes. " + f"Allowed values: {sorted(self.VALID_MINUTE_QUANTITIES)}" + ) + else: + if self.frequency_quantity != self.DEFAULT_QUANTITY: + raise ValueError( + f"Invalid frequency quantity {self.frequency_quantity} for {self.frequency_unit}. " + "Only 1 is allowed for this unit." + ) + diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/model.py b/runtime/databricks/automl_runtime/forecast/pmdarima/model.py index cc4116c..3cf1647 100644 --- a/runtime/databricks/automl_runtime/forecast/pmdarima/model.py +++ b/runtime/databricks/automl_runtime/forecast/pmdarima/model.py @@ -25,6 +25,7 @@ from mlflow.utils.environment import _mlflow_conda_env from databricks.automl_runtime.forecast import OFFSET_ALIAS_MAP, DATE_OFFSET_KEYWORD_MAP +from databricks.automl_runtime.forecast.frequency import Frequency from databricks.automl_runtime.forecast.model import ForecastModel, mlflow_forecast_log_model from databricks.automl_runtime.forecast.utils import calculate_period_differences, is_frequency_consistency, \ make_future_dataframe, make_single_future_dataframe @@ -64,19 +65,18 @@ def model_env(self): return ARIMA_CONDA_ENV @staticmethod - def _get_ds_indices(start_ds: pd.Timestamp, periods: int, frequency_unit: str, frequency_quantity: int) -> pd.DatetimeIndex: + def _get_ds_indices(start_ds: pd.Timestamp, periods: int, frequency: Frequency) -> pd.DatetimeIndex: """ Create a DatetimeIndex with specified starting time and frequency, whose length is the given periods. :param start_ds: the pd.Timestamp as the start of the DatetimeIndex. :param periods: the length of the DatetimeIndex. - :param frequency_unit: the frequency unit of the DatetimeIndex. - :param frequency_quantity: the frequency quantity of the DatetimeIndex. + :param frequency: the frequency of the DatetimeIndex. :return: a DatetimeIndex. """ ds_indices = pd.date_range( start=start_ds, periods=periods, - freq=pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit]) * frequency_quantity + freq=pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * frequency.frequency_quantity ) modified_start_ds = ds_indices.min() if start_ds != modified_start_ds: @@ -90,15 +90,13 @@ class ArimaModel(AbstractArimaModel): ARIMA mlflow model wrapper for univariate forecasting. """ - def __init__(self, pickled_model: bytes, horizon: int, frequency_unit: str, - frequency_quantity: int, start_ds: pd.Timestamp, end_ds: pd.Timestamp, + def __init__(self, pickled_model: bytes, horizon: int, frequency: Frequency, start_ds: pd.Timestamp, end_ds: pd.Timestamp, time_col: str, exogenous_cols: Optional[List[str]] = None) -> None: """ Initialize the mlflow Python model wrapper for ARIMA. :param pickled_model: the pickled ARIMA model as a bytes object. :param horizon: int number of periods to forecast forward. - :param frequency_unit: the frequency unit of the time series - :param frequency_quantity: the frequency quantity of the time series + :param frequency: the frequency of the time series :param start_ds: the start time of training data :param end_ds: the end time of training data :param time_col: the column name of the time column @@ -108,8 +106,7 @@ def __init__(self, pickled_model: bytes, horizon: int, frequency_unit: str, super().__init__() self._pickled_model = pickled_model self._horizon = horizon - self._frequency_unit = OFFSET_ALIAS_MAP[frequency_unit] - self._frequency_quantity = frequency_quantity + self._frequency = Frequency(frequency_unit=OFFSET_ALIAS_MAP[frequency.frequency_unit], frequency_quantity=frequency.frequency_quantity) self._start_ds = pd.to_datetime(start_ds) self._end_ds = pd.to_datetime(end_ds) self._time_col = time_col @@ -160,8 +157,7 @@ def make_future_dataframe(self, horizon: int = None, include_history: bool = Tru start_time=self._start_ds, end_time=self._end_ds, horizon=horizon or self._horizon, - frequency_unit=self._frequency_unit, - frequency_quantity=self._frequency_quantity, + frequency=self._frequency, include_history=include_history ) @@ -196,7 +192,7 @@ def _predict_impl(self, input_df: pd.DataFrame) -> pd.DataFrame: ) # Check if the time has correct frequency consistency = df["ds"].apply(lambda x: - is_frequency_consistency(self._start_ds, x, self._frequency_unit, self._frequency_quantity) + is_frequency_consistency(self._start_ds, x, self._frequency) ).all() if not consistency: raise MlflowException( @@ -207,7 +203,7 @@ def _predict_impl(self, input_df: pd.DataFrame) -> pd.DataFrame: ) preds_pds = [] # Out-of-sample prediction if needed - horizon = calculate_period_differences(self._end_ds, max(df["ds"]), self._frequency_unit, self._frequency_quantity) + horizon = calculate_period_differences(self._end_ds, max(df["ds"]), self._frequency) if horizon > 0: X_future = df[df["ds"] > self._end_ds].set_index("ds") future_pd = self._forecast( @@ -233,8 +229,8 @@ def _predict_in_sample( end_ds: pd.Timestamp = None, X: pd.DataFrame = None) -> pd.DataFrame: if start_ds and end_ds: - start_idx = calculate_period_differences(self._start_ds, start_ds, self._frequency_unit, self._frequency_quantity) - end_idx = calculate_period_differences(self._start_ds, end_ds, self._frequency_unit, self._frequency_quantity) + start_idx = calculate_period_differences(self._start_ds, start_ds, self._frequency) + end_idx = calculate_period_differences(self._start_ds, end_ds, self._frequency) else: start_ds = self._start_ds end_ds = self._end_ds @@ -246,8 +242,8 @@ def _predict_in_sample( start=start_idx, end=end_idx, return_conf_int=True) - periods = calculate_period_differences(self._start_ds, end_ds, self._frequency_unit, self._frequency_quantity) + 1 - ds_indices = self._get_ds_indices(start_ds=self._start_ds, periods=periods, frequency_unit=self._frequency_unit, frequency_quantity=self._frequency_quantity)[start_idx:] + periods = calculate_period_differences(self._start_ds, end_ds, self._frequency) + 1 + ds_indices = self._get_ds_indices(start_ds=self._start_ds, periods=periods, frequency=self._frequency)[start_idx:] in_sample_pd = pd.DataFrame({'ds': ds_indices, 'yhat': preds_in_sample}) in_sample_pd[["yhat_lower", "yhat_upper"]] = conf_in_sample return in_sample_pd @@ -261,7 +257,7 @@ def _forecast( horizon, X=X, return_conf_int=True) - ds_indices = self._get_ds_indices(start_ds=self._end_ds, periods=horizon + 1, frequency_unit=self._frequency_unit, frequency_quantity=self._frequency_quantity)[1:] + ds_indices = self._get_ds_indices(start_ds=self._end_ds, periods=horizon + 1, frequency=self._frequency)[1:] preds_pd = pd.DataFrame({'ds': ds_indices, 'yhat': preds}) preds_pd[["yhat_lower", "yhat_upper"]] = conf return preds_pd @@ -272,15 +268,14 @@ class MultiSeriesArimaModel(AbstractArimaModel): ARIMA mlflow model wrapper for multivariate forecasting. """ - def __init__(self, pickled_model_dict: Dict[Tuple, bytes], horizon: int, frequency_unit: str, frequency_quantity: int, + def __init__(self, pickled_model_dict: Dict[Tuple, bytes], horizon: int, frequency: Frequency, start_ds_dict: Dict[Tuple, pd.Timestamp], end_ds_dict: Dict[Tuple, pd.Timestamp], time_col: str, id_cols: List[str], exogenous_cols: Optional[List[str]] = None) -> None: """ Initialize the mlflow Python model wrapper for multiseries ARIMA. :param pickled_model_dict: the dictionary of binarized ARIMA models for different time series. :param horizon: int number of periods to forecast forward. - :param frequency_unit: the frequency unit of the time series - :param frequency_quantity: the frequency quantity of the time series + :param frequency: the frequency of the time series :param start_ds_dict: the dictionary of the starting time of each time series in training data. :param end_ds_dict: the dictionary of the end time of each time series in training data. :param time_col: the column name of the time column @@ -291,8 +286,7 @@ def __init__(self, pickled_model_dict: Dict[Tuple, bytes], horizon: int, frequen super().__init__() self._pickled_models = pickled_model_dict self._horizon = horizon - self._frequency_unit = frequency_unit - self._frequency_quantity = frequency_quantity + self._frequency = frequency self._starts = start_ds_dict self._ends = end_ds_dict self._time_col = time_col @@ -335,8 +329,7 @@ def make_future_dataframe( start_time=self._starts, end_time=self._ends, horizon=horizon, - frequency_unit=self._frequency_unit, - frequency_quantity=self._frequency_quantity, + frequency=self._frequency, include_history=include_history, groups=groups, identity_column_names=self._id_cols @@ -367,7 +360,7 @@ def _predict_timeseries_single_id( horizon: int, include_history: bool = True, df: Optional[pd.DataFrame] = None) -> pd.DataFrame: - arima_model_single_id = ArimaModel(self._pickled_models[id_], self._horizon, self._frequency_unit, self._frequency_quantity, + arima_model_single_id = ArimaModel(self._pickled_models[id_], self._horizon, self._frequency, self._starts[id_], self._ends[id_], self._time_col, self._exogenous_cols) preds_df = arima_model_single_id.predict_timeseries(horizon, include_history, df) for id, col_name in zip(id_, self._id_cols): @@ -408,8 +401,7 @@ def _predict_single_id(self, df: pd.DataFrame) -> pd.DataFrame: id_ = df["ts_id"].to_list()[0] arima_model_single_id = ArimaModel(self._pickled_models[id_], self._horizon, - self._frequency_unit, - self._frequency_quantity, + self._frequency, self._starts[id_], self._ends[id_], self._time_col, diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py index 7cb1b35..9eb9db4 100644 --- a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py +++ b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py @@ -24,6 +24,7 @@ from pmdarima.arima import StepwiseContext from prophet.diagnostics import performance_metrics +from databricks.automl_runtime.forecast.frequency import Frequency from databricks.automl_runtime.forecast.pmdarima.diagnostics import cross_validation from databricks.automl_runtime.forecast import utils, OFFSET_ALIAS_MAP @@ -34,9 +35,9 @@ class ArimaEstimator: ARIMA estimator using pmdarima.auto_arima. """ - def __init__(self, horizon: int, frequency_unit: str, metric: str, seasonal_periods: List[int], + def __init__(self, horizon: int, frequency: Frequency, metric: str, seasonal_periods: List[int], num_folds: int = 20, max_steps: int = 150, exogenous_cols: Optional[List[str]] = None, - split_cutoff: Optional[pd.Timestamp] = None, frequency_quantity: int = 1) -> None: + split_cutoff: Optional[pd.Timestamp] = None) -> None: """ :param horizon: Number of periods to forecast forward :param frequency_unit: Frequency of the time series @@ -53,8 +54,7 @@ def __init__(self, horizon: int, frequency_unit: str, metric: str, seasonal_peri For training job, it is the cutoff bewteen validate and test split. """ self._horizon = horizon - self._frequency_unit = OFFSET_ALIAS_MAP[frequency_unit] - self._frequency_quantity = frequency_quantity + self._frequency = Frequency(frequency_unit=OFFSET_ALIAS_MAP[frequency.frequency_unit], frequency_quantity=frequency.frequency_quantity) self._metric = metric self._seasonal_periods = seasonal_periods self._num_folds = num_folds @@ -72,14 +72,14 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame: history_pd["ds"] = pd.to_datetime(history_pd["ds"]) # Check if the time has consistent frequency - self._validate_ds_freq(history_pd, self._frequency_unit, self._frequency_quantity) + self._validate_ds_freq(history_pd, self._frequency) history_periods = utils.calculate_period_differences( - history_pd['ds'].min(), history_pd['ds'].max(), self._frequency_unit, self._frequency_quantity + history_pd['ds'].min(), history_pd['ds'].max(), self._frequency ) if history_periods + 1 != history_pd['ds'].size: # Impute missing time steps - history_pd = self._fill_missing_time_steps(history_pd, self._frequency_unit, self._frequency_quantity) + history_pd = self._fill_missing_time_steps(history_pd, self._frequency) # Tune seasonal periods @@ -89,28 +89,26 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame: try: # this check mirrors the the default behavior by prophet if history_periods < 2 * m: - _logger.warning(f"Skipping seasonal_period={m} ({self._frequency_quantity}{self._frequency_unit}). Dataframe timestamps must span at least two seasonality periods, but only spans {history_periods} {self._frequency_quantity}{self._frequency_unit}""") + _logger.warning(f"Skipping seasonal_period={m} ({self._frequency.frequency_quantity}{self._frequency.frequency_unit}). Dataframe timestamps must span at least two seasonality periods, but only spans {history_periods} {self._frequency_quantity}{self._frequency_unit}""") continue # Prophet also rejects the seasonality periods if the seasonality period timedelta is less than the shortest timedelta in the dataframe. # However, this cannot happen in ARIMA because _fill_missing_time_steps imputes values for each _frequency_unit, # so the minimum valid seasonality period is always 1 - validation_horizon = utils.get_validation_horizon(history_pd, self._horizon, self._frequency_unit, self._frequency_quantity) + validation_horizon = utils.get_validation_horizon(history_pd, self._horizon, self._frequency) if self._split_cutoff: cutoffs = utils.generate_custom_cutoffs( history_pd, horizon=validation_horizon, - frequency_unit=self._frequency_unit, + frequency=self._frequency, split_cutoff=self._split_cutoff, - frequency_quantity=self._frequency_quantity, ) else: cutoffs = utils.generate_cutoffs( history_pd, horizon=validation_horizon, - frequency_unit=self._frequency_unit, + frequency=self._frequency, num_folds=self._num_folds, - frequency_quantity=self._frequency_quantity, ) result = self._fit_predict(history_pd, cutoffs=cutoffs, seasonal_period=m, max_steps=self._max_steps) @@ -154,9 +152,9 @@ def _fit_predict(self, df: pd.DataFrame, cutoffs: List[pd.Timestamp], seasonal_p return {"metrics": metrics, "model": arima_model} @staticmethod - def _fill_missing_time_steps(df: pd.DataFrame, frequency_unit: str, frequency_quantity: int): + def _fill_missing_time_steps(df: pd.DataFrame, frequency: Frequency): # Forward fill missing time steps - df_filled = df.set_index("ds").resample(rule=f"{frequency_quantity}{OFFSET_ALIAS_MAP[frequency_unit]}").pad().reset_index() + df_filled = df.set_index("ds").resample(rule=f"{frequency.frequency_quantity}{OFFSET_ALIAS_MAP[frequency.frequency_unit]}").pad().reset_index() start_ds, modified_start_ds = df["ds"].min(), df_filled["ds"].min() if start_ds != modified_start_ds: offset = modified_start_ds - start_ds @@ -164,12 +162,12 @@ def _fill_missing_time_steps(df: pd.DataFrame, frequency_unit: str, frequency_qu return df_filled @staticmethod - def _validate_ds_freq(df: pd.DataFrame, frequency_unit: str, frequency_quantity: int): + def _validate_ds_freq(df: pd.DataFrame, frequency: Frequency): start_ds = df["ds"].min() consistency = df["ds"].apply(lambda x: - utils.is_frequency_consistency(start_ds, x, frequency_unit, frequency_quantity) + utils.is_frequency_consistency(start_ds, x, frequency) ).all() if not consistency: raise ValueError( - f"Input time column includes different frequency than the specified frequency {frequency_quantity}{frequency_unit}." + f"Input time column includes different frequency than the specified frequency {frequency.frequency_quantity}{frequency.frequency_unit}." ) diff --git a/runtime/databricks/automl_runtime/forecast/prophet/forecast.py b/runtime/databricks/automl_runtime/forecast/prophet/forecast.py index 28c4a5e..98062dd 100644 --- a/runtime/databricks/automl_runtime/forecast/prophet/forecast.py +++ b/runtime/databricks/automl_runtime/forecast/prophet/forecast.py @@ -28,6 +28,7 @@ from databricks.automl_runtime.forecast.prophet.diagnostics import cross_validation from databricks.automl_runtime.forecast import utils, OFFSET_ALIAS_MAP, DATE_OFFSET_KEYWORD_MAP +from databricks.automl_runtime.forecast.frequency import Frequency class ProphetHyperParams(Enum): @@ -38,11 +39,10 @@ class ProphetHyperParams(Enum): def _prophet_fit_predict(params: Dict[str, Any], history_pd: pd.DataFrame, - horizon: int, frequency_unit: str, cutoffs: List[pd.Timestamp], + horizon: int, frequency: Frequency, cutoffs: List[pd.Timestamp], interval_width: int, primary_metric: str, country_holidays: Optional[str] = None, regressors = None, - frequency_quantity: int = 1, **prophet_kwargs) -> Dict[str, Any]: """ Training function for hyperparameter tuning with hyperopt @@ -51,8 +51,7 @@ def _prophet_fit_predict(params: Dict[str, Any], history_pd: pd.DataFrame, :param history_pd: pd.DataFrame containing the history. Must have columns ds (date type) and y, the time series :param horizon: Forecast horizon_timedelta - :param frequency_unit: Frequency unit of the time series - :param frequency_quantity: the number of time units that make up a single period of the time series. For now, only 1/5/10/15/30 minutes, 1 hour, 1 day, 1 week, 1 month, 1 quarter, 1 year are supported. + :param frequency: Frequency of the time series :param num_folds: Number of folds for cross validation :param interval_width: Width of the uncertainty intervals provided for the forecast :param primary_metric: Metric that will be optimized across trials @@ -70,8 +69,8 @@ def _prophet_fit_predict(params: Dict[str, Any], history_pd: pd.DataFrame, model.add_regressor(regressor) model.fit(history_pd, iter=200) - offset_kwarg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency_unit]] - horizon_offset = pd.DateOffset(**offset_kwarg)*frequency_quantity*horizon + offset_kwarg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency.frequency_unit]] + horizon_offset = pd.DateOffset(**offset_kwarg) * frequency.frequency_quantity * horizon # Evaluate Metrics df_cv = cross_validation( model, horizon=horizon_offset, cutoffs=cutoffs, disable_tqdm=True @@ -89,20 +88,19 @@ class ProphetHyperoptEstimator(ABC): """ SUPPORTED_METRICS = ["mse", "rmse", "mae", "mape", "mdape", "smape", "coverage"] - def __init__(self, horizon: int, frequency_unit: str, metric: str, interval_width: int, + def __init__(self, horizon: int, frequency: Frequency, metric: str, interval_width: int, country_holidays: str, search_space: Dict[str, Any], algo=hyperopt.tpe.suggest, num_folds: int = 5, max_eval: int = 10, trial_timeout: int = None, random_state: int = 0, is_parallel: bool = True, regressors = None, split_cutoff: Optional[pd.Timestamp] = None, - frequency_quantity: int = 1, **prophet_kwargs) -> None: """ Initialization :param horizon: Number of periods to forecast forward - :param frequency_unit: Frequency of the time series + :param frequency: Frequency of the time series :param metric: Metric that will be optimized across trials :param interval_width: Width of the uncertainty intervals provided for the forecast :param country_holidays: Built-in holidays for the specified country @@ -123,8 +121,7 @@ def __init__(self, horizon: int, frequency_unit: str, metric: str, interval_widt `The Prophet source code `_. """ self._horizon = horizon - self._frequency_unit = OFFSET_ALIAS_MAP[frequency_unit] - self._frequency_quantity = frequency_quantity + self._frequency = Frequency(frequency_unit=OFFSET_ALIAS_MAP[frequency.frequency_unit], frequency_quantity=frequency.frequency_quantity) self._metric = metric self._interval_width = interval_width self._country_holidays = country_holidays @@ -150,27 +147,24 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame: seasonality_mode = ["additive", "multiplicative"] - validation_horizon = utils.get_validation_horizon(df, self._horizon, self._frequency_unit, self._frequency_quantity) + validation_horizon = utils.get_validation_horizon(df, self._horizon, self._frequency) if self._split_cutoff: cutoffs = utils.generate_custom_cutoffs( df.reset_index(drop=True), horizon=validation_horizon, - frequency_unit=self._frequency_unit, + frequency=self._frequency, split_cutoff=self._split_cutoff, - frequency_quantity=self._frequency_quantity, ) else: cutoffs = utils.generate_cutoffs( df.reset_index(drop=True), horizon=validation_horizon, - frequency_unit=self._frequency_unit, + frequency=self._frequency, num_folds=self._num_folds, - frequency_quantity=self._frequency_quantity, ) train_fn = partial(_prophet_fit_predict, history_pd=df, horizon=validation_horizon, - frequency_unit=self._frequency_unit, - frequency_quantity=self._frequency_quantity, + frequency=self._frequency, cutoffs=cutoffs, interval_width=self._interval_width, primary_metric=self._metric, country_holidays=self._country_holidays, diff --git a/runtime/databricks/automl_runtime/forecast/prophet/model.py b/runtime/databricks/automl_runtime/forecast/prophet/model.py index 0696bef..7e9b0cd 100644 --- a/runtime/databricks/automl_runtime/forecast/prophet/model.py +++ b/runtime/databricks/automl_runtime/forecast/prophet/model.py @@ -24,6 +24,7 @@ from mlflow.utils.environment import _mlflow_conda_env from databricks.automl_runtime.forecast import OFFSET_ALIAS_MAP, DATE_OFFSET_KEYWORD_MAP +from databricks.automl_runtime.forecast.frequency import Frequency from databricks.automl_runtime.forecast.model import ForecastModel, mlflow_forecast_log_model from databricks.automl_runtime import version from databricks.automl_runtime.forecast.utils import is_quaterly_alias, make_future_dataframe @@ -48,24 +49,21 @@ class ProphetModel(ForecastModel): def __init__(self, model_json: Union[Dict[Tuple, str], str], horizon: int, - frequency_unit: str, - frequency_quantity: int, + frequency: Frequency, time_col: str) -> None: """ Initialize the mlflow Python model wrapper for mlflow :param model_json: json string of the Prophet model or the dictionary of json strings of Prophet model for multi-series forecasting :param horizon: Int number of periods to forecast forward. - :param frequency_unit: the frequency unit of the time series - :param frequency_quantity: the frequency quantity of the time series + :param frequency: the frequency of the time series :param time_col: the column name of the time column """ self._model_json = model_json self._horizon = horizon - self._frequency_unit = frequency_unit - self._frequency_quantity = frequency_quantity + self._frequency = frequency self._time_col = time_col - self._is_quaterly = is_quaterly_alias(frequency_unit) + self._is_quaterly = is_quaterly_alias(frequency.frequency_unit) super().__init__() def load_context(self, context: mlflow.pyfunc.model.PythonModelContext) -> None: @@ -98,8 +96,8 @@ def make_future_dataframe(self, horizon: int = None, include_history: bool = Tru :return: pd.Dataframe that extends forward from the end of self.history for the requested number of periods. """ - offset_kwarg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[self._frequency_unit]] - offset_kwarg = {key: value * self._frequency_quantity for key, value in offset_kwarg.items()} + offset_kwarg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[self._frequency.frequency_unit]] + offset_kwarg = {key: value * self._frequency.frequency_quantity for key, value in offset_kwarg.items()} return self.model().make_future_dataframe(periods=horizon or self._horizon, freq=pd.DateOffset(**offset_kwarg), include_history=include_history) @@ -151,7 +149,7 @@ class MultiSeriesProphetModel(ProphetModel): """ def __init__(self, model_json: Dict[Tuple, str], timeseries_starts: Dict[Tuple, pd.Timestamp], - timeseries_end: str, horizon: int, frequency_unit: str, frequency_quantity: int, time_col: str, id_cols: List[str], + timeseries_end: str, horizon: int, frequency: Frequency, time_col: str, id_cols: List[str], ) -> None: """ Initialize the mlflow Python model wrapper for mlflow @@ -159,14 +157,11 @@ def __init__(self, model_json: Dict[Tuple, str], timeseries_starts: Dict[Tuple, :param timeseries_starts: the dictionary of pd.Timestamp as the starting time of each time series :param timeseries_end: the end time of the time series :param horizon: int number of periods to forecast forward - :param frequency_unit: the frequency unit of the time series - :param frequency_quantity: the frequency quantity of the time series + :param frequency: the frequency of the time series :param time_col: the column name of the time column :param id_cols: the column names of the identity columns for multi-series time series """ - super().__init__(model_json, horizon, frequency_unit, frequency_quantity, time_col) - self._frequency_unit = frequency_unit - self._frequency_quantity = frequency_quantity + super().__init__(model_json, horizon, frequency, time_col) self._timeseries_end = timeseries_end self._timeseries_starts = timeseries_starts self._id_cols = id_cols @@ -209,8 +204,7 @@ def make_future_dataframe( start_time=self._timeseries_starts, end_time=end_time, horizon=horizon, - frequency_unit=self._frequency_unit, - frequency_quantity=self._frequency_quantity, + frequency=self._frequency, include_history=include_history, groups=groups, identity_column_names=self._id_cols @@ -245,8 +239,7 @@ def predict_timeseries(self, horizon: int = None, include_history: bool = True) start_time=self._timeseries_starts, end_time=end_time, horizon=horizon, - frequency_unit=self._frequency_unit, - frequency_quantity=self._frequency_quantity, + frequency=self._frequency, include_history=include_history, groups=self._model_json.keys(), identity_column_names=self._id_cols diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py index 195d35a..f0fff16 100644 --- a/runtime/databricks/automl_runtime/forecast/utils.py +++ b/runtime/databricks/automl_runtime/forecast/utils.py @@ -17,6 +17,7 @@ from typing import Dict, List, Optional, Tuple, Union from databricks.automl_runtime.forecast import DATE_OFFSET_KEYWORD_MAP,\ QUATERLY_OFFSET_ALIAS, NON_DAILY_OFFSET_ALIAS, OFFSET_ALIAS_MAP, PERIOD_ALIAS_MAP +from databricks.automl_runtime.forecast.frequency import Frequency import pandas as pd @@ -26,8 +27,7 @@ def make_future_dataframe( start_time: Union[pd.Timestamp, Dict[Tuple, pd.Timestamp]], end_time: Union[pd.Timestamp, Dict[Tuple, pd.Timestamp]], horizon: int, - frequency_unit: str, - frequency_quantity: int, + frequency: Frequency, include_history: bool = True, groups: List[Tuple] = None, identity_column_names: List[str] = None, @@ -37,15 +37,14 @@ def make_future_dataframe( :param start_time: the dictionary of the starting time of each time series in training data. :param end_time: the dictionary of the end time of each time series in training data. :param horizon: int number of periods to forecast forward. - :param frequency_unit: the frequency unit of the time series - :param frequency_quantity: the multiplier for the frequency. + :param frequency: the frequency of the time series :param include_history: :param groups: the collection of group(s) to generate forecast predictions. :param identity_column_names: Column names of the identity columns :return: pd.DataFrame that extends forward """ if groups is None: - return make_single_future_dataframe(start_time, end_time, horizon, frequency_unit, frequency_quantity) + return make_single_future_dataframe(start_time, end_time, horizon, frequency) future_df_list = [] for group in groups: @@ -57,7 +56,7 @@ def make_future_dataframe( group_end_time = end_time[group] else: group_end_time = end_time - df = make_single_future_dataframe(group_start_time, group_end_time, horizon, frequency_unit, frequency_quantity, include_history) + df = make_single_future_dataframe(group_start_time, group_end_time, horizon, frequency, include_history) for idx, identity_column_name in enumerate(identity_column_names): df[identity_column_name] = group[idx] future_df_list.append(df) @@ -67,8 +66,7 @@ def make_single_future_dataframe( start_time: pd.Timestamp, end_time: pd.Timestamp, horizon: int, - frequency_unit: str, - frequency_quantity: int, + frequency: Frequency, include_history: bool = True, column_name: str = "ds" ) -> pd.DataFrame: @@ -77,15 +75,14 @@ def make_single_future_dataframe( :param start_time: The starting time of time series of the training data. :param end_time: The end time of time series of the training data. :param horizon: Int number of periods to forecast forward. - :param frequency_unit: The frequency unit of the time series - :param frequency_quantity: The frequency quantity of the time series + :param frequency: The frequency of the time series :param include_history: Boolean to include the historical dates in the data frame for predictions. :param column_name: column name of the time column. Default is "ds". :return: """ - offset_freq = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency_unit]] - timestep_offset = pd.DateOffset(**offset_freq) * frequency_quantity + offset_freq = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency.frequency_unit]] + timestep_offset = pd.DateOffset(**offset_freq) * frequency.frequency_quantity end_time = pd.Timestamp(end_time) if include_history: @@ -100,7 +97,7 @@ def make_single_future_dataframe( ) return pd.DataFrame(date_rng, columns=[column_name]) -def get_validation_horizon(df: pd.DataFrame, horizon: int, frequency_unit: str, frequency_quantity: int = 1) -> int: +def get_validation_horizon(df: pd.DataFrame, horizon: int, frequency: Frequency) -> int: """ Return validation_horizon, which is the lesser of `horizon` and one quarter of the dataframe's timedelta Since the seasonality period is never more than half of the dataframe's timedelta, @@ -108,15 +105,11 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, frequency_unit: str, behavior, and we enforce it for ARIMA.) :param df: pd.DataFrame of the historical data :param horizon: int number of time into the future for forecasting - :param frequency_unit: frequency unit of the time series, which must be a pandas offset alias - :param frequency_quantity: int multiplier for the frequency unit, representing the number of `unit`s - per time step in the dataframe. This is useful when the time series has a granularity that - spans multiple `unit`s (e.g., if `unit='min'` and `frequency_quantity=5`, it means the data - follows a five-minute pattern). To make it backward compatible, defaults to 1. + :param frequency: frequency of the time series :return: horizon used for validation, in terms of the input `unit` """ MIN_HORIZONS = 4 # minimum number of horizons in the dataframe - horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit]) * horizon * frequency_quantity + horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * horizon * frequency.frequency_quantity try: if MIN_HORIZONS * horizon_dateoffset + df["ds"].min() <= df["ds"].max(): @@ -127,7 +120,7 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, frequency_unit: str, # In order to calculate the validation horizon, we incrementally add offset # to the start time to the quarter of total timedelta. We did this since # pd.DateOffset does not support divide by operation. - timestep_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit]) * frequency_quantity + timestep_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * frequency.frequency_quantity max_horizon = 0 cur_timestamp = df["ds"].min() while cur_timestamp + timestep_dateoffset <= df["ds"].max(): @@ -137,38 +130,36 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, frequency_unit: str, f"timedelta. Validation horizon will be reduced to {max_horizon//MIN_HORIZONS*timestep_dateoffset}.") return max_horizon // MIN_HORIZONS -def generate_cutoffs(df: pd.DataFrame, horizon: int, frequency_unit: str, +def generate_cutoffs(df: pd.DataFrame, horizon: int, frequency: Frequency, num_folds: int, seasonal_period: int = 0, - seasonal_unit: Optional[str] = None, - frequency_quantity: int = 1) -> List[pd.Timestamp]: + seasonal_unit: Optional[str] = None) -> List[pd.Timestamp]: """ Generate cutoff times for cross validation with the control of number of folds. :param df: pd.DataFrame of the historical data. :param horizon: int number of time into the future for forecasting. - :param frequency_unit: frequency unit of the time series, which must be a pandas offset alias. + :param frequency: frequency of the time series. :param num_folds: int number of cutoffs for cross validation. :param seasonal_period: length of the seasonality period. :param seasonal_unit: Optional frequency unit for the seasonal period. If not specified, the function will use the same frequency unit as the time series. - :param frequency_quantity: frequency quantity of the time series. :return: list of pd.Timestamp cutoffs for cross-validation. """ period = max(0.5 * horizon, 1) # avoid empty cutoff buckets # avoid non-integer months, quaters ands years. - if frequency_unit in NON_DAILY_OFFSET_ALIAS: + if frequency.frequency_unit in NON_DAILY_OFFSET_ALIAS: period = int(period) - period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*frequency_quantity*period + period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * frequency.frequency_quantity * period else: - offset_kwarg = {list(DATE_OFFSET_KEYWORD_MAP[frequency_unit])[0]: period} - period_dateoffset = pd.DateOffset(**offset_kwarg) * frequency_quantity + offset_kwarg = {list(DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit])[0]: period} + period_dateoffset = pd.DateOffset(**offset_kwarg) * frequency.frequency_quantity - horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*frequency_quantity*horizon + horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * frequency.frequency_quantity * horizon if not seasonal_unit: - seasonal_unit = frequency_unit + seasonal_unit = frequency.frequency_unit - seasonality_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*frequency_quantity*seasonal_period + seasonality_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * frequency.frequency_quantity * seasonal_period # We can not compare DateOffset directly, so we add to start time and compare. initial = seasonality_dateoffset @@ -197,24 +188,23 @@ def generate_cutoffs(df: pd.DataFrame, horizon: int, frequency_unit: str, ) return list(reversed(result)) -def generate_custom_cutoffs(df: pd.DataFrame, horizon: int, frequency_unit: str, - split_cutoff: pd.Timestamp, frequency_quantity: int = 1) -> List[pd.Timestamp]: +def generate_custom_cutoffs(df: pd.DataFrame, horizon: int, frequency: Frequency, + split_cutoff: pd.Timestamp) -> List[pd.Timestamp]: """ Generate custom cutoff times for cross validation based on user-specified split cutoff. Period (step size) is 1. :param df: pd.DataFrame of the historical data. :param horizon: int number of time into the future for forecasting. - :param frequency_unit: frequency unit of the time series, which must be a pandas offset alias. + :param frequency: frequency of the time series. :param split_cutoff: the user-specified cutoff, as the starting point of cutoffs. - :param frequency_quantity: frequency quantity of the time series. For tuning job, it is the cutoff between train and validate split. For training job, it is the cutoff bewteen validate and test split. :return: list of pd.Timestamp cutoffs for cross-validation. """ # TODO: [ML-43528] expose period as input. period = 1 - period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*period*frequency_quantity - horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*horizon*frequency_quantity + period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * period * frequency.frequency_quantity + horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * horizon * frequency.frequency_quantity # First cutoff is the cutoff bewteen splits cutoff = split_cutoff @@ -236,8 +226,7 @@ def is_quaterly_alias(freq: str): def is_frequency_consistency( start_time: pd.Timestamp, end_time: pd.Timestamp, - frequency_unit:str, - frequency_quantity: int) -> bool: + frequency: Frequency) -> bool: """ Validate the periods given a start time, end time is consistent with given frequency. We consider consistency as only integer frequencies between start and end time, e.g. @@ -251,19 +240,18 @@ def is_frequency_consistency( :return: A boolean indicate whether the time interval is evenly divisible by the period. """ - periods = calculate_period_differences(start_time, end_time, frequency_unit, frequency_quantity) + periods = calculate_period_differences(start_time, end_time, frequency) # If the difference between start and end time is divisible by the period time diff = (pd.to_datetime(end_time) - pd.DateOffset( - **DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency_unit]] - ) * periods * frequency_quantity) == pd.to_datetime(start_time) + **DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency.frequency_unit]] + ) * periods * frequency.frequency_quantity) == pd.to_datetime(start_time) return diff def calculate_period_differences( start_time: pd.Timestamp, end_time: pd.Timestamp, - frequency_unit:str, - frequency_quantity: int) -> int: + frequency: Frequency) -> int: """ Calculate the periods given a start time, end time and period frequency. :param start_time: A pandas timestamp. @@ -276,6 +264,6 @@ def calculate_period_differences( """ start_time = pd.to_datetime(start_time) end_time = pd.to_datetime(end_time) - freq_alias = PERIOD_ALIAS_MAP[OFFSET_ALIAS_MAP[frequency_unit]] + freq_alias = PERIOD_ALIAS_MAP[OFFSET_ALIAS_MAP[frequency.frequency_unit]] # It is intended to get the floor value. And in the later check we will use this floor value to find out if it is not consistent. - return (end_time.to_period(freq_alias) - start_time.to_period(freq_alias)).n // frequency_quantity + return (end_time.to_period(freq_alias) - start_time.to_period(freq_alias)).n // frequency.frequency_quantity diff --git a/runtime/tests/automl_runtime/forecast/deepar/model_test.py b/runtime/tests/automl_runtime/forecast/deepar/model_test.py index bee823d..101c348 100644 --- a/runtime/tests/automl_runtime/forecast/deepar/model_test.py +++ b/runtime/tests/automl_runtime/forecast/deepar/model_test.py @@ -25,6 +25,7 @@ from gluonts.transform import InstanceSplitter, TestSplitSampler from gluonts.torch.model.predictor import PyTorchPredictor +from databricks.automl_runtime.forecast.frequency import Frequency from databricks.automl_runtime.forecast.deepar.model import ( DeepARModel, mlflow_deepar_log_model, @@ -94,8 +95,7 @@ def test_model_save_and_load_single_series(self): deepar_model = DeepARModel( model=self.model, horizon=self.prediction_length, - frequency_unit="d", - frequency_quantity=1, + frequency=Frequency(frequency_unit="d", frequency_quantity=1), num_samples=1, target_col=target_col, time_col=time_col, @@ -139,8 +139,7 @@ def test_model_save_and_load_multi_series(self): model=self.model, horizon=self.prediction_length, num_samples=1, - frequency_unit="d", - frequency_quantity=1, + frequency=Frequency(frequency_unit="d", frequency_quantity=1), target_col=target_col, time_col=time_col, id_cols=[id_col], @@ -187,8 +186,7 @@ def test_model_save_and_load_multi_series_multi_id_cols(self): model=self.model, horizon=self.prediction_length, num_samples=1, - frequency_unit="d", - frequency_quantity=1, + frequency=Frequency(frequency_unit="d", frequency_quantity=1), target_col=target_col, time_col=time_col, id_cols=id_cols, @@ -234,8 +232,7 @@ def test_model_prediction_with_duplicate_timestamps(self): deepar_model = DeepARModel( model=self.model, horizon=self.prediction_length, - frequency_unit="d", - frequency_quantity=1, + frequency=Frequency(frequency_unit="d", frequency_quantity=1), num_samples=1, target_col=target_col, time_col=time_col, @@ -278,8 +275,7 @@ def test_model_prediction_with_monthly_data(self): deepar_model = DeepARModel( model=self.model, horizon=self.prediction_length, - frequency_unit="MS", - frequency_quantity=1, + frequency=Frequency(frequency_unit="MS", frequency_quantity=1), num_samples=1, target_col=target_col, time_col=time_col, @@ -321,8 +317,7 @@ def test_model_prediction_with_multiple_minutes_frequency(self, frequency_quanti deepar_model = DeepARModel( model=self.model, horizon=self.prediction_length, - frequency_unit="min", - frequency_quantity=frequency_quantity, + frequency=Frequency(frequency_unit="min", frequency_quantity=frequency_quantity), num_samples=1, target_col=target_col, time_col=time_col, diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py index bd4380c..5b90d86 100644 --- a/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py +++ b/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py @@ -20,6 +20,7 @@ import pandas as pd from pmdarima.arima import auto_arima, StepwiseContext +from databricks.automl_runtime.forecast.frequency import Frequency from databricks.automl_runtime.forecast.utils import generate_cutoffs from databricks.automl_runtime.forecast.pmdarima.diagnostics import cross_validation, single_cutoff_forecast @@ -43,7 +44,7 @@ class TestDiagnostics(unittest.TestCase): (df_with_exogenous, ["x1", "x2"]) ]) def test_cross_validation_success(self, df, exogenous_cols): - cutoffs = generate_cutoffs(df, horizon=3, frequency_unit="D", seasonal_period=1, seasonal_unit="D", num_folds=3) + cutoffs = generate_cutoffs(df, horizon=3, frequency=Frequency(frequency_unit="d", frequency_quantity=1), seasonal_period=1, seasonal_unit="D", num_folds=3) train_df = df[df["ds"] <= cutoffs[0]].set_index("ds") y_train = train_df[["y"]] X_train = train_df.drop(["y"], axis=1) diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py index 5918d29..ae1068b 100644 --- a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py +++ b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py @@ -25,6 +25,7 @@ from mlflow.protos.databricks_pb2 import ErrorCode, INVALID_PARAMETER_VALUE from pmdarima.arima import ARIMA +from databricks.automl_runtime.forecast.frequency import Frequency from databricks.automl_runtime.forecast.pmdarima.model import ( ArimaModel, MultiSeriesArimaModel, @@ -52,8 +53,7 @@ def setUp(self) -> None: pickled_model = pickle.dumps(model) self.arima_model = ArimaModel(pickled_model, horizon=self.horizon, - frequency_unit=self.freq, - frequency_quantity=self.frequency_quantity, + frequency=Frequency(frequency_unit=self.freq, frequency_quantity=self.frequency_quantity), start_ds=self.start_ds, end_ds=pd.Timestamp("2020-11-26"), time_col="date") @@ -69,8 +69,7 @@ def test_predict_timeseries_success(self): expected_ds = AbstractArimaModel._get_ds_indices( self.start_ds, periods=self.num_rows + self.horizon, - frequency_unit=self.freq, - frequency_quantity=self.frequency_quantity) + frequency=Frequency(frequency_unit=self.freq, frequency_quantity=self.frequency_quantity)) self.assertTrue(expected_columns.issubset(set(forecast_pd.columns))) self.assertEqual(10, forecast_pd.shape[0]) pd.testing.assert_series_equal(pd.Series(expected_ds, name='ds'), forecast_pd["ds"]) @@ -140,7 +139,7 @@ def setUp(self) -> None: self.freq = 'W' self.frequency_quantity = 1 dates = AbstractArimaModel._get_ds_indices( - pd.to_datetime(self.start_ds), periods=self.num_rows, frequency_unit=self.freq, frequency_quantity=self.frequency_quantity) + pd.to_datetime(self.start_ds), periods=self.num_rows, frequency=Frequency(frequency_unit=self.freq, frequency_quantity=self.frequency_quantity)) self.df = pd.concat([ pd.Series(dates, name='date'), pd.Series(range(self.num_rows), name="y") @@ -150,8 +149,7 @@ def setUp(self) -> None: pickled_model = pickle.dumps(model) self.arima_model = ArimaModel(pickled_model, horizon=self.horizon, - frequency_unit=self.freq, - frequency_quantity=self.frequency_quantity, + frequency=Frequency(frequency_unit=self.freq, frequency_quantity=self.frequency_quantity), start_ds=self.start_ds, end_ds=pd.Timestamp("2020-11-26"), time_col="date") @@ -189,8 +187,7 @@ def setUp(self) -> None: pickled_model = pickle.dumps(model) self.arima_model = ArimaModel(pickled_model, horizon=self.horizon, - frequency_unit=self.freq, - frequency_quantity=self.frequency_quantity, + frequency=Frequency(frequency_unit=self.freq, frequency_quantity=self.frequency_quantity), start_ds=self.start_ds, end_ds=pd.Timestamp("2020-11-26"), time_col="date", @@ -234,8 +231,7 @@ def setUp(self) -> None: end_ds_dict = {("1",): pd.Timestamp("2020-09-13"), ("2",): pd.Timestamp("2020-09-13")} self.arima_model = MultiSeriesArimaModel(pickled_model_dict, horizon=1, - frequency_unit='month', - frequency_quantity=1, + frequency=Frequency(frequency_unit='month', frequency_quantity=1), start_ds_dict=start_ds_dict, end_ds_dict=end_ds_dict, time_col="date", @@ -318,8 +314,7 @@ def test_make_future_dataframe_multi_ids(self): end_ds_dict = {(1, "1"): pd.Timestamp("2020-09-13"), (2, "1"): pd.Timestamp("2020-09-13")} arima_model = MultiSeriesArimaModel(pickled_model_dict, horizon=1, - frequency_unit='month', - frequency_quantity=1, + frequency=Frequency(frequency_unit='month', frequency_quantity=1), start_ds_dict=start_ds_dict, end_ds_dict=end_ds_dict, time_col="date", @@ -359,8 +354,7 @@ def setUp(self) -> None: end_ds_dict = {("1",): pd.Timestamp("2020-09-13"), ("2",): pd.Timestamp("2020-09-13")} self.arima_model = MultiSeriesArimaModel(pickled_model_dict, horizon=1, - frequency_unit='month', - frequency_quantity=1, + frequency=Frequency(frequency_unit='month', frequency_quantity=1), start_ds_dict=start_ds_dict, end_ds_dict=end_ds_dict, time_col="date", @@ -414,8 +408,7 @@ def test_get_ds_weekly(self): ds_indices = AbstractArimaModel._get_ds_indices( start_ds=pd.Timestamp("2022-01-01 12:30"), periods=8, - frequency_unit='W', - frequency_quantity=1) + frequency=Frequency(frequency_unit="W", frequency_quantity=1)) pd.testing.assert_index_equal(expected_ds, ds_indices) def test_get_ds_hourly(self): @@ -429,8 +422,7 @@ def test_get_ds_hourly(self): ds_indices = AbstractArimaModel._get_ds_indices( start_ds=pd.Timestamp("2021-12-10 09:23"), periods=10, - frequency_unit='H', - frequency_quantity=1) + frequency=Frequency(frequency_unit="h", frequency_quantity=1)) pd.testing.assert_index_equal(expected_ds, ds_indices) @@ -447,7 +439,7 @@ def setUp(self) -> None: self.pickled_model = pickle.dumps(model) def test_mlflow_arima_log_model(self): - arima_model = ArimaModel(self.pickled_model, horizon=1, frequency_unit='d', frequency_quantity=1, + arima_model = ArimaModel(self.pickled_model, horizon=1, frequency=Frequency(frequency_unit="d", frequency_quantity=1), start_ds=pd.to_datetime("2020-10-01"), end_ds=pd.to_datetime("2020-10-09"), time_col="date") with mlflow.start_run() as run: @@ -472,8 +464,7 @@ def test_mlflow_arima_log_model_multiseries(self): end_ds_dict = {("1",): pd.Timestamp("2020-10-09"), ("2",): pd.Timestamp("2020-10-09")} multiseries_arima_model = MultiSeriesArimaModel(pickled_model_dict, horizon=1, - frequency_unit='d', - frequency_quantity=1, + frequency=Frequency(frequency_unit='d', frequency_quantity=1), start_ds_dict=start_ds_dict, end_ds_dict=end_ds_dict, time_col="date", @@ -522,7 +513,7 @@ def setUp(self) -> None: self.quantity_model_pairs = [] for frequency_quantity in frequency_quantities: - dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency_unit=self.freq, frequency_quantity=frequency_quantity) + dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency=Frequency(frequency_unit=self.freq, frequency_quantity=frequency_quantity)) df = pd.concat([ pd.Series(dates, name='date'), pd.Series(range(self.num_rows), name="y") @@ -532,8 +523,7 @@ def setUp(self) -> None: pickled_model = pickle.dumps(model) self.quantity_model_pairs.append((frequency_quantity, ArimaModel(pickled_model, horizon=self.horizon, - frequency_unit=self.freq, - frequency_quantity=frequency_quantity, + frequency=Frequency(frequency_unit=self.freq, frequency_quantity=frequency_quantity), start_ds=self.start_ds, end_ds=dates.max(), time_col="date"))) @@ -551,8 +541,7 @@ def test_predict_timeseries_success(self): expected_ds = AbstractArimaModel._get_ds_indices( self.start_ds, periods=self.num_rows + self.horizon, - frequency_unit=self.freq, - frequency_quantity=frequency_quantity) + frequency=Frequency(frequency_unit=self.freq, frequency_quantity=frequency_quantity)) self.assertTrue(expected_columns.issubset(set(forecast_pd.columns))) self.assertEqual(10, forecast_pd.shape[0]) pd.testing.assert_series_equal(pd.Series(expected_ds, name='ds'), forecast_pd["ds"]) diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py index b79ab96..c226365 100644 --- a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py +++ b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py @@ -22,6 +22,7 @@ import numpy as np import pmdarima as pm +from databricks.automl_runtime.forecast.frequency import Frequency from databricks.automl_runtime.forecast.pmdarima.training import ArimaEstimator from databricks.automl_runtime.forecast import OFFSET_ALIAS_MAP, DATE_OFFSET_KEYWORD_MAP @@ -75,8 +76,7 @@ def test_fit_success(self): ['min', 15, self.df_with_15_minute_interval, [1]], ['min', 30, self.df_with_30_minute_interval, [1]]]: arima_estimator = ArimaEstimator(horizon=1, - frequency_unit=freq, - frequency_quantity=frequancy_quantity, + frequency=Frequency(frequency_unit=freq, frequency_quantity=frequancy_quantity), metric="smape", seasonal_periods=seasonal_periods, num_folds=2) @@ -87,8 +87,7 @@ def test_fit_success(self): def test_fit_success_with_exogenous(self): arima_estimator = ArimaEstimator(horizon=1, - frequency_unit="d", - frequency_quantity=1, + frequency=Frequency(frequency_unit="d", frequency_quantity=1), metric="smape", seasonal_periods=[1, 7], num_folds=2, @@ -107,8 +106,7 @@ def test_fit_success_with_split_cutoff(self): ['min', 15, self.df_with_15_minute_interval, '2020-07-05 01:30:00'], ['min', 30, self.df_with_30_minute_interval, '2020-07-05 03:00:00']]: arima_estimator = ArimaEstimator(horizon=1, - frequency_unit=freq, - frequency_quantity=frequency_quantity, + frequency=Frequency(frequency_unit=freq, frequency_quantity=frequency_quantity), metric="smape", seasonal_periods=[1, 7], num_folds=2, @@ -119,8 +117,7 @@ def test_fit_success_with_split_cutoff(self): def test_fit_skip_too_long_seasonality(self): arima_estimator = ArimaEstimator(horizon=1, - frequency_unit="d", - frequency_quantity=1, + frequency=Frequency(frequency_unit="d", frequency_quantity=1), metric="smape", seasonal_periods=[3, 14], num_folds=2) @@ -132,8 +129,7 @@ def test_fit_skip_too_long_seasonality(self): def test_fit_horizon_truncation(self, mock_generate_cutoffs): period = 2 arima_estimator = ArimaEstimator(horizon=100, - frequency_unit="d", - frequency_quantity=1, + frequency=Frequency(frequency_unit="d", frequency_quantity=1), metric="smape", seasonal_periods=[period], num_folds=2) @@ -150,8 +146,7 @@ def test_fit_horizon_truncation(self, mock_generate_cutoffs): def test_fit_horizon_truncation_one_cutoff(self, mock_fit_predict): period = 2 arima_estimator = ArimaEstimator(horizon=100, - frequency_unit="d", - frequency_quantity=1, + frequency=Frequency(frequency_unit="d", frequency_quantity=1), metric="smape", seasonal_periods=[period], num_folds=2) @@ -169,8 +164,7 @@ def test_fit_success_with_failed_seasonal_periods(self): # generate_cutoffs will fail with m=30 because of no enough data # The fit method still succeeds because m=1 succeeds arima_estimator = ArimaEstimator(horizon=1, - frequency_unit="d", - frequency_quantity=1, + frequency=Frequency(frequency_unit="d", frequency_quantity=1), metric="smape", seasonal_periods=[1, 7, 30], num_folds=2) @@ -180,8 +174,7 @@ def test_fit_success_with_failed_seasonal_periods(self): def test_fit_failure_inconsistent_frequency(self): arima_estimator = ArimaEstimator(horizon=1, - frequency_unit="W", - frequency_quantity=1, + frequency=Frequency(frequency_unit="W", frequency_quantity=1), metric="smape", seasonal_periods=[1], num_folds=2) @@ -190,8 +183,7 @@ def test_fit_failure_inconsistent_frequency(self): def test_fit_failure_no_succeeded_model(self): arima_estimator = ArimaEstimator(horizon=1, - frequency_unit="d", - frequency_quantity=1, + frequency=Frequency(frequency_unit="d", frequency_quantity=1), metric="smape", seasonal_periods=[30], num_folds=2) @@ -201,7 +193,7 @@ def test_fit_failure_no_succeeded_model(self): def test_fit_predict_success(self): cutoffs = [pd.to_datetime("2020-07-11")] arima_estimator = ArimaEstimator(horizon=1, - frequency_unit="d", + frequency=Frequency(frequency_unit="d", frequency_quantity=1), metric="smape", seasonal_periods=[30], num_folds=2) @@ -218,7 +210,7 @@ def test_fill_missing_time_steps(self): ) indices_to_drop = [5, 8] df_missing = pd.DataFrame({"ds": ds, "y": range(12)}).drop(indices_to_drop).reset_index(drop=True) - df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency_unit=frequency, frequency_quantity=1) + df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency=Frequency(frequency_unit=frequency, frequency_quantity=1)) for index in indices_to_drop: self.assertTrue(df_filled["y"][index] == df_filled["y"][index - 1]) self.assertEqual(ds.to_list(), df_filled["ds"].to_list()) @@ -232,7 +224,7 @@ def test_fill_missing_time_steps_with_exogenous(self): ) indices_to_drop = [5, 8] df_missing = pd.DataFrame({"ds": ds, "y": range(12), "x": range(12)}).drop(indices_to_drop).reset_index(drop=True) - df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency_unit=frequency, frequency_quantity=1) + df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency=Frequency(frequency_unit=frequency, frequency_quantity=1)) for index in indices_to_drop: self.assertTrue(df_filled["y"][index] == df_filled["y"][index - 1]) self.assertTrue(df_filled["x"][index] == df_filled["x"][index - 1]) @@ -245,7 +237,7 @@ def test_fill_missing_time_steps_with_multiple_frequency_quantities(self): ds = pd.date_range(start=start_ds, periods=12, freq=pd.DateOffset(**{'minutes': quantity})) indices_to_drop = [5, 8] df_missing = pd.DataFrame({"ds": ds, "y": range(12)}).drop(indices_to_drop).reset_index(drop=True) - df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency_unit='min', frequency_quantity=quantity) + df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency=Frequency(frequency_unit="min", frequency_quantity=quantity)) for index in indices_to_drop: self.assertTrue(df_filled["y"][index] == df_filled["y"][index - 1]) self.assertEqual(ds.to_list(), df_filled["ds"].to_list()) @@ -253,14 +245,14 @@ def test_fill_missing_time_steps_with_multiple_frequency_quantities(self): def test_validate_ds_freq_matched_frequency(self): ArimaEstimator._validate_ds_freq(self.df, frequency_unit='D', frequency_quantity=1) ArimaEstimator._validate_ds_freq(self.df_monthly, frequency_unit='month', frequency_quantity=1) - ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency_unit='min', frequency_quantity=5) - ArimaEstimator._validate_ds_freq(self.df_with_10_minute_interval, frequency_unit='min', frequency_quantity=10) - ArimaEstimator._validate_ds_freq(self.df_with_15_minute_interval, frequency_unit='min', frequency_quantity=15) - ArimaEstimator._validate_ds_freq(self.df_with_30_minute_interval, frequency_unit='min', frequency_quantity=30) + ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency=Frequency(frequency_unit="min", frequency_quantity=5)) + ArimaEstimator._validate_ds_freq(self.df_with_10_minute_interval, frequency=Frequency(frequency_unit="min", frequency_quantity=10)) + ArimaEstimator._validate_ds_freq(self.df_with_15_minute_interval, frequency=Frequency(frequency_unit="min", frequency_quantity=15)) + ArimaEstimator._validate_ds_freq(self.df_with_30_minute_interval, frequency=Frequency(frequency_unit="min", frequency_quantity=30)) def test_validate_ds_freq_unmatched_frequency(self): with pytest.raises(ValueError, match="includes different frequency"): - ArimaEstimator._validate_ds_freq(self.df, frequency_unit='W', frequency_quantity=1) + ArimaEstimator._validate_ds_freq(self.df, frequency=Frequency(frequency_unit="W", frequency_quantity=1)) with pytest.raises(ValueError, match="includes different frequency"): ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency_unit='min', frequency_quantity=10) diff --git a/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py b/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py index 7b7b924..c613001 100644 --- a/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py +++ b/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py @@ -20,6 +20,7 @@ from prophet import Prophet +from databricks.automl_runtime.forecast.frequency import Frequency from databricks.automl_runtime.forecast.utils import generate_cutoffs from databricks.automl_runtime.forecast.prophet.diagnostics import cross_validation @@ -43,7 +44,7 @@ def test_cross_validation_success(self): cutoffs = generate_cutoffs( self.X, horizon=3, - frequency_unit="MS", + frequency=Frequency(frequency_unit="MS", frequency_quantity=1), seasonal_period=1, seasonal_unit="D", num_folds=3, diff --git a/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py b/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py index b1afbe4..5370716 100644 --- a/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py +++ b/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py @@ -23,6 +23,7 @@ import pandas as pd from hyperopt import hp +from databricks.automl_runtime.forecast.frequency import Frequency from databricks.automl_runtime.forecast.prophet.forecast import ProphetHyperoptEstimator @@ -79,7 +80,7 @@ def setUp(self) -> None: def test_sequential_training(self): hyperopt_estim = ProphetHyperoptEstimator( horizon=1, - frequency_unit="d", + frequency=Frequency(frequency_unit="d", frequency_quantity=1), metric="smape", interval_width=0.8, country_holidays="US", @@ -111,7 +112,7 @@ def test_monthly_sequential_training(self): ) for freq, df in [['MS', self.df_string_monthly_time]]: hyperopt_estim = ProphetHyperoptEstimator(horizon=1, - frequency_unit=freq, + frequency=Frequency(frequency_unit=freq, frequency_quantity=1), metric="smape", interval_width=0.8, country_holidays="US", @@ -143,8 +144,7 @@ def test_sequential_training_with_multiple_frequency_quantities(self): [self.df_with_15_minute_interval, 15, "min"], [self.df_with_30_minute_interval, 30, "min"]]: hyperopt_estim = ProphetHyperoptEstimator(horizon=1, - frequency_unit=frequency_unit, - frequency_quantity=frequency_quantity, + frequency=Frequency(frequency_unit=frequency_unit, frequency_quantity=frequency_quantity), metric="smape", interval_width=0.8, country_holidays="US", @@ -174,7 +174,7 @@ def test_training_with_extra_regressors(self): pd.Series(np.random.randn(self.num_rows), name="f2"), ], axis=1) hyperopt_estim = ProphetHyperoptEstimator(horizon=1, - frequency_unit="d", + frequency=Frequency(frequency_unit="d", frequency_quantity=1), metric="smape", interval_width=0.8, country_holidays="US", @@ -198,7 +198,7 @@ def test_training_with_split_cutoff(self): ['Y', self.df_string_annually_time, '2021-01-15 00:00:00', 5e-1]] for freq, df, split_cutoff, delta in test_spaces: hyperopt_estim = ProphetHyperoptEstimator(horizon=1, - frequency_unit=freq, + frequency=Frequency(frequency_unit=freq, frequency_quantity=1), metric="smape", interval_width=0.8, country_holidays="US", @@ -227,7 +227,7 @@ def test_training_with_split_cutoff(self): def test_horizon_truncation(self, mock_partial, mock_trials, mock_fmin): hyperopt_estim = ProphetHyperoptEstimator( horizon=100, - frequency_unit="d", + frequency=Frequency(frequency_unit="D", frequency_quantity=1), metric="smape", interval_width=0.8, country_holidays="US", @@ -252,7 +252,7 @@ def test_no_horizon_truncation(self, mock_partial, mock_trials, mock_fmin): num_folds = 2 hyperopt_estim = ProphetHyperoptEstimator( horizon=horizon, - frequency_unit="d", + frequency=Frequency(frequency_unit="D", frequency_quantity=1), metric="smape", interval_width=0.8, country_holidays="US", diff --git a/runtime/tests/automl_runtime/forecast/prophet/model_test.py b/runtime/tests/automl_runtime/forecast/prophet/model_test.py index 2699705..533e4d8 100644 --- a/runtime/tests/automl_runtime/forecast/prophet/model_test.py +++ b/runtime/tests/automl_runtime/forecast/prophet/model_test.py @@ -26,6 +26,7 @@ from mlflow.exceptions import MlflowException from mlflow.protos.databricks_pb2 import ErrorCode, INTERNAL_ERROR +from databricks.automl_runtime.forecast.frequency import Frequency from databricks.automl_runtime.forecast.prophet.model import ( mlflow_prophet_log_model, MultiSeriesProphetModel, @@ -80,7 +81,7 @@ def setUpClass(cls) -> None: cls.model = model_from_json(cls.model_json) def test_model_save_and_load(self): - prophet_model = ProphetModel(self.model_json, 1, "d", 1, "ds") + prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "ds") with mlflow.start_run() as run: mlflow_prophet_log_model(prophet_model) @@ -110,7 +111,7 @@ def test_make_future_dataframe(self): # don't have full support yet. if OFFSET_ALIAS_MAP[feq_unit] in ['YS', 'MS', 'QS']: continue - prophet_model = ProphetModel(self.model_json, 1, feq_unit, 1, "ds") + prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit=feq_unit, frequency_quantity=1), "ds") future_df = prophet_model.make_future_dataframe(1) offset_kw_arg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[feq_unit]] expected_time = pd.Timestamp("2020-10-25") + pd.DateOffset(**offset_kw_arg) @@ -120,7 +121,7 @@ def test_make_future_dataframe(self): def test_make_future_dataframe_with_multiple_frequency_quantities(self): for frequency_quantity in [1, 5, 10, 15, 30]: - prophet_model = ProphetModel(self.model_json, 1, "min", frequency_quantity, "ds") + prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="min", frequency_quantity=frequency_quantity), "ds") future_df = prophet_model.make_future_dataframe(1) offset_kw_arg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP["min"]] expected_time = pd.Timestamp("2020-10-25") + pd.DateOffset(**offset_kw_arg)*frequency_quantity @@ -129,7 +130,7 @@ def test_make_future_dataframe_with_multiple_frequency_quantities(self): f" Expect {expected_time}, but get {future_df.iloc[-1]['ds']}") def test_predict_success_datetime_date(self): - prophet_model = ProphetModel(self.model_json, 1, "d", 1, "ds") + prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "ds") test_df = pd.DataFrame( {"ds": [datetime.date(2020, 10, 8), datetime.date(2020, 12, 10)]} ) @@ -141,7 +142,7 @@ def test_predict_success_datetime_date(self): ) # check the input dataframe is unchanged def test_predict_success_string(self): - prophet_model = ProphetModel(self.model_json, 1, "d", 1, "ds") + prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "ds") test_df = pd.DataFrame({"ds": ["2020-10-08", "2020-12-10"]}) expected_test_df = test_df.copy() yhat = prophet_model.predict(None, test_df) @@ -152,7 +153,7 @@ def test_predict_success_string(self): def test_predict_multiple_frequency_quantities(self): for frequency_quantity in [1, 5, 10, 15, 30]: - prophet_model = ProphetModel(self.model_json, 1, "min", frequency_quantity, "ds") + prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="min", frequency_quantity=frequency_quantity), "ds") test_df = pd.DataFrame({"ds": ["2020-10-08", "2020-12-10"]}) expected_test_df = test_df.copy() yhat = prophet_model.predict(None, test_df) @@ -162,7 +163,7 @@ def test_predict_multiple_frequency_quantities(self): ) # check the input dataframe is unchanged def test_validate_predict_cols(self): - prophet_model = ProphetModel(self.model_json, 1, "d", 1, "time") + prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "time") test_df = pd.DataFrame( { "date": [pd.to_datetime("2020-11-01"), pd.to_datetime("2020-11-04")], @@ -194,8 +195,7 @@ def setUpClass(cls) -> None: timeseries_starts=cls.multi_series_start, timeseries_end="2020-07-25", horizon=1, - frequency_unit="days", - frequency_quantity=1, + frequency=Frequency(frequency_unit="days", frequency_quantity=1), time_col="time", id_cols=["id"], ) @@ -262,8 +262,7 @@ def test_model_save_and_load_multi_ids(self): multi_series_start, "2020-07-25", 1, - "days", - 1, + Frequency(frequency_unit="days", frequency_quantity=1), "time", ["id1", "id2"], ) @@ -325,8 +324,7 @@ def test_validate_predict_cols(self): timeseries_starts=self.multi_series_start, timeseries_end="2020-07-25", horizon=1, - frequency_unit="days", - frequency_quantity=1, + frequency=Frequency(frequency_unit="days", frequency_quantity=1), time_col="ds", id_cols=["id1"], ) @@ -370,8 +368,7 @@ def test_make_future_dataframe_multiple_frequency_quantities(self): timeseries_starts=self.multi_series_start, timeseries_end="2020-07-25", horizon=1, - frequency_unit="min", - frequency_quantity=frequency_quantity, + frequency=Frequency(frequency_unit="min", frequency_quantity=frequency_quantity), time_col="time", id_cols=["id"], ) @@ -390,8 +387,7 @@ def test_make_future_dataframe_multi_ids(self): multi_series_start, "2020-07-25", 1, - "days", - 1, + Frequency(frequency_unit="days", frequency_quantity=1), "time", ["id1", "id2"], ) diff --git a/runtime/tests/automl_runtime/forecast/utils_test.py b/runtime/tests/automl_runtime/forecast/utils_test.py index c043da9..38055c0 100644 --- a/runtime/tests/automl_runtime/forecast/utils_test.py +++ b/runtime/tests/automl_runtime/forecast/utils_test.py @@ -20,6 +20,7 @@ import pandas as pd from databricks.automl_runtime.forecast import DATE_OFFSET_KEYWORD_MAP +from databricks.automl_runtime.forecast.frequency import Frequency from databricks.automl_runtime.forecast.utils import \ generate_cutoffs, get_validation_horizon, calculate_period_differences, \ is_frequency_consistency, make_future_dataframe, make_single_future_dataframe, \ @@ -31,96 +32,96 @@ class TestGetValidationHorizon(unittest.TestCase): def test_no_truncate(self): # 5 day horizon is OK for dataframe with 30 days of data df = pd.DataFrame(pd.date_range(start="2020-08-01", end="2020-08-30", freq="D"), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 5, "D") + validation_horizon = get_validation_horizon(df, 5, Frequency(frequency_unit="D", frequency_quantity=1)) self.assertEqual(validation_horizon, 5) # 2 week horizon is OK for dataframe with ~12 weeks of data df = pd.DataFrame(pd.date_range(start="2020-01-01", end="2020-04-01", freq="W"), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 2, "W") + validation_horizon = get_validation_horizon(df, 2, Frequency(frequency_unit="W", frequency_quantity=1)) self.assertEqual(validation_horizon, 2) def test_truncate(self): # for dataframe with 19 days of data, maximum horizon is 4 days df = pd.DataFrame(pd.date_range(start="2020-08-01", end="2020-08-20", freq="D"), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 10, "D") + validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="D", frequency_quantity=1)) self.assertEqual(validation_horizon, 4) # for dataframe with 20 days of data, maximum horizon is 5 days df = pd.DataFrame(pd.date_range(start="2020-08-01", end="2020-08-21", freq="D"), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 10, "D") + validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="D", frequency_quantity=1)) self.assertEqual(validation_horizon, 5) # for dataframe with 21 days of data, maximum horizon is 5 days df = pd.DataFrame(pd.date_range(start="2020-08-01", end="2020-08-22", freq="D"), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 10, "D") + validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="D", frequency_quantity=1)) self.assertEqual(validation_horizon, 5) # for dataframe with just under one year of data, maximum horizon is 12 weeks df = pd.DataFrame(pd.date_range(start="2020-01-01", end="2020-12-31", freq="W"), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 20, "W") + validation_horizon = get_validation_horizon(df, 20, Frequency(frequency_unit="W", frequency_quantity=1)) self.assertEqual(validation_horizon, 12) # for dataframe with just one year of data, maximum horizon is 3 months df = pd.DataFrame(pd.date_range(start="2020-01-14", periods=13, freq=pd.DateOffset(months=1)), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 17, "MS") + validation_horizon = get_validation_horizon(df, 17, Frequency(frequency_unit="MS", frequency_quantity=1)) self.assertEqual(validation_horizon, 3) # for dataframe with 8 year of data, maximum horizon is 2 years df = pd.DataFrame(pd.date_range(start="2012-01-14", periods=9, freq=pd.DateOffset(years=1)), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 17, "YS") + validation_horizon = get_validation_horizon(df, 17, Frequency(frequency_unit="YS", frequency_quantity=1)) self.assertEqual(validation_horizon, 2) # for dataframe with 12 quaters of data, maximum horizon is 3 quaters. df = pd.DataFrame(pd.date_range(start="2012-01-14", periods=13, freq=pd.DateOffset(months=3)), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 17, "QS") + validation_horizon = get_validation_horizon(df, 17, Frequency(frequency_unit="QS", frequency_quantity=1)) self.assertEqual(validation_horizon, 3) # prevent date overflow. There are 20 days of data, so maximum horizon is 5 days df = pd.DataFrame(pd.date_range(start="2020-08-01", end="2020-08-21", freq="D"), columns=["ds"]) # pd.Timestamp.max = Timestamp('2262-04-11 23:47:16.854775807') - validation_horizon = get_validation_horizon(df, 1000000, "D") + validation_horizon = get_validation_horizon(df, 1000000, Frequency(frequency_unit="D", frequency_quantity=1)) self.assertEqual(validation_horizon, 5) def test_truncate_logs(self): with self.assertLogs(logger="databricks.automl_runtime.forecast", level="INFO") as cm: df = pd.DataFrame(pd.date_range(start="2020-08-01", end="2020-08-20", freq="D"), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 10, "D") + validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="D", frequency_quantity=1)) self.assertIn("too long relative to dataframe's timedelta. Validation horizon will be reduced to", cm.output[0]) def test_frequency_quantity(self): # Since we only add extra supports of 5 min, 10 min, 15 min and 30 min for now, only test cases are added. # We need to add more test cases when we add more supports. df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:55:00", freq="5T"), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 10, "min", 5) + validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=5)) self.assertEqual(validation_horizon, 10) df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="5T"), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 10, "min", 5) + validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=5)) self.assertEqual(validation_horizon, 6) df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="10T"), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 10, "min", 10) + validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=10)) self.assertEqual(validation_horizon, 10) df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="10T"), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 10, "min", 10) + validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=10)) self.assertEqual(validation_horizon, 3) df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="15T"), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 10, "min", 15) + validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=15)) self.assertEqual(validation_horizon, 10) df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="15T"), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 10, "min", 15) + validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=15)) self.assertEqual(validation_horizon, 2) df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="30T"), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 10, "min", 30) + validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=30)) self.assertEqual(validation_horizon, 10) df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="30T"), columns=["ds"]) - validation_horizon = get_validation_horizon(df, 10, "min", 30) + validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=30)) self.assertEqual(validation_horizon, 1) class TestGenerateCutoffs(unittest.TestCase): @@ -131,11 +132,11 @@ def setUp(self) -> None: ).rename_axis("y").reset_index() def test_generate_cutoffs_success(self): - cutoffs = generate_cutoffs(self.X, horizon=7, frequency_unit="D", num_folds=3, seasonal_period=7) + cutoffs = generate_cutoffs(self.X, horizon=7, frequency=Frequency(frequency_unit="D", frequency_quantity=1), num_folds=3, seasonal_period=7) self.assertEqual([pd.Timestamp('2020-08-16 00:00:00'), pd.Timestamp('2020-08-19 12:00:00'), pd.Timestamp('2020-08-23 00:00:00')], cutoffs) def test_generate_cutoffs_success_large_num_folds(self): - cutoffs = generate_cutoffs(self.X, horizon=7, frequency_unit="D", num_folds=20, seasonal_period=1) + cutoffs = generate_cutoffs(self.X, horizon=7, frequency=Frequency(frequency_unit="D", frequency_quantity=1), num_folds=20, seasonal_period=1) self.assertEqual([pd.Timestamp('2020-07-22 12:00:00'), pd.Timestamp('2020-07-26 00:00:00'), pd.Timestamp('2020-07-29 12:00:00'), @@ -151,7 +152,7 @@ def test_generate_cutoffs_success_with_gaps(self): df = pd.DataFrame( pd.date_range(start="2020-07-01", periods=30, freq='3d'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="D", num_folds=5, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="D", frequency_quantity=1), num_folds=5, seasonal_period=1) self.assertEqual([pd.Timestamp('2020-09-13 00:00:00'), pd.Timestamp('2020-09-16 00:00:00'), pd.Timestamp('2020-09-19 00:00:00'), @@ -167,10 +168,10 @@ def test_generate_cutoffs_success_hourly(self): pd.Timestamp('2020-07-07 11:00:00'), pd.Timestamp('2020-07-07 14:00:00'), pd.Timestamp('2020-07-07 17:00:00')] - cutoffs = generate_cutoffs(df, horizon=6, frequency_unit="H", num_folds=5, seasonal_period=24) + cutoffs = generate_cutoffs(df, horizon=6, frequency=Frequency(frequency_unit="H", frequency_quantity=1), num_folds=5, seasonal_period=24) self.assertEqual(expected_cutoffs, cutoffs) - cutoffs_different_seasonal_unit = generate_cutoffs(df, horizon=6, frequency_unit="H", num_folds=5, + cutoffs_different_seasonal_unit = generate_cutoffs(df, horizon=6, frequency=Frequency(frequency_unit="H", frequency_quantity=1), num_folds=5, seasonal_period=1, seasonal_unit="D") self.assertEqual(expected_cutoffs, cutoffs_different_seasonal_unit) @@ -178,62 +179,62 @@ def test_generate_cutoffs_success_weekly(self): df = pd.DataFrame( pd.date_range(start="2020-07-01", periods=52, freq='W'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=4, frequency_unit="W", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=4, frequency=Frequency(frequency_unit="W", frequency_quantity=1), num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2021-05-02 00:00:00'), pd.Timestamp('2021-05-16 00:00:00'), pd.Timestamp('2021-05-30 00:00:00')], cutoffs) def test_generate_cutoffs_failure_horizon_too_large(self): with self.assertRaisesRegex(ValueError, "Less data than horizon after initial window. " "Make horizon shorter."): - generate_cutoffs(self.X, horizon=20, frequency_unit="D", num_folds=3, seasonal_period=1) + generate_cutoffs(self.X, horizon=20, frequency=Frequency(frequency_unit="D", frequency_quantity=1), num_folds=3, seasonal_period=1) def test_generate_cutoffs_less_data(self): with self.assertRaisesRegex(ValueError, "Less data than horizon."): - generate_cutoffs(self.X, horizon=100, frequency_unit="D", num_folds=3, seasonal_period=1) + generate_cutoffs(self.X, horizon=100, frequency=Frequency(frequency_unit="D", frequency_quantity=1), num_folds=3, seasonal_period=1) def test_generate_cutoffs_success_monthly(self): df = pd.DataFrame( pd.date_range(start="2020-01-12", periods=24, freq=pd.DateOffset(months=1)), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=2, frequency_unit="MS", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=2, frequency=Frequency(frequency_unit="MS", frequency_quantity=1), num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2021-08-12 00:00:00'), pd.Timestamp('2021-9-12 00:00:00'), pd.Timestamp('2021-10-12 00:00:00')], cutoffs) def test_generate_cutoffs_success_quaterly(self): df = pd.DataFrame( pd.date_range(start="2020-07-12", periods=9, freq=pd.DateOffset(months=3)), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="QS", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="QS", frequency_quantity=1), num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2021-10-12 00:00:00'), pd.Timestamp('2022-01-12 00:00:00'), pd.Timestamp('2022-04-12 00:00:00')], cutoffs) def test_generate_cutoffs_success_annualy(self): df = pd.DataFrame( pd.date_range(start="2012-07-14", periods=10, freq=pd.DateOffset(years=1)), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="YS", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="YS", frequency_quantity=1), num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2018-07-14 00:00:00'), pd.Timestamp('2019-07-14 00:00:00'), pd.Timestamp('2020-07-14 00:00:00')], cutoffs) def test_generate_cutoffs_success_with_multiple_frequency_quantities(self): df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:55:00", freq='5T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="min", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=1), num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2020-07-01 23:44:00'), pd.Timestamp('2020-07-01 23:49:00'), pd.Timestamp('2020-07-01 23:54:00')], cutoffs) df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:50:00", freq='10T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="min", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=1), num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2020-07-01 23:29:00'), pd.Timestamp('2020-07-01 23:39:00'), pd.Timestamp('2020-07-01 23:49:00')], cutoffs) df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:45:00", freq='15T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="min", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=1), num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2020-07-01 23:14:00'), pd.Timestamp('2020-07-01 23:29:00'), pd.Timestamp('2020-07-01 23:44:00')], cutoffs) df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:30:00", freq='30T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="min", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=1), num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2020-07-01 22:29:00'), pd.Timestamp('2020-07-01 22:59:00'), pd.Timestamp('2020-07-01 23:29:00')], cutoffs) class TestTestGenerateCustomCutoffs(unittest.TestCase): @@ -246,81 +247,81 @@ def test_generate_custom_cutoffs_success_hourly(self): pd.Timestamp('2020-07-07 14:00:00'), pd.Timestamp('2020-07-07 15:00:00'), pd.Timestamp('2020-07-07 16:00:00')] - cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="H", split_cutoff=pd.Timestamp('2020-07-07 13:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="H", frequency_quantity=1), split_cutoff=pd.Timestamp('2020-07-07 13:00:00')) self.assertEqual(expected_cutoffs, cutoffs) def test_generate_custom_cutoffs_success_daily(self): df = pd.DataFrame( pd.date_range(start="2020-07-01", end="2020-08-30", freq='d'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="D", split_cutoff=pd.Timestamp('2020-08-21 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="D", frequency_quantity=1), split_cutoff=pd.Timestamp('2020-08-21 00:00:00')) self.assertEqual([pd.Timestamp('2020-08-21 00:00:00'), pd.Timestamp('2020-08-22 00:00:00'), pd.Timestamp('2020-08-23 00:00:00')], cutoffs) def test_generate_custom_cutoffs_success_small_horizon(self): df = pd.DataFrame( pd.date_range(start="2020-07-01", end="2020-08-30", freq='2d'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=1, frequency_unit="D", split_cutoff=pd.Timestamp('2020-08-26 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="D", frequency_quantity=1), split_cutoff=pd.Timestamp('2020-08-26 00:00:00')) self.assertEqual([pd.Timestamp('2020-08-27 00:00:00'), pd.Timestamp('2020-08-29 00:00:00')], cutoffs) def test_generate_custom_cutoffs_success_weekly(self): df = pd.DataFrame( pd.date_range(start="2020-07-01", periods=52, freq='W'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="W", split_cutoff=pd.Timestamp('2021-04-25 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="W", frequency_quantity=1), split_cutoff=pd.Timestamp('2021-04-25 00:00:00')) self.assertEqual([pd.Timestamp('2021-04-25 00:00:00'), pd.Timestamp('2021-05-02 00:00:00'), pd.Timestamp('2021-05-09 00:00:00')], cutoffs) def test_generate_custom_cutoffs_success_monthly(self): df = pd.DataFrame( pd.date_range(start="2020-01-12", periods=24, freq=pd.DateOffset(months=1)), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="MS", split_cutoff=pd.Timestamp('2021-03-12 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="MS", frequency_quantity=1), split_cutoff=pd.Timestamp('2021-03-12 00:00:00')) self.assertEqual([pd.Timestamp('2021-03-12 00:00:00'), pd.Timestamp('2021-04-12 00:00:00'), pd.Timestamp('2021-05-12 00:00:00')], cutoffs) def test_generate_custom_cutoffs_success_quaterly(self): df = pd.DataFrame( pd.date_range(start="2020-07-12", periods=9, freq=pd.DateOffset(months=3)), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="QS", split_cutoff=pd.Timestamp('2020-07-12 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="QS", frequency_quantity=1), split_cutoff=pd.Timestamp('2020-07-12 00:00:00')) self.assertEqual([pd.Timestamp('2020-07-12 00:00:00'), pd.Timestamp('2020-10-12 00:00:00')], cutoffs) def test_generate_custom_cutoffs_success_annualy(self): df = pd.DataFrame( pd.date_range(start="2012-07-14", periods=10, freq=pd.DateOffset(years=1)), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="YS", split_cutoff=pd.Timestamp('2012-07-14 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="YS", frequency_quantity=1), split_cutoff=pd.Timestamp('2012-07-14 00:00:00')) self.assertEqual([pd.Timestamp('2012-07-14 00:00:00'), pd.Timestamp('2013-07-14 00:00:00'), pd.Timestamp('2014-07-14 00:00:00')], cutoffs) def test_generate_custom_cutoffs_success_with_multiple_frequency_quantities(self): df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:55:00", freq='5T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=1, frequency_unit="min", frequency_quantity=5, split_cutoff=pd.Timestamp('2020-07-01 23:45:00')) + cutoffs = generate_custom_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=5), split_cutoff=pd.Timestamp('2020-07-01 23:45:00')) self.assertEqual([pd.Timestamp('2020-07-01 23:45:00'), pd.Timestamp('2020-07-01 23:50:00')], cutoffs) df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:50:00", freq='10T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=1, frequency_unit="min", frequency_quantity=10, split_cutoff=pd.Timestamp('2020-07-01 23:30:00')) + cutoffs = generate_custom_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=10), split_cutoff=pd.Timestamp('2020-07-01 23:30:00')) self.assertEqual([pd.Timestamp('2020-07-01 23:30:00'), pd.Timestamp('2020-07-01 23:40:00')], cutoffs) df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:45:00", freq='15T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=1, frequency_unit="min", frequency_quantity=15, split_cutoff=pd.Timestamp('2020-07-01 23:15:00')) + cutoffs = generate_custom_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=15), split_cutoff=pd.Timestamp('2020-07-01 23:15:00')) self.assertEqual([pd.Timestamp('2020-07-01 23:15:00'), pd.Timestamp('2020-07-01 23:30:00')], cutoffs) df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:30:00", freq='30T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=1, frequency_unit="min", frequency_quantity=30, split_cutoff=pd.Timestamp('2020-07-01 23:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=30), split_cutoff=pd.Timestamp('2020-07-01 23:00:00')) self.assertEqual([pd.Timestamp('2020-07-01 23:00:00')], cutoffs) def test_generate_custom_cutoffs_success_with_small_gaps(self): df = pd.DataFrame( pd.date_range(start="2020-07-01", periods=30, freq='3d'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="D", split_cutoff=pd.Timestamp('2020-09-17 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="D", frequency_quantity=1), split_cutoff=pd.Timestamp('2020-09-17 00:00:00')) self.assertEqual([pd.Timestamp('2020-09-17 00:00:00'), pd.Timestamp('2020-09-18 00:00:00'), pd.Timestamp('2020-09-19 00:00:00')], cutoffs) @@ -329,7 +330,7 @@ def test_generate_custom_cutoffs_success_with_large_gaps(self): df = pd.DataFrame( pd.date_range(start="2020-07-01", periods=30, freq='9d'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="D", split_cutoff=pd.Timestamp('2021-03-08 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="D", frequency_quantity=1), split_cutoff=pd.Timestamp('2021-03-08 00:00:00')) self.assertEqual([pd.Timestamp('2021-03-08 00:00:00'), pd.Timestamp('2021-03-09 00:00:00'), pd.Timestamp('2021-03-12 00:00:00')], cutoffs) @@ -349,7 +350,7 @@ def test_calculate_period_differences_evenly(self): ) }) periods = df.apply(lambda x: calculate_period_differences( - x.start_time, x.end_time, 'month', 1 + x.start_time, x.end_time, Frequency(frequency_unit="month", frequency_quantity=1) ), axis=1) self.assertTrue((periods == pd.Series([4, 5, 12])).all()) @@ -363,11 +364,11 @@ def test_calculate_period_differences_unevenly(self): ) }) periods = df.apply(lambda x: calculate_period_differences( - x.start_time, x.end_time, 'month', 1 + x.start_time, x.end_time, Frequency(frequency_unit="month", frequency_quantity=1) ), axis=1) self.assertTrue((periods == pd.Series([4, 5, 0])).all()) periods = df.apply(lambda x: calculate_period_differences( - x.start_time, x.end_time, 'day', 1 + x.start_time, x.end_time, Frequency(frequency_unit="day", frequency_quantity=1) ), axis=1) self.assertTrue((periods == pd.Series([118, 151, 0])).all()) @@ -378,7 +379,7 @@ def test_calculate_period_differences_with_frequency_quantity(self): 'end_time': pd.date_range(start="2020-07-01 04:00:00", periods=10, freq=f'{frequency_quantity}T') }) periods = df.apply(lambda x: calculate_period_differences( - x.start_time, x.end_time, 'min', frequency_quantity + x.start_time, x.end_time, Frequency(frequency_unit="min", frequency_quantity=frequency_quantity) ), axis=1) self.assertTrue((periods == pd.Series([240//frequency_quantity]*10)).all()) @@ -391,12 +392,12 @@ def test_frequency_consistency(self): ) start_scalar = pd.to_datetime('2021-01-14') end_scalar = pd.to_datetime('2021-05-16') - self.assertFalse(is_frequency_consistency(start_scalar, end_scalar, 'month', 1)) + self.assertFalse(is_frequency_consistency(start_scalar, end_scalar, Frequency(frequency_unit="month", frequency_quantity=1))) self.assertTrue(start_time.apply( - lambda x: is_frequency_consistency(x, end_scalar, 'day', 1) + lambda x: is_frequency_consistency(x, end_scalar, Frequency(frequency_unit="day", frequency_quantity=1)) ).all()) self.assertTrue(end_time.apply( - lambda x: is_frequency_consistency(start_scalar, x, 'month', 1) + lambda x: is_frequency_consistency(start_scalar, x, Frequency(frequency_unit="month", frequency_quantity=1)) ).all()) def test_frequency_consistency_with_frequency_quantity(self): @@ -404,10 +405,10 @@ def test_frequency_consistency_with_frequency_quantity(self): start_time = pd.date_range(start="2020-07-01 00:00:00", periods=10, freq=f'{frequency_quantity}T') end_time = pd.date_range(start="2020-07-01 04:00:00", periods=10, freq=f'{frequency_quantity}T') self.assertTrue(start_time.to_series().apply( - lambda x: is_frequency_consistency(x, end_time[0], 'min', frequency_quantity) + lambda x: is_frequency_consistency(x, end_time[0], Frequency(frequency_unit="min", frequency_quantity=frequency_quantity)) ).all()) self.assertTrue(end_time.to_series().apply( - lambda x: is_frequency_consistency(start_time[0], x, 'min', frequency_quantity) + lambda x: is_frequency_consistency(start_time[0], x, Frequency(frequency_unit="min", frequency_quantity=frequency_quantity)) ).all()) @@ -417,8 +418,7 @@ def test_make_single_future_dataframe(self): start_time=pd.to_datetime('2022-01-01'), end_time=pd.to_datetime('2022-01-04'), horizon=1, - frequency_unit="d", - frequency_quantity=1, + frequency=Frequency(frequency_unit="D", frequency_quantity=1), include_history=False, column_name="test_date" ) @@ -430,8 +430,7 @@ def test_make_single_future_dataframe(self): start_time=pd.to_datetime('2022-01-01'), end_time=pd.to_datetime('2022-01-04'), horizon=1, - frequency_unit="d", - frequency_quantity=1, + frequency=Frequency(frequency_unit="D", frequency_quantity=1), include_history=True, column_name="test_date" ) @@ -447,8 +446,7 @@ def test_make_single_future_dataframe_with_different_freq(self): start_time=start_time, end_time=end_time, horizon=1, - frequency_unit=freq, - frequency_quantity=1, + frequency=Frequency(frequency_unit=freq, frequency_quantity=1), include_history=True, column_name="test_date" ) @@ -464,8 +462,7 @@ def test_make_single_future_dataframe_with_different_frequency_quantities(self): start_time=start_time, end_time=end_time, horizon=1, - frequency_unit="min", - frequency_quantity=frequency_quantity, + frequency=Frequency(frequency_unit="min", frequency_quantity=frequency_quantity), include_history=True, column_name="test_date" ) @@ -497,8 +494,7 @@ def test_make_future_dataframe(self, start_time, end_time, start_time=start_time, end_time=end_time, horizon=1, - frequency_unit=frequency_unit, - frequency_quantity=frequency_quantity, + frequency=Frequency(frequency_unit=frequency_unit, frequency_quantity=frequency_quantity), groups=groups, identity_column_names=identity_column_names, ) From 43316acd47883480ce96926d30b050debe215ef1 Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Wed, 26 Feb 2025 17:14:46 -0800 Subject: [PATCH 2/6] fix tests --- .../automl_runtime/forecast/frequency.py | 5 +++-- .../forecast/pmdarima/training.py | 2 +- .../forecast/deepar/utils_test.py | 22 ++++++++----------- .../forecast/pmdarima/diagnostics_test.py | 2 +- .../forecast/pmdarima/model_test.py | 6 ++--- .../forecast/pmdarima/training_test.py | 6 ++--- 6 files changed, 20 insertions(+), 23 deletions(-) diff --git a/runtime/databricks/automl_runtime/forecast/frequency.py b/runtime/databricks/automl_runtime/forecast/frequency.py index b7efa89..15e02df 100644 --- a/runtime/databricks/automl_runtime/forecast/frequency.py +++ b/runtime/databricks/automl_runtime/forecast/frequency.py @@ -26,7 +26,7 @@ class Frequency: frequency_quantity (int): The number of frequency_units in the period. Valid frequency units: source of truth is OFFSET_ALIAS_MAP in forecast.__init__.py - - Weeks: "W" + - Weeks: "W", "W-SUN", "W-MON", "W-TUE", "W-WED", "W-THU", "W-FRI", "W-SAT" These are aliases for "W", used for DeepAR only - Days: "d", "D", "days", "day" - Hours: "hours", "hour", "hr", "h", "H - Minutes: "m", "minute", "min", "minutes", "T" @@ -41,7 +41,8 @@ class Frequency: """ VALID_FREQUENCY_UNITS: ClassVar[Set[str]] = { - "W", "d", "D", "days", "day", "hours", "hour", "hr", "h", "H", + "W", "W-SUN", "W-MON", "W-TUE", "W-WED", "W-THU", "W-FRI", "W-SAT", + "d", "D", "days", "day", "hours", "hour", "hr", "h", "H", "m", "minute", "min", "minutes", "T", "S", "seconds", "sec", "second", "M", "MS", "month", "months", "Q", "QS", "quarter", "quarters", "Y", "YS", "year", "years" diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py index 9eb9db4..36507ab 100644 --- a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py +++ b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py @@ -89,7 +89,7 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame: try: # this check mirrors the the default behavior by prophet if history_periods < 2 * m: - _logger.warning(f"Skipping seasonal_period={m} ({self._frequency.frequency_quantity}{self._frequency.frequency_unit}). Dataframe timestamps must span at least two seasonality periods, but only spans {history_periods} {self._frequency_quantity}{self._frequency_unit}""") + _logger.warning(f"Skipping seasonal_period={m} ({self._frequency.frequency_quantity}{self._frequency.frequency_unit}). Dataframe timestamps must span at least two seasonality periods, but only spans {history_periods} {self._frequency.frequency_quantity}{self._frequency.frequency_unit}""") continue # Prophet also rejects the seasonality periods if the seasonality period timedelta is less than the shortest timedelta in the dataframe. # However, this cannot happen in ARIMA because _fill_missing_time_steps imputes values for each _frequency_unit, diff --git a/runtime/tests/automl_runtime/forecast/deepar/utils_test.py b/runtime/tests/automl_runtime/forecast/deepar/utils_test.py index 3aebf1e..73df15a 100644 --- a/runtime/tests/automl_runtime/forecast/deepar/utils_test.py +++ b/runtime/tests/automl_runtime/forecast/deepar/utils_test.py @@ -18,6 +18,7 @@ import pandas as pd from parameterized import parameterized +from databricks.automl_runtime.forecast.frequency import Frequency from databricks.automl_runtime.forecast.deepar.utils import set_index_and_fill_missing_time_steps @@ -40,7 +41,7 @@ def test_single_series_filled(self): ) dropped_df = base_df.drop([4, 5]).reset_index(drop=True) - transformed_df = set_index_and_fill_missing_time_steps(dropped_df, time_col, "D", 1) + transformed_df = set_index_and_fill_missing_time_steps(dropped_df, time_col, Frequency(frequency_unit="D", frequency_quantity=1)) expected_df = base_df.copy() expected_df.loc[[4, 5], target_col] = float('nan') @@ -69,7 +70,7 @@ def test_multi_series_filled(self): dropped_df = pd.concat([dropped_base_df.copy(), dropped_base_df.copy()], ignore_index=True) dropped_df[id_col] = [1] * (num_rows_per_ts - 2) + [2] * (num_rows_per_ts - 2) - transformed_df_dict = set_index_and_fill_missing_time_steps(dropped_df, time_col, "D", 1, id_cols=[id_col]) + transformed_df_dict = set_index_and_fill_missing_time_steps(dropped_df, time_col, Frequency(frequency_unit="D", frequency_quantity=1), id_cols=[id_col]) self.assertEqual(transformed_df_dict.keys(), {"1", "2"}) expected_first_df = base_df.copy() @@ -101,7 +102,7 @@ def test_multi_series_multi_id_cols_filled(self): dropped_df[id_cols[0]] = ([1] * (num_rows_per_ts - 2) + [2] * (num_rows_per_ts - 2)) * 2 dropped_df[id_cols[1]] = [1] * (2 * (num_rows_per_ts - 2)) + [2] * (2 * (num_rows_per_ts - 2)) - transformed_df_dict = set_index_and_fill_missing_time_steps(dropped_df, time_col, "D", 1, id_cols=id_cols) + transformed_df_dict = set_index_and_fill_missing_time_steps(dropped_df, time_col, Frequency(frequency_unit="D", frequency_quantity=1), id_cols=id_cols) self.assertEqual(transformed_df_dict.keys(), {"1-1", "1-2", "2-1", "2-2"}) expected_first_df = base_df.copy() @@ -134,8 +135,7 @@ def test_single_series_week_day_index(self): transformed_df = set_index_and_fill_missing_time_steps( dropped_df, time_col, - "W", # Weekly frequency **without** specifying Friday - 1 + Frequency(frequency_unit="W", frequency_quantity=1) # Weekly frequency **without** specifying Friday ) # Create expected dataframe @@ -170,8 +170,7 @@ def test_single_series_month_start_index(self): transformed_df = set_index_and_fill_missing_time_steps( dropped_df, time_col, - "MS", # Monthly frequency - 1 + Frequency(frequency_unit="MS", frequency_quantity=1) # Monthly frequency ) # Create expected dataframe @@ -207,8 +206,7 @@ def test_single_series_month_mid_index(self): transformed_df = set_index_and_fill_missing_time_steps( dropped_df, time_col, - "MS", - 1 + Frequency(frequency_unit="MS", frequency_quantity=1) ) # Create expected dataframe @@ -245,8 +243,7 @@ def test_single_series_month_end_index(self): transformed_df = set_index_and_fill_missing_time_steps( dropped_df, time_col, - "MS", # Monthly frequency - 1 + Frequency(frequency_unit="MS", frequency_quantity=1) # Monthly frequency ) # Create expected dataframe @@ -278,8 +275,7 @@ def test_single_series_with_multiple_minute_index(self, frequency_quantity): transformed_df = set_index_and_fill_missing_time_steps( dropped_df, time_col, - "min", - frequency_quantity + Frequency(frequency_unit="min", frequency_quantity=frequency_quantity) ) # Create expected dataframe diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py index 5b90d86..fb637a6 100644 --- a/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py +++ b/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py @@ -44,7 +44,7 @@ class TestDiagnostics(unittest.TestCase): (df_with_exogenous, ["x1", "x2"]) ]) def test_cross_validation_success(self, df, exogenous_cols): - cutoffs = generate_cutoffs(df, horizon=3, frequency=Frequency(frequency_unit="d", frequency_quantity=1), seasonal_period=1, seasonal_unit="D", num_folds=3) + cutoffs = generate_cutoffs(df, horizon=3, frequency=Frequency(frequency_unit="D", frequency_quantity=1), seasonal_period=1, seasonal_unit="D", num_folds=3) train_df = df[df["ds"] <= cutoffs[0]].set_index("ds") y_train = train_df[["y"]] X_train = train_df.drop(["y"], axis=1) diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py index ae1068b..cf6f99b 100644 --- a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py +++ b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py @@ -43,7 +43,7 @@ def setUp(self) -> None: self.horizon = 1 self.freq = 'W' self.frequency_quantity=1 - dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency_unit=self.freq, frequency_quantity=self.frequency_quantity) + dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency=Frequency(frequency_unit=self.freq, frequency_quantity=self.frequency_quantity)) self.df = pd.concat([ pd.Series(dates, name='date'), pd.Series(range(self.num_rows), name="y") @@ -172,7 +172,7 @@ def setUp(self) -> None: self.horizon = 1 self.freq = 'W' self.frequency_quantity = 1 - dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency_unit=self.freq, frequency_quantity=self.frequency_quantity) + dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency=Frequency(frequency_unit=self.freq, frequency_quantity=self.frequency_quantity)) self.df = pd.concat([ pd.Series(dates, name='date'), pd.Series(range(self.num_rows), name="y"), @@ -422,7 +422,7 @@ def test_get_ds_hourly(self): ds_indices = AbstractArimaModel._get_ds_indices( start_ds=pd.Timestamp("2021-12-10 09:23"), periods=10, - frequency=Frequency(frequency_unit="h", frequency_quantity=1)) + frequency=Frequency(frequency_unit="H", frequency_quantity=1)) pd.testing.assert_index_equal(expected_ds, ds_indices) diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py index c226365..dff8573 100644 --- a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py +++ b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py @@ -243,8 +243,8 @@ def test_fill_missing_time_steps_with_multiple_frequency_quantities(self): self.assertEqual(ds.to_list(), df_filled["ds"].to_list()) def test_validate_ds_freq_matched_frequency(self): - ArimaEstimator._validate_ds_freq(self.df, frequency_unit='D', frequency_quantity=1) - ArimaEstimator._validate_ds_freq(self.df_monthly, frequency_unit='month', frequency_quantity=1) + ArimaEstimator._validate_ds_freq(self.df, frequency=Frequency(frequency_unit='D', frequency_quantity=1)) + ArimaEstimator._validate_ds_freq(self.df_monthly, frequency=Frequency(frequency_unit='month', frequency_quantity=1)) ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency=Frequency(frequency_unit="min", frequency_quantity=5)) ArimaEstimator._validate_ds_freq(self.df_with_10_minute_interval, frequency=Frequency(frequency_unit="min", frequency_quantity=10)) ArimaEstimator._validate_ds_freq(self.df_with_15_minute_interval, frequency=Frequency(frequency_unit="min", frequency_quantity=15)) @@ -255,4 +255,4 @@ def test_validate_ds_freq_unmatched_frequency(self): ArimaEstimator._validate_ds_freq(self.df, frequency=Frequency(frequency_unit="W", frequency_quantity=1)) with pytest.raises(ValueError, match="includes different frequency"): - ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency_unit='min', frequency_quantity=10) + ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency=Frequency(frequency_unit='min', frequency_quantity=10)) From 13be1f0c27254a9ece5bde34d7e01256172b7b4b Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Thu, 27 Feb 2025 10:08:29 -0800 Subject: [PATCH 3/6] pr comment --- .../automl_runtime/forecast/deepar/utils.py | 4 +-- .../automl_runtime/forecast/frequency.py | 26 +++++++++++++++++++ .../forecast/pmdarima/training.py | 2 +- .../automl_runtime/forecast/utils.py | 2 +- 4 files changed, 30 insertions(+), 4 deletions(-) diff --git a/runtime/databricks/automl_runtime/forecast/deepar/utils.py b/runtime/databricks/automl_runtime/forecast/deepar/utils.py index c7593eb..59f5a17 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/utils.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/utils.py @@ -85,7 +85,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, # We need to adjust the frequency_unit for pd.date_range if it is weekly, # otherwise it would always be "W-SUN" - if frequency.frequency_unit.upper() == "W": + if frequency.is_weekly(): weekday_name = total_min.strftime("%a").upper() # e.g., "FRI" adjusted_frequency = Frequency(frequency_unit=f"W-{weekday_name}", frequency_quantity=frequency.frequency_quantity) else: @@ -110,7 +110,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, # Fill in missing time steps between the min and max time steps df = df.reindex(valid_index) - if frequency.frequency_unit.upper() == "MS": + if frequency.is_monthly(): # Truncate the day of month to avoid issues with pandas frequency check df = df.to_period("M") diff --git a/runtime/databricks/automl_runtime/forecast/frequency.py b/runtime/databricks/automl_runtime/forecast/frequency.py index 15e02df..6c07982 100644 --- a/runtime/databricks/automl_runtime/forecast/frequency.py +++ b/runtime/databricks/automl_runtime/forecast/frequency.py @@ -54,6 +54,9 @@ class Frequency: frequency_unit: str frequency_quantity: int + def __str__(self): + return f"{self.frequency_quantity}{self.frequency_unit}" + def __post_init__(self): if self.frequency_unit not in self.VALID_FREQUENCY_UNITS: raise ValueError(f"Invalid frequency unit: {self.frequency_unit}") @@ -71,3 +74,26 @@ def __post_init__(self): "Only 1 is allowed for this unit." ) + def is_second(self) -> bool: + return self.frequency_unit in {"S", "seconds", "sec", "second"} + + def is_minute(self) -> bool: + return self.frequency_unit in {"m", "minute", "min", "minutes", "T"} + + def is_hourly(self) -> bool: + return self.frequency_unit in {"hours", "hour", "hr", "h", "H"} + + def is_daily(self) -> bool: + return self.frequency_unit in {"d", "D", "days", "day"} + + def is_weekly(self) -> bool: + return self.frequency_unit in {"W", "W-SUN", "W-MON", "W-TUE", "W-WED", "W-THU", "W-FRI", "W-SAT"} + + def is_monthly(self) -> bool: + return self.frequency_unit in {"M", "MS", "month", "months"} + + def is_quarterly(self) -> bool: + return self.frequency_unit in {"Q", "QS", "quarter", "quarters"} + + def is_yearly(self) -> bool: + return self.frequency_unit in {"Y", "YS", "year", "years"} diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py index 36507ab..cbb1ae7 100644 --- a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py +++ b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py @@ -89,7 +89,7 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame: try: # this check mirrors the the default behavior by prophet if history_periods < 2 * m: - _logger.warning(f"Skipping seasonal_period={m} ({self._frequency.frequency_quantity}{self._frequency.frequency_unit}). Dataframe timestamps must span at least two seasonality periods, but only spans {history_periods} {self._frequency.frequency_quantity}{self._frequency.frequency_unit}""") + _logger.warning(f"Skipping seasonal_period={m} ({self._frequency}). Dataframe timestamps must span at least two seasonality periods, but only spans {history_periods} {self._frequency}""") continue # Prophet also rejects the seasonality periods if the seasonality period timedelta is less than the shortest timedelta in the dataframe. # However, this cannot happen in ARIMA because _fill_missing_time_steps imputes values for each _frequency_unit, diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py index f0fff16..d02c48b 100644 --- a/runtime/databricks/automl_runtime/forecast/utils.py +++ b/runtime/databricks/automl_runtime/forecast/utils.py @@ -147,7 +147,7 @@ def generate_cutoffs(df: pd.DataFrame, horizon: int, frequency: Frequency, period = max(0.5 * horizon, 1) # avoid empty cutoff buckets # avoid non-integer months, quaters ands years. - if frequency.frequency_unit in NON_DAILY_OFFSET_ALIAS: + if frequency.is_monthly() or frequency.is_quarterly() or frequency.is_yearly(): period = int(period) period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * frequency.frequency_quantity * period else: From ee35b3cdf817623f1d8e436968b7c3447e781ea7 Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Thu, 27 Feb 2025 10:10:14 -0800 Subject: [PATCH 4/6] fix --- runtime/databricks/automl_runtime/forecast/deepar/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/databricks/automl_runtime/forecast/deepar/utils.py b/runtime/databricks/automl_runtime/forecast/deepar/utils.py index 59f5a17..227a086 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/utils.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/utils.py @@ -33,7 +33,7 @@ def validate_and_generate_index(df: pd.DataFrame, :return: A complete time index covering the full range of the dataset. :raises ValueError: If the day-of-month pattern is inconsistent for "MS" frequency. """ - if frequency.frequency_unit.upper() != "MS": + if not frequency.is_monthly(): return pd.date_range(df[time_col].min(), df[time_col].max(), freq=f"{frequency.frequency_quantity}{frequency.frequency_unit}") df[time_col] = pd.to_datetime(df[time_col]) # Ensure datetime format From 8f953eb04e4818603ca3bc7bc6951097b974ca29 Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Thu, 27 Feb 2025 10:16:43 -0800 Subject: [PATCH 5/6] fix --- .../automl_runtime/forecast/prophet/forecast_test.py | 4 ++-- .../tests/automl_runtime/forecast/prophet/model_test.py | 8 ++++---- runtime/tests/automl_runtime/forecast/utils_test.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py b/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py index 5370716..0d8c0a3 100644 --- a/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py +++ b/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py @@ -227,7 +227,7 @@ def test_training_with_split_cutoff(self): def test_horizon_truncation(self, mock_partial, mock_trials, mock_fmin): hyperopt_estim = ProphetHyperoptEstimator( horizon=100, - frequency=Frequency(frequency_unit="D", frequency_quantity=1), + frequency=Frequency(frequency_unit="d", frequency_quantity=1), metric="smape", interval_width=0.8, country_holidays="US", @@ -252,7 +252,7 @@ def test_no_horizon_truncation(self, mock_partial, mock_trials, mock_fmin): num_folds = 2 hyperopt_estim = ProphetHyperoptEstimator( horizon=horizon, - frequency=Frequency(frequency_unit="D", frequency_quantity=1), + frequency=Frequency(frequency_unit="d", frequency_quantity=1), metric="smape", interval_width=0.8, country_holidays="US", diff --git a/runtime/tests/automl_runtime/forecast/prophet/model_test.py b/runtime/tests/automl_runtime/forecast/prophet/model_test.py index 533e4d8..d7ac89b 100644 --- a/runtime/tests/automl_runtime/forecast/prophet/model_test.py +++ b/runtime/tests/automl_runtime/forecast/prophet/model_test.py @@ -81,7 +81,7 @@ def setUpClass(cls) -> None: cls.model = model_from_json(cls.model_json) def test_model_save_and_load(self): - prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "ds") + prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="d", frequency_quantity=1), "ds") with mlflow.start_run() as run: mlflow_prophet_log_model(prophet_model) @@ -130,7 +130,7 @@ def test_make_future_dataframe_with_multiple_frequency_quantities(self): f" Expect {expected_time}, but get {future_df.iloc[-1]['ds']}") def test_predict_success_datetime_date(self): - prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "ds") + prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="d", frequency_quantity=1), "ds") test_df = pd.DataFrame( {"ds": [datetime.date(2020, 10, 8), datetime.date(2020, 12, 10)]} ) @@ -142,7 +142,7 @@ def test_predict_success_datetime_date(self): ) # check the input dataframe is unchanged def test_predict_success_string(self): - prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "ds") + prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="d", frequency_quantity=1), "ds") test_df = pd.DataFrame({"ds": ["2020-10-08", "2020-12-10"]}) expected_test_df = test_df.copy() yhat = prophet_model.predict(None, test_df) @@ -163,7 +163,7 @@ def test_predict_multiple_frequency_quantities(self): ) # check the input dataframe is unchanged def test_validate_predict_cols(self): - prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "time") + prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="d", frequency_quantity=1), "time") test_df = pd.DataFrame( { "date": [pd.to_datetime("2020-11-01"), pd.to_datetime("2020-11-04")], diff --git a/runtime/tests/automl_runtime/forecast/utils_test.py b/runtime/tests/automl_runtime/forecast/utils_test.py index 38055c0..560b269 100644 --- a/runtime/tests/automl_runtime/forecast/utils_test.py +++ b/runtime/tests/automl_runtime/forecast/utils_test.py @@ -418,7 +418,7 @@ def test_make_single_future_dataframe(self): start_time=pd.to_datetime('2022-01-01'), end_time=pd.to_datetime('2022-01-04'), horizon=1, - frequency=Frequency(frequency_unit="D", frequency_quantity=1), + frequency=Frequency(frequency_unit="d", frequency_quantity=1), include_history=False, column_name="test_date" ) @@ -430,7 +430,7 @@ def test_make_single_future_dataframe(self): start_time=pd.to_datetime('2022-01-01'), end_time=pd.to_datetime('2022-01-04'), horizon=1, - frequency=Frequency(frequency_unit="D", frequency_quantity=1), + frequency=Frequency(frequency_unit="d", frequency_quantity=1), include_history=True, column_name="test_date" ) From 1f81dbf935e795af0b605c0eee33424f66bc4607 Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Thu, 27 Feb 2025 10:19:51 -0800 Subject: [PATCH 6/6] fix --- runtime/databricks/automl_runtime/forecast/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py index d02c48b..76ebd22 100644 --- a/runtime/databricks/automl_runtime/forecast/utils.py +++ b/runtime/databricks/automl_runtime/forecast/utils.py @@ -16,7 +16,7 @@ import logging from typing import Dict, List, Optional, Tuple, Union from databricks.automl_runtime.forecast import DATE_OFFSET_KEYWORD_MAP,\ - QUATERLY_OFFSET_ALIAS, NON_DAILY_OFFSET_ALIAS, OFFSET_ALIAS_MAP, PERIOD_ALIAS_MAP + QUATERLY_OFFSET_ALIAS, OFFSET_ALIAS_MAP, PERIOD_ALIAS_MAP from databricks.automl_runtime.forecast.frequency import Frequency import pandas as pd