From a4c11e84e68c6343cf6b46f669b3bb8b9887de7b Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Mon, 10 Feb 2025 19:51:58 -0800 Subject: [PATCH 01/11] prototype succeeded --- .../automl_runtime/forecast/deepar/model.py | 5 ++- .../automl_runtime/forecast/deepar/utils.py | 7 +-- .../automl_runtime/forecast/pmdarima/model.py | 29 +++++++------ .../forecast/pmdarima/training.py | 27 ++++++------ .../forecast/prophet/forecast.py | 22 +++++++--- .../automl_runtime/forecast/prophet/model.py | 12 ++++-- .../automl_runtime/forecast/utils.py | 43 +++++++++++-------- 7 files changed, 89 insertions(+), 56 deletions(-) diff --git a/runtime/databricks/automl_runtime/forecast/deepar/model.py b/runtime/databricks/automl_runtime/forecast/deepar/model.py index 92636406..17a99152 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/model.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/model.py @@ -42,7 +42,7 @@ class DeepARModel(ForecastModel): DeepAR mlflow model wrapper for forecasting. """ - def __init__(self, model: PyTorchPredictor, horizon: int, frequency: str, + def __init__(self, model: PyTorchPredictor, horizon: int, frequency: str, frequency_quantity: int, num_samples: int, target_col: str, time_col: str, id_cols: Optional[List[str]] = None) -> None: @@ -51,6 +51,7 @@ def __init__(self, model: PyTorchPredictor, horizon: int, frequency: str, :param model: DeepAR model :param horizon: the number of periods to forecast forward :param frequency: the frequency of the time series + :param frequency_quantity: the frequency quantity of the time series :param num_samples: the number of samples to draw from the distribution :param target_col: the target column name :param time_col: the time column name @@ -61,6 +62,7 @@ def __init__(self, model: PyTorchPredictor, horizon: int, frequency: str, self._model = model self._horizon = horizon self._frequency = frequency + self._frequency_quantity = frequency_quantity self._num_samples = num_samples self._target_col = target_col self._time_col = time_col @@ -129,6 +131,7 @@ def predict_samples(self, model_input_transformed = set_index_and_fill_missing_time_steps(model_input, self._time_col, self._frequency, + self._frequency_quantity, self._id_cols) test_ds = PandasDataset(model_input_transformed, target=self._target_col) diff --git a/runtime/databricks/automl_runtime/forecast/deepar/utils.py b/runtime/databricks/automl_runtime/forecast/deepar/utils.py index e49b5d08..a169848e 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/utils.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/utils.py @@ -18,7 +18,7 @@ import pandas as pd -def validate_and_generate_index(df: pd.DataFrame, time_col: str, frequency: str): +def validate_and_generate_index(df: pd.DataFrame, time_col: str, frequency: str, frequency_quantity: int): """ Generate a complete time index for the given DataFrame based on the specified frequency. - Ensures the time column is in datetime format. @@ -31,7 +31,7 @@ def validate_and_generate_index(df: pd.DataFrame, time_col: str, frequency: str) :raises ValueError: If the day-of-month pattern is inconsistent for "MS" frequency. """ if frequency.upper() != "MS": - return pd.date_range(df[time_col].min(), df[time_col].max(), freq=frequency) + return pd.date_range(df[time_col].min(), df[time_col].max(), freq=f"{frequency_quantity}{frequency}") df[time_col] = pd.to_datetime(df[time_col]) # Ensure datetime format @@ -64,6 +64,7 @@ def validate_and_generate_index(df: pd.DataFrame, time_col: str, frequency: str) def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, frequency: str, + frequency_quantity: int, id_cols: Optional[List[str]] = None): """ Transform the input dataframe to an acceptable format for the GluonTS library. @@ -86,7 +87,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, weekday_name = total_min.strftime("%a").upper() # e.g., "FRI" frequency = f"W-{weekday_name}" - new_index_full = validate_and_generate_index(df=df, time_col=time_col, frequency=frequency) + new_index_full = validate_and_generate_index(df=df, time_col=time_col, frequency=frequency, frequency_quantity=frequency_quantity) if id_cols is not None: df_dict = {} diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/model.py b/runtime/databricks/automl_runtime/forecast/pmdarima/model.py index 64e5377d..d9a4627d 100644 --- a/runtime/databricks/automl_runtime/forecast/pmdarima/model.py +++ b/runtime/databricks/automl_runtime/forecast/pmdarima/model.py @@ -64,7 +64,7 @@ def model_env(self): return ARIMA_CONDA_ENV @staticmethod - def _get_ds_indices(start_ds: pd.Timestamp, periods: int, frequency: str) -> pd.DatetimeIndex: + def _get_ds_indices(start_ds: pd.Timestamp, periods: int, frequency: str, frequency_quantity: int) -> pd.DatetimeIndex: """ Create a DatetimeIndex with specified starting time and frequency, whose length is the given periods. :param start_ds: the pd.Timestamp as the start of the DatetimeIndex. @@ -75,7 +75,7 @@ def _get_ds_indices(start_ds: pd.Timestamp, periods: int, frequency: str) -> pd. ds_indices = pd.date_range( start=start_ds, periods=periods, - freq=pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency]) + freq=pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency]) * frequency_quantity ) modified_start_ds = ds_indices.min() if start_ds != modified_start_ds: @@ -90,7 +90,7 @@ class ArimaModel(AbstractArimaModel): """ def __init__(self, pickled_model: bytes, horizon: int, frequency: str, - start_ds: pd.Timestamp, end_ds: pd.Timestamp, + frequency_quantity: int, start_ds: pd.Timestamp, end_ds: pd.Timestamp, time_col: str, exogenous_cols: Optional[List[str]] = None) -> None: """ Initialize the mlflow Python model wrapper for ARIMA. @@ -107,6 +107,7 @@ def __init__(self, pickled_model: bytes, horizon: int, frequency: str, self._pickled_model = pickled_model self._horizon = horizon self._frequency = OFFSET_ALIAS_MAP[frequency] + self._frequency_quantity = frequency_quantity self._start_ds = pd.to_datetime(start_ds) self._end_ds = pd.to_datetime(end_ds) self._time_col = time_col @@ -158,6 +159,7 @@ def make_future_dataframe(self, horizon: int = None, include_history: bool = Tru end_time=self._end_ds, horizon=horizon or self._horizon, frequency=self._frequency, + frequency_quantity=self._frequency_quantity, include_history=include_history ) @@ -192,7 +194,7 @@ def _predict_impl(self, input_df: pd.DataFrame) -> pd.DataFrame: ) # Check if the time has correct frequency consistency = df["ds"].apply(lambda x: - is_frequency_consistency(self._start_ds, x, self._frequency) + is_frequency_consistency(self._start_ds, x, self._frequency, self._frequency_quantity) ).all() if not consistency: raise MlflowException( @@ -203,7 +205,7 @@ def _predict_impl(self, input_df: pd.DataFrame) -> pd.DataFrame: ) preds_pds = [] # Out-of-sample prediction if needed - horizon = calculate_period_differences(self._end_ds, max(df["ds"]), self._frequency) + horizon = calculate_period_differences(self._end_ds, max(df["ds"]), self._frequency, self._frequency_quantity) if horizon > 0: X_future = df[df["ds"] > self._end_ds].set_index("ds") future_pd = self._forecast( @@ -229,8 +231,8 @@ def _predict_in_sample( end_ds: pd.Timestamp = None, X: pd.DataFrame = None) -> pd.DataFrame: if start_ds and end_ds: - start_idx = calculate_period_differences(self._start_ds, start_ds, self._frequency) - end_idx = calculate_period_differences(self._start_ds, end_ds, self._frequency) + start_idx = calculate_period_differences(self._start_ds, start_ds, self._frequency, self._frequency_quantity) + end_idx = calculate_period_differences(self._start_ds, end_ds, self._frequency, self._frequency_quantity) else: start_ds = self._start_ds end_ds = self._end_ds @@ -242,8 +244,8 @@ def _predict_in_sample( start=start_idx, end=end_idx, return_conf_int=True) - periods = calculate_period_differences(self._start_ds, end_ds, self._frequency) + 1 - ds_indices = self._get_ds_indices(start_ds=self._start_ds, periods=periods, frequency=self._frequency)[start_idx:] + periods = calculate_period_differences(self._start_ds, end_ds, self._frequency, self._frequency_quantity) + 1 + ds_indices = self._get_ds_indices(start_ds=self._start_ds, periods=periods, frequency=self._frequency, frequency_quantity=self._frequency_quantity)[start_idx:] in_sample_pd = pd.DataFrame({'ds': ds_indices, 'yhat': preds_in_sample}) in_sample_pd[["yhat_lower", "yhat_upper"]] = conf_in_sample return in_sample_pd @@ -257,7 +259,7 @@ def _forecast( horizon, X=X, return_conf_int=True) - ds_indices = self._get_ds_indices(start_ds=self._end_ds, periods=horizon + 1, frequency=self._frequency)[1:] + ds_indices = self._get_ds_indices(start_ds=self._end_ds, periods=horizon + 1, frequency=self._frequency, frequency_quantity=self._frequency_quantity)[1:] preds_pd = pd.DataFrame({'ds': ds_indices, 'yhat': preds}) preds_pd[["yhat_lower", "yhat_upper"]] = conf return preds_pd @@ -268,7 +270,7 @@ class MultiSeriesArimaModel(AbstractArimaModel): ARIMA mlflow model wrapper for multivariate forecasting. """ - def __init__(self, pickled_model_dict: Dict[Tuple, bytes], horizon: int, frequency: str, + def __init__(self, pickled_model_dict: Dict[Tuple, bytes], horizon: int, frequency: str, frequency_quantity: int, start_ds_dict: Dict[Tuple, pd.Timestamp], end_ds_dict: Dict[Tuple, pd.Timestamp], time_col: str, id_cols: List[str], exogenous_cols: Optional[List[str]] = None) -> None: """ @@ -287,6 +289,7 @@ def __init__(self, pickled_model_dict: Dict[Tuple, bytes], horizon: int, frequen self._pickled_models = pickled_model_dict self._horizon = horizon self._frequency = frequency + self._frequency_quantity = frequency_quantity self._starts = start_ds_dict self._ends = end_ds_dict self._time_col = time_col @@ -330,6 +333,7 @@ def make_future_dataframe( end_time=self._ends, horizon=horizon, frequency=self._frequency, + frequency_quantity=self._frequency_quantity, include_history=include_history, groups=groups, identity_column_names=self._id_cols @@ -360,7 +364,7 @@ def _predict_timeseries_single_id( horizon: int, include_history: bool = True, df: Optional[pd.DataFrame] = None) -> pd.DataFrame: - arima_model_single_id = ArimaModel(self._pickled_models[id_], self._horizon, self._frequency, + arima_model_single_id = ArimaModel(self._pickled_models[id_], self._horizon, self._frequency, self._frequency_quantity, self._starts[id_], self._ends[id_], self._time_col, self._exogenous_cols) preds_df = arima_model_single_id.predict_timeseries(horizon, include_history, df) for id, col_name in zip(id_, self._id_cols): @@ -402,6 +406,7 @@ def _predict_single_id(self, df: pd.DataFrame) -> pd.DataFrame: arima_model_single_id = ArimaModel(self._pickled_models[id_], self._horizon, self._frequency, + self._frequency_quantity, self._starts[id_], self._ends[id_], self._time_col, diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py index 23b22c9a..b73145c6 100644 --- a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py +++ b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py @@ -36,7 +36,7 @@ class ArimaEstimator: def __init__(self, horizon: int, frequency_unit: str, metric: str, seasonal_periods: List[int], num_folds: int = 20, max_steps: int = 150, exogenous_cols: Optional[List[str]] = None, - split_cutoff: Optional[pd.Timestamp] = None) -> None: + split_cutoff: Optional[pd.Timestamp] = None, frequency_quantity: int = 1) -> None: """ :param horizon: Number of periods to forecast forward :param frequency_unit: Frequency of the time series @@ -53,6 +53,7 @@ def __init__(self, horizon: int, frequency_unit: str, metric: str, seasonal_peri """ self._horizon = horizon self._frequency_unit = OFFSET_ALIAS_MAP[frequency_unit] + self._frequency_quantity = frequency_quantity self._metric = metric self._seasonal_periods = seasonal_periods self._num_folds = num_folds @@ -70,14 +71,14 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame: history_pd["ds"] = pd.to_datetime(history_pd["ds"]) # Check if the time has consistent frequency - self._validate_ds_freq(history_pd, self._frequency_unit) + self._validate_ds_freq(history_pd, self._frequency_unit, self._frequency_quantity) history_periods = utils.calculate_period_differences( - history_pd['ds'].min(), history_pd['ds'].max(), self._frequency_unit + history_pd['ds'].min(), history_pd['ds'].max(), self._frequency_unit, self._frequency_quantity ) if history_periods + 1 != history_pd['ds'].size: # Impute missing time steps - history_pd = self._fill_missing_time_steps(history_pd, self._frequency_unit) + history_pd = self._fill_missing_time_steps(history_pd, self._frequency_unit, self._frequency_quantity) # Tune seasonal periods @@ -87,19 +88,20 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame: try: # this check mirrors the the default behavior by prophet if history_periods < 2 * m: - _logger.warning(f"Skipping seasonal_period={m} ({self._frequency_unit}). Dataframe timestamps must span at least two seasonality periods, but only spans {history_periods} {self._frequency_unit}""") + _logger.warning(f"Skipping seasonal_period={m} ({self._frequency_quantity}{self._frequency_unit}). Dataframe timestamps must span at least two seasonality periods, but only spans {history_periods} {self._frequency_quantity}{self._frequency_unit}""") continue # Prophet also rejects the seasonality periods if the seasonality period timedelta is less than the shortest timedelta in the dataframe. # However, this cannot happen in ARIMA because _fill_missing_time_steps imputes values for each _frequency_unit, # so the minimum valid seasonality period is always 1 - validation_horizon = utils.get_validation_horizon(history_pd, self._horizon, self._frequency_unit) + validation_horizon = utils.get_validation_horizon(history_pd, self._horizon, self._frequency_unit, self._frequency_quantity) if self._split_cutoff: cutoffs = utils.generate_custom_cutoffs( history_pd, horizon=validation_horizon, unit=self._frequency_unit, - split_cutoff=self._split_cutoff + split_cutoff=self._split_cutoff, + frequency_quantity=self._frequency_quantity, ) else: cutoffs = utils.generate_cutoffs( @@ -107,6 +109,7 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame: horizon=validation_horizon, unit=self._frequency_unit, num_folds=self._num_folds, + frequency_quantity=self._frequency_quantity, ) result = self._fit_predict(history_pd, cutoffs=cutoffs, seasonal_period=m, max_steps=self._max_steps) @@ -150,9 +153,9 @@ def _fit_predict(self, df: pd.DataFrame, cutoffs: List[pd.Timestamp], seasonal_p return {"metrics": metrics, "model": arima_model} @staticmethod - def _fill_missing_time_steps(df: pd.DataFrame, frequency: str): + def _fill_missing_time_steps(df: pd.DataFrame, frequency: str, frequency_quantity: int): # Forward fill missing time steps - df_filled = df.set_index("ds").resample(rule=OFFSET_ALIAS_MAP[frequency]).pad().reset_index() + df_filled = df.set_index("ds").resample(rule=f"{frequency_quantity}{OFFSET_ALIAS_MAP[frequency]}").pad().reset_index() start_ds, modified_start_ds = df["ds"].min(), df_filled["ds"].min() if start_ds != modified_start_ds: offset = modified_start_ds - start_ds @@ -160,12 +163,12 @@ def _fill_missing_time_steps(df: pd.DataFrame, frequency: str): return df_filled @staticmethod - def _validate_ds_freq(df: pd.DataFrame, frequency: str): + def _validate_ds_freq(df: pd.DataFrame, frequency: str, frequency_qantity: int): start_ds = df["ds"].min() consistency = df["ds"].apply(lambda x: - utils.is_frequency_consistency(start_ds, x, frequency) + utils.is_frequency_consistency(start_ds, x, frequency, frequency_qantity) ).all() if not consistency: raise ValueError( - f"Input time column includes different frequency than the specified frequency {frequency}." + f"Input time column includes different frequency than the specified frequency {frequency_qantity}{frequency}." ) diff --git a/runtime/databricks/automl_runtime/forecast/prophet/forecast.py b/runtime/databricks/automl_runtime/forecast/prophet/forecast.py index 706700f2..8fbe771b 100644 --- a/runtime/databricks/automl_runtime/forecast/prophet/forecast.py +++ b/runtime/databricks/automl_runtime/forecast/prophet/forecast.py @@ -41,7 +41,9 @@ def _prophet_fit_predict(params: Dict[str, Any], history_pd: pd.DataFrame, horizon: int, frequency: str, cutoffs: List[pd.Timestamp], interval_width: int, primary_metric: str, country_holidays: Optional[str] = None, - regressors = None, **prophet_kwargs) -> Dict[str, Any]: + regressors = None, + frequency_quantity: int = 1, + **prophet_kwargs) -> Dict[str, Any]: """ Training function for hyperparameter tuning with hyperopt @@ -50,6 +52,7 @@ def _prophet_fit_predict(params: Dict[str, Any], history_pd: pd.DataFrame, type) and y, the time series :param horizon: Forecast horizon_timedelta :param frequency: Frequency of the time series + :param frequency_quantity: the number of time units that make up a single period of the time series. For now, only 1/5/10/15/30 minutes, 1 hour, 1 day, 1 week, 1 month, 1 quarter, 1 year are supported. :param num_folds: Number of folds for cross validation :param interval_width: Width of the uncertainty intervals provided for the forecast :param primary_metric: Metric that will be optimized across trials @@ -68,7 +71,7 @@ def _prophet_fit_predict(params: Dict[str, Any], history_pd: pd.DataFrame, model.fit(history_pd, iter=200) offset_kwarg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency]] - horizon_offset = pd.DateOffset(**offset_kwarg)*horizon + horizon_offset = pd.DateOffset(**offset_kwarg)*frequency_quantity*horizon # Evaluate Metrics df_cv = cross_validation( model, horizon=horizon_offset, cutoffs=cutoffs, disable_tqdm=True @@ -92,7 +95,9 @@ def __init__(self, horizon: int, frequency_unit: str, metric: str, interval_widt max_eval: int = 10, trial_timeout: int = None, random_state: int = 0, is_parallel: bool = True, regressors = None, - split_cutoff: Optional[pd.Timestamp] = None, **prophet_kwargs) -> None: + split_cutoff: Optional[pd.Timestamp] = None, + frequency_quantity: int = 1, + **prophet_kwargs) -> None: """ Initialization @@ -119,6 +124,7 @@ def __init__(self, horizon: int, frequency_unit: str, metric: str, interval_widt """ self._horizon = horizon self._frequency_unit = OFFSET_ALIAS_MAP[frequency_unit] + self._frequency_quantity = frequency_quantity self._metric = metric self._interval_width = interval_width self._country_holidays = country_holidays @@ -144,13 +150,14 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame: seasonality_mode = ["additive", "multiplicative"] - validation_horizon = utils.get_validation_horizon(df, self._horizon, self._frequency_unit) + validation_horizon = utils.get_validation_horizon(df, self._horizon, self._frequency_unit, self._frequency_quantity) if self._split_cutoff: cutoffs = utils.generate_custom_cutoffs( df.reset_index(drop=True), horizon=validation_horizon, unit=self._frequency_unit, - split_cutoff=self._split_cutoff + split_cutoff=self._split_cutoff, + frequency_quantity=self._frequency_quantity, ) else: cutoffs = utils.generate_cutoffs( @@ -158,10 +165,13 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame: horizon=validation_horizon, unit=self._frequency_unit, num_folds=self._num_folds, + frequency_quantity=self._frequency_quantity, ) train_fn = partial(_prophet_fit_predict, history_pd=df, horizon=validation_horizon, - frequency=self._frequency_unit, cutoffs=cutoffs, + frequency=self._frequency_unit, + frequency_quantity=self._frequency_quantity, + cutoffs=cutoffs, interval_width=self._interval_width, primary_metric=self._metric, country_holidays=self._country_holidays, regressors=self._regressors, **self._prophet_kwargs) diff --git a/runtime/databricks/automl_runtime/forecast/prophet/model.py b/runtime/databricks/automl_runtime/forecast/prophet/model.py index f678ce6b..fce41aad 100644 --- a/runtime/databricks/automl_runtime/forecast/prophet/model.py +++ b/runtime/databricks/automl_runtime/forecast/prophet/model.py @@ -45,7 +45,7 @@ class ProphetModel(ForecastModel): Prophet mlflow model wrapper for univariate forecasting. """ - def __init__(self, model_json: Union[Dict[Tuple, str], str], horizon: int, frequency: str, + def __init__(self, model_json: Union[Dict[Tuple, str], str], horizon: int, frequency: str, frequency_quantity: int, time_col: str) -> None: """ Initialize the mlflow Python model wrapper for mlflow @@ -53,11 +53,13 @@ def __init__(self, model_json: Union[Dict[Tuple, str], str], horizon: int, frequ the dictionary of json strings of Prophet model for multi-series forecasting :param horizon: Int number of periods to forecast forward. :param frequency: the frequency of the time series + :param frequency_quantity: the frequency quantity of the time series :param time_col: the column name of the time column """ self._model_json = model_json self._horizon = horizon self._frequency = frequency + self._frequency_quantity = frequency_quantity self._time_col = time_col self._is_quaterly = is_quaterly_alias(frequency) super().__init__() @@ -93,6 +95,7 @@ def make_future_dataframe(self, horizon: int = None, include_history: bool = Tru requested number of periods. """ offset_kwarg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[self._frequency]] + offset_kwarg = {key: value * self._frequency_quantity for key, value in offset_kwarg.items()} return self.model().make_future_dataframe(periods=horizon or self._horizon, freq=pd.DateOffset(**offset_kwarg), include_history=include_history) @@ -144,7 +147,7 @@ class MultiSeriesProphetModel(ProphetModel): """ def __init__(self, model_json: Dict[Tuple, str], timeseries_starts: Dict[Tuple, pd.Timestamp], - timeseries_end: str, horizon: int, frequency: str, time_col: str, id_cols: List[str], + timeseries_end: str, horizon: int, frequency: str, frequency_quantity: int, time_col: str, id_cols: List[str], ) -> None: """ Initialize the mlflow Python model wrapper for mlflow @@ -156,8 +159,9 @@ def __init__(self, model_json: Dict[Tuple, str], timeseries_starts: Dict[Tuple, :param time_col: the column name of the time column :param id_cols: the column names of the identity columns for multi-series time series """ - super().__init__(model_json, horizon, frequency, time_col) + super().__init__(model_json, horizon, frequency, frequency_quantity, time_col) self._frequency = frequency + self._frequency_quantity = frequency_quantity self._timeseries_end = timeseries_end self._timeseries_starts = timeseries_starts self._id_cols = id_cols @@ -201,6 +205,7 @@ def make_future_dataframe( end_time=end_time, horizon=horizon, frequency=self._frequency, + frequency_quantity=self._frequency_quantity, include_history=include_history, groups=groups, identity_column_names=self._id_cols @@ -236,6 +241,7 @@ def predict_timeseries(self, horizon: int = None, include_history: bool = True) end_time=end_time, horizon=horizon, frequency=self._frequency, + frequency_quantity=self._frequency_quantity, include_history=include_history, groups=self._model_json.keys(), identity_column_names=self._id_cols diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py index 3b5c3942..1f81afb8 100644 --- a/runtime/databricks/automl_runtime/forecast/utils.py +++ b/runtime/databricks/automl_runtime/forecast/utils.py @@ -27,6 +27,7 @@ def make_future_dataframe( end_time: Union[pd.Timestamp, Dict[Tuple, pd.Timestamp]], horizon: int, frequency: str, + frequency_quantity: int, include_history: bool = True, groups: List[Tuple] = None, identity_column_names: List[str] = None, @@ -43,7 +44,7 @@ def make_future_dataframe( :return: pd.DataFrame that extends forward """ if groups is None: - return make_single_future_dataframe(start_time, end_time, horizon, frequency) + return make_single_future_dataframe(start_time, end_time, horizon, frequency, frequency_quantity) future_df_list = [] for group in groups: @@ -55,7 +56,7 @@ def make_future_dataframe( group_end_time = end_time[group] else: group_end_time = end_time - df = make_single_future_dataframe(group_start_time, group_end_time, horizon, frequency, include_history) + df = make_single_future_dataframe(group_start_time, group_end_time, horizon, frequency, frequency_quantity, include_history) for idx, identity_column_name in enumerate(identity_column_names): df[identity_column_name] = group[idx] future_df_list.append(df) @@ -66,6 +67,7 @@ def make_single_future_dataframe( end_time: pd.Timestamp, horizon: int, frequency: str, + frequency_quantity: int, include_history: bool = True, column_name: str = "ds" ) -> pd.DataFrame: @@ -81,18 +83,18 @@ def make_single_future_dataframe( :return: """ offset_freq = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency]] - unit_offset = pd.DateOffset(**offset_freq) + timestep_offset = pd.DateOffset(**offset_freq) * frequency_quantity end_time = pd.Timestamp(end_time) if include_history: start_time = start_time else: - start_time = end_time + unit_offset + start_time = end_time + timestep_offset date_rng = pd.date_range( start=start_time, - end=end_time + unit_offset*horizon, - freq=unit_offset + end=end_time + timestep_offset*horizon, + freq=timestep_offset ) return pd.DataFrame(date_rng, columns=[column_name]) @@ -135,7 +137,8 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str, frequency_ def generate_cutoffs(df: pd.DataFrame, horizon: int, unit: str, num_folds: int, seasonal_period: int = 0, - seasonal_unit: Optional[str] = None) -> List[pd.Timestamp]: + seasonal_unit: Optional[str] = None, + frequency_quantity: int = 1) -> List[pd.Timestamp]: """ Generate cutoff times for cross validation with the control of number of folds. :param df: pd.DataFrame of the historical data. @@ -152,17 +155,17 @@ def generate_cutoffs(df: pd.DataFrame, horizon: int, unit: str, # avoid non-integer months, quaters ands years. if unit in NON_DAILY_OFFSET_ALIAS: period = int(period) - period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*period + period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*frequency_quantity*period else: offset_kwarg = {list(DATE_OFFSET_KEYWORD_MAP[unit])[0]: period} - period_dateoffset = pd.DateOffset(**offset_kwarg) + period_dateoffset = pd.DateOffset(**offset_kwarg)*frequency_quantity - horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*horizon + horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*frequency_quantity*horizon if not seasonal_unit: seasonal_unit = unit - seasonality_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*seasonal_period + seasonality_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*frequency_quantity*seasonal_period # We can not compare DateOffset directly, so we add to start time and compare. initial = seasonality_dateoffset @@ -192,7 +195,7 @@ def generate_cutoffs(df: pd.DataFrame, horizon: int, unit: str, return list(reversed(result)) def generate_custom_cutoffs(df: pd.DataFrame, horizon: int, unit: str, - split_cutoff: pd.Timestamp) -> List[pd.Timestamp]: + split_cutoff: pd.Timestamp, frequency_quantity: int = 1) -> List[pd.Timestamp]: """ Generate custom cutoff times for cross validation based on user-specified split cutoff. Period (step size) is 1. @@ -206,8 +209,8 @@ def generate_custom_cutoffs(df: pd.DataFrame, horizon: int, unit: str, """ # TODO: [ML-43528] expose period as input. period = 1 - period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*period - horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*horizon + period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*period*frequency_quantity + horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*horizon*frequency_quantity # First cutoff is the cutoff bewteen splits cutoff = split_cutoff @@ -229,7 +232,8 @@ def is_quaterly_alias(freq: str): def is_frequency_consistency( start_time: pd.Timestamp, end_time: pd.Timestamp, - freq:str) -> bool: + freq:str, + frequency_quantity: int) -> bool: """ Validate the periods given a start time, end time is consistent with given frequency. We consider consistency as only integer frequencies between start and end time, e.g. @@ -242,17 +246,18 @@ def is_frequency_consistency( :return: A boolean indicate whether the time interval is evenly divisible by the period. """ - periods = calculate_period_differences(start_time, end_time, freq) + periods = calculate_period_differences(start_time, end_time, freq, frequency_quantity) diff = pd.to_datetime(end_time) - pd.DateOffset( **DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[freq]] - ) * periods == pd.to_datetime(start_time) + ) * periods * frequency_quantity == pd.to_datetime(start_time) return diff def calculate_period_differences( start_time: pd.Timestamp, end_time: pd.Timestamp, - freq:str) -> int: + freq:str, + frequency_quantity: int) -> int: """ Calculate the periods given a start time, end time and period frequency. :param start_time: A pandas timestamp. @@ -265,4 +270,4 @@ def calculate_period_differences( start_time = pd.to_datetime(start_time) end_time = pd.to_datetime(end_time) freq_alias = PERIOD_ALIAS_MAP[OFFSET_ALIAS_MAP[freq]] - return (end_time.to_period(freq_alias) - start_time.to_period(freq_alias)).n + return (end_time.to_period(freq_alias) - start_time.to_period(freq_alias)).n // frequency_quantity From 6e91a490ca4e5c1f297d09c2500a79492b44d6f3 Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Tue, 11 Feb 2025 11:17:27 -0800 Subject: [PATCH 02/11] update docstring --- runtime/databricks/automl_runtime/forecast/deepar/utils.py | 2 ++ .../databricks/automl_runtime/forecast/pmdarima/model.py | 3 +++ .../databricks/automl_runtime/forecast/pmdarima/training.py | 1 + runtime/databricks/automl_runtime/forecast/prophet/model.py | 1 + runtime/databricks/automl_runtime/forecast/utils.py | 6 ++++++ 5 files changed, 13 insertions(+) diff --git a/runtime/databricks/automl_runtime/forecast/deepar/utils.py b/runtime/databricks/automl_runtime/forecast/deepar/utils.py index a169848e..40ecd9ea 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/utils.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/utils.py @@ -27,6 +27,7 @@ def validate_and_generate_index(df: pd.DataFrame, time_col: str, frequency: str, :param df: The input DataFrame containing the time column. :param time_col: The name of the time column. :param frequency: The frequency of the time series. + :param frequency_quantity: The frequency quantity of the time series. :return: A complete time index covering the full range of the dataset. :raises ValueError: If the day-of-month pattern is inconsistent for "MS" frequency. """ @@ -75,6 +76,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, :param df: the input dataframe that contains time_col :param time_col: time column name :param frequency: the frequency of the time series + :param frequency_quantity: the frequency quantity of the time series :param id_cols: the column names of the identity columns for multi-series time series; None for single series :return: single-series - transformed dataframe; multi-series - dictionary of transformed dataframes, each key is the (concatenated) id of the time series diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/model.py b/runtime/databricks/automl_runtime/forecast/pmdarima/model.py index d9a4627d..b700116d 100644 --- a/runtime/databricks/automl_runtime/forecast/pmdarima/model.py +++ b/runtime/databricks/automl_runtime/forecast/pmdarima/model.py @@ -70,6 +70,7 @@ def _get_ds_indices(start_ds: pd.Timestamp, periods: int, frequency: str, freque :param start_ds: the pd.Timestamp as the start of the DatetimeIndex. :param periods: the length of the DatetimeIndex. :param frequency: the frequency of the DatetimeIndex. + :param frequency_quantity: the frequency quantity of the DatetimeIndex. :return: a DatetimeIndex. """ ds_indices = pd.date_range( @@ -97,6 +98,7 @@ def __init__(self, pickled_model: bytes, horizon: int, frequency: str, :param pickled_model: the pickled ARIMA model as a bytes object. :param horizon: int number of periods to forecast forward. :param frequency: the frequency of the time series + :param frequency_quantity: the frequency quantity of the time series :param start_ds: the start time of training data :param end_ds: the end time of training data :param time_col: the column name of the time column @@ -278,6 +280,7 @@ def __init__(self, pickled_model_dict: Dict[Tuple, bytes], horizon: int, frequen :param pickled_model_dict: the dictionary of binarized ARIMA models for different time series. :param horizon: int number of periods to forecast forward. :param frequency: the frequency of the time series + :param frequency_quantity: the frequency quantity of the time series :param start_ds_dict: the dictionary of the starting time of each time series in training data. :param end_ds_dict: the dictionary of the end time of each time series in training data. :param time_col: the column name of the time column diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py index b73145c6..f02064fb 100644 --- a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py +++ b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py @@ -47,6 +47,7 @@ def __init__(self, horizon: int, frequency_unit: str, metric: str, seasonal_peri :param exogenous_cols: Optional list of column names of exogenous variables. If provided, these columns are used as additional features in arima model. :param split_cutoff: Optional cutoff specified by user. If provided, + :param frequency_quantity: The number of frequency units in the frequency. Default is 1. it is the starting point of cutoffs for cross validation. For tuning job, it is the cutoff between train and validate split. For training job, it is the cutoff bewteen validate and test split. diff --git a/runtime/databricks/automl_runtime/forecast/prophet/model.py b/runtime/databricks/automl_runtime/forecast/prophet/model.py index fce41aad..173741ee 100644 --- a/runtime/databricks/automl_runtime/forecast/prophet/model.py +++ b/runtime/databricks/automl_runtime/forecast/prophet/model.py @@ -156,6 +156,7 @@ def __init__(self, model_json: Dict[Tuple, str], timeseries_starts: Dict[Tuple, :param timeseries_end: the end time of the time series :param horizon: int number of periods to forecast forward :param frequency: the frequency of the time series + :param frequency_quantity: the frequency quantity of the time series :param time_col: the column name of the time column :param id_cols: the column names of the identity columns for multi-series time series """ diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py index 1f81afb8..7faab0f4 100644 --- a/runtime/databricks/automl_runtime/forecast/utils.py +++ b/runtime/databricks/automl_runtime/forecast/utils.py @@ -38,6 +38,7 @@ def make_future_dataframe( :param end_time: the dictionary of the end time of each time series in training data. :param horizon: int number of periods to forecast forward. :param frequency: the frequency of the time series + :param frequency_quantity: the multiplier for the frequency. :param include_history: :param groups: the collection of group(s) to generate forecast predictions. :param identity_column_names: Column names of the identity columns @@ -77,6 +78,7 @@ def make_single_future_dataframe( :param end_time: The end time of time series of the training data. :param horizon: Int number of periods to forecast forward. :param frequency: The frequency of the time series + :param frequency_quantity: The frequency quantity of the time series :param include_history: Boolean to include the historical dates in the data frame for predictions. :param column_name: column name of the time column. Default is "ds". @@ -148,6 +150,7 @@ def generate_cutoffs(df: pd.DataFrame, horizon: int, unit: str, :param seasonal_period: length of the seasonality period. :param seasonal_unit: Optional frequency unit for the seasonal period. If not specified, the function will use the same frequency unit as the time series. + :param frequency_quantity: frequency quantity of the time series. :return: list of pd.Timestamp cutoffs for cross-validation. """ period = max(0.5 * horizon, 1) # avoid empty cutoff buckets @@ -203,6 +206,7 @@ def generate_custom_cutoffs(df: pd.DataFrame, horizon: int, unit: str, :param horizon: int number of time into the future for forecasting. :param unit: frequency unit of the time series, which must be a pandas offset alias. :param split_cutoff: the user-specified cutoff, as the starting point of cutoffs. + :param frequency_quantity: frequency quantity of the time series. For tuning job, it is the cutoff between train and validate split. For training job, it is the cutoff bewteen validate and test split. :return: list of pd.Timestamp cutoffs for cross-validation. @@ -243,6 +247,7 @@ def is_frequency_consistency( :param end_time: A pandas timestamp. :param freq: A string that is accepted by OFFSET_ALIAS_MAP, e.g. 'day', 'month' etc. + :param frequency_quantity: the multiplier for the frequency. :return: A boolean indicate whether the time interval is evenly divisible by the period. """ @@ -264,6 +269,7 @@ def calculate_period_differences( :param end_time: A pandas timestamp. :param freq: A string that is accepted by OFFSET_ALIAS_MAP, e.g. 'day', 'month' etc. + :param frequency_quantity: An integer that is the multiplier for the frequency. :return: A pd.Series indicates the round-down integer period calculated. """ From b80d08a3b4976bee50682a18632f90316286413b Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Tue, 11 Feb 2025 11:55:05 -0800 Subject: [PATCH 03/11] fix tests --- .../forecast/pmdarima/training.py | 6 ++--- .../forecast/deepar/model_test.py | 5 ++++ .../forecast/deepar/utils_test.py | 18 ++++++++----- .../forecast/pmdarima/model_test.py | 27 ++++++++++++++----- .../forecast/pmdarima/training_test.py | 21 ++++++++++----- .../forecast/prophet/model_test.py | 14 ++++++---- .../automl_runtime/forecast/utils_test.py | 16 ++++++----- 7 files changed, 73 insertions(+), 34 deletions(-) diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py index f02064fb..37b37efb 100644 --- a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py +++ b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py @@ -164,12 +164,12 @@ def _fill_missing_time_steps(df: pd.DataFrame, frequency: str, frequency_quantit return df_filled @staticmethod - def _validate_ds_freq(df: pd.DataFrame, frequency: str, frequency_qantity: int): + def _validate_ds_freq(df: pd.DataFrame, frequency: str, frequency_quantity: int): start_ds = df["ds"].min() consistency = df["ds"].apply(lambda x: - utils.is_frequency_consistency(start_ds, x, frequency, frequency_qantity) + utils.is_frequency_consistency(start_ds, x, frequency, frequency_quantity) ).all() if not consistency: raise ValueError( - f"Input time column includes different frequency than the specified frequency {frequency_qantity}{frequency}." + f"Input time column includes different frequency than the specified frequency {frequency_quantity}{frequency}." ) diff --git a/runtime/tests/automl_runtime/forecast/deepar/model_test.py b/runtime/tests/automl_runtime/forecast/deepar/model_test.py index 44894b93..467ec3ba 100644 --- a/runtime/tests/automl_runtime/forecast/deepar/model_test.py +++ b/runtime/tests/automl_runtime/forecast/deepar/model_test.py @@ -94,6 +94,7 @@ def test_model_save_and_load_single_series(self): model=self.model, horizon=self.prediction_length, frequency="d", + frequency_quantity=1, num_samples=1, target_col=target_col, time_col=time_col, @@ -138,6 +139,7 @@ def test_model_save_and_load_multi_series(self): horizon=self.prediction_length, num_samples=1, frequency="d", + frequency_quantity=1, target_col=target_col, time_col=time_col, id_cols=[id_col], @@ -185,6 +187,7 @@ def test_model_save_and_load_multi_series_multi_id_cols(self): horizon=self.prediction_length, num_samples=1, frequency="d", + frequency_quantity=1, target_col=target_col, time_col=time_col, id_cols=id_cols, @@ -231,6 +234,7 @@ def test_model_prediction_with_duplicate_timestamps(self): model=self.model, horizon=self.prediction_length, frequency="d", + frequency_quantity=1, num_samples=1, target_col=target_col, time_col=time_col, @@ -274,6 +278,7 @@ def test_model_prediction_with_monthly_data(self): model=self.model, horizon=self.prediction_length, frequency="MS", + frequency_quantity=1, num_samples=1, target_col=target_col, time_col=time_col, diff --git a/runtime/tests/automl_runtime/forecast/deepar/utils_test.py b/runtime/tests/automl_runtime/forecast/deepar/utils_test.py index 98aa75b2..d88c567b 100644 --- a/runtime/tests/automl_runtime/forecast/deepar/utils_test.py +++ b/runtime/tests/automl_runtime/forecast/deepar/utils_test.py @@ -39,7 +39,7 @@ def test_single_series_filled(self): ) dropped_df = base_df.drop([4, 5]).reset_index(drop=True) - transformed_df = set_index_and_fill_missing_time_steps(dropped_df, time_col, "D") + transformed_df = set_index_and_fill_missing_time_steps(dropped_df, time_col, "D", 1) expected_df = base_df.copy() expected_df.loc[[4, 5], target_col] = float('nan') @@ -68,7 +68,7 @@ def test_multi_series_filled(self): dropped_df = pd.concat([dropped_base_df.copy(), dropped_base_df.copy()], ignore_index=True) dropped_df[id_col] = [1] * (num_rows_per_ts - 2) + [2] * (num_rows_per_ts - 2) - transformed_df_dict = set_index_and_fill_missing_time_steps(dropped_df, time_col, "D", id_cols=[id_col]) + transformed_df_dict = set_index_and_fill_missing_time_steps(dropped_df, time_col, "D", 1, id_cols=[id_col]) self.assertEqual(transformed_df_dict.keys(), {"1", "2"}) expected_first_df = base_df.copy() @@ -100,7 +100,7 @@ def test_multi_series_multi_id_cols_filled(self): dropped_df[id_cols[0]] = ([1] * (num_rows_per_ts - 2) + [2] * (num_rows_per_ts - 2)) * 2 dropped_df[id_cols[1]] = [1] * (2 * (num_rows_per_ts - 2)) + [2] * (2 * (num_rows_per_ts - 2)) - transformed_df_dict = set_index_and_fill_missing_time_steps(dropped_df, time_col, "D", id_cols=id_cols) + transformed_df_dict = set_index_and_fill_missing_time_steps(dropped_df, time_col, "D", 1, id_cols=id_cols) self.assertEqual(transformed_df_dict.keys(), {"1-1", "1-2", "2-1", "2-2"}) expected_first_df = base_df.copy() @@ -133,7 +133,8 @@ def test_single_series_week_day_index(self): transformed_df = set_index_and_fill_missing_time_steps( dropped_df, time_col, - "W" # Weekly frequency **without** specifying Friday + "W", # Weekly frequency **without** specifying Friday + 1 ) # Create expected dataframe @@ -168,7 +169,8 @@ def test_single_series_month_start_index(self): transformed_df = set_index_and_fill_missing_time_steps( dropped_df, time_col, - "MS" # Monthly frequency + "MS", # Monthly frequency + 1 ) # Create expected dataframe @@ -204,7 +206,8 @@ def test_single_series_month_mid_index(self): transformed_df = set_index_and_fill_missing_time_steps( dropped_df, time_col, - "MS" + "MS", + 1 ) # Create expected dataframe @@ -241,7 +244,8 @@ def test_single_series_month_end_index(self): transformed_df = set_index_and_fill_missing_time_steps( dropped_df, time_col, - "MS" # Monthly frequency + "MS", # Monthly frequency + 1 ) # Create expected dataframe diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py index 3c6d0d40..57a0ab1b 100644 --- a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py +++ b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py @@ -41,7 +41,8 @@ def setUp(self) -> None: self.start_ds = pd.Timestamp("2020-10-01") self.horizon = 1 self.freq = 'W' - dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency=self.freq) + self.frequency_quantity=1 + dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency=self.freq, frequency_quantity=self.frequency_quantity) self.df = pd.concat([ pd.Series(dates, name='date'), pd.Series(range(self.num_rows), name="y") @@ -52,6 +53,7 @@ def setUp(self) -> None: self.arima_model = ArimaModel(pickled_model, horizon=self.horizon, frequency=self.freq, + frequency_quantity=self.frequency_quantity, start_ds=self.start_ds, end_ds=pd.Timestamp("2020-11-26"), time_col="date") @@ -67,7 +69,8 @@ def test_predict_timeseries_success(self): expected_ds = AbstractArimaModel._get_ds_indices( self.start_ds, periods=self.num_rows + self.horizon, - frequency=self.freq) + frequency=self.freq, + frequency_quantity=self.frequency_quantity) self.assertTrue(expected_columns.issubset(set(forecast_pd.columns))) self.assertEqual(10, forecast_pd.shape[0]) pd.testing.assert_series_equal(pd.Series(expected_ds, name='ds'), forecast_pd["ds"]) @@ -135,8 +138,9 @@ def setUp(self) -> None: self.start_ds = datetime.date(2020, 10, 1) self.horizon = 1 self.freq = 'W' + self.frequency_quantity = 1 dates = AbstractArimaModel._get_ds_indices( - pd.to_datetime(self.start_ds), periods=self.num_rows, frequency=self.freq) + pd.to_datetime(self.start_ds), periods=self.num_rows, frequency=self.freq, frequency_quantity=self.frequency_quantity) self.df = pd.concat([ pd.Series(dates, name='date'), pd.Series(range(self.num_rows), name="y") @@ -147,6 +151,7 @@ def setUp(self) -> None: self.arima_model = ArimaModel(pickled_model, horizon=self.horizon, frequency=self.freq, + frequency_quantity=self.frequency_quantity, start_ds=self.start_ds, end_ds=pd.Timestamp("2020-11-26"), time_col="date") @@ -168,7 +173,8 @@ def setUp(self) -> None: self.start_ds = pd.Timestamp("2020-10-01") self.horizon = 1 self.freq = 'W' - dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency=self.freq) + self.frequency_quantity = 1 + dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency=self.freq, frequency_quantity=self.frequency_quantity) self.df = pd.concat([ pd.Series(dates, name='date'), pd.Series(range(self.num_rows), name="y"), @@ -184,6 +190,7 @@ def setUp(self) -> None: self.arima_model = ArimaModel(pickled_model, horizon=self.horizon, frequency=self.freq, + frequency_quantity=self.frequency_quantity, start_ds=self.start_ds, end_ds=pd.Timestamp("2020-11-26"), time_col="date", @@ -228,6 +235,7 @@ def setUp(self) -> None: self.arima_model = MultiSeriesArimaModel(pickled_model_dict, horizon=1, frequency='month', + frequency_quantity=1, start_ds_dict=start_ds_dict, end_ds_dict=end_ds_dict, time_col="date", @@ -311,6 +319,7 @@ def test_make_future_dataframe_multi_ids(self): arima_model = MultiSeriesArimaModel(pickled_model_dict, horizon=1, frequency='month', + frequency_quantity=1, start_ds_dict=start_ds_dict, end_ds_dict=end_ds_dict, time_col="date", @@ -351,6 +360,7 @@ def setUp(self) -> None: self.arima_model = MultiSeriesArimaModel(pickled_model_dict, horizon=1, frequency='month', + frequency_quantity=1, start_ds_dict=start_ds_dict, end_ds_dict=end_ds_dict, time_col="date", @@ -404,7 +414,8 @@ def test_get_ds_weekly(self): ds_indices = AbstractArimaModel._get_ds_indices( start_ds=pd.Timestamp("2022-01-01 12:30"), periods=8, - frequency='W') + frequency='W', + frequency_quantity=1) pd.testing.assert_index_equal(expected_ds, ds_indices) def test_get_ds_hourly(self): @@ -418,7 +429,8 @@ def test_get_ds_hourly(self): ds_indices = AbstractArimaModel._get_ds_indices( start_ds=pd.Timestamp("2021-12-10 09:23"), periods=10, - frequency='H') + frequency='H', + frequency_quantity=1) pd.testing.assert_index_equal(expected_ds, ds_indices) @@ -435,7 +447,7 @@ def setUp(self) -> None: self.pickled_model = pickle.dumps(model) def test_mlflow_arima_log_model(self): - arima_model = ArimaModel(self.pickled_model, horizon=1, frequency='d', + arima_model = ArimaModel(self.pickled_model, horizon=1, frequency='d', frequency_quantity=1, start_ds=pd.to_datetime("2020-10-01"), end_ds=pd.to_datetime("2020-10-09"), time_col="date") with mlflow.start_run() as run: @@ -461,6 +473,7 @@ def test_mlflow_arima_log_model_multiseries(self): multiseries_arima_model = MultiSeriesArimaModel(pickled_model_dict, horizon=1, frequency='d', + frequency_quantity=1, start_ds_dict=start_ds_dict, end_ds_dict=end_ds_dict, time_col="date", diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py index 1f1d07aa..01ccfcde 100644 --- a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py +++ b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py @@ -53,6 +53,7 @@ def test_fit_success(self): for freq, df in [['d', self.df], ['d', self.df_string_time], ['month', self.df_monthly]]: arima_estimator = ArimaEstimator(horizon=1, frequency_unit=freq, + frequency_quantity=1, metric="smape", seasonal_periods=[1, 7], num_folds=2) @@ -64,6 +65,7 @@ def test_fit_success(self): def test_fit_success_with_exogenous(self): arima_estimator = ArimaEstimator(horizon=1, frequency_unit="d", + frequency_quantity=1, metric="smape", seasonal_periods=[1, 7], num_folds=2, @@ -79,6 +81,7 @@ def test_fit_success_with_split_cutoff(self): ['month', self.df_monthly, '2020-09-07 00:00:00']]: arima_estimator = ArimaEstimator(horizon=1, frequency_unit=freq, + frequency_quantity=1, metric="smape", seasonal_periods=[1, 7], num_folds=2, @@ -90,18 +93,20 @@ def test_fit_success_with_split_cutoff(self): def test_fit_skip_too_long_seasonality(self): arima_estimator = ArimaEstimator(horizon=1, frequency_unit="d", + frequency_quantity=1, metric="smape", seasonal_periods=[3, 14], num_folds=2) with self.assertLogs(logger="databricks.automl_runtime.forecast.pmdarima.training", level="WARNING") as cm: results_pd = arima_estimator.fit(self.df) - self.assertIn("Skipping seasonal_period=14 (D). Dataframe timestamps must span at least two seasonality periods", cm.output[0]) + self.assertIn("Skipping seasonal_period=14 (1D). Dataframe timestamps must span at least two seasonality periods", cm.output[0]) @patch("databricks.automl_runtime.forecast.prophet.forecast.utils.generate_cutoffs") def test_fit_horizon_truncation(self, mock_generate_cutoffs): period = 2 arima_estimator = ArimaEstimator(horizon=100, frequency_unit="d", + frequency_quantity=1, metric="smape", seasonal_periods=[period], num_folds=2) @@ -119,6 +124,7 @@ def test_fit_horizon_truncation_one_cutoff(self, mock_fit_predict): period = 2 arima_estimator = ArimaEstimator(horizon=100, frequency_unit="d", + frequency_quantity=1, metric="smape", seasonal_periods=[period], num_folds=2) @@ -137,6 +143,7 @@ def test_fit_success_with_failed_seasonal_periods(self): # The fit method still succeeds because m=1 succeeds arima_estimator = ArimaEstimator(horizon=1, frequency_unit="d", + frequency_quantity=1, metric="smape", seasonal_periods=[1, 7, 30], num_folds=2) @@ -147,6 +154,7 @@ def test_fit_success_with_failed_seasonal_periods(self): def test_fit_failure_inconsistent_frequency(self): arima_estimator = ArimaEstimator(horizon=1, frequency_unit="W", + frequency_quantity=1, metric="smape", seasonal_periods=[1], num_folds=2) @@ -156,6 +164,7 @@ def test_fit_failure_inconsistent_frequency(self): def test_fit_failure_no_succeeded_model(self): arima_estimator = ArimaEstimator(horizon=1, frequency_unit="d", + frequency_quantity=1, metric="smape", seasonal_periods=[30], num_folds=2) @@ -182,7 +191,7 @@ def test_fill_missing_time_steps(self): ) indices_to_drop = [5, 8] df_missing = pd.DataFrame({"ds": ds, "y": range(12)}).drop(indices_to_drop).reset_index(drop=True) - df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency=frequency) + df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency=frequency, frequency_quantity=1) for index in indices_to_drop: self.assertTrue(df_filled["y"][index] == df_filled["y"][index - 1]) self.assertEqual(ds.to_list(), df_filled["ds"].to_list()) @@ -196,16 +205,16 @@ def test_fill_missing_time_steps_with_exogenous(self): ) indices_to_drop = [5, 8] df_missing = pd.DataFrame({"ds": ds, "y": range(12), "x": range(12)}).drop(indices_to_drop).reset_index(drop=True) - df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency=frequency) + df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency=frequency, frequency_quantity=1) for index in indices_to_drop: self.assertTrue(df_filled["y"][index] == df_filled["y"][index - 1]) self.assertTrue(df_filled["x"][index] == df_filled["x"][index - 1]) self.assertEqual(ds.to_list(), df_filled["ds"].to_list()) def test_validate_ds_freq_matched_frequency(self): - ArimaEstimator._validate_ds_freq(self.df, frequency='D') - ArimaEstimator._validate_ds_freq(self.df_monthly, frequency='month') + ArimaEstimator._validate_ds_freq(self.df, frequency='D', frequency_quantity=1) + ArimaEstimator._validate_ds_freq(self.df_monthly, frequency='month', frequency_quantity=1) def test_validate_ds_freq_unmatched_frequency(self): with pytest.raises(ValueError, match="includes different frequency"): - ArimaEstimator._validate_ds_freq(self.df, frequency='W') + ArimaEstimator._validate_ds_freq(self.df, frequency='W', frequency_quantity=1) diff --git a/runtime/tests/automl_runtime/forecast/prophet/model_test.py b/runtime/tests/automl_runtime/forecast/prophet/model_test.py index 4ce65dae..30ab37b9 100644 --- a/runtime/tests/automl_runtime/forecast/prophet/model_test.py +++ b/runtime/tests/automl_runtime/forecast/prophet/model_test.py @@ -80,7 +80,7 @@ def setUpClass(cls) -> None: cls.model = model_from_json(cls.model_json) def test_model_save_and_load(self): - prophet_model = ProphetModel(self.model_json, 1, "d", "ds") + prophet_model = ProphetModel(self.model_json, 1, "d", 1, "ds") with mlflow.start_run() as run: mlflow_prophet_log_model(prophet_model) @@ -110,7 +110,7 @@ def test_make_future_dataframe(self): # don't have full support yet. if OFFSET_ALIAS_MAP[feq_unit] in ['YS', 'MS', 'QS']: continue - prophet_model = ProphetModel(self.model_json, 1, feq_unit, "ds") + prophet_model = ProphetModel(self.model_json, 1, feq_unit, 1, "ds") future_df = prophet_model.make_future_dataframe(1) offset_kw_arg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[feq_unit]] expected_time = pd.Timestamp("2020-10-25") + pd.DateOffset(**offset_kw_arg) @@ -119,7 +119,7 @@ def test_make_future_dataframe(self): f" Expect {expected_time}, but get {future_df.iloc[-1]['ds']}") def test_predict_success_datetime_date(self): - prophet_model = ProphetModel(self.model_json, 1, "d", "ds") + prophet_model = ProphetModel(self.model_json, 1, "d", 1, "ds") test_df = pd.DataFrame( {"ds": [datetime.date(2020, 10, 8), datetime.date(2020, 12, 10)]} ) @@ -131,7 +131,7 @@ def test_predict_success_datetime_date(self): ) # check the input dataframe is unchanged def test_predict_success_string(self): - prophet_model = ProphetModel(self.model_json, 1, "d", "ds") + prophet_model = ProphetModel(self.model_json, 1, "d", 1, "ds") test_df = pd.DataFrame({"ds": ["2020-10-08", "2020-12-10"]}) expected_test_df = test_df.copy() yhat = prophet_model.predict(None, test_df) @@ -141,7 +141,7 @@ def test_predict_success_string(self): ) # check the input dataframe is unchanged def test_validate_predict_cols(self): - prophet_model = ProphetModel(self.model_json, 1, "d", "time") + prophet_model = ProphetModel(self.model_json, 1, "d", 1, "time") test_df = pd.DataFrame( { "date": [pd.to_datetime("2020-11-01"), pd.to_datetime("2020-11-04")], @@ -174,6 +174,7 @@ def setUpClass(cls) -> None: timeseries_end="2020-07-25", horizon=1, frequency="days", + frequency_quantity=1, time_col="time", id_cols=["id"], ) @@ -241,6 +242,7 @@ def test_model_save_and_load_multi_ids(self): "2020-07-25", 1, "days", + 1, "time", ["id1", "id2"], ) @@ -303,6 +305,7 @@ def test_validate_predict_cols(self): timeseries_end="2020-07-25", horizon=1, frequency="days", + frequency_quantity=1, time_col="ds", id_cols=["id1"], ) @@ -350,6 +353,7 @@ def test_make_future_dataframe_multi_ids(self): "2020-07-25", 1, "days", + 1, "time", ["id1", "id2"], ) diff --git a/runtime/tests/automl_runtime/forecast/utils_test.py b/runtime/tests/automl_runtime/forecast/utils_test.py index 3d9c5195..edbbc2d7 100644 --- a/runtime/tests/automl_runtime/forecast/utils_test.py +++ b/runtime/tests/automl_runtime/forecast/utils_test.py @@ -300,7 +300,7 @@ def test_calculate_period_differences_evenly(self): ) }) periods = df.apply(lambda x: calculate_period_differences( - x.start_time, x.end_time, 'month' + x.start_time, x.end_time, 'month', 1 ), axis=1) self.assertTrue((periods == pd.Series([4, 5, 12])).all()) @@ -314,11 +314,11 @@ def test_calculate_period_differences_unevenly(self): ) }) periods = df.apply(lambda x: calculate_period_differences( - x.start_time, x.end_time, 'month' + x.start_time, x.end_time, 'month', 1 ), axis=1) self.assertTrue((periods == pd.Series([4, 5, 0])).all()) periods = df.apply(lambda x: calculate_period_differences( - x.start_time, x.end_time, 'day' + x.start_time, x.end_time, 'day', 1 ), axis=1) self.assertTrue((periods == pd.Series([118, 151, 0])).all()) @@ -331,12 +331,12 @@ def test_frequency_consistency(self): ) start_scalar = pd.to_datetime('2021-01-14') end_scalar = pd.to_datetime('2021-05-16') - self.assertFalse(is_frequency_consistency(start_scalar, end_scalar, 'month')) + self.assertFalse(is_frequency_consistency(start_scalar, end_scalar, 'month', 1)) self.assertTrue(start_time.apply( - lambda x: is_frequency_consistency(x, end_scalar, 'day') + lambda x: is_frequency_consistency(x, end_scalar, 'day', 1) ).all()) self.assertTrue(end_time.apply( - lambda x: is_frequency_consistency(start_scalar, x, 'month') + lambda x: is_frequency_consistency(start_scalar, x, 'month', 1) ).all()) @@ -347,6 +347,7 @@ def test_make_single_future_dataframe(self): end_time=pd.to_datetime('2022-01-04'), horizon=1, frequency="d", + frequency_quantity=1, include_history=False, column_name="test_date" ) @@ -359,6 +360,7 @@ def test_make_single_future_dataframe(self): end_time=pd.to_datetime('2022-01-04'), horizon=1, frequency="d", + frequency_quantity=1, include_history=True, column_name="test_date" ) @@ -375,6 +377,7 @@ def test_make_single_future_dataframe_with_different_freq(self): end_time=end_time, horizon=1, frequency=freq, + frequency_quantity=1, include_history=True, column_name="test_date" ) @@ -401,6 +404,7 @@ def test_make_future_dataframe(self, start_time, end_time, end_time=end_time, horizon=1, frequency="d", + frequency_quantity=1, groups=groups, identity_column_names=identity_column_names, ) From 4956f30bc54b63c416a43178f16fce1bf851880d Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Tue, 11 Feb 2025 15:25:13 -0800 Subject: [PATCH 04/11] pr comment --- .../automl_runtime/forecast/deepar/utils.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/runtime/databricks/automl_runtime/forecast/deepar/utils.py b/runtime/databricks/automl_runtime/forecast/deepar/utils.py index 40ecd9ea..74549628 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/utils.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/utils.py @@ -18,7 +18,10 @@ import pandas as pd -def validate_and_generate_index(df: pd.DataFrame, time_col: str, frequency: str, frequency_quantity: int): +def validate_and_generate_index(df: pd.DataFrame, + time_col: str, + frequency: str, + frequency_quantity: int): """ Generate a complete time index for the given DataFrame based on the specified frequency. - Ensures the time column is in datetime format. @@ -89,7 +92,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, weekday_name = total_min.strftime("%a").upper() # e.g., "FRI" frequency = f"W-{weekday_name}" - new_index_full = validate_and_generate_index(df=df, time_col=time_col, frequency=frequency, frequency_quantity=frequency_quantity) + valid_index = validate_and_generate_index(df=df, time_col=time_col, frequency=frequency, frequency_quantity=frequency_quantity) if id_cols is not None: df_dict = {} @@ -99,14 +102,14 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, else: ts_id = str(grouped_id) df_dict[ts_id] = (grouped_df.set_index(time_col).sort_index() - .reindex(new_index_full).drop(id_cols, axis=1)) + .reindex(valid_index).drop(id_cols, axis=1)) return df_dict df = df.set_index(time_col).sort_index() # Fill in missing time steps between the min and max time steps - df = df.reindex(new_index_full) + df = df.reindex(valid_index) if frequency.upper() == "MS": # Truncate the day of month to avoid issues with pandas frequency check From d9e51445879ab09d165d212d01eeedc202027979 Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Tue, 11 Feb 2025 17:30:53 -0800 Subject: [PATCH 05/11] add tests --- .../forecast/deepar/model_test.py | 43 ++++++- .../forecast/deepar/utils_test.py | 33 ++++++ .../forecast/pmdarima/model_test.py | 109 ++++++++++++++++++ 3 files changed, 184 insertions(+), 1 deletion(-) diff --git a/runtime/tests/automl_runtime/forecast/deepar/model_test.py b/runtime/tests/automl_runtime/forecast/deepar/model_test.py index 467ec3ba..0ebf1971 100644 --- a/runtime/tests/automl_runtime/forecast/deepar/model_test.py +++ b/runtime/tests/automl_runtime/forecast/deepar/model_test.py @@ -20,6 +20,7 @@ import pandas as pd import torch import torch.nn as nn +from parameterized import parameterized from gluonts.dataset.field_names import FieldName from gluonts.transform import InstanceSplitter, TestSplitSampler from gluonts.torch.model.predictor import PyTorchPredictor @@ -310,4 +311,44 @@ def test_model_prediction_with_monthly_data(self): # Verify the prediction output format self.assertEqual(pred_df.columns.tolist(), [time_col, "yhat"]) self.assertEqual(len(pred_df), self.prediction_length) - self.assertGreater(pred_df[time_col].min(), sample_input[time_col].max()) \ No newline at end of file + self.assertGreater(pred_df[time_col].min(), sample_input[time_col].max()) + + @parameterized.expand([(1,), (5,), (10,), (15,), (30,)]) + def test_model_prediction_with_multiple_minutes_frequency(self, frequency_quantity): + target_col = "sales" + time_col = "date" + + deepar_model = DeepARModel( + model=self.model, + horizon=self.prediction_length, + frequency="min", + frequency_quantity=frequency_quantity, + num_samples=1, + target_col=target_col, + time_col=time_col, + ) + + # Create sample input with duplicate timestamps + dates = pd.date_range(start="2020-10-01", periods=6, freq=f"{frequency_quantity}min") + + sales = [10, 20, 30, + 60, 90, 100] + + sample_input = pd.DataFrame({ + time_col: dates, + target_col: sales + }) + + with mlflow.start_run() as run: + mlflow_deepar_log_model(deepar_model, sample_input) + + run_id = run.info.run_id + + # Load the model and predict + loaded_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") + pred_df = loaded_model.predict(sample_input) + + # Verify the prediction output format + self.assertEqual(pred_df.columns.tolist(), [time_col, "yhat"]) + self.assertEqual(len(pred_df), self.prediction_length) + self.assertGreater(pred_df[time_col].min(), sample_input[time_col].max()) diff --git a/runtime/tests/automl_runtime/forecast/deepar/utils_test.py b/runtime/tests/automl_runtime/forecast/deepar/utils_test.py index d88c567b..3aebf1ea 100644 --- a/runtime/tests/automl_runtime/forecast/deepar/utils_test.py +++ b/runtime/tests/automl_runtime/forecast/deepar/utils_test.py @@ -16,6 +16,7 @@ import unittest import pandas as pd +from parameterized import parameterized from databricks.automl_runtime.forecast.deepar.utils import set_index_and_fill_missing_time_steps @@ -256,3 +257,35 @@ def test_single_series_month_end_index(self): # Assert equality pd.testing.assert_frame_equal(transformed_df, expected_df) + + @parameterized.expand([(1,), (5,), (10,), (15,), (30,)]) + def test_single_series_with_multiple_minute_index(self, frequency_quantity): + target_col = "sales" + time_col = "date" + num_rows = 6 + + base_dates = pd.date_range(start="2020-10-01", periods=num_rows, freq=f"{frequency_quantity}min") + + base_df = pd.DataFrame({ + time_col: base_dates, + target_col: range(num_rows) + }) + + # Create a dataframe with missing months (drop the 3rd and 4th rows) + dropped_df = base_df.drop([3, 4]).reset_index(drop=True) + + # Transform the dataframe + transformed_df = set_index_and_fill_missing_time_steps( + dropped_df, + time_col, + "min", + frequency_quantity + ) + + # Create expected dataframe + expected_df = base_df.copy() + expected_df.loc[[3, 4], target_col] = float('nan') + expected_df = expected_df.set_index(time_col).rename_axis(None).asfreq(f"{frequency_quantity}min") + + # Assert equality + pd.testing.assert_frame_equal(transformed_df, expected_df) diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py index 57a0ab1b..b6583752 100644 --- a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py +++ b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py @@ -510,3 +510,112 @@ def _check_requirements(self, run_id: str): # check if all additional dependencies are logged for dependency in ARIMA_ADDITIONAL_PIP_DEPS: self.assertIn(dependency, requirements, f"requirements.txt should contain {dependency} but got {requirements}") + +class TestArimaModelFrequencyQuantity(unittest.TestCase): + + def setUp(self) -> None: + self.num_rows = 9 + self.start_ds = pd.Timestamp("2020-10-01") + self.horizon = 1 + self.freq = 'min' + frequency_quantities = [1, 5, 10, 15, 30] + self.quantity_model_pairs = [] + + for frequency_quantity in frequency_quantities: + dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency=self.freq, frequency_quantity=frequency_quantity) + df = pd.concat([ + pd.Series(dates, name='date'), + pd.Series(range(self.num_rows), name="y") + ], axis=1) + model = ARIMA(order=(2, 0, 2), suppress_warnings=True) + model.fit(df.set_index("date")) + pickled_model = pickle.dumps(model) + self.quantity_model_pairs.append((frequency_quantity, ArimaModel(pickled_model, + horizon=self.horizon, + frequency=self.freq, + frequency_quantity=frequency_quantity, + start_ds=self.start_ds, + end_ds=dates.max(), + time_col="date"))) + + def test_make_future_dataframe(self): + for frequency_quantity, arima_model in self.quantity_model_pairs: + future_df = arima_model.make_future_dataframe(include_history=False) + self.assertCountEqual(future_df.columns, {"ds"}) + self.assertEqual(1, future_df.shape[0]) + + def test_predict_timeseries_success(self): + for frequency_quantity, arima_model in self.quantity_model_pairs: + forecast_pd = arima_model.predict_timeseries() + expected_columns = {"yhat", "yhat_lower", "yhat_upper"} + expected_ds = AbstractArimaModel._get_ds_indices( + self.start_ds, + periods=self.num_rows + self.horizon, + frequency=self.freq, + frequency_quantity=frequency_quantity) + self.assertTrue(expected_columns.issubset(set(forecast_pd.columns))) + self.assertEqual(10, forecast_pd.shape[0]) + pd.testing.assert_series_equal(pd.Series(expected_ds, name='ds'), forecast_pd["ds"]) + # Test forecast without history data + forecast_future_pd = arima_model.predict_timeseries(include_history=False) + self.assertEqual(len(forecast_future_pd), self.horizon) + + def test_predict_success(self): + for frequency_quantity, arima_model in self.quantity_model_pairs: + test_df = pd.DataFrame({ + "date": [pd.to_datetime("2020-10-01") + self.num_rows*pd.DateOffset(minutes=frequency_quantity), + pd.to_datetime("2020-10-01") + (self.num_rows+1)*pd.DateOffset(minutes=frequency_quantity)] + }) + expected_test_df = test_df.copy() + yhat = arima_model.predict(context=None, model_input=test_df) + self.assertEqual(2, len(yhat)) + pd.testing.assert_frame_equal(test_df, expected_test_df) # check the input dataframe is unchanged + + def test_predict_success_datetime_date(self): + for _, arima_model in self.quantity_model_pairs: + test_df = pd.DataFrame({ + "date": [datetime.datetime(2020, 10, 1, 6, 0, 0), datetime.datetime(2020, 10, 1, 6, 30, 0)] + }) + expected_test_df = test_df.copy() + yhat = arima_model.predict(context=None, model_input=test_df) + self.assertEqual(2, len(yhat)) + pd.testing.assert_frame_equal(test_df, expected_test_df) # check the input dataframe is unchanged + + def test_predict_success_string(self): + for _, arima_model in self.quantity_model_pairs: + test_df = pd.DataFrame({ + "date": ["2020-10-01 06:00:00", "2020-10-01 06:30:00"] + }) + expected_test_df = test_df.copy() + yhat = arima_model.predict(context=None, model_input=test_df) + self.assertEqual(2, len(yhat)) + pd.testing.assert_frame_equal(test_df, expected_test_df) # check the input dataframe is unchanged + + def test_predict_failure_unmatched_frequency(self): + for frequency_quantity, arima_model in self.quantity_model_pairs: + if frequency_quantity == 1: continue + test_df = pd.DataFrame({ + "date": [pd.to_datetime("2020-10-01 00:00:00"), pd.to_datetime("2020-10-01 00:01:00"), pd.to_datetime("2020-10-01 00:04:00")] + }) + with pytest.raises(MlflowException, match="includes different frequency") as e: + arima_model.predict(context=None, model_input=test_df) + assert e.value.error_code == ErrorCode.Name(INVALID_PARAMETER_VALUE) + + def test_predict_failure_invalid_time_range(self): + for _, arima_model in self.quantity_model_pairs: + test_df = pd.DataFrame({ + "date": [pd.to_datetime("2020-09-30 00:00:00"), pd.to_datetime("2020-10-01 00:01:00")] + }) + with pytest.raises(MlflowException, match="includes time earlier than the history data that the model was " + "trained on") as e: + arima_model.predict(context=None, model_input=test_df) + assert e.value.error_code == ErrorCode.Name(INVALID_PARAMETER_VALUE) + + def test_predict_failure_invalid_time_col_name(self): + for _, arima_model in self.quantity_model_pairs: + test_df = pd.DataFrame({ + "invalid_time_col_name": [pd.to_datetime("2020-10-08"), pd.to_datetime("2020-12-10")] + }) + with pytest.raises(MlflowException, match="Input data columns") as e: + arima_model.predict(context=None, model_input=test_df) + assert e.value.error_code == ErrorCode.Name(INVALID_PARAMETER_VALUE) From c2fa88ab13ba6d7ee4bdb204b905853e3c5a0d6e Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Tue, 11 Feb 2025 18:08:19 -0800 Subject: [PATCH 06/11] add tests --- .../forecast/pmdarima/training_test.py | 62 ++++++++++++++++--- 1 file changed, 54 insertions(+), 8 deletions(-) diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py index 01ccfcde..43869184 100644 --- a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py +++ b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py @@ -48,14 +48,37 @@ def setUp(self) -> None: pd.Series(np.random.rand(self.num_rows), name="x1"), pd.Series(np.random.rand(self.num_rows), name="x2") ], axis=1) + self.df_with_5_minute_interval = pd.concat([ + pd.Series(pd.date_range(start="2020-07-05 00:00:00", periods=self.num_rows, freq="5min"), name="ds"), + pd.Series(np.random.rand(self.num_rows), name="y"), + ], axis=1) + self.df_with_10_minute_interval = pd.concat([ + pd.Series(pd.date_range(start="2020-07-05 00:00:00", periods=self.num_rows, freq="10min"), name="ds"), + pd.Series(np.random.rand(self.num_rows), name="y"), + ], axis=1) + self.df_with_15_minute_interval = pd.concat([ + pd.Series(pd.date_range(start="2020-07-05 00:00:00", periods=self.num_rows, freq="15min"), name="ds"), + pd.Series(np.random.rand(self.num_rows), name="y"), + ], axis=1) + self.df_with_30_minute_interval = pd.concat([ + pd.Series(pd.date_range(start="2020-07-05 00:00:00", periods=self.num_rows, freq="30min"), name="ds"), + pd.Series(np.random.rand(self.num_rows), name="y"), + ], axis=1) def test_fit_success(self): - for freq, df in [['d', self.df], ['d', self.df_string_time], ['month', self.df_monthly]]: + for freq, frequancy_quantity, df, seasonal_periods in [ + ['d', 1, self.df, [1, 7]], + ['d', 1, self.df_string_time, [1, 7]], + ['month', 1, self.df_monthly, [1, 12]], + ['min', 5, self.df_with_5_minute_interval, [1]], + ['min', 10, self.df_with_10_minute_interval, [1]], + ['min', 15, self.df_with_15_minute_interval, [1]], + ['min', 30, self.df_with_30_minute_interval, [1]]]: arima_estimator = ArimaEstimator(horizon=1, frequency_unit=freq, - frequency_quantity=1, + frequency_quantity=frequancy_quantity, metric="smape", - seasonal_periods=[1, 7], + seasonal_periods=seasonal_periods, num_folds=2) results_pd = arima_estimator.fit(df) @@ -76,14 +99,18 @@ def test_fit_success_with_exogenous(self): self.assertIn("pickled_model", results_pd) def test_fit_success_with_split_cutoff(self): - for freq, df, split_cutoff in [['d', self.df, '2020-07-17 00:00:00'], - ['d', self.df_string_time, '2020-07-17 00:00:00'], - ['month', self.df_monthly, '2020-09-07 00:00:00']]: + for freq, frequency_quantity, df, split_cutoff in [['d', 1, self.df, '2020-07-17 00:00:00'], + ['d', 1, self.df_string_time, '2020-07-17 00:00:00'], + ['month', 1, self.df_monthly, '2020-09-07 00:00:00'], + ['min', 5, self.df_with_5_minute_interval, '2020-07-05 00:30:00'], + ['min', 10, self.df_with_10_minute_interval, '2020-07-05 01:00:00'], + ['min', 15, self.df_with_15_minute_interval, '2020-07-05 01:30:00'], + ['min', 30, self.df_with_30_minute_interval, '2020-07-05 03:00:00']]: arima_estimator = ArimaEstimator(horizon=1, frequency_unit=freq, - frequency_quantity=1, + frequency_quantity=frequency_quantity, metric="smape", - seasonal_periods=[1, 7], + seasonal_periods=[1], num_folds=2, split_cutoff=pd.Timestamp(split_cutoff)) results_pd = arima_estimator.fit(df) @@ -211,10 +238,29 @@ def test_fill_missing_time_steps_with_exogenous(self): self.assertTrue(df_filled["x"][index] == df_filled["x"][index - 1]) self.assertEqual(ds.to_list(), df_filled["ds"].to_list()) + def test_fill_missing_time_steps_with_multiple_frequency_quantities(self): + supported_quantities = [1, 5, 10, 15, 30] + start_ds = pd.Timestamp("2020-07-05 00:00:00") + for quantity in supported_quantities: + ds = pd.date_range(start=start_ds, periods=12, freq=pd.DateOffset(**{'minutes': quantity})) + indices_to_drop = [5, 8] + df_missing = pd.DataFrame({"ds": ds, "y": range(12)}).drop(indices_to_drop).reset_index(drop=True) + df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency='min', frequency_quantity=quantity) + for index in indices_to_drop: + self.assertTrue(df_filled["y"][index] == df_filled["y"][index - 1]) + self.assertEqual(ds.to_list(), df_filled["ds"].to_list()) + def test_validate_ds_freq_matched_frequency(self): ArimaEstimator._validate_ds_freq(self.df, frequency='D', frequency_quantity=1) ArimaEstimator._validate_ds_freq(self.df_monthly, frequency='month', frequency_quantity=1) + ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency='min', frequency_quantity=5) + ArimaEstimator._validate_ds_freq(self.df_with_10_minute_interval, frequency='min', frequency_quantity=10) + ArimaEstimator._validate_ds_freq(self.df_with_15_minute_interval, frequency='min', frequency_quantity=15) + ArimaEstimator._validate_ds_freq(self.df_with_30_minute_interval, frequency='min', frequency_quantity=30) def test_validate_ds_freq_unmatched_frequency(self): with pytest.raises(ValueError, match="includes different frequency"): ArimaEstimator._validate_ds_freq(self.df, frequency='W', frequency_quantity=1) + + with pytest.raises(ValueError, match="includes different frequency"): + ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency='min', frequency_quantity=10) From a8b1c974b5d22836ff1cc7d8580a402935104a9c Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Tue, 11 Feb 2025 21:00:06 -0800 Subject: [PATCH 07/11] add tests --- .../forecast/prophet/forecast_test.py | 49 ++++++++ .../forecast/prophet/model_test.py | 38 ++++++ .../automl_runtime/forecast/utils_test.py | 110 ++++++++++++++++-- 3 files changed, 189 insertions(+), 8 deletions(-) diff --git a/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py b/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py index d3b9478d..b1afbe4e 100644 --- a/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py +++ b/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py @@ -58,6 +58,22 @@ def setUp(self) -> None: pd.Series(range(self.num_rows), name="ds").apply(lambda i: f"{2012+i:04d}-01-15"), y_series ], axis=1) + self.df_with_5_minute_interval = pd.concat([ + pd.Series(pd.date_range(start="2020-07-05 00:00:00", periods=self.num_rows, freq="5min"), name="ds"), + y_series, + ], axis=1) + self.df_with_10_minute_interval = pd.concat([ + pd.Series(pd.date_range(start="2020-07-05 00:00:00", periods=self.num_rows, freq="10min"), name="ds"), + y_series, + ], axis=1) + self.df_with_15_minute_interval = pd.concat([ + pd.Series(pd.date_range(start="2020-07-05 00:00:00", periods=self.num_rows, freq="15min"), name="ds"), + y_series, + ], axis=1) + self.df_with_30_minute_interval = pd.concat([ + pd.Series(pd.date_range(start="2020-07-05 00:00:00", periods=self.num_rows, freq="30min"), name="ds"), + y_series, + ], axis=1) self.search_space = {"changepoint_prior_scale": hp.loguniform("changepoint_prior_scale", -2.3, -0.7)} def test_sequential_training(self): @@ -116,6 +132,39 @@ def test_monthly_sequential_training(self): model_json = json.loads(results["model_json"][0]) self.assertGreaterEqual(model_json["changepoint_prior_scale"], 0.1) self.assertLessEqual(model_json["changepoint_prior_scale"], 0.5) + + def test_sequential_training_with_multiple_frequency_quantities(self): + search_space = {"changepoint_prior_scale": hp.loguniform("changepoint_prior_scale", -2.3, -0.7)} + search_space["seasonality_mode"] = hp.choice( + 'seasonality_mode', ['additive', 'multiplicative'] + ) + for df, frequency_quantity, frequency_unit in [[self.df_with_5_minute_interval, 5, "min"], + [self.df_with_10_minute_interval, 10, "min"], + [self.df_with_15_minute_interval, 15, "min"], + [self.df_with_30_minute_interval, 30, "min"]]: + hyperopt_estim = ProphetHyperoptEstimator(horizon=1, + frequency_unit=frequency_unit, + frequency_quantity=frequency_quantity, + metric="smape", + interval_width=0.8, + country_holidays="US", + search_space=search_space, + num_folds=2, + trial_timeout=1000, + random_state=0, + is_parallel=False) + results = hyperopt_estim.fit(df) + self.assertAlmostEqual(results["mse"][0], 0, delta=0.0002) + self.assertAlmostEqual(results["rmse"][0], 0, delta=0.02) + self.assertAlmostEqual(results["mae"][0], 0, delta=0.02) + self.assertAlmostEqual(results["mape"][0], 0, delta=0.002) + self.assertAlmostEqual(results["mdape"][0], 0, delta=0.002) + self.assertAlmostEqual(results["smape"][0], 0, delta=0.002) + self.assertGreaterEqual(results["coverage"][0], 0.5) + # check the best result parameter is inside the search space + model_json = json.loads(results["model_json"][0]) + self.assertGreaterEqual(model_json["changepoint_prior_scale"], 0.1) + self.assertLessEqual(model_json["changepoint_prior_scale"], 0.5) def test_training_with_extra_regressors(self): df = pd.concat([ diff --git a/runtime/tests/automl_runtime/forecast/prophet/model_test.py b/runtime/tests/automl_runtime/forecast/prophet/model_test.py index 30ab37b9..7f15aaa0 100644 --- a/runtime/tests/automl_runtime/forecast/prophet/model_test.py +++ b/runtime/tests/automl_runtime/forecast/prophet/model_test.py @@ -118,6 +118,16 @@ def test_make_future_dataframe(self): f"Wrong future dataframe generated with frequency {feq_unit}:" f" Expect {expected_time}, but get {future_df.iloc[-1]['ds']}") + def test_make_future_dataframe_with_multiple_frequency_quantities(self): + for frequency_quantity in [1, 5, 10, 15, 30]: + prophet_model = ProphetModel(self.model_json, 1, "min", frequency_quantity, "ds") + future_df = prophet_model.make_future_dataframe(1) + offset_kw_arg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP["min"]] + expected_time = pd.Timestamp("2020-10-25") + pd.DateOffset(**offset_kw_arg)*frequency_quantity + self.assertEqual(future_df.iloc[-1]["ds"], expected_time, + f"Wrong future dataframe generated with frequency min:" + f" Expect {expected_time}, but get {future_df.iloc[-1]['ds']}") + def test_predict_success_datetime_date(self): prophet_model = ProphetModel(self.model_json, 1, "d", 1, "ds") test_df = pd.DataFrame( @@ -140,6 +150,17 @@ def test_predict_success_string(self): test_df, expected_test_df ) # check the input dataframe is unchanged + def test_predict_multiple_frequency_quantities(self): + for frequency_quantity in [1, 5, 10, 15, 30]: + prophet_model = ProphetModel(self.model_json, 1, "min", frequency_quantity, "ds") + test_df = pd.DataFrame({"ds": ["2020-10-08", "2020-12-10"]}) + expected_test_df = test_df.copy() + yhat = prophet_model.predict(None, test_df) + self.assertEqual(2, len(yhat)) + pd.testing.assert_frame_equal( + test_df, expected_test_df + ) # check the input dataframe is unchanged + def test_validate_predict_cols(self): prophet_model = ProphetModel(self.model_json, 1, "d", 1, "time") test_df = pd.DataFrame( @@ -341,6 +362,23 @@ def test_make_future_dataframe(self): self.assertCountEqual(future_df.columns, {"ds", "id"}) self.assertEqual(2, future_df.shape[0]) + def test_make_future_dataframe_multiple_frequency_quantities(self): + + for frequency_quantity in [1, 5, 10, 15, 30]: + prophet_model = MultiSeriesProphetModel( + model_json=self.multi_series_model_json, + timeseries_starts=self.multi_series_start, + timeseries_end="2020-07-25", + horizon=1, + frequency="min", + frequency_quantity=frequency_quantity, + time_col="time", + id_cols=["id"], + ) + future_df = prophet_model.make_future_dataframe(include_history=False) + self.assertCountEqual(future_df.columns, {"ds", "id"}) + self.assertEqual(2, future_df.shape[0]) + def test_make_future_dataframe_multi_ids(self): multi_series_model_json = {(1, "1"): self.model_json, (2, "1"): self.model_json} multi_series_start = { diff --git a/runtime/tests/automl_runtime/forecast/utils_test.py b/runtime/tests/automl_runtime/forecast/utils_test.py index edbbc2d7..147e6c96 100644 --- a/runtime/tests/automl_runtime/forecast/utils_test.py +++ b/runtime/tests/automl_runtime/forecast/utils_test.py @@ -211,6 +211,30 @@ def test_generate_cutoffs_success_annualy(self): cutoffs = generate_cutoffs(df, horizon=1, unit="YS", num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2018-07-14 00:00:00'), pd.Timestamp('2019-07-14 00:00:00'), pd.Timestamp('2020-07-14 00:00:00')], cutoffs) + def test_generate_cutoffs_success_with_multiple_frequency_quantities(self): + df = pd.DataFrame( + pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:55:00", freq='5T'), columns=["ds"] + ).rename_axis("y").reset_index() + cutoffs = generate_cutoffs(df, horizon=1, unit="min", num_folds=3, seasonal_period=1) + self.assertEqual([pd.Timestamp('2020-07-01 23:44:00'), pd.Timestamp('2020-07-01 23:49:00'), pd.Timestamp('2020-07-01 23:54:00')], cutoffs) + + df = pd.DataFrame( + pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:50:00", freq='10T'), columns=["ds"] + ).rename_axis("y").reset_index() + cutoffs = generate_cutoffs(df, horizon=1, unit="min", num_folds=3, seasonal_period=1) + self.assertEqual([pd.Timestamp('2020-07-01 23:29:00'), pd.Timestamp('2020-07-01 23:39:00'), pd.Timestamp('2020-07-01 23:49:00')], cutoffs) + + df = pd.DataFrame( + pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:45:00", freq='15T'), columns=["ds"] + ).rename_axis("y").reset_index() + cutoffs = generate_cutoffs(df, horizon=1, unit="min", num_folds=3, seasonal_period=1) + self.assertEqual([pd.Timestamp('2020-07-01 23:14:00'), pd.Timestamp('2020-07-01 23:29:00'), pd.Timestamp('2020-07-01 23:44:00')], cutoffs) + + df = pd.DataFrame( + pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:30:00", freq='30T'), columns=["ds"] + ).rename_axis("y").reset_index() + cutoffs = generate_cutoffs(df, horizon=1, unit="min", num_folds=3, seasonal_period=1) + self.assertEqual([pd.Timestamp('2020-07-01 22:29:00'), pd.Timestamp('2020-07-01 22:59:00'), pd.Timestamp('2020-07-01 23:29:00')], cutoffs) class TestTestGenerateCustomCutoffs(unittest.TestCase): @@ -267,6 +291,31 @@ def test_generate_custom_cutoffs_success_annualy(self): cutoffs = generate_custom_cutoffs(df, horizon=7, unit="YS", split_cutoff=pd.Timestamp('2012-07-14 00:00:00')) self.assertEqual([pd.Timestamp('2012-07-14 00:00:00'), pd.Timestamp('2013-07-14 00:00:00'), pd.Timestamp('2014-07-14 00:00:00')], cutoffs) + def test_generate_custom_cutoffs_success_with_multiple_frequency_quantities(self): + df = pd.DataFrame( + pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:55:00", freq='5T'), columns=["ds"] + ).rename_axis("y").reset_index() + cutoffs = generate_custom_cutoffs(df, horizon=1, unit="min", frequency_quantity=5, split_cutoff=pd.Timestamp('2020-07-01 23:45:00')) + self.assertEqual([pd.Timestamp('2020-07-01 23:45:00'), pd.Timestamp('2020-07-01 23:50:00')], cutoffs) + + df = pd.DataFrame( + pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:50:00", freq='10T'), columns=["ds"] + ).rename_axis("y").reset_index() + cutoffs = generate_custom_cutoffs(df, horizon=1, unit="min", frequency_quantity=10, split_cutoff=pd.Timestamp('2020-07-01 23:30:00')) + self.assertEqual([pd.Timestamp('2020-07-01 23:30:00'), pd.Timestamp('2020-07-01 23:40:00')], cutoffs) + + df = pd.DataFrame( + pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:45:00", freq='15T'), columns=["ds"] + ).rename_axis("y").reset_index() + cutoffs = generate_custom_cutoffs(df, horizon=1, unit="min", frequency_quantity=15, split_cutoff=pd.Timestamp('2020-07-01 23:15:00')) + self.assertEqual([pd.Timestamp('2020-07-01 23:15:00'), pd.Timestamp('2020-07-01 23:30:00')], cutoffs) + + df = pd.DataFrame( + pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:30:00", freq='30T'), columns=["ds"] + ).rename_axis("y").reset_index() + cutoffs = generate_custom_cutoffs(df, horizon=1, unit="min", frequency_quantity=30, split_cutoff=pd.Timestamp('2020-07-01 23:00:00')) + self.assertEqual([pd.Timestamp('2020-07-01 23:00:00')], cutoffs) + def test_generate_custom_cutoffs_success_with_small_gaps(self): df = pd.DataFrame( pd.date_range(start="2020-07-01", periods=30, freq='3d'), columns=["ds"] @@ -321,6 +370,17 @@ def test_calculate_period_differences_unevenly(self): x.start_time, x.end_time, 'day', 1 ), axis=1) self.assertTrue((periods == pd.Series([118, 151, 0])).all()) + + def test_calculate_period_differences_with_frequency_quantity(self): + for frequency_quantity in [1, 5, 10, 15, 30]: + df = pd.DataFrame({ + 'start_time': pd.date_range(start="2020-07-01 00:00:00", periods=10, freq=f'{frequency_quantity}T'), + 'end_time': pd.date_range(start="2020-07-01 04:00:00", periods=10, freq=f'{frequency_quantity}T') + }) + periods = df.apply(lambda x: calculate_period_differences( + x.start_time, x.end_time, 'min', frequency_quantity + ), axis=1) + self.assertTrue((periods == pd.Series([240//frequency_quantity]*10)).all()) def test_frequency_consistency(self): start_time = pd.Series( @@ -339,6 +399,17 @@ def test_frequency_consistency(self): lambda x: is_frequency_consistency(start_scalar, x, 'month', 1) ).all()) + def test_frequency_consistency_with_frequency_quantity(self): + for frequency_quantity in [1, 5, 10, 15, 30]: + start_time = pd.date_range(start="2020-07-01 00:00:00", periods=10, freq=f'{frequency_quantity}T') + end_time = pd.date_range(start="2020-07-01 04:00:00", periods=10, freq=f'{frequency_quantity}T') + self.assertTrue(start_time.to_series().apply( + lambda x: is_frequency_consistency(x, end_time[0], 'min', frequency_quantity) + ).all()) + self.assertTrue(end_time.to_series().apply( + lambda x: is_frequency_consistency(start_time[0], x, 'min', frequency_quantity) + ).all()) + class TestMakeFutureDataFrame(unittest.TestCase): def test_make_single_future_dataframe(self): @@ -385,26 +456,49 @@ def test_make_single_future_dataframe_with_different_freq(self): expected_columns = { "test_date" } self.assertTrue(expected_columns.issubset(set(future_df.columns))) + def test_make_single_future_dataframe_with_different_frequency_quantities(self): + for frequency_quantity in [1, 5, 10, 15, 30]: + start_time = pd.to_datetime('2022-01-01') + end_time = start_time + pd.DateOffset(**{'minutes': 1}) * frequency_quantity + future_df = make_single_future_dataframe( + start_time=start_time, + end_time=end_time, + horizon=1, + frequency="min", + frequency_quantity=frequency_quantity, + include_history=True, + column_name="test_date" + ) + self.assertEqual(len(future_df), 3) + expected_columns = { "test_date" } + self.assertTrue(expected_columns.issubset(set(future_df.columns))) + @parameterized.expand([ - (pd.to_datetime('2022-01-01'), pd.to_datetime('2022-01-05'), None, None, { "ds" }, ), - (pd.to_datetime('2022-01-01'), pd.to_datetime('2022-01-05'), ["id"], [("1", )], { "ds", "id" }), + (pd.to_datetime('2022-01-01'), pd.to_datetime('2022-01-05'), None, None, { "ds" }, "d", 1), + (pd.to_datetime('2022-01-01'), pd.to_datetime('2022-01-05'), ["id"], [("1", )], { "ds", "id" }, "d", 1), (pd.to_datetime('2022-01-01'), pd.to_datetime('2022-01-05'), ["id1", "id2"], - [("1", 1)], { "ds", "id1", "id2" }), + [("1", 1)], { "ds", "id1", "id2" }, "d", 1), ({("1", ): pd.to_datetime('2022-01-01'), ("2", ): pd.to_datetime('2022-01-01')}, - pd.to_datetime('2022-01-02'), ["id"], [("1", ), ("2", )], { "ds", "id" }), + pd.to_datetime('2022-01-02'), ["id"], [("1", ), ("2", )], { "ds", "id" }, "d", 1), ({("1", ): pd.to_datetime('2022-01-01'), ("2", ): pd.to_datetime('2022-01-01')}, {("1", ): pd.to_datetime('2022-01-02'), ("2", ): pd.to_datetime('2022-01-02')}, - ["id"], [("1", ), ("2", )], { "ds", "id" }), + ["id"], [("1", ), ("2", )], { "ds", "id" }, "d", 1), + (pd.Timestamp('2022-01-01 00:00:00'), pd.Timestamp('2022-01-01 00:20:00'), None, None, { "ds" }, "min", 5), + (pd.Timestamp('2022-01-01 00:00:00'), pd.Timestamp('2022-01-01 00:40:00'), None, None, { "ds" }, "min", 10), + (pd.Timestamp('2022-01-01 00:00:00'), pd.Timestamp('2022-01-01 01:00:00'), None, None, { "ds" }, "min", 15), + (pd.Timestamp('2022-01-01 00:00:00'), pd.Timestamp('2022-01-01 02:00:00'), None, None, { "ds" }, "min", 30), ]) def test_make_future_dataframe(self, start_time, end_time, identity_column_names, - groups, expected_columns): + groups, expected_columns, + frequency_unit, + frequency_quantity): future_df = make_future_dataframe( start_time=start_time, end_time=end_time, horizon=1, - frequency="d", - frequency_quantity=1, + frequency=frequency_unit, + frequency_quantity=frequency_quantity, groups=groups, identity_column_names=identity_column_names, ) From 03cd681780f12d3d915a05b260c1c2ccf07826dd Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Tue, 11 Feb 2025 21:27:09 -0800 Subject: [PATCH 08/11] refactor naming --- .../automl_runtime/forecast/deepar/model.py | 8 +-- .../automl_runtime/forecast/deepar/utils.py | 22 +++---- .../automl_runtime/forecast/pmdarima/model.py | 40 +++++------ .../forecast/pmdarima/training.py | 14 ++-- .../forecast/prophet/forecast.py | 12 ++-- .../automl_runtime/forecast/prophet/model.py | 22 +++---- .../automl_runtime/forecast/utils.py | 56 ++++++++-------- .../forecast/deepar/model_test.py | 12 ++-- .../forecast/pmdarima/diagnostics_test.py | 2 +- .../forecast/pmdarima/model_test.py | 34 +++++----- .../forecast/pmdarima/training_test.py | 22 +++---- .../forecast/prophet/diagnostics_test.py | 2 +- .../forecast/prophet/model_test.py | 6 +- .../automl_runtime/forecast/utils_test.py | 66 +++++++++---------- 14 files changed, 159 insertions(+), 159 deletions(-) diff --git a/runtime/databricks/automl_runtime/forecast/deepar/model.py b/runtime/databricks/automl_runtime/forecast/deepar/model.py index 17a99152..137c37a5 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/model.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/model.py @@ -42,7 +42,7 @@ class DeepARModel(ForecastModel): DeepAR mlflow model wrapper for forecasting. """ - def __init__(self, model: PyTorchPredictor, horizon: int, frequency: str, frequency_quantity: int, + def __init__(self, model: PyTorchPredictor, horizon: int, frequency_unit: str, frequency_quantity: int, num_samples: int, target_col: str, time_col: str, id_cols: Optional[List[str]] = None) -> None: @@ -50,7 +50,7 @@ def __init__(self, model: PyTorchPredictor, horizon: int, frequency: str, freque Initialize the DeepAR mlflow Python model wrapper :param model: DeepAR model :param horizon: the number of periods to forecast forward - :param frequency: the frequency of the time series + :param frequency_unit: the frequency unit of the time series :param frequency_quantity: the frequency quantity of the time series :param num_samples: the number of samples to draw from the distribution :param target_col: the target column name @@ -61,7 +61,7 @@ def __init__(self, model: PyTorchPredictor, horizon: int, frequency: str, freque super().__init__() self._model = model self._horizon = horizon - self._frequency = frequency + self._frequency_unit = frequency_unit self._frequency_quantity = frequency_quantity self._num_samples = num_samples self._target_col = target_col @@ -130,7 +130,7 @@ def predict_samples(self, model_input_transformed = set_index_and_fill_missing_time_steps(model_input, self._time_col, - self._frequency, + self._frequency_unit, self._frequency_quantity, self._id_cols) diff --git a/runtime/databricks/automl_runtime/forecast/deepar/utils.py b/runtime/databricks/automl_runtime/forecast/deepar/utils.py index 74549628..016de93b 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/utils.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/utils.py @@ -20,7 +20,7 @@ def validate_and_generate_index(df: pd.DataFrame, time_col: str, - frequency: str, + frequency_unit: str, frequency_quantity: int): """ Generate a complete time index for the given DataFrame based on the specified frequency. @@ -29,13 +29,13 @@ def validate_and_generate_index(df: pd.DataFrame, - Generates a new time index from the minimum to the maximum timestamp in the data. :param df: The input DataFrame containing the time column. :param time_col: The name of the time column. - :param frequency: The frequency of the time series. + :param frequency_unit: The frequency unit of the time series. :param frequency_quantity: The frequency quantity of the time series. :return: A complete time index covering the full range of the dataset. :raises ValueError: If the day-of-month pattern is inconsistent for "MS" frequency. """ - if frequency.upper() != "MS": - return pd.date_range(df[time_col].min(), df[time_col].max(), freq=f"{frequency_quantity}{frequency}") + if frequency_unit.upper() != "MS": + return pd.date_range(df[time_col].min(), df[time_col].max(), freq=f"{frequency_quantity}{frequency_unit}") df[time_col] = pd.to_datetime(df[time_col]) # Ensure datetime format @@ -67,7 +67,7 @@ def validate_and_generate_index(df: pd.DataFrame, return new_index_full def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, - frequency: str, + frequency_unit: str, frequency_quantity: int, id_cols: Optional[List[str]] = None): """ @@ -78,7 +78,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, :param df: the input dataframe that contains time_col :param time_col: time column name - :param frequency: the frequency of the time series + :param frequency_unit: the frequency unit of the time series :param frequency_quantity: the frequency quantity of the time series :param id_cols: the column names of the identity columns for multi-series time series; None for single series :return: single-series - transformed dataframe; @@ -86,13 +86,13 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, """ total_min, total_max = df[time_col].min(), df[time_col].max() - # We need to adjust the frequency for pd.date_range if it is weekly, + # We need to adjust the frequency_unit for pd.date_range if it is weekly, # otherwise it would always be "W-SUN" - if frequency.upper() == "W": + if frequency_unit.upper() == "W": weekday_name = total_min.strftime("%a").upper() # e.g., "FRI" - frequency = f"W-{weekday_name}" + frequency_unit = f"W-{weekday_name}" - valid_index = validate_and_generate_index(df=df, time_col=time_col, frequency=frequency, frequency_quantity=frequency_quantity) + valid_index = validate_and_generate_index(df=df, time_col=time_col, frequency_unit=frequency_unit, frequency_quantity=frequency_quantity) if id_cols is not None: df_dict = {} @@ -111,7 +111,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str, # Fill in missing time steps between the min and max time steps df = df.reindex(valid_index) - if frequency.upper() == "MS": + if frequency_unit.upper() == "MS": # Truncate the day of month to avoid issues with pandas frequency check df = df.to_period("M") diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/model.py b/runtime/databricks/automl_runtime/forecast/pmdarima/model.py index b700116d..cc4116c9 100644 --- a/runtime/databricks/automl_runtime/forecast/pmdarima/model.py +++ b/runtime/databricks/automl_runtime/forecast/pmdarima/model.py @@ -64,19 +64,19 @@ def model_env(self): return ARIMA_CONDA_ENV @staticmethod - def _get_ds_indices(start_ds: pd.Timestamp, periods: int, frequency: str, frequency_quantity: int) -> pd.DatetimeIndex: + def _get_ds_indices(start_ds: pd.Timestamp, periods: int, frequency_unit: str, frequency_quantity: int) -> pd.DatetimeIndex: """ Create a DatetimeIndex with specified starting time and frequency, whose length is the given periods. :param start_ds: the pd.Timestamp as the start of the DatetimeIndex. :param periods: the length of the DatetimeIndex. - :param frequency: the frequency of the DatetimeIndex. + :param frequency_unit: the frequency unit of the DatetimeIndex. :param frequency_quantity: the frequency quantity of the DatetimeIndex. :return: a DatetimeIndex. """ ds_indices = pd.date_range( start=start_ds, periods=periods, - freq=pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency]) * frequency_quantity + freq=pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit]) * frequency_quantity ) modified_start_ds = ds_indices.min() if start_ds != modified_start_ds: @@ -90,14 +90,14 @@ class ArimaModel(AbstractArimaModel): ARIMA mlflow model wrapper for univariate forecasting. """ - def __init__(self, pickled_model: bytes, horizon: int, frequency: str, + def __init__(self, pickled_model: bytes, horizon: int, frequency_unit: str, frequency_quantity: int, start_ds: pd.Timestamp, end_ds: pd.Timestamp, time_col: str, exogenous_cols: Optional[List[str]] = None) -> None: """ Initialize the mlflow Python model wrapper for ARIMA. :param pickled_model: the pickled ARIMA model as a bytes object. :param horizon: int number of periods to forecast forward. - :param frequency: the frequency of the time series + :param frequency_unit: the frequency unit of the time series :param frequency_quantity: the frequency quantity of the time series :param start_ds: the start time of training data :param end_ds: the end time of training data @@ -108,7 +108,7 @@ def __init__(self, pickled_model: bytes, horizon: int, frequency: str, super().__init__() self._pickled_model = pickled_model self._horizon = horizon - self._frequency = OFFSET_ALIAS_MAP[frequency] + self._frequency_unit = OFFSET_ALIAS_MAP[frequency_unit] self._frequency_quantity = frequency_quantity self._start_ds = pd.to_datetime(start_ds) self._end_ds = pd.to_datetime(end_ds) @@ -160,7 +160,7 @@ def make_future_dataframe(self, horizon: int = None, include_history: bool = Tru start_time=self._start_ds, end_time=self._end_ds, horizon=horizon or self._horizon, - frequency=self._frequency, + frequency_unit=self._frequency_unit, frequency_quantity=self._frequency_quantity, include_history=include_history ) @@ -196,7 +196,7 @@ def _predict_impl(self, input_df: pd.DataFrame) -> pd.DataFrame: ) # Check if the time has correct frequency consistency = df["ds"].apply(lambda x: - is_frequency_consistency(self._start_ds, x, self._frequency, self._frequency_quantity) + is_frequency_consistency(self._start_ds, x, self._frequency_unit, self._frequency_quantity) ).all() if not consistency: raise MlflowException( @@ -207,7 +207,7 @@ def _predict_impl(self, input_df: pd.DataFrame) -> pd.DataFrame: ) preds_pds = [] # Out-of-sample prediction if needed - horizon = calculate_period_differences(self._end_ds, max(df["ds"]), self._frequency, self._frequency_quantity) + horizon = calculate_period_differences(self._end_ds, max(df["ds"]), self._frequency_unit, self._frequency_quantity) if horizon > 0: X_future = df[df["ds"] > self._end_ds].set_index("ds") future_pd = self._forecast( @@ -233,8 +233,8 @@ def _predict_in_sample( end_ds: pd.Timestamp = None, X: pd.DataFrame = None) -> pd.DataFrame: if start_ds and end_ds: - start_idx = calculate_period_differences(self._start_ds, start_ds, self._frequency, self._frequency_quantity) - end_idx = calculate_period_differences(self._start_ds, end_ds, self._frequency, self._frequency_quantity) + start_idx = calculate_period_differences(self._start_ds, start_ds, self._frequency_unit, self._frequency_quantity) + end_idx = calculate_period_differences(self._start_ds, end_ds, self._frequency_unit, self._frequency_quantity) else: start_ds = self._start_ds end_ds = self._end_ds @@ -246,8 +246,8 @@ def _predict_in_sample( start=start_idx, end=end_idx, return_conf_int=True) - periods = calculate_period_differences(self._start_ds, end_ds, self._frequency, self._frequency_quantity) + 1 - ds_indices = self._get_ds_indices(start_ds=self._start_ds, periods=periods, frequency=self._frequency, frequency_quantity=self._frequency_quantity)[start_idx:] + periods = calculate_period_differences(self._start_ds, end_ds, self._frequency_unit, self._frequency_quantity) + 1 + ds_indices = self._get_ds_indices(start_ds=self._start_ds, periods=periods, frequency_unit=self._frequency_unit, frequency_quantity=self._frequency_quantity)[start_idx:] in_sample_pd = pd.DataFrame({'ds': ds_indices, 'yhat': preds_in_sample}) in_sample_pd[["yhat_lower", "yhat_upper"]] = conf_in_sample return in_sample_pd @@ -261,7 +261,7 @@ def _forecast( horizon, X=X, return_conf_int=True) - ds_indices = self._get_ds_indices(start_ds=self._end_ds, periods=horizon + 1, frequency=self._frequency, frequency_quantity=self._frequency_quantity)[1:] + ds_indices = self._get_ds_indices(start_ds=self._end_ds, periods=horizon + 1, frequency_unit=self._frequency_unit, frequency_quantity=self._frequency_quantity)[1:] preds_pd = pd.DataFrame({'ds': ds_indices, 'yhat': preds}) preds_pd[["yhat_lower", "yhat_upper"]] = conf return preds_pd @@ -272,14 +272,14 @@ class MultiSeriesArimaModel(AbstractArimaModel): ARIMA mlflow model wrapper for multivariate forecasting. """ - def __init__(self, pickled_model_dict: Dict[Tuple, bytes], horizon: int, frequency: str, frequency_quantity: int, + def __init__(self, pickled_model_dict: Dict[Tuple, bytes], horizon: int, frequency_unit: str, frequency_quantity: int, start_ds_dict: Dict[Tuple, pd.Timestamp], end_ds_dict: Dict[Tuple, pd.Timestamp], time_col: str, id_cols: List[str], exogenous_cols: Optional[List[str]] = None) -> None: """ Initialize the mlflow Python model wrapper for multiseries ARIMA. :param pickled_model_dict: the dictionary of binarized ARIMA models for different time series. :param horizon: int number of periods to forecast forward. - :param frequency: the frequency of the time series + :param frequency_unit: the frequency unit of the time series :param frequency_quantity: the frequency quantity of the time series :param start_ds_dict: the dictionary of the starting time of each time series in training data. :param end_ds_dict: the dictionary of the end time of each time series in training data. @@ -291,7 +291,7 @@ def __init__(self, pickled_model_dict: Dict[Tuple, bytes], horizon: int, frequen super().__init__() self._pickled_models = pickled_model_dict self._horizon = horizon - self._frequency = frequency + self._frequency_unit = frequency_unit self._frequency_quantity = frequency_quantity self._starts = start_ds_dict self._ends = end_ds_dict @@ -335,7 +335,7 @@ def make_future_dataframe( start_time=self._starts, end_time=self._ends, horizon=horizon, - frequency=self._frequency, + frequency_unit=self._frequency_unit, frequency_quantity=self._frequency_quantity, include_history=include_history, groups=groups, @@ -367,7 +367,7 @@ def _predict_timeseries_single_id( horizon: int, include_history: bool = True, df: Optional[pd.DataFrame] = None) -> pd.DataFrame: - arima_model_single_id = ArimaModel(self._pickled_models[id_], self._horizon, self._frequency, self._frequency_quantity, + arima_model_single_id = ArimaModel(self._pickled_models[id_], self._horizon, self._frequency_unit, self._frequency_quantity, self._starts[id_], self._ends[id_], self._time_col, self._exogenous_cols) preds_df = arima_model_single_id.predict_timeseries(horizon, include_history, df) for id, col_name in zip(id_, self._id_cols): @@ -408,7 +408,7 @@ def _predict_single_id(self, df: pd.DataFrame) -> pd.DataFrame: id_ = df["ts_id"].to_list()[0] arima_model_single_id = ArimaModel(self._pickled_models[id_], self._horizon, - self._frequency, + self._frequency_unit, self._frequency_quantity, self._starts[id_], self._ends[id_], diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py index 37b37efb..7cb1b359 100644 --- a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py +++ b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py @@ -100,7 +100,7 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame: cutoffs = utils.generate_custom_cutoffs( history_pd, horizon=validation_horizon, - unit=self._frequency_unit, + frequency_unit=self._frequency_unit, split_cutoff=self._split_cutoff, frequency_quantity=self._frequency_quantity, ) @@ -108,7 +108,7 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame: cutoffs = utils.generate_cutoffs( history_pd, horizon=validation_horizon, - unit=self._frequency_unit, + frequency_unit=self._frequency_unit, num_folds=self._num_folds, frequency_quantity=self._frequency_quantity, ) @@ -154,9 +154,9 @@ def _fit_predict(self, df: pd.DataFrame, cutoffs: List[pd.Timestamp], seasonal_p return {"metrics": metrics, "model": arima_model} @staticmethod - def _fill_missing_time_steps(df: pd.DataFrame, frequency: str, frequency_quantity: int): + def _fill_missing_time_steps(df: pd.DataFrame, frequency_unit: str, frequency_quantity: int): # Forward fill missing time steps - df_filled = df.set_index("ds").resample(rule=f"{frequency_quantity}{OFFSET_ALIAS_MAP[frequency]}").pad().reset_index() + df_filled = df.set_index("ds").resample(rule=f"{frequency_quantity}{OFFSET_ALIAS_MAP[frequency_unit]}").pad().reset_index() start_ds, modified_start_ds = df["ds"].min(), df_filled["ds"].min() if start_ds != modified_start_ds: offset = modified_start_ds - start_ds @@ -164,12 +164,12 @@ def _fill_missing_time_steps(df: pd.DataFrame, frequency: str, frequency_quantit return df_filled @staticmethod - def _validate_ds_freq(df: pd.DataFrame, frequency: str, frequency_quantity: int): + def _validate_ds_freq(df: pd.DataFrame, frequency_unit: str, frequency_quantity: int): start_ds = df["ds"].min() consistency = df["ds"].apply(lambda x: - utils.is_frequency_consistency(start_ds, x, frequency, frequency_quantity) + utils.is_frequency_consistency(start_ds, x, frequency_unit, frequency_quantity) ).all() if not consistency: raise ValueError( - f"Input time column includes different frequency than the specified frequency {frequency_quantity}{frequency}." + f"Input time column includes different frequency than the specified frequency {frequency_quantity}{frequency_unit}." ) diff --git a/runtime/databricks/automl_runtime/forecast/prophet/forecast.py b/runtime/databricks/automl_runtime/forecast/prophet/forecast.py index 8fbe771b..28c4a5eb 100644 --- a/runtime/databricks/automl_runtime/forecast/prophet/forecast.py +++ b/runtime/databricks/automl_runtime/forecast/prophet/forecast.py @@ -38,7 +38,7 @@ class ProphetHyperParams(Enum): def _prophet_fit_predict(params: Dict[str, Any], history_pd: pd.DataFrame, - horizon: int, frequency: str, cutoffs: List[pd.Timestamp], + horizon: int, frequency_unit: str, cutoffs: List[pd.Timestamp], interval_width: int, primary_metric: str, country_holidays: Optional[str] = None, regressors = None, @@ -51,7 +51,7 @@ def _prophet_fit_predict(params: Dict[str, Any], history_pd: pd.DataFrame, :param history_pd: pd.DataFrame containing the history. Must have columns ds (date type) and y, the time series :param horizon: Forecast horizon_timedelta - :param frequency: Frequency of the time series + :param frequency_unit: Frequency unit of the time series :param frequency_quantity: the number of time units that make up a single period of the time series. For now, only 1/5/10/15/30 minutes, 1 hour, 1 day, 1 week, 1 month, 1 quarter, 1 year are supported. :param num_folds: Number of folds for cross validation :param interval_width: Width of the uncertainty intervals provided for the forecast @@ -70,7 +70,7 @@ def _prophet_fit_predict(params: Dict[str, Any], history_pd: pd.DataFrame, model.add_regressor(regressor) model.fit(history_pd, iter=200) - offset_kwarg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency]] + offset_kwarg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency_unit]] horizon_offset = pd.DateOffset(**offset_kwarg)*frequency_quantity*horizon # Evaluate Metrics df_cv = cross_validation( @@ -155,7 +155,7 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame: cutoffs = utils.generate_custom_cutoffs( df.reset_index(drop=True), horizon=validation_horizon, - unit=self._frequency_unit, + frequency_unit=self._frequency_unit, split_cutoff=self._split_cutoff, frequency_quantity=self._frequency_quantity, ) @@ -163,13 +163,13 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame: cutoffs = utils.generate_cutoffs( df.reset_index(drop=True), horizon=validation_horizon, - unit=self._frequency_unit, + frequency_unit=self._frequency_unit, num_folds=self._num_folds, frequency_quantity=self._frequency_quantity, ) train_fn = partial(_prophet_fit_predict, history_pd=df, horizon=validation_horizon, - frequency=self._frequency_unit, + frequency_unit=self._frequency_unit, frequency_quantity=self._frequency_quantity, cutoffs=cutoffs, interval_width=self._interval_width, diff --git a/runtime/databricks/automl_runtime/forecast/prophet/model.py b/runtime/databricks/automl_runtime/forecast/prophet/model.py index 173741ee..72463aee 100644 --- a/runtime/databricks/automl_runtime/forecast/prophet/model.py +++ b/runtime/databricks/automl_runtime/forecast/prophet/model.py @@ -45,23 +45,23 @@ class ProphetModel(ForecastModel): Prophet mlflow model wrapper for univariate forecasting. """ - def __init__(self, model_json: Union[Dict[Tuple, str], str], horizon: int, frequency: str, frequency_quantity: int, + def __init__(self, model_json: Union[Dict[Tuple, str], str], horizon: int, frequency_unit: str, frequency_quantity: int, time_col: str) -> None: """ Initialize the mlflow Python model wrapper for mlflow :param model_json: json string of the Prophet model or the dictionary of json strings of Prophet model for multi-series forecasting :param horizon: Int number of periods to forecast forward. - :param frequency: the frequency of the time series + :param frequency_unit: the frequency unit of the time series :param frequency_quantity: the frequency quantity of the time series :param time_col: the column name of the time column """ self._model_json = model_json self._horizon = horizon - self._frequency = frequency + self._frequency_unit = frequency_unit self._frequency_quantity = frequency_quantity self._time_col = time_col - self._is_quaterly = is_quaterly_alias(frequency) + self._is_quaterly = is_quaterly_alias(frequency_unit) super().__init__() def load_context(self, context: mlflow.pyfunc.model.PythonModelContext) -> None: @@ -94,7 +94,7 @@ def make_future_dataframe(self, horizon: int = None, include_history: bool = Tru :return: pd.Dataframe that extends forward from the end of self.history for the requested number of periods. """ - offset_kwarg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[self._frequency]] + offset_kwarg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[self._frequency_unit]] offset_kwarg = {key: value * self._frequency_quantity for key, value in offset_kwarg.items()} return self.model().make_future_dataframe(periods=horizon or self._horizon, freq=pd.DateOffset(**offset_kwarg), @@ -147,7 +147,7 @@ class MultiSeriesProphetModel(ProphetModel): """ def __init__(self, model_json: Dict[Tuple, str], timeseries_starts: Dict[Tuple, pd.Timestamp], - timeseries_end: str, horizon: int, frequency: str, frequency_quantity: int, time_col: str, id_cols: List[str], + timeseries_end: str, horizon: int, frequency_unit: str, frequency_quantity: int, time_col: str, id_cols: List[str], ) -> None: """ Initialize the mlflow Python model wrapper for mlflow @@ -155,13 +155,13 @@ def __init__(self, model_json: Dict[Tuple, str], timeseries_starts: Dict[Tuple, :param timeseries_starts: the dictionary of pd.Timestamp as the starting time of each time series :param timeseries_end: the end time of the time series :param horizon: int number of periods to forecast forward - :param frequency: the frequency of the time series + :param frequency_unit: the frequency unit of the time series :param frequency_quantity: the frequency quantity of the time series :param time_col: the column name of the time column :param id_cols: the column names of the identity columns for multi-series time series """ - super().__init__(model_json, horizon, frequency, frequency_quantity, time_col) - self._frequency = frequency + super().__init__(model_json, horizon, frequency_unit, frequency_quantity, time_col) + self._frequency_unit = frequency_unit self._frequency_quantity = frequency_quantity self._timeseries_end = timeseries_end self._timeseries_starts = timeseries_starts @@ -205,7 +205,7 @@ def make_future_dataframe( start_time=self._timeseries_starts, end_time=end_time, horizon=horizon, - frequency=self._frequency, + frequency_unit=self._frequency_unit, frequency_quantity=self._frequency_quantity, include_history=include_history, groups=groups, @@ -241,7 +241,7 @@ def predict_timeseries(self, horizon: int = None, include_history: bool = True) start_time=self._timeseries_starts, end_time=end_time, horizon=horizon, - frequency=self._frequency, + frequency_unit=self._frequency_unit, frequency_quantity=self._frequency_quantity, include_history=include_history, groups=self._model_json.keys(), diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py index 7faab0f4..e8165cec 100644 --- a/runtime/databricks/automl_runtime/forecast/utils.py +++ b/runtime/databricks/automl_runtime/forecast/utils.py @@ -26,7 +26,7 @@ def make_future_dataframe( start_time: Union[pd.Timestamp, Dict[Tuple, pd.Timestamp]], end_time: Union[pd.Timestamp, Dict[Tuple, pd.Timestamp]], horizon: int, - frequency: str, + frequency_unit: str, frequency_quantity: int, include_history: bool = True, groups: List[Tuple] = None, @@ -37,7 +37,7 @@ def make_future_dataframe( :param start_time: the dictionary of the starting time of each time series in training data. :param end_time: the dictionary of the end time of each time series in training data. :param horizon: int number of periods to forecast forward. - :param frequency: the frequency of the time series + :param frequency_unit: the frequency unit of the time series :param frequency_quantity: the multiplier for the frequency. :param include_history: :param groups: the collection of group(s) to generate forecast predictions. @@ -45,7 +45,7 @@ def make_future_dataframe( :return: pd.DataFrame that extends forward """ if groups is None: - return make_single_future_dataframe(start_time, end_time, horizon, frequency, frequency_quantity) + return make_single_future_dataframe(start_time, end_time, horizon, frequency_unit, frequency_quantity) future_df_list = [] for group in groups: @@ -57,7 +57,7 @@ def make_future_dataframe( group_end_time = end_time[group] else: group_end_time = end_time - df = make_single_future_dataframe(group_start_time, group_end_time, horizon, frequency, frequency_quantity, include_history) + df = make_single_future_dataframe(group_start_time, group_end_time, horizon, frequency_unit, frequency_quantity, include_history) for idx, identity_column_name in enumerate(identity_column_names): df[identity_column_name] = group[idx] future_df_list.append(df) @@ -67,7 +67,7 @@ def make_single_future_dataframe( start_time: pd.Timestamp, end_time: pd.Timestamp, horizon: int, - frequency: str, + frequency_unit: str, frequency_quantity: int, include_history: bool = True, column_name: str = "ds" @@ -77,14 +77,14 @@ def make_single_future_dataframe( :param start_time: The starting time of time series of the training data. :param end_time: The end time of time series of the training data. :param horizon: Int number of periods to forecast forward. - :param frequency: The frequency of the time series + :param frequency_unit: The frequency unit of the time series :param frequency_quantity: The frequency quantity of the time series :param include_history: Boolean to include the historical dates in the data frame for predictions. :param column_name: column name of the time column. Default is "ds". :return: """ - offset_freq = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency]] + offset_freq = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency_unit]] timestep_offset = pd.DateOffset(**offset_freq) * frequency_quantity end_time = pd.Timestamp(end_time) @@ -100,7 +100,7 @@ def make_single_future_dataframe( ) return pd.DataFrame(date_rng, columns=[column_name]) -def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str, frequency_quantity: int = 1) -> int: +def get_validation_horizon(df: pd.DataFrame, horizon: int, frequency_unit: str, frequency_quantity: int = 1) -> int: """ Return validation_horizon, which is the lesser of `horizon` and one quarter of the dataframe's timedelta Since the seasonality period is never more than half of the dataframe's timedelta, @@ -108,7 +108,7 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str, frequency_ behavior, and we enforce it for ARIMA.) :param df: pd.DataFrame of the historical data :param horizon: int number of time into the future for forecasting - :param unit: frequency unit of the time series, which must be a pandas offset alias + :param frequency_unit: frequency unit of the time series, which must be a pandas offset alias :param frequency_quantity: int multiplier for the frequency unit, representing the number of `unit`s per time step in the dataframe. This is useful when the time series has a granularity that spans multiple `unit`s (e.g., if `unit='min'` and `frequency_quantity=5`, it means the data @@ -116,7 +116,7 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str, frequency_ :return: horizon used for validation, in terms of the input `unit` """ MIN_HORIZONS = 4 # minimum number of horizons in the dataframe - horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * horizon * frequency_quantity + horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit]) * horizon * frequency_quantity try: if MIN_HORIZONS * horizon_dateoffset + df["ds"].min() <= df["ds"].max(): @@ -127,7 +127,7 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str, frequency_ # In order to calculate the validation horizon, we incrementally add offset # to the start time to the quarter of total timedelta. We did this since # pd.DateOffset does not support divide by operation. - timestep_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit]) * frequency_quantity + timestep_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit]) * frequency_quantity max_horizon = 0 cur_timestamp = df["ds"].min() while cur_timestamp + timestep_dateoffset <= df["ds"].max(): @@ -137,7 +137,7 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, unit: str, frequency_ f"timedelta. Validation horizon will be reduced to {max_horizon//MIN_HORIZONS*timestep_dateoffset}.") return max_horizon // MIN_HORIZONS -def generate_cutoffs(df: pd.DataFrame, horizon: int, unit: str, +def generate_cutoffs(df: pd.DataFrame, horizon: int, frequency_unit: str, num_folds: int, seasonal_period: int = 0, seasonal_unit: Optional[str] = None, frequency_quantity: int = 1) -> List[pd.Timestamp]: @@ -145,7 +145,7 @@ def generate_cutoffs(df: pd.DataFrame, horizon: int, unit: str, Generate cutoff times for cross validation with the control of number of folds. :param df: pd.DataFrame of the historical data. :param horizon: int number of time into the future for forecasting. - :param unit: frequency unit of the time series, which must be a pandas offset alias. + :param frequency_unit: frequency unit of the time series, which must be a pandas offset alias. :param num_folds: int number of cutoffs for cross validation. :param seasonal_period: length of the seasonality period. :param seasonal_unit: Optional frequency unit for the seasonal period. If not specified, the function will use @@ -156,19 +156,19 @@ def generate_cutoffs(df: pd.DataFrame, horizon: int, unit: str, period = max(0.5 * horizon, 1) # avoid empty cutoff buckets # avoid non-integer months, quaters ands years. - if unit in NON_DAILY_OFFSET_ALIAS: + if frequency_unit in NON_DAILY_OFFSET_ALIAS: period = int(period) - period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*frequency_quantity*period + period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*frequency_quantity*period else: - offset_kwarg = {list(DATE_OFFSET_KEYWORD_MAP[unit])[0]: period} + offset_kwarg = {list(DATE_OFFSET_KEYWORD_MAP[frequency_unit])[0]: period} period_dateoffset = pd.DateOffset(**offset_kwarg)*frequency_quantity - horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*frequency_quantity*horizon + horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*frequency_quantity*horizon if not seasonal_unit: - seasonal_unit = unit + seasonal_unit = frequency_unit - seasonality_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*frequency_quantity*seasonal_period + seasonality_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*frequency_quantity*seasonal_period # We can not compare DateOffset directly, so we add to start time and compare. initial = seasonality_dateoffset @@ -197,14 +197,14 @@ def generate_cutoffs(df: pd.DataFrame, horizon: int, unit: str, ) return list(reversed(result)) -def generate_custom_cutoffs(df: pd.DataFrame, horizon: int, unit: str, +def generate_custom_cutoffs(df: pd.DataFrame, horizon: int, frequency_unit: str, split_cutoff: pd.Timestamp, frequency_quantity: int = 1) -> List[pd.Timestamp]: """ Generate custom cutoff times for cross validation based on user-specified split cutoff. Period (step size) is 1. :param df: pd.DataFrame of the historical data. :param horizon: int number of time into the future for forecasting. - :param unit: frequency unit of the time series, which must be a pandas offset alias. + :param frequency_unit: frequency unit of the time series, which must be a pandas offset alias. :param split_cutoff: the user-specified cutoff, as the starting point of cutoffs. :param frequency_quantity: frequency quantity of the time series. For tuning job, it is the cutoff between train and validate split. @@ -213,8 +213,8 @@ def generate_custom_cutoffs(df: pd.DataFrame, horizon: int, unit: str, """ # TODO: [ML-43528] expose period as input. period = 1 - period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*period*frequency_quantity - horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[unit])*horizon*frequency_quantity + period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*period*frequency_quantity + horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*horizon*frequency_quantity # First cutoff is the cutoff bewteen splits cutoff = split_cutoff @@ -236,7 +236,7 @@ def is_quaterly_alias(freq: str): def is_frequency_consistency( start_time: pd.Timestamp, end_time: pd.Timestamp, - freq:str, + frequency_unit:str, frequency_quantity: int) -> bool: """ Validate the periods given a start time, end time is consistent with given frequency. @@ -251,9 +251,9 @@ def is_frequency_consistency( :return: A boolean indicate whether the time interval is evenly divisible by the period. """ - periods = calculate_period_differences(start_time, end_time, freq, frequency_quantity) + periods = calculate_period_differences(start_time, end_time, frequency_unit, frequency_quantity) diff = pd.to_datetime(end_time) - pd.DateOffset( - **DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[freq]] + **DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency_unit]] ) * periods * frequency_quantity == pd.to_datetime(start_time) return diff @@ -261,7 +261,7 @@ def is_frequency_consistency( def calculate_period_differences( start_time: pd.Timestamp, end_time: pd.Timestamp, - freq:str, + frequency_unit:str, frequency_quantity: int) -> int: """ Calculate the periods given a start time, end time and period frequency. @@ -275,5 +275,5 @@ def calculate_period_differences( """ start_time = pd.to_datetime(start_time) end_time = pd.to_datetime(end_time) - freq_alias = PERIOD_ALIAS_MAP[OFFSET_ALIAS_MAP[freq]] + freq_alias = PERIOD_ALIAS_MAP[OFFSET_ALIAS_MAP[frequency_unit]] return (end_time.to_period(freq_alias) - start_time.to_period(freq_alias)).n // frequency_quantity diff --git a/runtime/tests/automl_runtime/forecast/deepar/model_test.py b/runtime/tests/automl_runtime/forecast/deepar/model_test.py index 0ebf1971..bee823d6 100644 --- a/runtime/tests/automl_runtime/forecast/deepar/model_test.py +++ b/runtime/tests/automl_runtime/forecast/deepar/model_test.py @@ -94,7 +94,7 @@ def test_model_save_and_load_single_series(self): deepar_model = DeepARModel( model=self.model, horizon=self.prediction_length, - frequency="d", + frequency_unit="d", frequency_quantity=1, num_samples=1, target_col=target_col, @@ -139,7 +139,7 @@ def test_model_save_and_load_multi_series(self): model=self.model, horizon=self.prediction_length, num_samples=1, - frequency="d", + frequency_unit="d", frequency_quantity=1, target_col=target_col, time_col=time_col, @@ -187,7 +187,7 @@ def test_model_save_and_load_multi_series_multi_id_cols(self): model=self.model, horizon=self.prediction_length, num_samples=1, - frequency="d", + frequency_unit="d", frequency_quantity=1, target_col=target_col, time_col=time_col, @@ -234,7 +234,7 @@ def test_model_prediction_with_duplicate_timestamps(self): deepar_model = DeepARModel( model=self.model, horizon=self.prediction_length, - frequency="d", + frequency_unit="d", frequency_quantity=1, num_samples=1, target_col=target_col, @@ -278,7 +278,7 @@ def test_model_prediction_with_monthly_data(self): deepar_model = DeepARModel( model=self.model, horizon=self.prediction_length, - frequency="MS", + frequency_unit="MS", frequency_quantity=1, num_samples=1, target_col=target_col, @@ -321,7 +321,7 @@ def test_model_prediction_with_multiple_minutes_frequency(self, frequency_quanti deepar_model = DeepARModel( model=self.model, horizon=self.prediction_length, - frequency="min", + frequency_unit="min", frequency_quantity=frequency_quantity, num_samples=1, target_col=target_col, diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py index f15f5480..bd4380cc 100644 --- a/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py +++ b/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py @@ -43,7 +43,7 @@ class TestDiagnostics(unittest.TestCase): (df_with_exogenous, ["x1", "x2"]) ]) def test_cross_validation_success(self, df, exogenous_cols): - cutoffs = generate_cutoffs(df, horizon=3, unit="D", seasonal_period=1, seasonal_unit="D", num_folds=3) + cutoffs = generate_cutoffs(df, horizon=3, frequency_unit="D", seasonal_period=1, seasonal_unit="D", num_folds=3) train_df = df[df["ds"] <= cutoffs[0]].set_index("ds") y_train = train_df[["y"]] X_train = train_df.drop(["y"], axis=1) diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py index b6583752..5918d29c 100644 --- a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py +++ b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py @@ -42,7 +42,7 @@ def setUp(self) -> None: self.horizon = 1 self.freq = 'W' self.frequency_quantity=1 - dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency=self.freq, frequency_quantity=self.frequency_quantity) + dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency_unit=self.freq, frequency_quantity=self.frequency_quantity) self.df = pd.concat([ pd.Series(dates, name='date'), pd.Series(range(self.num_rows), name="y") @@ -52,7 +52,7 @@ def setUp(self) -> None: pickled_model = pickle.dumps(model) self.arima_model = ArimaModel(pickled_model, horizon=self.horizon, - frequency=self.freq, + frequency_unit=self.freq, frequency_quantity=self.frequency_quantity, start_ds=self.start_ds, end_ds=pd.Timestamp("2020-11-26"), @@ -69,7 +69,7 @@ def test_predict_timeseries_success(self): expected_ds = AbstractArimaModel._get_ds_indices( self.start_ds, periods=self.num_rows + self.horizon, - frequency=self.freq, + frequency_unit=self.freq, frequency_quantity=self.frequency_quantity) self.assertTrue(expected_columns.issubset(set(forecast_pd.columns))) self.assertEqual(10, forecast_pd.shape[0]) @@ -140,7 +140,7 @@ def setUp(self) -> None: self.freq = 'W' self.frequency_quantity = 1 dates = AbstractArimaModel._get_ds_indices( - pd.to_datetime(self.start_ds), periods=self.num_rows, frequency=self.freq, frequency_quantity=self.frequency_quantity) + pd.to_datetime(self.start_ds), periods=self.num_rows, frequency_unit=self.freq, frequency_quantity=self.frequency_quantity) self.df = pd.concat([ pd.Series(dates, name='date'), pd.Series(range(self.num_rows), name="y") @@ -150,7 +150,7 @@ def setUp(self) -> None: pickled_model = pickle.dumps(model) self.arima_model = ArimaModel(pickled_model, horizon=self.horizon, - frequency=self.freq, + frequency_unit=self.freq, frequency_quantity=self.frequency_quantity, start_ds=self.start_ds, end_ds=pd.Timestamp("2020-11-26"), @@ -174,7 +174,7 @@ def setUp(self) -> None: self.horizon = 1 self.freq = 'W' self.frequency_quantity = 1 - dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency=self.freq, frequency_quantity=self.frequency_quantity) + dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency_unit=self.freq, frequency_quantity=self.frequency_quantity) self.df = pd.concat([ pd.Series(dates, name='date'), pd.Series(range(self.num_rows), name="y"), @@ -189,7 +189,7 @@ def setUp(self) -> None: pickled_model = pickle.dumps(model) self.arima_model = ArimaModel(pickled_model, horizon=self.horizon, - frequency=self.freq, + frequency_unit=self.freq, frequency_quantity=self.frequency_quantity, start_ds=self.start_ds, end_ds=pd.Timestamp("2020-11-26"), @@ -234,7 +234,7 @@ def setUp(self) -> None: end_ds_dict = {("1",): pd.Timestamp("2020-09-13"), ("2",): pd.Timestamp("2020-09-13")} self.arima_model = MultiSeriesArimaModel(pickled_model_dict, horizon=1, - frequency='month', + frequency_unit='month', frequency_quantity=1, start_ds_dict=start_ds_dict, end_ds_dict=end_ds_dict, @@ -318,7 +318,7 @@ def test_make_future_dataframe_multi_ids(self): end_ds_dict = {(1, "1"): pd.Timestamp("2020-09-13"), (2, "1"): pd.Timestamp("2020-09-13")} arima_model = MultiSeriesArimaModel(pickled_model_dict, horizon=1, - frequency='month', + frequency_unit='month', frequency_quantity=1, start_ds_dict=start_ds_dict, end_ds_dict=end_ds_dict, @@ -359,7 +359,7 @@ def setUp(self) -> None: end_ds_dict = {("1",): pd.Timestamp("2020-09-13"), ("2",): pd.Timestamp("2020-09-13")} self.arima_model = MultiSeriesArimaModel(pickled_model_dict, horizon=1, - frequency='month', + frequency_unit='month', frequency_quantity=1, start_ds_dict=start_ds_dict, end_ds_dict=end_ds_dict, @@ -414,7 +414,7 @@ def test_get_ds_weekly(self): ds_indices = AbstractArimaModel._get_ds_indices( start_ds=pd.Timestamp("2022-01-01 12:30"), periods=8, - frequency='W', + frequency_unit='W', frequency_quantity=1) pd.testing.assert_index_equal(expected_ds, ds_indices) @@ -429,7 +429,7 @@ def test_get_ds_hourly(self): ds_indices = AbstractArimaModel._get_ds_indices( start_ds=pd.Timestamp("2021-12-10 09:23"), periods=10, - frequency='H', + frequency_unit='H', frequency_quantity=1) pd.testing.assert_index_equal(expected_ds, ds_indices) @@ -447,7 +447,7 @@ def setUp(self) -> None: self.pickled_model = pickle.dumps(model) def test_mlflow_arima_log_model(self): - arima_model = ArimaModel(self.pickled_model, horizon=1, frequency='d', frequency_quantity=1, + arima_model = ArimaModel(self.pickled_model, horizon=1, frequency_unit='d', frequency_quantity=1, start_ds=pd.to_datetime("2020-10-01"), end_ds=pd.to_datetime("2020-10-09"), time_col="date") with mlflow.start_run() as run: @@ -472,7 +472,7 @@ def test_mlflow_arima_log_model_multiseries(self): end_ds_dict = {("1",): pd.Timestamp("2020-10-09"), ("2",): pd.Timestamp("2020-10-09")} multiseries_arima_model = MultiSeriesArimaModel(pickled_model_dict, horizon=1, - frequency='d', + frequency_unit='d', frequency_quantity=1, start_ds_dict=start_ds_dict, end_ds_dict=end_ds_dict, @@ -522,7 +522,7 @@ def setUp(self) -> None: self.quantity_model_pairs = [] for frequency_quantity in frequency_quantities: - dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency=self.freq, frequency_quantity=frequency_quantity) + dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency_unit=self.freq, frequency_quantity=frequency_quantity) df = pd.concat([ pd.Series(dates, name='date'), pd.Series(range(self.num_rows), name="y") @@ -532,7 +532,7 @@ def setUp(self) -> None: pickled_model = pickle.dumps(model) self.quantity_model_pairs.append((frequency_quantity, ArimaModel(pickled_model, horizon=self.horizon, - frequency=self.freq, + frequency_unit=self.freq, frequency_quantity=frequency_quantity, start_ds=self.start_ds, end_ds=dates.max(), @@ -551,7 +551,7 @@ def test_predict_timeseries_success(self): expected_ds = AbstractArimaModel._get_ds_indices( self.start_ds, periods=self.num_rows + self.horizon, - frequency=self.freq, + frequency_unit=self.freq, frequency_quantity=frequency_quantity) self.assertTrue(expected_columns.issubset(set(forecast_pd.columns))) self.assertEqual(10, forecast_pd.shape[0]) diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py index 43869184..b06b2c36 100644 --- a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py +++ b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py @@ -218,7 +218,7 @@ def test_fill_missing_time_steps(self): ) indices_to_drop = [5, 8] df_missing = pd.DataFrame({"ds": ds, "y": range(12)}).drop(indices_to_drop).reset_index(drop=True) - df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency=frequency, frequency_quantity=1) + df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency_unit=frequency, frequency_quantity=1) for index in indices_to_drop: self.assertTrue(df_filled["y"][index] == df_filled["y"][index - 1]) self.assertEqual(ds.to_list(), df_filled["ds"].to_list()) @@ -232,7 +232,7 @@ def test_fill_missing_time_steps_with_exogenous(self): ) indices_to_drop = [5, 8] df_missing = pd.DataFrame({"ds": ds, "y": range(12), "x": range(12)}).drop(indices_to_drop).reset_index(drop=True) - df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency=frequency, frequency_quantity=1) + df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency_unit=frequency, frequency_quantity=1) for index in indices_to_drop: self.assertTrue(df_filled["y"][index] == df_filled["y"][index - 1]) self.assertTrue(df_filled["x"][index] == df_filled["x"][index - 1]) @@ -245,22 +245,22 @@ def test_fill_missing_time_steps_with_multiple_frequency_quantities(self): ds = pd.date_range(start=start_ds, periods=12, freq=pd.DateOffset(**{'minutes': quantity})) indices_to_drop = [5, 8] df_missing = pd.DataFrame({"ds": ds, "y": range(12)}).drop(indices_to_drop).reset_index(drop=True) - df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency='min', frequency_quantity=quantity) + df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency_unit='min', frequency_quantity=quantity) for index in indices_to_drop: self.assertTrue(df_filled["y"][index] == df_filled["y"][index - 1]) self.assertEqual(ds.to_list(), df_filled["ds"].to_list()) def test_validate_ds_freq_matched_frequency(self): - ArimaEstimator._validate_ds_freq(self.df, frequency='D', frequency_quantity=1) - ArimaEstimator._validate_ds_freq(self.df_monthly, frequency='month', frequency_quantity=1) - ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency='min', frequency_quantity=5) - ArimaEstimator._validate_ds_freq(self.df_with_10_minute_interval, frequency='min', frequency_quantity=10) - ArimaEstimator._validate_ds_freq(self.df_with_15_minute_interval, frequency='min', frequency_quantity=15) - ArimaEstimator._validate_ds_freq(self.df_with_30_minute_interval, frequency='min', frequency_quantity=30) + ArimaEstimator._validate_ds_freq(self.df, frequency_unit='D', frequency_quantity=1) + ArimaEstimator._validate_ds_freq(self.df_monthly, frequency_unit='month', frequency_quantity=1) + ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency_unit='min', frequency_quantity=5) + ArimaEstimator._validate_ds_freq(self.df_with_10_minute_interval, frequency_unit='min', frequency_quantity=10) + ArimaEstimator._validate_ds_freq(self.df_with_15_minute_interval, frequency_unit='min', frequency_quantity=15) + ArimaEstimator._validate_ds_freq(self.df_with_30_minute_interval, frequency_unit='min', frequency_quantity=30) def test_validate_ds_freq_unmatched_frequency(self): with pytest.raises(ValueError, match="includes different frequency"): - ArimaEstimator._validate_ds_freq(self.df, frequency='W', frequency_quantity=1) + ArimaEstimator._validate_ds_freq(self.df, frequency_unit='W', frequency_quantity=1) with pytest.raises(ValueError, match="includes different frequency"): - ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency='min', frequency_quantity=10) + ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency_unit='min', frequency_quantity=10) diff --git a/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py b/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py index 0018f056..7b7b9245 100644 --- a/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py +++ b/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py @@ -43,7 +43,7 @@ def test_cross_validation_success(self): cutoffs = generate_cutoffs( self.X, horizon=3, - unit="MS", + frequency_unit="MS", seasonal_period=1, seasonal_unit="D", num_folds=3, diff --git a/runtime/tests/automl_runtime/forecast/prophet/model_test.py b/runtime/tests/automl_runtime/forecast/prophet/model_test.py index 7f15aaa0..2699705f 100644 --- a/runtime/tests/automl_runtime/forecast/prophet/model_test.py +++ b/runtime/tests/automl_runtime/forecast/prophet/model_test.py @@ -194,7 +194,7 @@ def setUpClass(cls) -> None: timeseries_starts=cls.multi_series_start, timeseries_end="2020-07-25", horizon=1, - frequency="days", + frequency_unit="days", frequency_quantity=1, time_col="time", id_cols=["id"], @@ -325,7 +325,7 @@ def test_validate_predict_cols(self): timeseries_starts=self.multi_series_start, timeseries_end="2020-07-25", horizon=1, - frequency="days", + frequency_unit="days", frequency_quantity=1, time_col="ds", id_cols=["id1"], @@ -370,7 +370,7 @@ def test_make_future_dataframe_multiple_frequency_quantities(self): timeseries_starts=self.multi_series_start, timeseries_end="2020-07-25", horizon=1, - frequency="min", + frequency_unit="min", frequency_quantity=frequency_quantity, time_col="time", id_cols=["id"], diff --git a/runtime/tests/automl_runtime/forecast/utils_test.py b/runtime/tests/automl_runtime/forecast/utils_test.py index 147e6c96..c043da9a 100644 --- a/runtime/tests/automl_runtime/forecast/utils_test.py +++ b/runtime/tests/automl_runtime/forecast/utils_test.py @@ -131,11 +131,11 @@ def setUp(self) -> None: ).rename_axis("y").reset_index() def test_generate_cutoffs_success(self): - cutoffs = generate_cutoffs(self.X, horizon=7, unit="D", num_folds=3, seasonal_period=7) + cutoffs = generate_cutoffs(self.X, horizon=7, frequency_unit="D", num_folds=3, seasonal_period=7) self.assertEqual([pd.Timestamp('2020-08-16 00:00:00'), pd.Timestamp('2020-08-19 12:00:00'), pd.Timestamp('2020-08-23 00:00:00')], cutoffs) def test_generate_cutoffs_success_large_num_folds(self): - cutoffs = generate_cutoffs(self.X, horizon=7, unit="D", num_folds=20, seasonal_period=1) + cutoffs = generate_cutoffs(self.X, horizon=7, frequency_unit="D", num_folds=20, seasonal_period=1) self.assertEqual([pd.Timestamp('2020-07-22 12:00:00'), pd.Timestamp('2020-07-26 00:00:00'), pd.Timestamp('2020-07-29 12:00:00'), @@ -151,7 +151,7 @@ def test_generate_cutoffs_success_with_gaps(self): df = pd.DataFrame( pd.date_range(start="2020-07-01", periods=30, freq='3d'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=1, unit="D", num_folds=5, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="D", num_folds=5, seasonal_period=1) self.assertEqual([pd.Timestamp('2020-09-13 00:00:00'), pd.Timestamp('2020-09-16 00:00:00'), pd.Timestamp('2020-09-19 00:00:00'), @@ -167,10 +167,10 @@ def test_generate_cutoffs_success_hourly(self): pd.Timestamp('2020-07-07 11:00:00'), pd.Timestamp('2020-07-07 14:00:00'), pd.Timestamp('2020-07-07 17:00:00')] - cutoffs = generate_cutoffs(df, horizon=6, unit="H", num_folds=5, seasonal_period=24) + cutoffs = generate_cutoffs(df, horizon=6, frequency_unit="H", num_folds=5, seasonal_period=24) self.assertEqual(expected_cutoffs, cutoffs) - cutoffs_different_seasonal_unit = generate_cutoffs(df, horizon=6, unit="H", num_folds=5, + cutoffs_different_seasonal_unit = generate_cutoffs(df, horizon=6, frequency_unit="H", num_folds=5, seasonal_period=1, seasonal_unit="D") self.assertEqual(expected_cutoffs, cutoffs_different_seasonal_unit) @@ -178,62 +178,62 @@ def test_generate_cutoffs_success_weekly(self): df = pd.DataFrame( pd.date_range(start="2020-07-01", periods=52, freq='W'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=4, unit="W", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=4, frequency_unit="W", num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2021-05-02 00:00:00'), pd.Timestamp('2021-05-16 00:00:00'), pd.Timestamp('2021-05-30 00:00:00')], cutoffs) def test_generate_cutoffs_failure_horizon_too_large(self): with self.assertRaisesRegex(ValueError, "Less data than horizon after initial window. " "Make horizon shorter."): - generate_cutoffs(self.X, horizon=20, unit="D", num_folds=3, seasonal_period=1) + generate_cutoffs(self.X, horizon=20, frequency_unit="D", num_folds=3, seasonal_period=1) def test_generate_cutoffs_less_data(self): with self.assertRaisesRegex(ValueError, "Less data than horizon."): - generate_cutoffs(self.X, horizon=100, unit="D", num_folds=3, seasonal_period=1) + generate_cutoffs(self.X, horizon=100, frequency_unit="D", num_folds=3, seasonal_period=1) def test_generate_cutoffs_success_monthly(self): df = pd.DataFrame( pd.date_range(start="2020-01-12", periods=24, freq=pd.DateOffset(months=1)), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=2, unit="MS", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=2, frequency_unit="MS", num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2021-08-12 00:00:00'), pd.Timestamp('2021-9-12 00:00:00'), pd.Timestamp('2021-10-12 00:00:00')], cutoffs) def test_generate_cutoffs_success_quaterly(self): df = pd.DataFrame( pd.date_range(start="2020-07-12", periods=9, freq=pd.DateOffset(months=3)), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=1, unit="QS", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="QS", num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2021-10-12 00:00:00'), pd.Timestamp('2022-01-12 00:00:00'), pd.Timestamp('2022-04-12 00:00:00')], cutoffs) def test_generate_cutoffs_success_annualy(self): df = pd.DataFrame( pd.date_range(start="2012-07-14", periods=10, freq=pd.DateOffset(years=1)), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=1, unit="YS", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="YS", num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2018-07-14 00:00:00'), pd.Timestamp('2019-07-14 00:00:00'), pd.Timestamp('2020-07-14 00:00:00')], cutoffs) def test_generate_cutoffs_success_with_multiple_frequency_quantities(self): df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:55:00", freq='5T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=1, unit="min", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="min", num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2020-07-01 23:44:00'), pd.Timestamp('2020-07-01 23:49:00'), pd.Timestamp('2020-07-01 23:54:00')], cutoffs) df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:50:00", freq='10T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=1, unit="min", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="min", num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2020-07-01 23:29:00'), pd.Timestamp('2020-07-01 23:39:00'), pd.Timestamp('2020-07-01 23:49:00')], cutoffs) df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:45:00", freq='15T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=1, unit="min", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="min", num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2020-07-01 23:14:00'), pd.Timestamp('2020-07-01 23:29:00'), pd.Timestamp('2020-07-01 23:44:00')], cutoffs) df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:30:00", freq='30T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_cutoffs(df, horizon=1, unit="min", num_folds=3, seasonal_period=1) + cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="min", num_folds=3, seasonal_period=1) self.assertEqual([pd.Timestamp('2020-07-01 22:29:00'), pd.Timestamp('2020-07-01 22:59:00'), pd.Timestamp('2020-07-01 23:29:00')], cutoffs) class TestTestGenerateCustomCutoffs(unittest.TestCase): @@ -246,81 +246,81 @@ def test_generate_custom_cutoffs_success_hourly(self): pd.Timestamp('2020-07-07 14:00:00'), pd.Timestamp('2020-07-07 15:00:00'), pd.Timestamp('2020-07-07 16:00:00')] - cutoffs = generate_custom_cutoffs(df, horizon=7, unit="H", split_cutoff=pd.Timestamp('2020-07-07 13:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="H", split_cutoff=pd.Timestamp('2020-07-07 13:00:00')) self.assertEqual(expected_cutoffs, cutoffs) def test_generate_custom_cutoffs_success_daily(self): df = pd.DataFrame( pd.date_range(start="2020-07-01", end="2020-08-30", freq='d'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=7, unit="D", split_cutoff=pd.Timestamp('2020-08-21 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="D", split_cutoff=pd.Timestamp('2020-08-21 00:00:00')) self.assertEqual([pd.Timestamp('2020-08-21 00:00:00'), pd.Timestamp('2020-08-22 00:00:00'), pd.Timestamp('2020-08-23 00:00:00')], cutoffs) def test_generate_custom_cutoffs_success_small_horizon(self): df = pd.DataFrame( pd.date_range(start="2020-07-01", end="2020-08-30", freq='2d'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=1, unit="D", split_cutoff=pd.Timestamp('2020-08-26 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=1, frequency_unit="D", split_cutoff=pd.Timestamp('2020-08-26 00:00:00')) self.assertEqual([pd.Timestamp('2020-08-27 00:00:00'), pd.Timestamp('2020-08-29 00:00:00')], cutoffs) def test_generate_custom_cutoffs_success_weekly(self): df = pd.DataFrame( pd.date_range(start="2020-07-01", periods=52, freq='W'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=7, unit="W", split_cutoff=pd.Timestamp('2021-04-25 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="W", split_cutoff=pd.Timestamp('2021-04-25 00:00:00')) self.assertEqual([pd.Timestamp('2021-04-25 00:00:00'), pd.Timestamp('2021-05-02 00:00:00'), pd.Timestamp('2021-05-09 00:00:00')], cutoffs) def test_generate_custom_cutoffs_success_monthly(self): df = pd.DataFrame( pd.date_range(start="2020-01-12", periods=24, freq=pd.DateOffset(months=1)), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=7, unit="MS", split_cutoff=pd.Timestamp('2021-03-12 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="MS", split_cutoff=pd.Timestamp('2021-03-12 00:00:00')) self.assertEqual([pd.Timestamp('2021-03-12 00:00:00'), pd.Timestamp('2021-04-12 00:00:00'), pd.Timestamp('2021-05-12 00:00:00')], cutoffs) def test_generate_custom_cutoffs_success_quaterly(self): df = pd.DataFrame( pd.date_range(start="2020-07-12", periods=9, freq=pd.DateOffset(months=3)), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=7, unit="QS", split_cutoff=pd.Timestamp('2020-07-12 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="QS", split_cutoff=pd.Timestamp('2020-07-12 00:00:00')) self.assertEqual([pd.Timestamp('2020-07-12 00:00:00'), pd.Timestamp('2020-10-12 00:00:00')], cutoffs) def test_generate_custom_cutoffs_success_annualy(self): df = pd.DataFrame( pd.date_range(start="2012-07-14", periods=10, freq=pd.DateOffset(years=1)), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=7, unit="YS", split_cutoff=pd.Timestamp('2012-07-14 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="YS", split_cutoff=pd.Timestamp('2012-07-14 00:00:00')) self.assertEqual([pd.Timestamp('2012-07-14 00:00:00'), pd.Timestamp('2013-07-14 00:00:00'), pd.Timestamp('2014-07-14 00:00:00')], cutoffs) def test_generate_custom_cutoffs_success_with_multiple_frequency_quantities(self): df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:55:00", freq='5T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=1, unit="min", frequency_quantity=5, split_cutoff=pd.Timestamp('2020-07-01 23:45:00')) + cutoffs = generate_custom_cutoffs(df, horizon=1, frequency_unit="min", frequency_quantity=5, split_cutoff=pd.Timestamp('2020-07-01 23:45:00')) self.assertEqual([pd.Timestamp('2020-07-01 23:45:00'), pd.Timestamp('2020-07-01 23:50:00')], cutoffs) df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:50:00", freq='10T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=1, unit="min", frequency_quantity=10, split_cutoff=pd.Timestamp('2020-07-01 23:30:00')) + cutoffs = generate_custom_cutoffs(df, horizon=1, frequency_unit="min", frequency_quantity=10, split_cutoff=pd.Timestamp('2020-07-01 23:30:00')) self.assertEqual([pd.Timestamp('2020-07-01 23:30:00'), pd.Timestamp('2020-07-01 23:40:00')], cutoffs) df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:45:00", freq='15T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=1, unit="min", frequency_quantity=15, split_cutoff=pd.Timestamp('2020-07-01 23:15:00')) + cutoffs = generate_custom_cutoffs(df, horizon=1, frequency_unit="min", frequency_quantity=15, split_cutoff=pd.Timestamp('2020-07-01 23:15:00')) self.assertEqual([pd.Timestamp('2020-07-01 23:15:00'), pd.Timestamp('2020-07-01 23:30:00')], cutoffs) df = pd.DataFrame( pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:30:00", freq='30T'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=1, unit="min", frequency_quantity=30, split_cutoff=pd.Timestamp('2020-07-01 23:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=1, frequency_unit="min", frequency_quantity=30, split_cutoff=pd.Timestamp('2020-07-01 23:00:00')) self.assertEqual([pd.Timestamp('2020-07-01 23:00:00')], cutoffs) def test_generate_custom_cutoffs_success_with_small_gaps(self): df = pd.DataFrame( pd.date_range(start="2020-07-01", periods=30, freq='3d'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=7, unit="D", split_cutoff=pd.Timestamp('2020-09-17 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="D", split_cutoff=pd.Timestamp('2020-09-17 00:00:00')) self.assertEqual([pd.Timestamp('2020-09-17 00:00:00'), pd.Timestamp('2020-09-18 00:00:00'), pd.Timestamp('2020-09-19 00:00:00')], cutoffs) @@ -329,7 +329,7 @@ def test_generate_custom_cutoffs_success_with_large_gaps(self): df = pd.DataFrame( pd.date_range(start="2020-07-01", periods=30, freq='9d'), columns=["ds"] ).rename_axis("y").reset_index() - cutoffs = generate_custom_cutoffs(df, horizon=7, unit="D", split_cutoff=pd.Timestamp('2021-03-08 00:00:00')) + cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="D", split_cutoff=pd.Timestamp('2021-03-08 00:00:00')) self.assertEqual([pd.Timestamp('2021-03-08 00:00:00'), pd.Timestamp('2021-03-09 00:00:00'), pd.Timestamp('2021-03-12 00:00:00')], cutoffs) @@ -417,7 +417,7 @@ def test_make_single_future_dataframe(self): start_time=pd.to_datetime('2022-01-01'), end_time=pd.to_datetime('2022-01-04'), horizon=1, - frequency="d", + frequency_unit="d", frequency_quantity=1, include_history=False, column_name="test_date" @@ -430,7 +430,7 @@ def test_make_single_future_dataframe(self): start_time=pd.to_datetime('2022-01-01'), end_time=pd.to_datetime('2022-01-04'), horizon=1, - frequency="d", + frequency_unit="d", frequency_quantity=1, include_history=True, column_name="test_date" @@ -447,7 +447,7 @@ def test_make_single_future_dataframe_with_different_freq(self): start_time=start_time, end_time=end_time, horizon=1, - frequency=freq, + frequency_unit=freq, frequency_quantity=1, include_history=True, column_name="test_date" @@ -464,7 +464,7 @@ def test_make_single_future_dataframe_with_different_frequency_quantities(self): start_time=start_time, end_time=end_time, horizon=1, - frequency="min", + frequency_unit="min", frequency_quantity=frequency_quantity, include_history=True, column_name="test_date" @@ -497,7 +497,7 @@ def test_make_future_dataframe(self, start_time, end_time, start_time=start_time, end_time=end_time, horizon=1, - frequency=frequency_unit, + frequency_unit=frequency_unit, frequency_quantity=frequency_quantity, groups=groups, identity_column_names=identity_column_names, From ef7511575ee171bd1a0fef241cf00d6fd90316fb Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Tue, 11 Feb 2025 23:10:51 -0800 Subject: [PATCH 09/11] pr comment --- runtime/databricks/automl_runtime/forecast/prophet/model.py | 6 +++++- runtime/databricks/automl_runtime/forecast/utils.py | 5 +++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/runtime/databricks/automl_runtime/forecast/prophet/model.py b/runtime/databricks/automl_runtime/forecast/prophet/model.py index 72463aee..0696bef3 100644 --- a/runtime/databricks/automl_runtime/forecast/prophet/model.py +++ b/runtime/databricks/automl_runtime/forecast/prophet/model.py @@ -45,7 +45,11 @@ class ProphetModel(ForecastModel): Prophet mlflow model wrapper for univariate forecasting. """ - def __init__(self, model_json: Union[Dict[Tuple, str], str], horizon: int, frequency_unit: str, frequency_quantity: int, + def __init__(self, + model_json: Union[Dict[Tuple, str], str], + horizon: int, + frequency_unit: str, + frequency_quantity: int, time_col: str) -> None: """ Initialize the mlflow Python model wrapper for mlflow diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py index e8165cec..844f527a 100644 --- a/runtime/databricks/automl_runtime/forecast/utils.py +++ b/runtime/databricks/automl_runtime/forecast/utils.py @@ -95,7 +95,7 @@ def make_single_future_dataframe( date_rng = pd.date_range( start=start_time, - end=end_time + timestep_offset*horizon, + end=end_time + timestep_offset * horizon, freq=timestep_offset ) return pd.DataFrame(date_rng, columns=[column_name]) @@ -161,7 +161,7 @@ def generate_cutoffs(df: pd.DataFrame, horizon: int, frequency_unit: str, period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*frequency_quantity*period else: offset_kwarg = {list(DATE_OFFSET_KEYWORD_MAP[frequency_unit])[0]: period} - period_dateoffset = pd.DateOffset(**offset_kwarg)*frequency_quantity + period_dateoffset = pd.DateOffset(**offset_kwarg) * frequency_quantity horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*frequency_quantity*horizon @@ -252,6 +252,7 @@ def is_frequency_consistency( evenly divisible by the period. """ periods = calculate_period_differences(start_time, end_time, frequency_unit, frequency_quantity) + # If the difference between start and end time is divisible by the period time diff = pd.to_datetime(end_time) - pd.DateOffset( **DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency_unit]] ) * periods * frequency_quantity == pd.to_datetime(start_time) From b02ab4912d03db075e2b1bc650d67406641bed87 Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Tue, 11 Feb 2025 23:14:46 -0800 Subject: [PATCH 10/11] pr comment --- runtime/tests/automl_runtime/forecast/pmdarima/training_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py index b06b2c36..b79ab967 100644 --- a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py +++ b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py @@ -110,7 +110,7 @@ def test_fit_success_with_split_cutoff(self): frequency_unit=freq, frequency_quantity=frequency_quantity, metric="smape", - seasonal_periods=[1], + seasonal_periods=[1, 7], num_folds=2, split_cutoff=pd.Timestamp(split_cutoff)) results_pd = arima_estimator.fit(df) From 5356b0440af0e019168107a52560329a20a483e7 Mon Sep 17 00:00:00 2001 From: Lan Zhang Date: Wed, 12 Feb 2025 11:23:09 -0800 Subject: [PATCH 11/11] pr comment --- runtime/databricks/automl_runtime/forecast/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py index 844f527a..195d35a9 100644 --- a/runtime/databricks/automl_runtime/forecast/utils.py +++ b/runtime/databricks/automl_runtime/forecast/utils.py @@ -253,9 +253,9 @@ def is_frequency_consistency( """ periods = calculate_period_differences(start_time, end_time, frequency_unit, frequency_quantity) # If the difference between start and end time is divisible by the period time - diff = pd.to_datetime(end_time) - pd.DateOffset( + diff = (pd.to_datetime(end_time) - pd.DateOffset( **DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency_unit]] - ) * periods * frequency_quantity == pd.to_datetime(start_time) + ) * periods * frequency_quantity) == pd.to_datetime(start_time) return diff @@ -277,4 +277,5 @@ def calculate_period_differences( start_time = pd.to_datetime(start_time) end_time = pd.to_datetime(end_time) freq_alias = PERIOD_ALIAS_MAP[OFFSET_ALIAS_MAP[frequency_unit]] + # It is intended to get the floor value. And in the later check we will use this floor value to find out if it is not consistent. return (end_time.to_period(freq_alias) - start_time.to_period(freq_alias)).n // frequency_quantity