Skip to content

[ML-49316] Support MonthMid and MonthEnd for DeepAR #159

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 45 additions & 1 deletion runtime/databricks/automl_runtime/forecast/deepar/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,50 @@
import pandas as pd


def validate_and_generate_index(df: pd.DataFrame, time_col: str, frequency: str):
"""
Generate a complete time index for the given DataFrame based on the specified frequency.
- Ensures the time column is in datetime format.
- Validates consistency in the day of the month if frequency is "MS" (month start).
- Generates a new time index from the minimum to the maximum timestamp in the data.
:param df: The input DataFrame containing the time column.
:param time_col: The name of the time column.
:param frequency: The frequency of the time series.
:return: A complete time index covering the full range of the dataset.
:raises ValueError: If the day-of-month pattern is inconsistent for "MS" frequency.
"""
if frequency.upper() != "MS":
return pd.date_range(df[time_col].min(), df[time_col].max(), freq=frequency)

df[time_col] = pd.to_datetime(df[time_col]) # Ensure datetime format

# Extract unique days
unique_days = df[time_col].dt.day.unique()

if len(unique_days) == 1:
# All dates have the same day-of-month, considered consistent
day_of_month = unique_days[0]
else:
# Check if all dates are last days of their respective months
is_last_day = (df[time_col] + pd.offsets.MonthEnd(0)) == df[time_col]
if is_last_day.all():
day_of_month = "MonthEnd"
else:
raise ValueError("Inconsistent day of the month found in time column.")

# Generate new index based on detected pattern
total_min, total_max = df[time_col].min(), df[time_col].max()
month_starts = pd.date_range(start=total_min.to_period("M").to_timestamp(),
end=total_max.to_period("M").to_timestamp(),
freq="MS")

if day_of_month == "MonthEnd":
new_index_full = month_starts + pd.offsets.MonthEnd(0)
else:
new_index_full = month_starts.map(lambda d: d.replace(day=day_of_month))

return new_index_full

def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str,
frequency: str,
id_cols: Optional[List[str]] = None):
Expand All @@ -42,7 +86,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str,
weekday_name = total_min.strftime("%a").upper() # e.g., "FRI"
frequency = f"W-{weekday_name}"

new_index_full = pd.date_range(total_min, total_max, freq=frequency)
new_index_full = validate_and_generate_index(df=df, time_col=time_col, frequency=frequency)

if id_cols is not None:
df_dict = {}
Expand Down
105 changes: 105 additions & 0 deletions runtime/tests/automl_runtime/forecast/deepar/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,108 @@ def test_single_series_week_day_index(self):

# Assert equality
pd.testing.assert_frame_equal(transformed_df, expected_df)

def test_single_series_month_start_index(self):
target_col = "sales"
time_col = "date"
num_months = 24

# Starting from first day of January 2020
base_dates = pd.date_range(
start='2020-01-01',
periods=num_months,
freq='MS'
)

base_df = pd.DataFrame({
time_col: base_dates,
target_col: range(num_months)
})

# Create a dataframe with missing months (drop months 3 and 4)
dropped_df = base_df.drop([3, 4]).reset_index(drop=True)

# Transform the dataframe
transformed_df = set_index_and_fill_missing_time_steps(
dropped_df,
time_col,
"MS" # Monthly frequency
)

# Create expected dataframe
expected_df = base_df.copy()
expected_df.loc[[3, 4], target_col] = float('nan')
expected_df = expected_df.set_index(time_col).rename_axis(None)

# Assert equality
pd.testing.assert_frame_equal(transformed_df, expected_df)

def test_single_series_month_mid_index(self):
target_col = "sales"
time_col = "date"
num_months = 24

# Starting from fifteenth day of January 2020
base_dates = pd.date_range(
start='2020-01-01',
periods=num_months,
freq='MS'
) + pd.DateOffset(days=14)

base_df = pd.DataFrame({
time_col: base_dates,
target_col: range(num_months)
})

# Create a dataframe with missing months (drop months 3 and 4)
dropped_df = base_df.drop([3, 4]).reset_index(drop=True)

# Transform the dataframe
transformed_df = set_index_and_fill_missing_time_steps(
dropped_df,
time_col,
"MS"
)

# Create expected dataframe
expected_df = base_df.copy()
expected_df.loc[[3, 4], target_col] = float('nan')
expected_df = expected_df.set_index(time_col).rename_axis(None)

# Assert equality
pd.testing.assert_frame_equal(transformed_df, expected_df)

def test_single_series_month_end_index(self):
target_col = "sales"
time_col = "date"
num_months = 24

# Starting from end day of January 2020
base_dates = pd.date_range(
start='2020-01-01',
periods=num_months,
freq='M'
)

base_df = pd.DataFrame({
time_col: base_dates,
target_col: range(num_months)
})

# Create a dataframe with missing months (drop months 3 and 4)
dropped_df = base_df.drop([3, 4]).reset_index(drop=True)

# Transform the dataframe
transformed_df = set_index_and_fill_missing_time_steps(
dropped_df,
time_col,
"MS" # Monthly frequency
)

# Create expected dataframe
expected_df = base_df.copy()
expected_df.loc[[3, 4], target_col] = float('nan')
expected_df = expected_df.set_index(time_col).rename_axis(None)

# Assert equality
pd.testing.assert_frame_equal(transformed_df, expected_df)