Skip to content

[ML-49316] Support MonthMid and MonthEnd for DeepAR #160

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 52 additions & 2 deletions runtime/databricks/automl_runtime/forecast/deepar/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,50 @@
import pandas as pd


def validate_and_generate_index(df: pd.DataFrame, time_col: str, frequency: str):
"""
Generate a complete time index for the given DataFrame based on the specified frequency.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the detailed function description!

- Ensures the time column is in datetime format.
- Validates consistency in the day of the month if frequency is "MS" (month start).
- Generates a new time index from the minimum to the maximum timestamp in the data.
:param df: The input DataFrame containing the time column.
:param time_col: The name of the time column.
:param frequency: The frequency of the time series.
:return: A complete time index covering the full range of the dataset.
:raises ValueError: If the day-of-month pattern is inconsistent for "MS" frequency.
"""
if frequency.upper() != "MS":
return pd.date_range(df[time_col].min(), df[time_col].max(), freq=frequency)

df[time_col] = pd.to_datetime(df[time_col]) # Ensure datetime format

# Extract unique days
unique_days = df[time_col].dt.day.unique()

if len(unique_days) == 1:
# All dates have the same day-of-month, considered consistent
day_of_month = unique_days[0]
else:
# Check if all dates are last days of their respective months
is_last_day = (df[time_col] + pd.offsets.MonthEnd(0)) == df[time_col]
if is_last_day.all():
day_of_month = "MonthEnd"
else:
raise ValueError("Inconsistent day of the month found in time column.")

# Generate new index based on detected pattern
total_min, total_max = df[time_col].min(), df[time_col].max()
month_starts = pd.date_range(start=total_min.to_period("M").to_timestamp(),
end=total_max.to_period("M").to_timestamp(),
freq="MS")

if day_of_month == "MonthEnd":
new_index_full = month_starts + pd.offsets.MonthEnd(0)
else:
new_index_full = month_starts.map(lambda d: d.replace(day=day_of_month))

return new_index_full

def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str,
frequency: str,
id_cols: Optional[List[str]] = None):
Expand All @@ -42,7 +86,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str,
weekday_name = total_min.strftime("%a").upper() # e.g., "FRI"
frequency = f"W-{weekday_name}"

new_index_full = pd.date_range(total_min, total_max, freq=frequency)
new_index_full = validate_and_generate_index(df=df, time_col=time_col, frequency=frequency)

if id_cols is not None:
df_dict = {}
Expand All @@ -59,4 +103,10 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str,
df = df.set_index(time_col).sort_index()

# Fill in missing time steps between the min and max time steps
return df.reindex(new_index_full)
df = df.reindex(new_index_full)

if frequency.upper() == "MS":
# Truncate the day of month to avoid issues with pandas frequency check
df = df.to_period("M")

return df
109 changes: 109 additions & 0 deletions runtime/tests/automl_runtime/forecast/deepar/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,112 @@ def test_single_series_week_day_index(self):

# Assert equality
pd.testing.assert_frame_equal(transformed_df, expected_df)

def test_single_series_month_start_index(self):
target_col = "sales"
time_col = "date"
num_months = 24

# Starting from first day of January 2020
base_dates = pd.date_range(
start='2020-01-01',
periods=num_months,
freq='MS'
)

base_df = pd.DataFrame({
time_col: base_dates,
target_col: range(num_months)
})

# Create a dataframe with missing months (drop months 3 and 4)
dropped_df = base_df.drop([3, 4]).reset_index(drop=True)

# Transform the dataframe
transformed_df = set_index_and_fill_missing_time_steps(
dropped_df,
time_col,
"MS" # Monthly frequency
)

# Create expected dataframe
expected_df = base_df.copy()
expected_df.loc[[3, 4], target_col] = float('nan')
expected_df = expected_df.set_index(time_col).rename_axis(None)
expected_df = expected_df.to_period("M")

# Assert equality
pd.testing.assert_frame_equal(transformed_df, expected_df)

def test_single_series_month_mid_index(self):
target_col = "sales"
time_col = "date"
num_months = 24

# Starting from fifteenth day of January 2020
base_dates = pd.date_range(
start='2020-01-01',
periods=num_months,
freq='MS'
) + pd.DateOffset(days=14)

base_df = pd.DataFrame({
time_col: base_dates,
target_col: range(num_months)
})

# Create a dataframe with missing months (drop months 3 and 4)
dropped_df = base_df.drop([3, 4]).reset_index(drop=True)

# Transform the dataframe
transformed_df = set_index_and_fill_missing_time_steps(
dropped_df,
time_col,
"MS"
)

# Create expected dataframe
expected_df = base_df.copy()
expected_df.loc[[3, 4], target_col] = float('nan')
expected_df = expected_df.set_index(time_col).rename_axis(None)
expected_df = expected_df.to_period("M")

# Assert equality
pd.testing.assert_frame_equal(transformed_df, expected_df)

def test_single_series_month_end_index(self):
target_col = "sales"
time_col = "date"
num_months = 24

# Starting from end day of January 2020
# by specifying freq='M', it is by default the end of the month
base_dates = pd.date_range(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is it starting on the last ay of Jan 2020?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see line 228, by specifying freq='M', it is by default the end of the month

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Can you add a comment on the line below?

start='2020-01-01',
periods=num_months,
freq='M'
)

base_df = pd.DataFrame({
time_col: base_dates,
target_col: range(num_months)
})

# Create a dataframe with missing months (drop months 3 and 4)
dropped_df = base_df.drop([3, 4]).reset_index(drop=True)

# Transform the dataframe
transformed_df = set_index_and_fill_missing_time_steps(
dropped_df,
time_col,
"MS" # Monthly frequency
)

# Create expected dataframe
expected_df = base_df.copy()
expected_df.loc[[3, 4], target_col] = float('nan')
expected_df = expected_df.set_index(time_col).rename_axis(None)
expected_df = expected_df.to_period("M")

# Assert equality
pd.testing.assert_frame_equal(transformed_df, expected_df)