diff --git a/aeon/datasets/Final Dataset Selection.csv b/aeon/datasets/Final Dataset Selection.csv new file mode 100644 index 0000000000..c336db5a22 --- /dev/null +++ b/aeon/datasets/Final Dataset Selection.csv @@ -0,0 +1,101 @@ +Dataset,Series,Category +weather_dataset,T1,Weather +weather_dataset,T2,Weather +weather_dataset,T3,Weather +weather_dataset,T4,Weather +weather_dataset,T5,Weather +solar_10_minutes_dataset,T1,Energy Production +solar_10_minutes_dataset,T2,Energy Production +solar_10_minutes_dataset,T3,Energy Production +solar_10_minutes_dataset,T4,Energy Production +solar_10_minutes_dataset,T5,Energy Production +sunspot_dataset_without_missing_values,T1,Other +wind_farms_minutely_dataset_without_missing_values,T1,Energy Production +wind_farms_minutely_dataset_without_missing_values,T2,Energy Production +wind_farms_minutely_dataset_without_missing_values,T3,Energy Production +wind_farms_minutely_dataset_without_missing_values,T4,Energy Production +wind_farms_minutely_dataset_without_missing_values,T5,Energy Production +elecdemand_dataset,T1,Energy Demand +us_births_dataset,T1,Demographic +saugeenday_dataset,T1,Weather +london_smart_meters_dataset_without_missing_values,T1,Energy Demand +london_smart_meters_dataset_without_missing_values,T2,Energy Demand +london_smart_meters_dataset_without_missing_values,T3,Energy Demand +traffic_hourly_dataset,T1,Transportation +traffic_hourly_dataset,T2,Transportation +traffic_hourly_dataset,T3,Transportation +traffic_hourly_dataset,T4,Transportation +traffic_hourly_dataset,T5,Transportation +electricity_hourly_dataset,T1,Energy Demand +electricity_hourly_dataset,T2,Energy Demand +electricity_hourly_dataset,T3,Energy Demand +pedestrian_counts_dataset,T1,Transportation +pedestrian_counts_dataset,T2,Transportation +pedestrian_counts_dataset,T3,Transportation +pedestrian_counts_dataset,T4,Transportation +pedestrian_counts_dataset,T5,Transportation +kdd_cup_2018_dataset_without_missing_values,T1,Other +australian_electricity_demand_dataset,T1,Energy Demand +australian_electricity_demand_dataset,T2,Energy Demand +australian_electricity_demand_dataset,T3,Energy Demand +oikolab_weather_dataset,T1,Weather +oikolab_weather_dataset,T2,Weather +oikolab_weather_dataset,T3,Weather +oikolab_weather_dataset,T4,Weather +m4_monthly_dataset,T122,Macro +m4_monthly_dataset,T145,Macro +m4_monthly_dataset,T180,Macro +m4_monthly_dataset,T186,Macro +m4_monthly_dataset,T17051,Micro +m4_monthly_dataset,T17088,Micro +m4_monthly_dataset,T17132,Micro +m4_monthly_dataset,T17146,Micro +m4_monthly_dataset,T26710,Demographic +m4_monthly_dataset,T27138,Industry +m4_monthly_dataset,T27170,Industry +m4_monthly_dataset,T27175,Industry +m4_monthly_dataset,T27186,Industry +m4_monthly_dataset,T37009,Finance +m4_monthly_dataset,T37070,Finance +m4_monthly_dataset,T37238,Finance +m4_monthly_dataset,T37248,Finance +m4_monthly_dataset,T47915,Other +m4_weekly_dataset,T1,Other +m4_weekly_dataset,T2,Other +m4_weekly_dataset,T19,Macro +m4_weekly_dataset,T20,Macro +m4_weekly_dataset,T21,Macro +m4_weekly_dataset,T55,Industry +m4_weekly_dataset,T56,Industry +m4_weekly_dataset,T60,Finance +m4_weekly_dataset,T61,Finance +m4_weekly_dataset,T62,Finance +m4_weekly_dataset,T224,Demographic +m4_weekly_dataset,T225,Demographic +m4_weekly_dataset,T226,Demographic +m4_weekly_dataset,T227,Demographic +m4_weekly_dataset,T248,Micro +m4_weekly_dataset,T249,Micro +m4_weekly_dataset,T250,Micro +m4_daily_dataset,T1,Macro +m4_daily_dataset,T2,Macro +m4_daily_dataset,T6,Macro +m4_daily_dataset,T130,Micro +m4_daily_dataset,T131,Micro +m4_daily_dataset,T145,Micro +m4_daily_dataset,T1604,Demographic +m4_daily_dataset,T1605,Demographic +m4_daily_dataset,T1606,Demographic +m4_daily_dataset,T1607,Demographic +m4_daily_dataset,T1614,Industry +m4_daily_dataset,T1615,Industry +m4_daily_dataset,T1634,Industry +m4_daily_dataset,T1650,Industry +m4_daily_dataset,T2036,Finance +m4_daily_dataset,T2037,Finance +m4_daily_dataset,T2041,Finance +m4_daily_dataset,T3595,Other +m4_daily_dataset,T3597,Other +m4_hourly_dataset,T170,Other +m4_hourly_dataset,T171,Other +m4_hourly_dataset,T172,Other diff --git a/aeon/datasets/__init__.py b/aeon/datasets/__init__.py index a35419017e..f2207900ed 100644 --- a/aeon/datasets/__init__.py +++ b/aeon/datasets/__init__.py @@ -16,7 +16,10 @@ "load_human_activity_segmentation_datasets", # Write functions "write_to_ts_file", + "write_to_tsf_file", "write_to_arff_file", + "write_regression_dataset", + "write_forecasting_dataset", # Single problem loaders "load_airline", "load_arrow_head", @@ -58,7 +61,13 @@ load_from_tsv_file, load_regression, ) -from aeon.datasets._data_writers import write_to_arff_file, write_to_ts_file +from aeon.datasets._data_writers import ( + write_forecasting_dataset, + write_regression_dataset, + write_to_arff_file, + write_to_ts_file, + write_to_tsf_file, +) from aeon.datasets._single_problem_loaders import ( load_acsf1, load_airline, diff --git a/aeon/datasets/_data_loaders.py b/aeon/datasets/_data_loaders.py index 9f87e1d623..0ade092f16 100644 --- a/aeon/datasets/_data_loaders.py +++ b/aeon/datasets/_data_loaders.py @@ -979,7 +979,7 @@ def load_forecasting(name, extract_path=None, return_metadata=False): >>> X=load_forecasting("m1_yearly_dataset") # doctest: +SKIP """ # Allow user to have non standard extract path - from aeon.datasets.tsf_datasets import tsf_all + from aeon.datasets.tsf_datasets import tsf_monash if extract_path is not None: local_module = extract_path @@ -995,8 +995,8 @@ def load_forecasting(name, extract_path=None, return_metadata=False): if name not in get_downloaded_tsf_datasets(extract_path): # Dataset is not already present in the datasets directory provided. # If it is not there, download and install it. - if name in tsf_all.keys(): - id = tsf_all[name] + if name in tsf_monash.keys(): + id = tsf_monash[name] if extract_path is None: local_dirname = "local_data" if not os.path.exists(os.path.join(local_module, local_dirname)): diff --git a/aeon/datasets/_data_writers.py b/aeon/datasets/_data_writers.py index 29ec83e648..fef3e86d3d 100644 --- a/aeon/datasets/_data_writers.py +++ b/aeon/datasets/_data_writers.py @@ -1,9 +1,20 @@ +"""Dataset wrting functions.""" + import os import textwrap +from datetime import datetime import numpy as np +import pandas as pd + +from aeon.transformations.format import SlidingWindowTransformer, TrainTestTransformer +from aeon.transformations.series._difference import DifferencingSeriesTransformer -__all__ = ["write_to_ts_file", "write_to_arff_file"] +__all__ = [ + "write_to_ts_file", + "write_to_tsf_file", + "write_to_arff_file", +] def write_to_ts_file( @@ -83,7 +94,6 @@ def write_to_ts_file( class_labels=class_labels, comment=header, regression=regression, - extension=None, ) missing_values = "NaN" for i in range(n_cases): @@ -99,6 +109,186 @@ def write_to_ts_file( file.close() +def write_to_tsf_file( + df, + full_file_path, + metadata, + value_column_name="series_value", + attributes_types=None, + missing_val_symbol="?", +): + """ + Save a pandas DataFrame in TSF format. + + Parameters + ---------- + df : pandas.DataFrame + The DataFrame to be saved. It is assumed that one column contains the series + (by default, named "series_value") and all other columns are series attributes. + full_file_path : str + The full path (including file name) where the TSF file will be saved. + metadata : dict + A dictionary containing metadata for the forecasting problem. It must + include the following keys: + - "frequency" (str) + - "forecast_horizon" (int) + - "contain_missing_values" (bool) + - "contain_equal_length" (bool) + value_column_name : str, optional (default="series_value") + The name of the column that contains the time series values. + attributes_types : dict, optional + A dictionary mapping attribute column names to their TSF type + (one of "numeric", "string", "date"). + If not provided, the type is inferred from the DataFrame dtypes as follows: + - numeric dtypes -> "numeric" + - datetime dtypes -> "date" + - all others -> "string" + missing_val_symbol : str, optional (default="?") + The symbol to be used in the file to represent missing values in the series. + + Raises + ------ + Exception + If any required metadata or a series or attribute value is missing. + """ + # Validate metadata keys + required_meta = [ + "frequency", + "forecast_horizon", + "contain_missing_values", + "contain_equal_length", + ] + for key in required_meta: + if key not in metadata: + raise AttributeError(f"Missing metadata entry: {key}") + + # Determine attribute columns (all columns except the series column) + attribute_columns = [col for col in df.columns if col != value_column_name] + + # If no attributes are present, warn the user. + if not attribute_columns: + raise AttributeError( + "The DataFrame must contain at least one \ + attribute column besides the series column." + ) + + # Determine attribute types if not provided. + # For each attribute, assign a type: + # - numeric dtypes -> "numeric" + # - datetime dtypes -> "date" (and will be formatted as "%Y-%m-%d %H-%M-%S") + # - all others -> "string" + if attributes_types is None: + attributes_types = {} + for col in attribute_columns: + if pd.api.types.is_numeric_dtype(df[col]): + attributes_types[col] = "numeric" + elif pd.api.types.is_datetime64_any_dtype(df[col]): + attributes_types[col] = "date" + else: + attributes_types[col] = "string" + else: + # Ensure that a type is provided for each attribute column + for col in attribute_columns: + if col not in attributes_types: + raise ValueError( + f"Attribute type for column '{col}' is \ + missing in attributes_types." + ) + + # Build header lines for the TSF file. + header_lines = [] + # First, write the attribute lines (order matters!) + for col in attribute_columns: + att_type = attributes_types[col] + if att_type not in {"numeric", "string", "date"}: + raise ValueError( + f"Unsupported attribute type '{att_type}' for column '{col}'." + ) + header_lines.append(f"@attribute {col} {att_type}") + + # Now add the metadata lines. (The order here is flexible, + # but must appear before @data.) + header_lines.append(f"@frequency {metadata['frequency']}") + header_lines.append(f"@horizon {metadata['forecast_horizon']}") + header_lines.append( + f"@missing {'true' if metadata['contain_missing_values'] else 'false'}" + ) + header_lines.append( + f"@equallength {'true' if metadata['contain_equal_length'] else 'false'}" + ) + + # Add the data section tag. + header_lines.append("@data") + # Open file for writing using the same encoding as the loader. + with open(full_file_path, "w", encoding="cp1252") as f: + # Write header lines. + for line in header_lines: + f.write(line + "\n") + + # Process each row to write the data lines. + for idx, row in df.iterrows(): + parts = [] + # Process each attribute value. + for col in attribute_columns: + val = row[col] + col_type = attributes_types[col] + if pd.isna(val): + raise ValueError( + f"Missing value in attribute column '{col}' at row {idx}." + ) + if col_type == "numeric": + try: + val_str = str(int(val)) + except Exception as e: + raise ValueError( + f"Error converting value in column '{col}' \ + at row {idx} to integer: {e}" + ) from e + elif col_type == "date": + # Ensure val is a datetime; if not, attempt conversion. + if not isinstance(val, datetime): + try: + val = pd.to_datetime(val) + except Exception as e: + raise ValueError( + f"Error converting value in column '{col}' \ + at row {idx} to datetime: {e}" + ) from e + val_str = val.strftime("%Y-%m-%d %H-%M-%S") + elif col_type == "string": + val_str = str(val) + else: + # Should not get here because we validated types earlier. + raise ValueError( + f"Unsupported attribute type '{col_type}' for column '{col}'." + ) + parts.append(val_str) + + # Process the series data from value_column_name. + series_val = row[value_column_name] + if not hasattr(series_val, "__iter__"): + raise ValueError( + f"The series in column '{value_column_name}' \ + at row {idx} is not iterable." + ) + + series_str_parts = [] + for s in series_val: + # Check for missing values in the series. + if pd.isna(s): + series_str_parts.append(missing_val_symbol) + else: + series_str_parts.append(str(s).removesuffix(".0")) + # Join series values with commas. + series_str = ",".join(series_str_parts) + parts.append(series_str) + + # The data line consists of the attribute values and + # then the series, separated by colons. + line_data = ":".join(parts) + f.write(line_data + "\n") + + def _write_header( path, problem_name, @@ -108,25 +298,24 @@ def _write_header( comment=None, regression=False, class_labels=None, - extension=None, ): if class_labels is not None and regression: raise ValueError("Cannot have class_labels true for a regression problem") # create path if it does not exist - dir = os.path.join(path, "") + dir_path = os.path.join(path, "") try: - os.makedirs(dir, exist_ok=True) - except OSError: - raise ValueError(f"Error trying to access {dir} in _write_header") + os.makedirs(dir_path, exist_ok=True) + except OSError as exc: + raise ValueError(f"Error trying to access {dir_path} in _write_header") from exc # create ts file in the path - load_path = os.path.join(dir, problem_name) - file = open(load_path, "w") + load_path = os.path.join(dir_path, problem_name) + file = open(load_path, "w", encoding="utf-8") # write comment if any as a block at start of file if comment is not None: file.write("\n# ".join(textwrap.wrap("# " + comment))) file.write("\n") - """ Writes the header info for a ts file""" + # Writes the header info for a ts file file.write(f"@problemName {problem_name}\n") file.write("@timestamps false\n") file.write(f"@univariate {str(univariate).lower()}\n") @@ -175,7 +364,7 @@ def write_to_arff_file( ------- None """ - if not (isinstance(X, np.ndarray)): + if not isinstance(X, np.ndarray): raise TypeError( f" Wrong input data type {type(X)}. Convert to np.ndarray (n_cases, " f"n_channels, n_timepoints) if possible." @@ -187,31 +376,92 @@ def write_to_arff_file( f"received {X.shape}" ) - file = open(f"{path}/{problem_name}.arff", "w") + with open(f"{path}/{problem_name}.arff", "w", encoding="utf-8") as file: - # write comment if any as a block at start of file - if header is not None: - file.write("\n% ".join(textwrap.wrap("% " + header))) - file.write("\n") + # write comment if any as a block at start of file + if header is not None: + file.write("\n% ".join(textwrap.wrap("% " + header))) + file.write("\n") - # begin writing header information - file.write(f"@Relation {problem_name}\n") + # begin writing header information + file.write(f"@Relation {problem_name}\n") - # write each attribute - for i in range(X.shape[2]): - file.write(f"@attribute att{str(i)} numeric\n") + # write each attribute + for i in range(X.shape[2]): + file.write(f"@attribute att{str(i)} numeric\n") - # lass attribute if it exists - comma_separated_class_label = ",".join(str(label) for label in np.unique(y)) - file.write(f"@attribute target {{{comma_separated_class_label}}}\n") + # lass attribute if it exists + comma_separated_class_label = ",".join(str(label) for label in np.unique(y)) + file.write(f"@attribute target {{{comma_separated_class_label}}}\n") - # write data - file.write("@data\n") - for case, target in zip(X, y): - # turn attributes into comma-separated row - atts = ",".join([str(num) if not np.isnan(num) else "?" for num in case[0]]) - file.write(str(atts)) - file.write(f",{target}") - file.write("\n") # open a new line + # write data + file.write("@data\n") + for case, target in zip(X, y): + # turn attributes into comma-separated row + atts = ",".join([str(num) if not np.isnan(num) else "?" for num in case[0]]) + file.write(str(atts)) + file.write(f",{target}") + file.write("\n") # open a new line - file.close() + +def write_regression_dataset( + series, full_file_path, dataset_name, difference_series=False, difference_y=False +): + """Write a regression dataset to file.""" + train_series, test_series = TrainTestTransformer().fit_transform(series) + if difference_series: + train_series = DifferencingSeriesTransformer().fit_transform(train_series) + test_series = DifferencingSeriesTransformer().fit_transform(test_series) + x_train, y_train, _train_indices = SlidingWindowTransformer().fit_transform( + train_series + ) + x_test, y_test, _test_indices = SlidingWindowTransformer().fit_transform( + test_series + ) + if difference_y: + y_train = np.concatenate( + ( + [y_train[0] - train_series[99]], + DifferencingSeriesTransformer().fit_transform(y_train), + ) + ) + y_test = np.concatenate( + ( + [y_test[0] - test_series[99]], + DifferencingSeriesTransformer().fit_transform(y_test), + ) + ) + write_to_ts_file( + [[item] for item in x_train], + full_file_path, + y_train, + f"{dataset_name}_TRAIN", + None, + True, + ) + write_to_ts_file( + [[item] for item in x_test], + full_file_path, + y_test, + f"{dataset_name}_TEST", + None, + True, + ) + + +def write_forecasting_dataset( + series, full_file_path, dataset_name, difference_series=False +): + """Write a regression dataset to file.""" + train_series, test_series = TrainTestTransformer().fit_transform(series) + if difference_series: + train_series = DifferencingSeriesTransformer().fit_transform(train_series) + test_series = DifferencingSeriesTransformer().fit_transform(test_series) + train_df = pd.DataFrame(train_series) + train_df.to_csv( + f"{full_file_path}/{dataset_name}_TRAIN.csv", index=False, header=False + ) + test_df = pd.DataFrame(test_series) + test_df.to_csv( + f"{full_file_path}/{dataset_name}_TEST.csv", index=False, header=False + ) diff --git a/aeon/datasets/dataset_collections.py b/aeon/datasets/dataset_collections.py index f47dac5cc4..9e163b5229 100644 --- a/aeon/datasets/dataset_collections.py +++ b/aeon/datasets/dataset_collections.py @@ -38,7 +38,7 @@ import aeon from aeon.datasets.tsc_datasets import multivariate, univariate from aeon.datasets.tser_datasets import tser_monash, tser_soton -from aeon.datasets.tsf_datasets import tsf_all +from aeon.datasets.tsf_datasets import tsf_monash MODULE = os.path.join(os.path.dirname(aeon.__file__), "datasets") @@ -75,8 +75,8 @@ def get_available_tser_datasets(name="tser_soton", return_list=True): def get_available_tsf_datasets(name=None): """List available tsf data.""" if name is None: # List them all - return sorted(list(tsf_all)) - return name in tsf_all + return sorted(list(tsf_monash)) + return name in tsf_monash def get_available_tsc_datasets(name=None): diff --git a/aeon/datasets/dataset_generation.py b/aeon/datasets/dataset_generation.py new file mode 100644 index 0000000000..86a3f9efef --- /dev/null +++ b/aeon/datasets/dataset_generation.py @@ -0,0 +1,226 @@ +"""Code to select datasets for regression-based forecasting experiments.""" + +import gc +import os +import tempfile +import time + +import pandas as pd + +from aeon.datasets import load_forecasting +from aeon.datasets._data_writers import ( + write_forecasting_dataset, + write_regression_dataset, +) + +filtered_datasets = [ + "nn5_daily_dataset_without_missing_values", + "nn5_weekly_dataset", + "m1_yearly_dataset", + "m1_quarterly_dataset", + "m1_monthly_dataset", + "m3_yearly_dataset", + "m3_quarterly_dataset", + "m3_monthly_dataset", + "m3_other_dataset", + "m4_yearly_dataset", + "m4_quarterly_dataset", + "m4_monthly_dataset", + "m4_weekly_dataset", + "m4_daily_dataset", + "m4_hourly_dataset", + "tourism_yearly_dataset", + "tourism_quarterly_dataset", + "tourism_monthly_dataset", + "car_parts_dataset_without_missing_values", + "hospital_dataset", + "weather_dataset", + "dominick_dataset", + "fred_md_dataset", + "solar_10_minutes_dataset", + "solar_weekly_dataset", + "solar_4_seconds_dataset", + "wind_4_seconds_dataset", + "sunspot_dataset_without_missing_values", + "wind_farms_minutely_dataset_without_missing_values", + "elecdemand_dataset", + "us_births_dataset", + "saugeenday_dataset", + "covid_deaths_dataset", + "cif_2016_dataset", + "london_smart_meters_dataset_without_missing_values", + "kaggle_web_traffic_dataset_without_missing_values", + "kaggle_web_traffic_weekly_dataset", + "traffic_hourly_dataset", + "traffic_weekly_dataset", + "electricity_hourly_dataset", + "electricity_weekly_dataset", + "pedestrian_counts_dataset", + "kdd_cup_2018_dataset_without_missing_values", + "australian_electricity_demand_dataset", + "covid_mobility_dataset_without_missing_values", + "rideshare_dataset_without_missing_values", + "vehicle_trips_dataset_without_missing_values", + "temperature_rain_dataset_without_missing_values", + "oikolab_weather_dataset", +] + + +def filter_datasets(): + """ + Filter datasets to identify and print time series with more than 1000 data points. + + This function iterates over a list of datasets, loads each dataset, + and checks each time series within it. If a series contains more than 1000 + data points, it is counted as a "hit." The function prints up to 10 matches + per dataset in the format: `,`. + + Returns + ------- + None + The function does not return anything but prints matching dataset + and series names to the console. + + Notes + ----- + - The function introduces a 1-second delay (`time.sleep(1)`) between processing + datasets to control HTTP request frequency. + - Uses `gc.collect()` to explicitly trigger garbage collection, to avoid + running out of memory + """ + num_hits = 0 + for dataset_name in filtered_datasets: + # print(f"{dataset_name}") + time.sleep(1) + dataset_counter = 0 + dataset = load_forecasting(dataset_name) + for index, row in enumerate(dataset["series_value"]): + if len(row) > 1000: + num_hits += 1 + dataset_counter += 1 + if dataset_counter <= 10: + print(f"{dataset_name},{dataset['series_name'][index]}") # noqa + # if dataset_counter > 0: + # print(f"{dataset_name}: Hits: {dataset_counter}") + del dataset + gc.collect() + # print(f"Num hits in datasets: {num_hits}") + + +# filter_datasets() + + +def filter_and_categorise_m4(frequency_type): + """ + Filter and categorize M4 dataset time series. + + Parameters + ---------- + frequency_type : str + The frequency type of the M4 dataset to process. + Accepted values: 'yearly', 'quarterly', 'monthly', 'weekly', 'daily', 'hourly'. + + Returns + ------- + None + The function does not return any values but prints categorized series + information. + + Notes + ----- + - The function constructs an appropriate prefix ('Y', 'Q', 'M', 'W', 'D', 'H') + based on the dataset type to match metadata identifiers. + - Limits printed results to 10 per category. + """ + metadata = pd.read_csv("C:/Users/alexb/Downloads/M4-info.csv") + m4daily = load_forecasting(f"m4_{frequency_type}_dataset") + categories = {} + prefix = "" + if frequency_type == "yearly": + prefix = "Y" + elif frequency_type == "quarterly": + prefix = "Q" + elif frequency_type == "monthly": + prefix = "M" + elif frequency_type == "weekly": + prefix = "W" + elif frequency_type == "daily": + prefix = "D" + elif frequency_type == "hourly": + prefix = "H" + for index, row in enumerate(m4daily["series_value"]): + if len(row) > 1000: + category = metadata.loc[ + metadata["M4id"] == f"{prefix}{m4daily['series_name'][index][1:]}", + "category", + ].values[0] + if category not in categories: + categories[category] = 1 + else: + categories[category] += 1 + if categories[category] <= 10: + print( # noqa + f"m4_{frequency_type}_dataset,\ + {m4daily['series_name'][index]},{category}" + ) + + +# filter_and_categorise_m4('monthly') +# filter_and_categorise_m4('weekly') +# filter_and_categorise_m4('daily') +# filter_and_categorise_m4('hourly') + + +def gen_datasets(problem_type, dataset_folder=None): + """ + Generate windowed train/test split of datasets. + + Returns + ------- + None + The function does not return anything but writes out the train and test + files to the specified directory. + + Notes + ----- + - Requires a CSV file containing a list of the series to process. + """ + final_series_selection = pd.read_csv("./aeon/datasets/Final Dataset Selection.csv") + current_dataset = "" + dataset = pd.DataFrame() + tmpdir = tempfile.mkdtemp() + folder = problem_type if dataset_folder is None else dataset_folder + location_of_datasets = f"./aeon/datasets/local_data/{folder}" + if not os.path.exists(location_of_datasets): + os.makedirs(location_of_datasets) + with open(f"{location_of_datasets}/windowed_series.txt", "w") as f: + for item in final_series_selection.to_records(index=False): + if current_dataset != item[0]: + dataset = load_forecasting(item[0], tmpdir) + current_dataset = item[0] + print(f"Current Dataset: {current_dataset}") # noqa + f.write(f"{item[0]}_{item[1]}\n") + series = ( + dataset[dataset["series_name"] == item[1]]["series_value"] + .iloc[0] + .to_numpy() + ) + dataset_name = f"{item[0]}_{item[1]}" + full_file_path = f"{location_of_datasets}/{dataset_name}" + if not os.path.exists(full_file_path): + os.makedirs(full_file_path) + if problem_type == "regression": + write_regression_dataset( + series, + full_file_path, + dataset_name, + difference_series=False, + difference_y=True, + ) + elif problem_type == "forecasting": + write_forecasting_dataset( + series, full_file_path, dataset_name, difference_series=False + ) + + +gen_datasets("regression", "part_diff_regression") diff --git a/aeon/datasets/tests/test_data_writers.py b/aeon/datasets/tests/test_data_writers.py index d31700ac2b..e7428a39fc 100644 --- a/aeon/datasets/tests/test_data_writers.py +++ b/aeon/datasets/tests/test_data_writers.py @@ -128,7 +128,6 @@ def test_write_header(): _write_header( tmp, problem_name, - extension=".csv", comment="Hello", regression=True, ) diff --git a/aeon/datasets/tests/test_dataset_collections.py b/aeon/datasets/tests/test_dataset_collections.py index 624870ab5e..bb185fac14 100644 --- a/aeon/datasets/tests/test_dataset_collections.py +++ b/aeon/datasets/tests/test_dataset_collections.py @@ -69,7 +69,7 @@ def test_list_available_tser_datasets(): def test_list_available_tsf_datasets(): """Test recovering lists of available data sets.""" res = get_available_tsf_datasets() - assert len(res) == 53 + assert len(res) == 62 res = get_available_tsf_datasets("FOO") assert not res res = get_available_tsf_datasets("m1_monthly_dataset") diff --git a/aeon/datasets/tsad_datasets.py b/aeon/datasets/tsad_datasets.py index 4372772dc5..8f10af3eaf 100644 --- a/aeon/datasets/tsad_datasets.py +++ b/aeon/datasets/tsad_datasets.py @@ -67,7 +67,7 @@ def tsad_collections() -> dict[str, list[str]]: df = _load_indexfile() return ( df.groupby("collection_name") - .apply(lambda x: x["dataset_name"].to_list(), include_groups=False) + .apply(lambda x: x["dataset_name"].to_list()) .to_dict() ) diff --git a/aeon/datasets/tsf_datasets.py b/aeon/datasets/tsf_datasets.py index b5c008c3dd..ce3248e90e 100644 --- a/aeon/datasets/tsf_datasets.py +++ b/aeon/datasets/tsf_datasets.py @@ -1,6 +1,6 @@ """Datasets in the Monash tser data archives.""" -tsf_all = { +tsf_monash = { "nn5_daily_dataset_with_missing_values": 4656110, "nn5_daily_dataset_without_missing_values": 4656117, "nn5_weekly_dataset": 4656125, @@ -54,4 +54,17 @@ "australian_electricity_demand_dataset": 4659727, "covid_mobility_dataset_with_missing_values": 4663762, "covid_mobility_dataset_without_missing_values": 4663809, + "bitcoin_dataset_with_missing_values": 5121965, + "bitcoin_dataset_without_missing_values": 5122101, + "rideshare_dataset_with_missing_values": 5122114, + "rideshare_dataset_without_missing_values": 5122232, + "vehicle_trips_dataset_with_missing_values": 5122535, + "vehicle_trips_dataset_without_missing_values": 5122537, + "temperature_rain_dataset_with_missing_values": 5129073, + "temperature_rain_dataset_without_missing_values": 5129091, + "oikolab_weather_dataset": 5184708, + # These datasets generate HTTP Error 404: NOT FOUND errors + # "extended_wikipedia_web_traffic_daily_dataset_with_missing_values": 7370977, + # "extended_wikipedia_web_traffic_daily_dataset_without_missing_values": 7371038, + # "residential_power_and_battery_data": 8219786, } diff --git a/aeon/forecasting/__init__.py b/aeon/forecasting/__init__.py index 7a331f69e6..937e952291 100644 --- a/aeon/forecasting/__init__.py +++ b/aeon/forecasting/__init__.py @@ -5,9 +5,19 @@ "BaseForecaster", "RegressionForecaster", "ETSForecaster", + "AutoETSForecaster", + "ARIMAForecaster", + "SARIMAForecaster", + "AutoARIMAForecaster", + "AutoSARIMAForecaster", ] +from aeon.forecasting._arima import ARIMAForecaster +from aeon.forecasting._auto_arima import AutoARIMAForecaster +from aeon.forecasting._auto_sarima import AutoSARIMAForecaster +from aeon.forecasting._autoets import AutoETSForecaster from aeon.forecasting._ets import ETSForecaster from aeon.forecasting._naive import NaiveForecaster from aeon.forecasting._regression import RegressionForecaster +from aeon.forecasting._sarima import SARIMAForecaster from aeon.forecasting.base import BaseForecaster diff --git a/aeon/forecasting/_arima.py b/aeon/forecasting/_arima.py new file mode 100644 index 0000000000..57af81c212 --- /dev/null +++ b/aeon/forecasting/_arima.py @@ -0,0 +1,273 @@ +"""ARIMAForecaster. + +An implementation of the ARIMA forecasting algorithm. +""" + +__maintainer__ = ["alexbanwell1", "TonyBagnall"] +__all__ = ["ARIMAForecaster"] + +import numpy as np +from numba import njit + +from aeon.forecasting.base import BaseForecaster +from aeon.utils.optimisation._nelder_mead import nelder_mead + + +class ARIMAForecaster(BaseForecaster): + """AutoRegressive Integrated Moving Average (ARIMA) forecaster. + + The model automatically selects the parameters of the model based + on information criteria, such as AIC. + + Parameters + ---------- + p : int, default=1, + Autoregressive (p) order of the ARIMA model + d : int, default=0, + Differencing (d) order of the ARIMA model + q : int, default=1, + Moving average (q) order of the ARIMA model + constant_term: bool = False, + Presence of a constant/intercept term in the model. + horizon : int, default=1 + The forecasting horizon, i.e., the number of steps ahead to predict. + + Attributes + ---------- + data_ : np.ndarray + Original training series values. + differenced_data_ : np.ndarray + Differenced version of the training data used for stationarity. + residuals_ : np.ndarray + Residual errors from the fitted model. + aic_ : float + Akaike Information Criterion for the selected model. + p, d, q : int + Parameters passed to the forecaster see p_, d_, q_. + p_, d_, q_ : int + Orders of the ARIMA model: autoregressive (p), differencing (d), + and moving average (q) terms. + constant_term : bool + Parameters passed to the forecaster see constant_term_. + constant_term_ : bool + Whether to include a constant/intercept term in the model. + c_ : float + Estimated constant term (internal use). + phi_ : np.ndarray + Coefficients for the non-seasonal autoregressive terms. + theta_ : np.ndarray + Coefficients for the non-seasonal moving average terms. + + References + ---------- + .. [1] R. J. Hyndman and G. Athanasopoulos, + Forecasting: Principles and Practice. OTexts, 2014. + https://otexts.com/fpp3/ + + Examples + -------- + >>> from aeon.forecasting import ARIMAForecaster + >>> from aeon.datasets import load_airline + >>> y = load_airline() + >>> forecaster = ARIMAForecaster(p=2,d=1) + >>> forecaster.fit(y) + ARIMAForecaster(d=1, p=2) + >>> forecaster.predict() + 474.49449... + """ + + def __init__( + self, + p: int = 1, + d: int = 0, + q: int = 1, + constant_term: bool = False, + ): + super().__init__(horizon=1, axis=1) + self.data_ = [] + self.differenced_data_ = [] + self.residuals_ = [] + self.fitted_values_ = [] + self.aic_ = 0 + self.p = p + self.d = d + self.q = q + self.constant_term = constant_term + self.p_ = 0 + self.d_ = 0 + self.q_ = 0 + self.constant_term_ = False + self.model_ = [] + self.c_ = 0 + self.phi_ = 0 + self.theta_ = 0 + self.parameters_ = [] + + def _fit(self, y, exog=None): + """Fit AutoARIMA forecaster to series y. + + Fit a forecaster to predict self.horizon steps ahead using y. + + Parameters + ---------- + y : np.ndarray + A time series on which to learn a forecaster to predict horizon ahead + exog : np.ndarray, default =None + Optional exogenous time series data assumed to be aligned with y + + Returns + ------- + self + Fitted ARIMAForecaster. + """ + self.p_ = self.p + self.d_ = self.d + self.q_ = self.q + self.constant_term_ = self.constant_term + self.data_ = np.array(y.squeeze(), dtype=np.float64) + self.model_ = np.array( + (1 if self.constant_term else 0, self.p, self.q), dtype=np.int32 + ) + self.differenced_data_ = np.diff(self.data_, n=self.d) + (self.parameters_, self.aic_) = nelder_mead( + _arima_model_wrapper, + np.sum(self.model_[:3]), + self.differenced_data_, + self.model_, + ) + (self.c_, self.phi_, self.theta_) = _extract_params( + self.parameters_, self.model_ + ) + (self.aic_, self.residuals_, self.fitted_values_) = _arima_model( + self.parameters_, + _calc_arima, + self.differenced_data_, + self.model_, + np.empty(0), + ) + return self + + def _predict(self, y=None, exog=None): + """ + Predict the next horizon steps ahead. + + Parameters + ---------- + y : np.ndarray, default = None + A time series to predict the next horizon value for. If None, + predict the next horizon value after series seen in fit. + exog : np.ndarray, default =None + Optional exogenous time series data assumed to be aligned with y + + Returns + ------- + array[float] + Predictions len(y) steps ahead of the data seen in fit. + If y is None, then predict 1 step ahead of the data seen in fit. + """ + if y is not None: + combined_data = np.concatenate((self.data_, y.flatten())) + else: + combined_data = self.data_ + n = len(self.data_) + differenced_data = np.diff(combined_data, n=self.d_) + _aic, _residuals, predicted_values = _arima_model( + self.parameters_, + _calc_arima, + differenced_data, + self.model_, + self.residuals_, + ) + init = combined_data[n - self.d_ : n] + x = np.concatenate((init, predicted_values)) + for _ in range(self.d_): + x = np.cumsum(x) + return x[self.d_ :] + + +@njit(cache=True, fastmath=True) +def _aic(residuals, num_params): + """Calculate the log-likelihood of a model.""" + variance = np.mean(residuals**2) + liklihood = len(residuals) * (np.log(2 * np.pi) + np.log(variance) + 1) + return liklihood + 2 * num_params + + +@njit(cache=False, fastmath=True) +def _arima_model_wrapper(params, data, model): + return _arima_model(params, _calc_arima, data, model, np.empty(0))[0] + + +# Define the ARIMA(p, d, q) likelihood function +@njit(cache=True, fastmath=True) +def _arima_model(params, base_function, data, model, residuals): + """Calculate the log-likelihood of an ARIMA model given the parameters.""" + formatted_params = _extract_params(params, model) # Extract parameters + + # Initialize residuals + n = len(data) + m = len(residuals) + num_predictions = n - m + 1 + residuals = np.concatenate((residuals, np.zeros(num_predictions - 1))) + expect_full_history = m > 0 # I.e. we've been provided with some residuals + fitted_values = np.zeros(num_predictions) + for t in range(num_predictions): + fitted_values[t] = base_function( + data, + model, + m + t, + formatted_params, + residuals, + expect_full_history, + ) + if t != num_predictions - 1: + # Only calculate residuals for the predictions we have data for + residuals[m + t] = data[m + t] - fitted_values[t] + return _aic(residuals, len(params)), residuals, fitted_values + + +@njit(cache=True, fastmath=True) +def _extract_params(params, model): + """Extract ARIMA parameters from the parameter vector.""" + if len(params) != np.sum(model): + previous_length = np.sum(model) + model = model[:-1] # Remove the seasonal period + if len(params) != np.sum(model): + raise ValueError( + f"Expected {previous_length} parameters for a non-seasonal model or \ + {np.sum(model)} parameters for a seasonal model, got {len(params)}" + ) + starts = np.cumsum(np.concatenate((np.zeros(1, dtype=np.int32), model[:-1]))) + n = len(starts) + max_len = np.max(model) + result = np.full((n, max_len), np.nan, dtype=params.dtype) + for i in range(n): + length = model[i] + start = starts[i] + result[i, :length] = params[start : start + length] + return result + + +@njit(cache=True, fastmath=True) +def _calc_arima(data, model, t, formatted_params, residuals, expect_full_history=False): + """Calculate the ARIMA forecast for time t.""" + if len(model) != 3: + raise ValueError("Model must be of the form (c, p, q)") + p = model[1] + q = model[2] + if expect_full_history and (t - p < 0 or t - q < 0): + raise ValueError( + f"Insufficient data for ARIMA model at time {t}. " + f"Expected at least {p} past values for AR and {q} for MA." + ) + # AR part + phi = formatted_params[1, :p] + ar_term = 0 if (t - p) < 0 else np.dot(phi, data[t - p : t][::-1]) + + # MA part + theta = formatted_params[2, :q] + ma_term = 0 if (t - q) < 0 else np.dot(theta, residuals[t - q : t][::-1]) + + c = formatted_params[0, 0] if model[0] else 0 + y_hat = c + ar_term + ma_term + return y_hat diff --git a/aeon/forecasting/_auto_arima.py b/aeon/forecasting/_auto_arima.py new file mode 100644 index 0000000000..44f0ee80f9 --- /dev/null +++ b/aeon/forecasting/_auto_arima.py @@ -0,0 +1,169 @@ +"""AutoARIMAForecaster. + +An implementation of the AutoARIMA forecasting algorithm. +""" + +__maintainer__ = ["alexbanwell1", "TonyBagnall"] +__all__ = ["AutoARIMAForecaster"] + +import numpy as np +from numba import njit + +from aeon.forecasting import ARIMAForecaster +from aeon.forecasting._arima import ( + _arima_model, + _arima_model_wrapper, + _calc_arima, + _extract_params, +) +from aeon.utils.forecasting._hypo_tests import kpss_test +from aeon.utils.optimisation._nelder_mead import nelder_mead + + +class AutoARIMAForecaster(ARIMAForecaster): + """AutoRegressive Integrated Moving Average (ARIMA) forecaster. + + Implements the Hyndman-Khandakar automatic ARIMA algorithm for time series + forecasting with optional seasonal components. The model automatically selects + the orders of the (p, d, q) components based on information criteria, such as AIC. + + Parameters + ---------- + horizon : int, default=1 + The forecasting horizon, i.e., the number of steps ahead to predict. + + References + ---------- + .. [1] R. J. Hyndman and G. Athanasopoulos, + Forecasting: Principles and Practice. OTexts, 2014. + https://otexts.com/fpp3/ + + Examples + -------- + >>> from aeon.forecasting import AutoARIMAForecaster + >>> from aeon.datasets import load_airline + >>> y = load_airline() + >>> forecaster = AutoARIMAForecaster() + >>> forecaster.fit(y) + AutoARIMAForecaster() + >>> forecaster.predict() + 476.5824781648738 + """ + + def __init__(self): + super().__init__() + + def _fit(self, y, exog=None): + """Fit AutoARIMA forecaster to series y. + + Fit a forecaster to predict self.horizon steps ahead using y. + + Parameters + ---------- + y : np.ndarray + A time series on which to learn a forecaster to predict horizon ahead + exog : np.ndarray, default =None + Optional exogenous time series data assumed to be aligned with y + + Returns + ------- + self + Fitted ARIMAForecaster. + """ + self.data_ = np.array(y.squeeze(), dtype=np.float64) + self.differenced_data_ = self.data_.copy() + self.d_ = 0 + while not kpss_test(self.differenced_data_)[1]: + self.differenced_data_ = np.diff(self.differenced_data_, n=1) + self.d_ += 1 + include_c = 1 if self.d_ == 0 else 0 + model_parameters = np.array( + [ + [include_c, 2, 2], + [include_c, 0, 0], + [include_c, 1, 0], + [include_c, 0, 1], + ] + ) + ( + self.model_, + self.parameters_, + self.aic_, + ) = _auto_arima( + self.differenced_data_, _arima_model_wrapper, model_parameters, 3 + ) + ( + constant_term_int, + self.p_, + self.q_, + ) = self.model_ + self.constant_term_ = constant_term_int == 1 + (self.c_, self.phi_, self.theta_) = _extract_params( + self.parameters_, self.model_ + ) + ( + self.aic_, + self.residuals_, + self.fitted_values_, + ) = _arima_model( + self.parameters_, + _calc_arima, + self.differenced_data_, + self.model_, + np.empty(0), + ) + return self + + +@njit(cache=True, fastmath=True) +def _auto_arima( + differenced_data, model_function, inital_model_parameters, num_model_params=3 +): + """ + Implement the Hyndman-Khandakar algorithm. + + For automatic ARIMA model selection. + """ + best_score = -1 + best_model = inital_model_parameters[0] + best_points = None + for model in inital_model_parameters: + points, aic = nelder_mead( + model_function, + np.sum(model[:num_model_params]), + differenced_data, + model, + ) + if (aic < best_score) or (best_score == -1): + best_score = aic + best_model = model + best_points = points + + while True: + better_model = False + for param_no in range(1, num_model_params): + for adjustment in [-1, 1]: + if (best_model[param_no] + adjustment) < 0: + continue + model = best_model.copy() + model[param_no] += adjustment + for constant_term in [0, 1]: + model[0] = constant_term + points, aic = nelder_mead( + model_function, + np.sum(model[:num_model_params]), + differenced_data, + model, + ) + if aic < best_score: + best_model = model.copy() + best_points = points + best_score = aic + better_model = True + if not better_model: + break + return ( + best_model, + best_points, + best_score, + ) diff --git a/aeon/forecasting/_auto_sarima.py b/aeon/forecasting/_auto_sarima.py new file mode 100644 index 0000000000..fbf6a5d3c0 --- /dev/null +++ b/aeon/forecasting/_auto_sarima.py @@ -0,0 +1,124 @@ +"""AutoSARIMAForecaster. + +An implementation of the Auto SARIMA forecasting algorithm. +""" + +__maintainer__ = ["alexbanwell1", "TonyBagnall"] +__all__ = ["AutoSARIMAForecaster"] + +import numpy as np + +from aeon.forecasting._arima import _arima_model, _extract_params +from aeon.forecasting._auto_arima import _auto_arima +from aeon.forecasting._sarima import ( + SARIMAForecaster, + _calc_sarima, + _sarima_model_wrapper, +) +from aeon.utils.forecasting._hypo_tests import kpss_test +from aeon.utils.forecasting._seasonality import calc_seasonal_period + +NOGIL = False +CACHE = True + + +class AutoSARIMAForecaster(SARIMAForecaster): + """Seasonal AutoRegressive Integrated Moving Average (SARIMA) forecaster. + + Implements the Hyndman-Khandakar automatic ARIMA algorithm for time series + forecasting with optional seasonal components. The model automatically selects + the orders of the non-seasonal (p, d, q) and seasonal (P, D, Q, m) components + based on information criteria, such as AIC. + + Parameters + ---------- + horizon : int, default=1 + The forecasting horizon, i.e., the number of steps ahead to predict. + + References + ---------- + .. [1] R. J. Hyndman and G. Athanasopoulos, + Forecasting: Principles and Practice. OTexts, 2014. + https://otexts.com/fpp3/ + + Examples + -------- + >>> from aeon.forecasting import AutoSARIMAForecaster + >>> from aeon.datasets import load_airline + >>> y = load_airline() + >>> forecaster = AutoSARIMAForecaster() + >>> forecaster.fit(y) + AutoSARIMAForecaster() + >>> forecaster.predict() + 450.748... + """ + + def __init__(self): + super().__init__() + + def _fit(self, y, exog=None): + """Fit AutoARIMA forecaster to series y. + + Fit a forecaster to predict self.horizon steps ahead using y. + + Parameters + ---------- + y : np.ndarray + A time series on which to learn a forecaster to predict horizon ahead + exog : np.ndarray, default =None + Optional exogenous time series data assumed to be aligned with y + + Returns + ------- + self + Fitted ARIMAForecaster. + """ + self.data_ = np.array(y.squeeze(), dtype=np.float64) + self.seasonal_period_ = calc_seasonal_period(self.data_) + self.differenced_data_ = self.data_.copy() + self.d_ = 0 + while not kpss_test(self.differenced_data_)[1]: + self.differenced_data_ = np.diff(self.differenced_data_, n=1) + self.d_ += 1 + self.ds_ = 1 if self.seasonal_period_ > 1 else 0 + if self.ds_: + self.differenced_data_ = ( + self.differenced_data_[self.seasonal_period_ :] + - self.differenced_data_[: -self.seasonal_period_] + ) + include_c = 1 if self.d_ == 0 else 0 + model_parameters = np.array( + [ + [include_c, 2, 2, 0, 0, self.seasonal_period_], + [include_c, 0, 0, 0, 0, self.seasonal_period_], + [include_c, 1, 0, 0, 0, self.seasonal_period_], + [include_c, 0, 1, 0, 0, self.seasonal_period_], + ] + ) + ( + self.model_, + self.parameters_, + self.aic_, + ) = _auto_arima( + self.differenced_data_, _sarima_model_wrapper, model_parameters, 5 + ) + ( + constant_term_int, + self.p_, + self.q_, + self.ps_, + self.qs_, + self.seasonal_period_, + ) = self.model_ + self.constant_term_ = constant_term_int == 1 + (self.c_, self.phi_, self.theta_, self.phi_s_, self.theta_s_) = _extract_params( + self.parameters_, self.model_ + ) + (self.aic_, self.residuals_, self.fitted_values_) = _arima_model( + self.parameters_, + _calc_sarima, + self.differenced_data_, + self.model_, + np.empty(0), + ) + return self diff --git a/aeon/forecasting/_autoets.py b/aeon/forecasting/_autoets.py new file mode 100644 index 0000000000..4f2410cb5e --- /dev/null +++ b/aeon/forecasting/_autoets.py @@ -0,0 +1,468 @@ +"""AutoETS class. + +Extends the ETSForecaster to automatically calculate the smoothing parameters + +""" + +__maintainer__ = [] +__all__ = ["AutoETSForecaster"] +import numpy as np +from numba import njit +from scipy.optimize import minimize + +from aeon.forecasting._autoets_gradient_params import _calc_model_liklihood +from aeon.forecasting._ets import _numba_fit, _predict +from aeon.forecasting._utils import calc_seasonal_period +from aeon.forecasting.base import BaseForecaster + +NOGIL = False +CACHE = True + + +class AutoETSForecaster(BaseForecaster): + """Automatic Exponential Smoothing forecaster. + + An implementation of the exponential smoothing statistics forecasting algorithm. + Chooses betweek additive and multiplicative error models, + None, additive and multiplicative (including damped) trend and + None, additive and mutliplicative seasonality[1]_. + + Parameters + ---------- + horizon : int, default = 1 + The horizon to forecast to. + + References + ---------- + .. [1] R. J. Hyndman and G. Athanasopoulos, + Forecasting: Principles and Practice. Melbourne, Australia: OTexts, 2014. + + Examples + -------- + >>> from aeon.forecasting import AutoETSForecaster + >>> from aeon.datasets import load_airline + >>> y = load_airline() + >>> forecaster = AutoETSForecaster() + >>> forecaster.fit(y) + AutoETSForecaster() + >>> forecaster.predict() + array([407.74740434]) + """ + + def __init__( + self, + method="internal_nelder_mead", + horizon=1, + ): + self.method = method + self.forecast_val_ = 0.0 + self.level_ = 0.0 + self.trend_ = 0.0 + self.seasonality_ = None + self.alpha_ = 0 + self.beta_ = 0 + self.gamma_ = 0 + self.phi_ = 0 + self.error_type_ = 0 + self.trend_type_ = 0 + self.seasonality_type_ = 0 + self.seasonal_period_ = 0 + self.n_timepoints_ = 0 + self.avg_mean_sq_err_ = 0 + self.liklihood_ = 0 + self.k_ = 0 + self.aic_ = 0 + self.residuals_ = [] + self.fitted_values_ = [] + super().__init__(horizon=horizon, axis=1) + + def _fit(self, y, exog=None): + """Fit Auto Exponential Smoothing forecaster to series y. + + Fit a forecaster to predict self.horizon steps ahead using y. + + Parameters + ---------- + y : np.ndarray + A time series on which to learn a forecaster to predict horizon ahead + exog : np.ndarray, default =None + Optional exogenous time series data assumed to be aligned with y + + Returns + ------- + self + Fitted AutoETSForecaster. + """ + data = y.squeeze() + ( + self.error_type_, + self.trend_type_, + self.seasonality_type_, + self.seasonal_period_, + self.alpha_, + self.beta_, + self.gamma_, + self.phi_, + ) = auto_ets(data, self.method) + ( + self.level_, + self.trend_, + self.seasonality_, + self.n_timepoints_, + self.residuals_, + self.fitted_values_, + self.avg_mean_sq_err_, + self.liklihood_, + self.k_, + self.aic_, + ) = _numba_fit( + data, + self.error_type_, + self.trend_type_, + self.seasonality_type_, + self.seasonal_period_, + self.alpha_, + self.beta_, + self.gamma_, + self.phi_, + ) + return self + + def _predict(self, y=None, exog=None): + """ + Predict the next horizon steps ahead. + + Parameters + ---------- + y : np.ndarray, default = None + A time series to predict the next horizon value for. If None, + predict the next horizon value after series seen in fit. + exog : np.ndarray, default =None + Optional exogenous time series data assumed to be aligned with y + + Returns + ------- + float + single prediction self.horizon steps ahead of y. + """ + if y is not None: + y = y.squeeze() + fitted_values = _predict( + self.error_type_, + self.trend_type_, + self.seasonality_type_, + self.alpha_, + self.beta_, + self.gamma_, + self.phi_, + self.level_, + self.trend_, + self.seasonality_, + self.horizon, + self.n_timepoints_, + self.seasonal_period_, + y, + ) + return fitted_values + + +def auto_ets(data, method="internal_nelder_mead"): + """Return the best ETS model based on the supplied data, and optimisation method.""" + if method == "internal_nelder_mead": + return auto_ets_nelder_mead(data) + elif method == "internal_gradient": + return auto_ets_gradient(data) + else: + return auto_ets_scipy(data, method) + + +def auto_ets_scipy(data, method): + """Calculate ETS model parameters based on scipy optimisation functions.""" + seasonal_period = calc_seasonal_period(data) + lowest_liklihood = -1 + best_model = None + for error_type in range(1, 3): + for trend_type in range(0, 3): + for seasonality_type in range(0, 2 * (seasonal_period != 1) + 1): + optimise_result = optimise_params_scipy( + data, + error_type, + trend_type, + seasonality_type, + seasonal_period, + method, + ) + alpha, beta, gamma = optimise_result.x + liklihood_ = optimise_result.fun + phi = 0.98 + if lowest_liklihood == -1 or lowest_liklihood > liklihood_: + lowest_liklihood = liklihood_ + best_model = ( + error_type, + trend_type, + seasonality_type, + seasonal_period, + alpha, + beta, + gamma, + phi, + ) + return best_model + + +def auto_ets_gradient(data): + """ + Calc model params using pytorch. + + Calculate ETS model parameters based on the + internal gradient-based approach using pytorch. + """ + seasonal_period = calc_seasonal_period(data) + lowest_liklihood = -1 + best_model = None + for error_type in range(1, 3): + for trend_type in range(0, 3): + for seasonality_type in range(0, 2 * (seasonal_period != 1) + 1): + (alpha, beta, gamma, phi, _residuals, liklihood_) = ( + _calc_model_liklihood( + data, error_type, trend_type, seasonality_type, seasonal_period + ) + ) + if lowest_liklihood == -1 or lowest_liklihood > liklihood_: + lowest_liklihood = liklihood_ + best_model = ( + error_type, + trend_type, + seasonality_type, + seasonal_period, + alpha, + beta, + gamma, + phi, + ) + return best_model + + +@njit(nogil=NOGIL, cache=CACHE) +def auto_ets_nelder_mead(data): + """Calculate model parameters based on the internal nelder-mead implementation.""" + seasonal_period = calc_seasonal_period(data) + lowest_aic = -1 + best_model = None + for error_type in range(1, 3): + for trend_type in range(0, 3): + for seasonality_type in range(0, 2 * (seasonal_period != 1) + 1): + if trend_type == 0: + phi = 1 + model_seasonal_period = seasonal_period + if seasonal_period < 1 or seasonality_type == 0: + model_seasonal_period = 1 + ([alpha, beta, gamma, phi], aic) = nelder_mead( + data, + error_type, + trend_type, + seasonality_type, + model_seasonal_period, + ) + if lowest_aic == -1 or lowest_aic > aic: + lowest_aic = aic + best_model = ( + error_type, + trend_type, + seasonality_type, + model_seasonal_period, + alpha, + beta, + gamma, + phi, + ) + return best_model + + +def optimise_params_scipy( + data, error_type, trend_type, seasonality_type, seasonal_period, method +): + """Optimise the ETS model parameters using the scipy algorithms.""" + + def run_ets_scipy(parameters): + alpha, beta, gamma, phi = parameters + if not ( + 0 <= alpha <= 1 and 0 <= beta <= 1 and 0 <= gamma <= 1 and 0 <= phi <= 1 + ): + return float("inf") + ( + _level, + _trend, + _seasonality, + _n_timepoints, + _residuals, + _fitted_values, + _avg_mean_sq_err, + _liklihood, + _k, + aic_, + ) = _numba_fit( + data, + error_type, + trend_type, + seasonality_type, + seasonal_period, + alpha, + beta, + gamma, + phi, + ) + return aic_ + + initial_points = [0.5, 0.5, 0.5, 0.5] + return minimize( + run_ets_scipy, initial_points, bounds=[[0, 1] for i in range(3)], method=method + ) + + +@njit(nogil=NOGIL, cache=CACHE) +def run_ets( + parameters, data, error_type, trend_type, seasonality_type, seasonal_period +): + """Create and fit an ETS model and return the liklihood.""" + alpha, beta, gamma, phi = parameters + if not ( + 0 <= alpha <= 1 + and 0 <= beta <= 1 + and 0 <= gamma <= 1 + and 0.8 <= phi <= 1 + and ( + data.min() > 0 + or (error_type != 2 and trend_type != 2 and seasonality_type != 2) + ) + ): + return np.finfo(np.float64).max + ( + _level, + _trend, + _seasonality, + _n_timepoints, + _residuals, + _fitted_values, + _avg_mean_sq_err, + _liklihood, + _k, + aic_, + ) = _numba_fit( + data, + error_type, + trend_type, + seasonality_type, + seasonal_period, + alpha, + beta, + gamma, + phi, + ) + return aic_ + + +@njit(nogil=NOGIL, cache=CACHE) +def nelder_mead( + data, + error_type, + trend_type, + seasonality_type, + seasonal_period, + tol=1e-6, + max_iter=500, +): + """Implement the nelder-mead optimisation algorithm.""" + points = np.array( + [ + [0.5, 0.5, 0.5, 0.9], + [0.6, 0.5, 0.5, 0.9], + [0.5, 0.6, 0.5, 0.9], + [0.5, 0.5, 0.6, 0.9], + [0.5, 0.5, 0.5, 0.95], + ] + ) + values = np.array( + [ + run_ets(v, data, error_type, trend_type, seasonality_type, seasonal_period) + for v in points + ] + ) + for _iteration in range(max_iter): + # Order simplex by function values + order = np.argsort(values) + points = points[order] + values = values[order] + + # Centroid of the best n points + centre_point = points[:-1].sum(axis=0) / len(points[:-1]) + + # Reflection + # centre + distance between centre and largest value + reflected_point = centre_point + (centre_point - points[-1]) + reflected_value = run_ets( + reflected_point, + data, + error_type, + trend_type, + seasonality_type, + seasonal_period, + ) + # if between best and second best, use reflected value + if values[0] <= reflected_value < values[-2]: + points[-1] = reflected_point + values[-1] = reflected_value + continue + # Expansion + # Otherwise if it is better than the best value + if reflected_value < values[0]: + expanded_point = centre_point + 2 * (reflected_point - centre_point) + expanded_value = run_ets( + expanded_point, + data, + error_type, + trend_type, + seasonality_type, + seasonal_period, + ) + # if less than reflected value use expanded, otherwise go back to reflected + if expanded_value < reflected_value: + points[-1] = expanded_point + values[-1] = expanded_value + else: + points[-1] = reflected_point + values[-1] = reflected_value + continue + # Contraction + # Otherwise if reflection is worse than all current values + contracted_point = centre_point - 0.5 * (centre_point - points[-1]) + contracted_value = run_ets( + contracted_point, + data, + error_type, + trend_type, + seasonality_type, + seasonal_period, + ) + # If contraction is better use that otherwise move to shrinkage + if contracted_value < values[-1]: + points[-1] = contracted_point + values[-1] = contracted_value + continue + + # Shrinkage + for i in range(1, len(points)): + points[i] = points[0] - 0.5 * (points[0] - points[i]) + values[i] = run_ets( + points[i], + data, + error_type, + trend_type, + seasonality_type, + seasonal_period, + ) + + # Convergence check + if np.max(np.abs(values - values[0])) < tol: + break + return points[0], values[0] diff --git a/aeon/forecasting/_autoets_gradient_params.py b/aeon/forecasting/_autoets_gradient_params.py new file mode 100644 index 0000000000..78b7109f7e --- /dev/null +++ b/aeon/forecasting/_autoets_gradient_params.py @@ -0,0 +1,301 @@ +"""AutoETSForecaster class. + +Extends the ETSForecaster to automatically calculate the smoothing parameters + +aeon enhancement proposal +https://github.com/aeon-toolkit/aeon/pull/2244/ + +""" + +__maintainer__ = [] +__all__ = [] + +import torch + +from aeon.forecasting._ets import ETSForecaster + +NONE = 0 +ADDITIVE = 1 +MULTIPLICATIVE = 2 + + +def _calc_model_liklihood( + data, error_type, trend_type, seasonality_type, seasonal_period +): + alpha, beta, gamma, phi = _optimise_parameters( + data, error_type, trend_type, seasonality_type, seasonal_period + ) + forecaster = ETSForecaster( + error_type, + trend_type, + seasonality_type, + seasonal_period, + alpha, + beta, + gamma, + phi, + 1, + ) + forecaster.fit(data) + return alpha, beta, gamma, phi, forecaster.residuals_, forecaster.liklihood_ + + +def _optimise_parameters( + data, error_type, trend_type, seasonality_type, seasonal_period +): + torch.autograd.set_detect_anomaly(True) + data = torch.tensor(data) + n_timepoints = len(data) + if seasonality_type == 0: + seasonal_period = 1 + level, trend, seasonality = _initialise( + trend_type, seasonality_type, seasonal_period, data + ) + alpha = torch.tensor(0.1, requires_grad=True) # Level smoothing + parameters = [alpha] + if trend_type == NONE: + beta = torch.tensor(0) # Trend smoothing + else: + beta = torch.tensor(0.05, requires_grad=True) # Trend smoothing + parameters.append(beta) + if seasonality_type == NONE: + gamma = torch.tensor(0) # Trend smoothing + else: + gamma = torch.tensor(0.05, requires_grad=True) # Seasonality smoothing + parameters.append(gamma) + phi = torch.tensor(0.98, requires_grad=True) # Damping factor + batch_size = len(data) # seasonal_period * 2 + num_batches = len(data) // batch_size + # residuals_ = torch.zeros(n_timepoints) # 1 Less residual than data points + optimizer = torch.optim.SGD([alpha, beta, gamma, phi], lr=0.01) + for _epoch in range(10): # number of epochs + for i in range(0, num_batches): + batch_of_data = data[i * batch_size : (i + 1) * batch_size] + liklihood_ = torch.tensor(0, dtype=torch.float64) + mul_liklihood_pt2 = torch.tensor(0, dtype=torch.float64) + for t, data_item in enumerate(batch_of_data): + # Calculate level, trend, and seasonal components + fitted_value, error, level, trend, seasonality[t % seasonal_period] = ( + _update_states( + error_type, + trend_type, + seasonality_type, + level, + trend, + seasonality[t % seasonal_period], + data_item, + alpha, + beta, + gamma, + phi, + ) + ) + liklihood_ += error * error + mul_liklihood_pt2 += torch.log(torch.abs(fitted_value)) + liklihood_ = (n_timepoints - seasonal_period) * torch.log(liklihood_) + if error_type == MULTIPLICATIVE: + liklihood_ += 2 * mul_liklihood_pt2 + liklihood_.backward() + optimizer.step() + optimizer.zero_grad() + # Impose sensible parameter limits + alpha = alpha.clone().detach().requires_grad_().clamp(0, 1) + if trend_type != NONE: + # Impose sensible parameter limits + beta = beta.clone().detach().requires_grad_().clamp(0, 1) + if seasonality_type != NONE: + # Impose sensible parameter limits + gamma = gamma.clone().detach().requires_grad_().clamp(0, 1) + # Impose sensible parameter limits + phi = phi.clone().detach().requires_grad_().clamp(0.1, 0.98) + level = level.clone().detach() + trend = trend.clone().detach() + seasonality = seasonality.clone().detach() + return alpha.item(), beta.item(), gamma.item(), phi.item() + + +def _predict( + trend_type, + seasonality_type, + level, + trend, + seasonality, + phi, + horizon, + n_timepoints, + seasonal_period, +): + # Generate forecasts based on the final values of level, trend, and seasonals + if phi == 1: # No damping case + phi_h = float(horizon) + else: + # Geometric series formula for calculating phi + phi^2 + ... + phi^h + phi_h = phi * (1 - phi**horizon) / (1 - phi) + seasonal_index = (n_timepoints + horizon) % seasonal_period + return _predict_value( + trend_type, seasonality_type, level, trend, seasonality[seasonal_index], phi_h + )[0] + + +def _initialise(trend_type, seasonality_type, seasonal_period, data): + """ + Initialize level, trend, and seasonality values for the ETS model. + + Parameters + ---------- + data : array-like + The time series data + (should contain at least two full seasons if seasonality is specified) + """ + # Initial Level: Mean of the first season + level = torch.mean(data[:seasonal_period]) + # Initial Trend + if trend_type == ADDITIVE: + # Average difference between corresponding points in the first two seasons + trend = torch.mean( + data[seasonal_period : 2 * seasonal_period] - data[:seasonal_period] + ) + elif trend_type == MULTIPLICATIVE: + # Average ratio between corresponding points in the first two seasons + trend = torch.mean( + data[seasonal_period : 2 * seasonal_period] / data[:seasonal_period] + ) + else: + # No trend + trend = torch.tensor(0) + # Initial Seasonality + if seasonality_type == ADDITIVE: + # Seasonal component is the difference + # from the initial level for each point in the first season + seasonality = data[:seasonal_period] - level + elif seasonality_type == MULTIPLICATIVE: + # Seasonal component is the ratio of each point in the first season + # to the initial level + seasonality = data[:seasonal_period] / level + else: + # No seasonality + seasonality = torch.zeros(1) + return level, trend, seasonality + + +def _update_states( + error_type, + trend_type, + seasonality_type, + curr_level, + curr_trend, + curr_seasonality, + data_item: int, + alpha, + beta, + gamma, + phi, +): + """ + Update level, trend, and seasonality components. + + Using state space equations for an ETS model. + + Parameters + ---------- + data_item: float + The current value of the time series. + seasonal_index: int + The index to update the seasonal component. + """ + # Retrieve the current state values + fitted_value, damped_trend, trend_level_combination = _predict_value( + trend_type, seasonality_type, curr_level, curr_trend, curr_seasonality, phi + ) + # Calculate the error term (observed value - fitted value) + if error_type == MULTIPLICATIVE: + error = data_item / fitted_value - 1 # Multiplicative error + else: + error = data_item - fitted_value # Additive error + # Update level + if error_type == MULTIPLICATIVE: + level = trend_level_combination.clone() * (1 + alpha.clone() * error.clone()) + trend = damped_trend.clone() * (1 + beta.clone() * error.clone()) + seasonality = curr_seasonality.clone() * (1 + gamma.clone() * error.clone()) + if seasonality_type == ADDITIVE: + # Add seasonality correction + level += alpha.clone() * error.clone() * curr_seasonality.clone() + seasonality += ( + gamma.clone() * error.clone() * trend_level_combination.clone() + ) + if trend_type == ADDITIVE: + trend += ( + (curr_level.clone() + curr_seasonality.clone()) + * beta.clone() + * error.clone() + ) + else: + trend += ( + (curr_seasonality.clone() / curr_level.clone()) + * beta.clone() + * error.clone() + ) + elif trend_type == ADDITIVE: + trend += curr_level.clone() * beta.clone() * error.clone() + else: + level_correction = 1 + trend_correction = 1 + seasonality_correction = 1 + if seasonality_type == MULTIPLICATIVE: + # Add seasonality correction + level_correction *= curr_seasonality.clone() + trend_correction *= curr_seasonality.clone() + seasonality_correction *= trend_level_combination.clone() + if trend_type == MULTIPLICATIVE: + trend_correction *= curr_level.clone() + level = ( + trend_level_combination.clone() + + alpha.clone() * error.clone() / level_correction + ) + trend = damped_trend.clone() + beta.clone() * error.clone() / trend_correction + seasonality = ( + curr_seasonality.clone() + + gamma.clone() * error.clone() / seasonality_correction + ) + return (fitted_value, error, level, trend, seasonality) + + +def _predict_value(trend_type, seasonality_type, level, trend, seasonality, phi): + """ + + Generate various useful values, including the next fitted value. + + Parameters + ---------- + trend : float + The current trend value for the model + level : float + The current level value for the model + seasonality : float + The current seasonality value for the model + phi : float + The damping parameter for the model + + Returns + ------- + fitted_value : float + single prediction based on the current state variables. + damped_trend : float + The damping parameter combined with the trend dependant on the model type + trend_level_combination : float + Combination of the trend and level based on the model type. + """ + # Apply damping parameter and + # calculate commonly used combination of trend and level components + if trend_type == MULTIPLICATIVE: + damped_trend = trend.clone() ** phi.clone() + trend_level_combination = level.clone() * damped_trend.clone() + else: # Additive trend, if no trend, then trend = 0 + damped_trend = trend.clone() * phi.clone() + trend_level_combination = level.clone() + damped_trend.clone() + # Calculate forecast (fitted value) based on the current components + if seasonality_type == MULTIPLICATIVE: + fitted_value = trend_level_combination.clone() * seasonality.clone() + else: # Additive seasonality, if no seasonality, then seasonality = 0 + fitted_value = trend_level_combination.clone() + seasonality.clone() + return fitted_value, damped_trend, trend_level_combination diff --git a/aeon/forecasting/_compare_external_autoets.py b/aeon/forecasting/_compare_external_autoets.py new file mode 100644 index 0000000000..feecab1a8f --- /dev/null +++ b/aeon/forecasting/_compare_external_autoets.py @@ -0,0 +1,207 @@ +"""Test Other Packages AutoETS.""" + +# __maintainer__ = [] +# __all__ = [] + +import math +import time + +import matplotlib.pyplot as plt +from sktime.forecasting.ets import AutoETS as sktime_AutoETS +from statsforecast.models import AutoETS as sf_AutoETS +from statsforecast.utils import AirPassengers as ap +from statsforecast.utils import AirPassengersDF +from statsmodels.tsa.exponential_smoothing.ets import ETSModel + +from aeon.forecasting._autoets import auto_ets +from aeon.forecasting._ets import ETSForecaster + +plt.rcParams["figure.figsize"] = (12, 8) + + +def test_other_forecasters(): + """TestOtherForecasters.""" + plt.plot(AirPassengersDF.ds, AirPassengersDF.y, label="Actual Values", color="blue") + # Statsmodels + start = time.perf_counter() + statsmodels_model = ETSModel( + ap, + error="mul", + trend=None, + damped_trend=False, + seasonal="mul", + seasonal_periods=12, + ) + statsmodels_fit = statsmodels_model.fit(maxiter=10000) + end = time.perf_counter() + statsmodels_time = end - start + print( # noqa + f"Statsmodels: Alpha: {statsmodels_fit.alpha}, \ + Beta: statsmodels_fit.beta, gamma: {statsmodels_fit.gamma}, \ + phi: statsmodels_fit.phi" + ) + print(f"Statsmodels AIC: {statsmodels_fit.aic}") # noqa + sm_internal_model = ETSForecaster( + 2, 0, 2, 12, statsmodels_fit.alpha, 0, statsmodels_fit.gamma, 1 + ) + sm_internal_model.fit(ap) + print(f"Statsmodels AIC: {sm_internal_model.aic_}") # noqa + plt.plot( + AirPassengersDF.ds, + statsmodels_fit.fittedvalues, + label="statsmodels fit", + color="green", + ) + # Sktime + start = time.perf_counter() + sktime_model = sktime_AutoETS(auto=True, sp=12) + sktime_model.fit(ap) + end = time.perf_counter() + sktime_time = end - start + # pylint: disable=W0212 + print( # noqa + f"Sktime: Alpha: {sktime_model._fitted_forecaster.alpha}, \ + Beta: {sktime_model._fitted_forecaster.beta}, \ + gamma: {sktime_model._fitted_forecaster.gamma}, \ + phi: sktime_model._fitted_forecaster.phi" + ) + + if sktime_model._fitted_forecaster.error == "add": + sk_error = 1 + elif sktime_model._fitted_forecaster.error == "mul": + sk_error = 2 + else: + sk_error = 0 + if sktime_model._fitted_forecaster.trend == "add": + sk_trend = 1 + elif sktime_model._fitted_forecaster.trend == "mul": + sk_trend = 2 + else: + sk_trend = 0 + if sktime_model._fitted_forecaster.seasonal == "add": + sk_seasonal = 1 + elif sktime_model._fitted_forecaster.seasonal == "mul": + sk_seasonal = 2 + else: + sk_seasonal = 0 + print( # noqa + f"Error Type: {sk_error}, Trend Type: {sk_trend}, \ + Seasonality Type: {sk_seasonal}, Seasonal Period: {12}" + ) + print(f"Sktime AIC: {sktime_model._fitted_forecaster.aic}") # noqa + sk_internal_model = ETSForecaster( + sk_error, + sk_trend, + sk_seasonal, + 12, + sktime_model._fitted_forecaster.alpha, + sktime_model._fitted_forecaster.beta, + sktime_model._fitted_forecaster.gamma, + 1, + ) + sk_internal_model.fit(ap) + print(f"Sktime AIC: {sk_internal_model.aic_}") # noqa + plt.plot( + AirPassengersDF.ds, + sktime_model._fitted_forecaster.fittedvalues, + label="sktime fitted values", + color="red", + ) + # pylint: enable=W0212 + # internal + start = time.perf_counter() + ( + error_type, + trend_type, + seasonality_type, + seasonal_period, + alpha, + beta, + gamma, + phi, + ) = auto_ets(ap) + internal_model = ETSForecaster( + error_type, + trend_type, + seasonality_type, + seasonal_period, + alpha, + beta, + gamma, + phi, + ) + internal_model.fit(ap) + end = time.perf_counter() + internal_time = end - start + print( # noqa + f"Internal: Alpha: {internal_model.alpha}, Beta: {internal_model.beta}, \ + gamma: {internal_model.gamma}, phi: {internal_model.phi}" + ) + print( # noqa + f"Error Type: {internal_model.error_type}, \ + Trend Type: {internal_model.trend_type}, \ + Seasonality Type: {internal_model.seasonality_type}, \ + Seasonal Period: {internal_model.seasonal_period}" + ) + print(f"Internal AIC: {internal_model.aic_}") # noqa + plt.plot( + AirPassengersDF.ds[seasonal_period:], + internal_model.fitted_values_, + label="Internal fitted values", + color="black", + ) + # statsforecast + start = time.perf_counter() + sf_model = sf_AutoETS(season_length=12) + sf_model.fit(ap) + end = time.perf_counter() + statsforecast_time = end - start + print( # noqa + f"Statsforecast: Alpha: {sf_model.model_['par'][0]}, \ + Beta: {sf_model.model_['par'][1]}, gamma: {sf_model.model_['par'][2]}, \ + phi: {sf_model.model_['par'][3]}" + ) + print( # noqa + f"Statsforecast Model Type: {sf_model.model_['method']}, \ + AIC: {sf_model.model_['aic']}" + ) + sf_internal_model = ETSForecaster( + 2 if sf_model.model_["components"][0] == "M" else 1, + ( + 2 + if sf_model.model_["components"][1] == "M" + else 1 if sf_model.model_["components"][1] == "A" else 0 + ), + ( + 2 + if sf_model.model_["components"][2] == "M" + else 1 if sf_model.model_["components"][2] == "A" else 0 + ), + 12, + 0 if math.isnan(sf_model.model_["par"][0]) else sf_model.model_["par"][0], + 0 if math.isnan(sf_model.model_["par"][1]) else sf_model.model_["par"][1], + 0 if math.isnan(sf_model.model_["par"][2]) else sf_model.model_["par"][2], + 0 if math.isnan(sf_model.model_["par"][3]) else sf_model.model_["par"][3], + ) + sf_internal_model.fit(ap) + print(f"Statsforecast AIC: {sf_internal_model.aic_}") # noqa + plt.plot( + AirPassengersDF.ds, + sf_model.model_["fitted"], + label="statsforecast fitted values", + color="orange", + ) + print( # noqa + f"Statsmodels Time: {statsmodels_time}\ + Sktime Time: {sktime_time}\ + Internal Time: {internal_time}\ + Statsforecast Time: {statsforecast_time}" + ) # noqa + plt.ylabel("Air Passenger Numbers") + plt.grid() + plt.legend() + plt.show() + + +if __name__ == "__main__": + test_other_forecasters() diff --git a/aeon/forecasting/_ets.py b/aeon/forecasting/_ets.py index 2d3eac898b..5006e5f46e 100644 --- a/aeon/forecasting/_ets.py +++ b/aeon/forecasting/_ets.py @@ -44,6 +44,8 @@ class ETSForecaster(BaseForecaster): Seasonal smoothing parameter. phi : float, default=0.99 Trend damping parameter (used only for damped trend models). + horizon : int, default=1 + Forecasting horizon (number of time steps ahead to predict). Attributes ---------- @@ -80,12 +82,14 @@ class ETSForecaster(BaseForecaster): >>> from aeon.datasets import load_airline >>> y = load_airline() >>> forecaster = ETSForecaster( - ... alpha=0.4, beta=0.2, gamma=0.5, phi=0.8, + ... alpha=0.4, beta=0.2, gamma=0.5, phi=0.8, horizon=1, ... error_type='additive', trend_type='multiplicative', ... seasonality_type='multiplicative', seasonal_period=4 ... ) - >>> forecaster.forecast(y) - 366.90200486015596 + >>> forecaster.fit(y) + ETSForecaster(...) + >>> forecaster.predict() + array([366.9020...) """ _tags = { @@ -102,6 +106,7 @@ def __init__( beta: float = 0.01, gamma: float = 0.01, phi: float = 0.99, + horizon: int = 1, ): self.alpha = alpha self.beta = beta @@ -125,7 +130,7 @@ def __init__( self.aic_ = 0 self.residuals_ = [] self.fitted_values_ = [] - super().__init__(horizon=1, axis=1) + super().__init__(horizon=horizon, axis=1) def _fit(self, y, exog=None): """Fit Exponential Smoothing forecaster to series y. @@ -193,24 +198,17 @@ def _get_int(x): self._gamma, self.phi, ) - self.forecast_ = _predict( - self._trend_type, - self._seasonality_type, - self.level_, - self.trend_, - self.seasonality_, - self.phi, - self.horizon, - self.n_timepoints_, - self._seasonal_period, - ) - return self - def _predict(self, y, exog=None): + def _predict(self, y=None, exog=None): """ Predict the next horizon steps ahead. + If given y, use y to update the model and continue to predict horizon + steps ahead. Assumes no gap in time between the y passed in fit and the + y passed here. Assumes that y is of the same frequency as y passed in fit. + If y is None, predict horizon steps ahead of the series seen in fit. + Parameters ---------- y : np.ndarray, default = None @@ -224,7 +222,25 @@ def _predict(self, y, exog=None): float single prediction self.horizon steps ahead of y. """ - return self.forecast_ + if y is not None: + y = y.squeeze() + fitted_values = _predict( + self._error_type, + self._trend_type, + self._seasonality_type, + self.alpha, + self._beta, + self._gamma, + self.phi, + self.level_, + self.trend_, + self.seasonality_, + self.horizon, + self.n_timepoints_, + self._seasonal_period, + y, + ) + return fitted_values def _initialise(self, data): """ @@ -315,31 +331,71 @@ def _numba_fit( @njit(fastmath=True, cache=True) def _predict( + error_type, trend_type, seasonality_type, + alpha, + beta, + gamma, + phi, level, trend, seasonality, - phi, horizon, n_timepoints, seasonal_period, + y=None, ): - # Generate forecasts based on the final values of level, trend, and seasonals + # Predict horizon steps ahead for each time point in y + if y is None: + y = np.zeros(shape=0, dtype=np.float64) + fitted_values_ = np.zeros(shape=(len(y) + 1), dtype=np.float64) if phi == 1: # No damping case phi_h = 1 else: # Geometric series formula for calculating phi + phi^2 + ... + phi^h phi_h = phi * (1 - phi**horizon) / (1 - phi) - seasonal_index = (n_timepoints + horizon) % seasonal_period - return _predict_value( + for t, time_point in enumerate(y): + s_index = (n_timepoints + t) % seasonal_period + # Calculate level, trend, and seasonal components + (fitted_value, _error, level, trend, seasonality[s_index]) = _update_states( + error_type, + trend_type, + seasonality_type, + level, + trend, + seasonality[s_index], + time_point, + alpha, + beta, + gamma, + phi, + ) + if horizon > 1: + forecast_s_index = (n_timepoints + t + horizon) % seasonal_period + # Generate forecasts based the horizon ahead + fitted_values_[t] = _predict_value( + trend_type, + seasonality_type, + level, + trend, + seasonality[forecast_s_index], + phi_h, + )[0] + else: + fitted_values_[t] = fitted_value + # Handle the final forecast value after the last time point in y + forecast_s_index = (n_timepoints + len(y) + horizon) % seasonal_period + # Generate forecasts based on the final values of level, trend, and seasonals + fitted_values_[-1] = _predict_value( trend_type, seasonality_type, level, trend, - seasonality[seasonal_index], + seasonality[forecast_s_index], phi_h, )[0] + return fitted_values_ @njit(fastmath=True, cache=True) @@ -418,6 +474,8 @@ def _update_states( ) # Calculate the error term (observed value - fitted value) if error_type == 2: + if fitted_value == 0: + error = data_item - fitted_value # Avoid division by zero error = data_item / fitted_value - 1 # Multiplicative error else: error = data_item - fitted_value # Additive error @@ -432,6 +490,8 @@ def _update_states( if trend_type == 1: trend += (curr_level + curr_seasonality) * beta * error else: + if curr_level == 0: + curr_level = 1e-10 # Avoid division by zero trend += curr_seasonality / curr_level * beta * error elif trend_type == 1: trend += curr_level * beta * error @@ -446,8 +506,14 @@ def _update_states( seasonality_correction *= trend_level_combination if trend_type == 2: trend_correction *= curr_level + if level_correction == 0: + level_correction = 1e-10 # Avoid division by zero level = trend_level_combination + alpha * error / level_correction + if trend_correction == 0: + trend_correction = 1e-10 # Avoid division by zero trend = damped_trend + beta * error / trend_correction + if seasonality_correction == 0: + seasonality_correction = 1e-10 # Avoid division by zero seasonality = curr_seasonality + gamma * error / seasonality_correction return (fitted_value, error, level, trend, seasonality) @@ -481,7 +547,11 @@ def _predict_value(trend_type, seasonality_type, level, trend, seasonality, phi) # Apply damping parameter and # calculate commonly used combination of trend and level components if trend_type == 2: # Multiplicative - damped_trend = trend**phi + if trend <= 0 and phi < 1: + # Avoid NANs + damped_trend = -(np.abs(trend) ** phi) + else: + damped_trend = trend**phi trend_level_combination = level * damped_trend else: # Additive trend, if no trend, then trend = 0 damped_trend = trend * phi diff --git a/aeon/forecasting/_plot_autoets_gradient_method.py b/aeon/forecasting/_plot_autoets_gradient_method.py new file mode 100644 index 0000000000..4d405f8bc2 --- /dev/null +++ b/aeon/forecasting/_plot_autoets_gradient_method.py @@ -0,0 +1,66 @@ +"""Test AutoETS.""" + +# __maintainer__ = [] +# __all__ = [] + +import matplotlib.pyplot as plt +from statsforecast.utils import AirPassengers as ap +from statsforecast.utils import AirPassengersDF + +from aeon.forecasting._autoets import auto_ets +from aeon.forecasting._ets import ETSForecaster + +plt.rcParams["figure.figsize"] = (12, 8) + + +def test_autoets_forecaster(): + """TestETSForecaster.""" + ( + error_type, + trend_type, + seasonality_type, + seasonal_period, + alpha, + beta, + gamma, + phi, + ) = auto_ets(ap, method="internal_gradient") + print( # noqa + f"Error Type: {error_type}, Trend Type: {trend_type}, \ + Seasonality Type: {seasonality_type}, Seasonal Period: {seasonal_period}, \ + Alpha: {alpha}, Beta: {beta}, Gamma: {gamma}, Phi: {phi}" + ) # noqa + etsforecaster = ETSForecaster( + error_type, + trend_type, + seasonality_type, + seasonal_period, + alpha, + beta, + gamma, + phi, + 1, + ) + etsforecaster.fit(ap) + print(f"liklihood: {etsforecaster.liklihood_}") # noqa + + # assert np.allclose([parameter.item() for parameter in parameters], + # [0.1,0.05,0.05,0.98]) + plt.plot(AirPassengersDF.ds, AirPassengersDF.y, label="Actual Values", color="blue") + plt.plot( + AirPassengersDF.ds, + etsforecaster.fitted_values_, + label="Predicted Values", + color="green", + ) + plt.plot( + AirPassengersDF.ds, etsforecaster.residuals_, label="Residuals", color="red" + ) + plt.ylabel("Air Passenger Numbers") + plt.grid() + plt.legend() + plt.show() + + +if __name__ == "__main__": + test_autoets_forecaster() diff --git a/aeon/forecasting/_sarima.py b/aeon/forecasting/_sarima.py new file mode 100644 index 0000000000..1729407127 --- /dev/null +++ b/aeon/forecasting/_sarima.py @@ -0,0 +1,238 @@ +"""SARIMAForecaster. + +An implementation of the Seasonal ARIMA forecasting algorithm. +""" + +__maintainer__ = ["alexbanwell1", "TonyBagnall"] +__all__ = ["SARIMAForecaster"] + +import numpy as np +from numba import njit + +from aeon.forecasting import ARIMAForecaster +from aeon.forecasting._arima import _arima_model, _calc_arima, _extract_params +from aeon.utils.optimisation._nelder_mead import nelder_mead + +NOGIL = False +CACHE = True + + +class SARIMAForecaster(ARIMAForecaster): + """Seasonal AutoRegressive Integrated Moving Average (SARIMA) forecaster. + + Parameters + ---------- + horizon : int, default=1 + The forecasting horizon, i.e., the number of steps ahead to predict. + + Attributes + ---------- + ps_, ds_, qs_ : int + Orders of the seasonal ARIMA model: seasonal AR (P), seasonal differencing (D), + and seasonal MA (Q) terms. + seasonal_period_ : int + Length of the seasonal cycle. + phi_s_ : array-like + Coefficients for the seasonal autoregressive terms. + theta_s_ : array-like + Coefficients for the seasonal moving average terms. + + References + ---------- + .. [1] R. J. Hyndman and G. Athanasopoulos, + Forecasting: Principles and Practice. OTexts, 2014. + https://otexts.com/fpp3/ + + Examples + -------- + >>> from aeon.forecasting import SARIMAForecaster + >>> from aeon.datasets import load_airline + >>> y = load_airline() + >>> forecaster = SARIMAForecaster(1,1,2,0,1,0,12,False) + >>> forecaster.fit(y) + SARIMAForecaster(d=1, ds=1, q=2) + >>> forecaster.predict() + 450.74876... + """ + + def __init__( + self, + p: int = 1, + d: int = 0, + q: int = 1, + ps: int = 0, + ds: int = 0, + qs: int = 0, + seasonal_period: int = 12, + constant_term: bool = False, + ): + super().__init__(p=p, d=d, q=q, constant_term=constant_term) + self.ps = ps + self.ds = ds + self.qs = qs + self.seasonal_period = seasonal_period + self.ps_ = 0 + self.ds_ = 0 + self.qs_ = 0 + self.seasonal_period_ = 0 + self.phi_s_ = 0 + self.theta_s_ = 0 + + def _fit(self, y, exog=None): + """Fit AutoARIMA forecaster to series y. + + Fit a forecaster to predict self.horizon steps ahead using y. + + Parameters + ---------- + y : np.ndarray + A time series on which to learn a forecaster to predict horizon ahead + exog : np.ndarray, default =None + Optional exogenous time series data assumed to be aligned with y + + Returns + ------- + self + Fitted ARIMAForecaster. + """ + self.p_ = self.p + self.d_ = self.d + self.q_ = self.q + self.ps_ = self.ps + self.ds_ = self.ds + self.qs_ = self.qs + self.seasonal_period_ = self.seasonal_period + if self.seasonal_period_ == 1: + raise ValueError("Seasonal period must be greater than 1.") + self.constant_term_ = self.constant_term + self.data_ = np.array(y.squeeze(), dtype=np.float64) + self.model_ = np.array( + ( + 1 if self.constant_term else 0, + self.p, + self.q, + self.ps, + self.qs, + self.seasonal_period, + ), + dtype=np.int32, + ) + self.differenced_data_ = np.diff(self.data_, n=self.d_) + for _ds in range(self.ds_): + self.differenced_data_ = ( + self.differenced_data_[self.seasonal_period_ :] + - self.differenced_data_[: -self.seasonal_period_] + ) + (self.parameters_, self.aic_) = nelder_mead( + _sarima_model_wrapper, + np.sum(self.model_[:5]), + self.differenced_data_, + self.model_, + ) + (self.c_, self.phi_, self.theta_, self.phi_s_, self.theta_s_) = _extract_params( + self.parameters_, self.model_ + ) + (self.aic_, self.residuals_, self.fitted_values_) = _arima_model( + self.parameters_, + _calc_sarima, + self.differenced_data_, + self.model_, + np.empty(0), + ) + return self + + def _predict(self, y=None, exog=None): + """ + Predict the next horizon steps ahead. + + Parameters + ---------- + y : np.ndarray, default = None + A time series to predict the next horizon value for. If None, + predict the next horizon value after series seen in fit. + exog : np.ndarray, default =None + Optional exogenous time series data assumed to be aligned with y + + Returns + ------- + float + single prediction self.horizon steps ahead of y. + """ + if y is not None: + combined_data = np.concatenate((self.data_, y.flatten())) + else: + combined_data = self.data_ + n = len(self.data_) + differenced_data = np.diff(combined_data, n=self.d_) + m = n - self.d_ + seasonal_differenced_data = differenced_data + for _ds in range(self.ds_): + seasonal_differenced_data = ( + seasonal_differenced_data[self.seasonal_period_ :] + - seasonal_differenced_data[: -self.seasonal_period_] + ) + _aic, _residuals, predicted_values = _arima_model( + self.parameters_, + _calc_sarima, + seasonal_differenced_data, + self.model_, + self.residuals_, + ) + # Undo seasonal differencing + last_season = differenced_data[m - self.seasonal_period_ * self.ds_ : m] + values = np.concatenate((last_season, predicted_values)) + for _ in range(self.ds_): + for i in range(self.seasonal_period_, len(values)): + values[i] += values[i - self.seasonal_period_] + values = values[self.seasonal_period_ * self.ds_ :] + # Undo ordinary differencing + init = self.data_[n - self.d_ : n] + values = np.concatenate((init, values)) + for _ in range(self.d_): + values = np.cumsum(values) + return values[self.d_ :] + + +@njit(cache=False, fastmath=True) +def _sarima_model_wrapper(params, data, model): + return _arima_model(params, _calc_sarima, data, model, np.empty(0))[0] + + +@njit(cache=True, fastmath=True) +def _calc_sarima(data, model, t, formatted_params, residuals, expect_full_history): + """Calculate the SARIMA forecast for time t.""" + if len(model) != 6: + raise ValueError("Model must be of the form (c, p, q, ps, qs, seasonal_period)") + ps = model[3] + qs = model[4] + seasonal_period = model[5] + if expect_full_history and ( + (t - seasonal_period * ps) < 0 or (t - seasonal_period * qs) < 0 + ): + raise ValueError( + f"Insufficient data for SARIMA model at time {t}. \ + Seasonal period is {seasonal_period}." + f"Expected at least {seasonal_period * ps} past \ + values for AR and {seasonal_period * qs} for MA." + ) + + arima_forecast = _calc_arima( + data, model[:3], t, formatted_params, residuals, expect_full_history + ) + # Seasonal AR part + phi_s = formatted_params[3, :ps] + ars_term = ( + 0 + if (t - seasonal_period * ps) < 0 + else np.dot(phi_s, data[t - seasonal_period * ps : t : seasonal_period][::-1]) + ) + # Seasonal MA part + theta_s = formatted_params[4, :qs] + mas_term = ( + 0 + if (t - seasonal_period * qs) < 0 + else np.dot( + theta_s, residuals[t - seasonal_period * qs : t : seasonal_period][::-1] + ) + ) + return arima_forecast + ars_term + mas_term diff --git a/aeon/forecasting/_sktime_autoets.py b/aeon/forecasting/_sktime_autoets.py new file mode 100644 index 0000000000..8852f9a7a9 --- /dev/null +++ b/aeon/forecasting/_sktime_autoets.py @@ -0,0 +1,72 @@ +"""SktimeAutoETS class. + +Wraps sktime AutoETS model for forecasting. + +""" + +__maintainer__ = [] +__all__ = ["SktimeAutoETSForecaster"] + +from sktime.forecasting.ets import AutoETS + +from aeon.forecasting._utils import calc_seasonal_period +from aeon.forecasting.base import BaseForecaster + + +class SktimeAutoETSForecaster(BaseForecaster): + """Automatic Exponential Smoothing forecaster from sktime. + + Parameters + ---------- + horizon : int, default = 1 + The horizon to forecast to. + """ + + def __init__( + self, + horizon=1, + ): + self.model_ = None + super().__init__(horizon=horizon, axis=1) + + def _fit(self, y, exog=None): + """Fit Auto Exponential Smoothing forecaster to series y. + + Fit a forecaster to predict self.horizon steps ahead using y. + + Parameters + ---------- + y : np.ndarray + A time series on which to learn a forecaster to predict horizon ahead + exog : np.ndarray, default =None + Optional exogenous time series data assumed to be aligned with y + + Returns + ------- + self + Fitted AutoETSForecaster. + """ + data = y.squeeze() + season_length = calc_seasonal_period(data) + self.model_ = AutoETS(auto=True, sp=season_length) + self.model_.fit(data) + return self + + def _predict(self, y=None, exog=None): + """ + Predict the next horizon steps ahead. + + Parameters + ---------- + y : np.ndarray, default = None + A time series to predict the next horizon value for. If None, + predict the next horizon value after series seen in fit. + exog : np.ndarray, default =None + Optional exogenous time series data assumed to be aligned with y + + Returns + ------- + float + single prediction self.horizon steps ahead of y. + """ + return self.model_.predict(self.horizon, exog)[0][0] diff --git a/aeon/forecasting/_statsforecast_autoets.py b/aeon/forecasting/_statsforecast_autoets.py new file mode 100644 index 0000000000..d294315d44 --- /dev/null +++ b/aeon/forecasting/_statsforecast_autoets.py @@ -0,0 +1,72 @@ +"""StatsforecastAutoETS class. + +Wraps statsforecast AutoETS model for forecasting. + +""" + +__maintainer__ = [] +__all__ = ["StatsForecastAutoETSForecaster"] + +from statsforecast.models import AutoETS + +from aeon.forecasting._utils import calc_seasonal_period +from aeon.forecasting.base import BaseForecaster + + +class StatsForecastAutoETSForecaster(BaseForecaster): + """Automatic Exponential Smoothing forecaster from statsforecast. + + Parameters + ---------- + horizon : int, default = 1 + The horizon to forecast to. + """ + + def __init__( + self, + horizon=1, + ): + self.model_ = None + super().__init__(horizon=horizon, axis=1) + + def _fit(self, y, exog=None): + """Fit Auto Exponential Smoothing forecaster to series y. + + Fit a forecaster to predict self.horizon steps ahead using y. + + Parameters + ---------- + y : np.ndarray + A time series on which to learn a forecaster to predict horizon ahead + exog : np.ndarray, default =None + Optional exogenous time series data assumed to be aligned with y + + Returns + ------- + self + Fitted AutoETSForecaster. + """ + data = y.squeeze() + season_length = calc_seasonal_period(data) + self.model_ = AutoETS(season_length=season_length) + self.model_.fit(data) + return self + + def _predict(self, y=None, exog=None): + """ + Predict the next horizon steps ahead. + + Parameters + ---------- + y : np.ndarray, default = None + A time series to predict the next horizon value for. If None, + predict the next horizon value after series seen in fit. + exog : np.ndarray, default =None + Optional exogenous time series data assumed to be aligned with y + + Returns + ------- + float + single prediction self.horizon steps ahead of y. + """ + return self.model_.predict(self.horizon, exog)["mean"][0] diff --git a/aeon/forecasting/_time_autoets.py b/aeon/forecasting/_time_autoets.py new file mode 100644 index 0000000000..3d9e263e15 --- /dev/null +++ b/aeon/forecasting/_time_autoets.py @@ -0,0 +1,37 @@ +"""Test AutoETS.""" + +# __maintainer__ = [] +# __all__ = [] + +import timeit + +from statsforecast.utils import AirPassengers as ap + +from aeon.forecasting._autoets import nelder_mead, optimise_params_scipy + + +def test_optimise_params(): + nelder_mead(ap, 2, 2, 2, 12) + + +def test_optimise_params_scipy(): + optimise_params_scipy(ap, 2, 2, 2, 12, method="L-BFGS-B") + + +def test_autoets_forecaster(): + """TestETSForecaster.""" + for _i in range(20): + test_optimise_params() + test_optimise_params_scipy() + optim_ets_time = timeit.timeit(test_optimise_params, globals={}, number=1000) + print(f"Execution time Optimise params: {optim_ets_time} seconds") # noqa + optim_ets_scipy_time = timeit.timeit( + test_optimise_params_scipy, globals={}, number=1000 + ) + print( # noqa + f"Execution time Optimise params Scipy: {optim_ets_scipy_time} seconds" + ) + + +if __name__ == "__main__": + test_autoets_forecaster() diff --git a/aeon/forecasting/_utils.py b/aeon/forecasting/_utils.py new file mode 100644 index 0000000000..aeee0db3ae --- /dev/null +++ b/aeon/forecasting/_utils.py @@ -0,0 +1,115 @@ +""" +Forecasting utilities class. + +Contains useful utility methods for forecasting time series data. + +""" + +import numpy as np +from numba import njit + + +@njit(cache=True, fastmath=True) +def calc_seasonal_period(data): + """Estimate the seasonal period based on the autocorrelation of the series.""" + lags = _acf(data, 24) + lags = np.concatenate((np.array([1.0]), lags)) + peaks = [] + mean_lags = np.mean(lags) + for i in range(1, len(lags) - 1): # Skip the first (lag 0) and last elements + if lags[i] >= lags[i - 1] and lags[i] >= lags[i + 1] and lags[i] > mean_lags: + peaks.append(i) + if not peaks: + return 1 + else: + return peaks[0] + + +@njit(cache=True, fastmath=True) +def _acf(X, max_lag): + length = len(X) + X_t = np.zeros(max_lag, dtype=float) + for lag in range(1, max_lag + 1): + lag_length = length - lag + x1 = X[:-lag] + x2 = X[lag:] + s1 = np.sum(x1) + s2 = np.sum(x2) + m1 = s1 / lag_length + m2 = s2 / lag_length + ss1 = np.sum(x1 * x1) + ss2 = np.sum(x2 * x2) + v1 = ss1 - s1 * m1 + v2 = ss2 - s2 * m2 + v1_is_zero, v2_is_zero = v1 <= 1e-9, v2 <= 1e-9 + if v1_is_zero and v2_is_zero: # Both zero variance, + # so must be 100% correlated + X_t[lag - 1] = 1 + elif v1_is_zero or v2_is_zero: # One zero variance + # the other not + X_t[lag - 1] = 0 + else: + X_t[lag - 1] = np.sum((x1 - m1) * (x2 - m2)) / np.sqrt(v1 * v2) + return X_t + + +def kpss_test(y, regression="c", lags=None): # Test if time series is stationary + """ + Implement the KPSS test for stationarity. + + Parameters + ---------- + y (array-like): Time series data + regression (str): 'c' for constant, 'ct' for constant + trend + lags (int): Number of lags for HAC variance estimation (default: sqrt(n)) + + Returns + ------- + kpss_stat (float): KPSS test statistic + stationary (bool): Whether the series is stationary according to the test + """ + y = np.asarray(y) + n = len(y) + + # Step 1: Fit regression model to estimate residuals + if regression == "c": # Constant + X = np.ones((n, 1)) + elif regression == "ct": # Constant + Trend + X = np.column_stack((np.ones(n), np.arange(1, n + 1))) + else: + raise ValueError("regression must be 'c' or 'ct'") + + beta = np.linalg.lstsq(X, y, rcond=None)[0] # Estimate regression coefficients + residuals = y - X @ beta # Get residuals (u_t) + + # Step 2: Compute cumulative sum of residuals (S_t) + S_t = np.cumsum(residuals) + + # Step 3: Estimate long-run variance (HAC variance) + if lags is None: + # lags = int(12 * (n / 100)**(1/4)) # Default statsmodels lag length + lags = int(np.sqrt(n)) # Default lag length + + gamma_0 = np.sum(residuals**2) / (n - X.shape[1]) # Lag-0 autocovariance + gamma = [np.sum(residuals[k:] * residuals[:-k]) / n for k in range(1, lags + 1)] + + # Bartlett weights + weights = [1 - (k / (lags + 1)) for k in range(1, lags + 1)] + + # Long-run variance + sigma_squared = gamma_0 + 2 * np.sum([w * g for w, g in zip(weights, gamma)]) + + # Step 4: Calculate the KPSS statistic + kpss_stat = np.sum(S_t**2) / (n**2 * sigma_squared) + + if regression == "ct": + # p. 162 Kwiatkowski et al. (1992): y_t = beta * t + r_t + e_t, + # where beta is the trend, r_t a random walk and e_t a stationary + # error term. + crit = 0.146 + else: # hypo == "c" + # special case of the model above, where beta = 0 (so the null + # hypothesis is that the data is stationary around r_0). + crit = 0.463 + + return kpss_stat, kpss_stat < crit diff --git a/aeon/forecasting/_verify_arima.py b/aeon/forecasting/_verify_arima.py new file mode 100644 index 0000000000..34758eb6eb --- /dev/null +++ b/aeon/forecasting/_verify_arima.py @@ -0,0 +1,31 @@ +from pmdarima import auto_arima as pmd_auto_arima +from statsforecast.utils import AirPassengers as ap +from statsmodels.tsa.stattools import kpss + +from aeon.forecasting._arima import ARIMAForecaster, auto_arima, nelder_mead +from aeon.forecasting._utils import kpss_test + + +def test_arima(): + model = pmd_auto_arima( + ap, + seasonal=True, + m=12, + trace=True, + error_action="ignore", + suppress_warnings=True, + ) + print(model.summary()) # noqa + print(f"Optimal Model: {nelder_mead(ap, 2, 1, 1, 0, 1, 0, 12, True)}") # noqa + print(model.predict(n_periods=1)) # noqa + print(kpss_test(ap)) # noqa + print(kpss(ap, regression="c", nlags=12)) # noqa + print(auto_arima(ap)) # noqa + forecaster = ARIMAForecaster() + forecaster.fit(ap) + print(forecaster.predict()) # noqa + + +if __name__ == "__main__": + test_arima() +# Fit Auto-ARIMA model diff --git a/aeon/forecasting/_verify_ets.py b/aeon/forecasting/_verify_ets.py new file mode 100644 index 0000000000..1e060143c3 --- /dev/null +++ b/aeon/forecasting/_verify_ets.py @@ -0,0 +1,299 @@ +"""Script to test ETS implementation against ETS implementations from other modules.""" + +import random +import time +import timeit + +import numpy as np +from statsforecast.utils import AirPassengers as ap + +import aeon.forecasting._ets as ets +from aeon.forecasting import ETSForecaster + +NA = -99999.0 +MAX_NMSE = 30 +MAX_SEASONAL_PERIOD = 24 + + +def setup(): + """Generate parameters required for ETS algorithms.""" + y = ap + m = random.randint(2, 24) + error = random.randint(1, 2) + trend = random.randint(0, 2) + season = random.randint(0, 2) + alpha = round(random.random(), 4) + if alpha == 0: + alpha = round(random.random(), 4) + beta = round(random.random() * alpha, 4) # 0 < beta < alpha + if beta == 0: + beta = round(random.random() * alpha, 4) + gamma = round(random.random() * (1 - alpha), 4) # 0 < beta < alpha + if gamma == 0: + gamma = round(random.random() * (1 - alpha), 4) + phi = round( + random.random() * 0.18 + 0.8, 4 + ) # Common constraint for phi is 0.8 < phi < 0.98 + return (y, m, error, trend, season, alpha, beta, gamma, phi) + + +def statsmodels_version( + y: np.ndarray, + m: int, + f1: ETSForecaster, + errortype: int, + trendtype: int, + seasontype: int, + alpha: float, + beta: float, + gamma: float, + phi: float, +): + """Hide the differences between different statsforecast versions.""" + from statsmodels.tsa.holtwinters import ExponentialSmoothing + + ets_model = ExponentialSmoothing( + y[m:], + trend="add" if trendtype == 1 else "mul" if trendtype == 2 else None, + damped_trend=(phi != 1 and trendtype != 0), + seasonal="add" if seasontype == 1 else "mul" if seasontype == 2 else None, + seasonal_periods=m if seasontype != 0 else None, + initialization_method="known", + initial_level=f1.level_, + initial_trend=f1.trend_ if trendtype != 0 else None, + initial_seasonal=f1.seasonality_ if seasontype != 0 else None, + ) + results = ets_model.fit( + smoothing_level=alpha, + smoothing_trend=( + beta / alpha if trendtype != 0 else None + ), # statsmodels uses beta*=beta/alpha + smoothing_seasonal=gamma if seasontype != 0 else None, + damping_trend=phi if trendtype != 0 else None, + optimized=False, + ) + avg_mean_sq_err = results.sse / (len(y) - m) + # Back-calculate our log-likelihood proxy from AIC + if errortype == 1: + residuals = y[m:] - results.fittedvalues + assert np.allclose(residuals, results.resid) + else: + residuals = y[m:] / results.fittedvalues - 1 + return ( + (np.array([avg_mean_sq_err])), + residuals, + (results.aic - 2 * results.k + (len(y) - m) * np.log(len(y) - m)), + ) + + +def obscure_statsforecast_version( + y: np.ndarray, + m: int, + f1: ETSForecaster, + errortype: int, + trendtype: int, + seasontype: int, + alpha: float, + beta: float, + gamma: float, + phi: float, +): + """Hide the differences between different statsforecast versions.""" + init_state = np.zeros(len(y) * (1 + (trendtype > 0) + m * (seasontype > 0) + 1)) + init_state[0] = f1.level_ + init_state[1] = f1.trend_ + init_state[1 + (trendtype != 0) : m + 1 + (trendtype != 0)] = f1.seasonality_[::-1] + # from statsforecast.ets import pegelsresid_C + # amse, e, _x, lik = pegelsresid_C( + # y[m:], + # m, + # init_state, + # "A" if errortype == 1 else "M", + # "A" if trendtype == 1 else "M" if trendtype == 2 else "N", + # "A" if seasontype == 1 else "M" if seasontype == 2 else "N", + # phi != 1, + # alpha, + # beta, + # gamma, + # phi, + # nmse, + # ) + from statsforecast.ets import etscalc + + e = np.zeros(len(y)) + amse = np.zeros(MAX_NMSE) + lik = etscalc( + y[m:], + len(y) - m, + init_state, + m, + errortype, + trendtype, + seasontype, + alpha, + beta, + gamma, + phi, + e, + amse, + 1, + ) + return amse, e[:-m], lik + + +def test_ets_comparison(setup_func, random_seed, catch_errors): + """Run both our statsforecast and our implementation and crosschecks.""" + random.seed(random_seed) + ( + y, + m, + error, + trend, + season, + alpha, + beta, + gamma, + phi, + ) = setup_func() + # tsml-eval implementation + start = time.perf_counter() + f1 = ETSForecaster( + error, + trend, + season, + m, + alpha, + beta, + gamma, + phi, + 1, + ) + f1.fit(y) + end = time.perf_counter() + time_fitets = end - start + e_fitets = f1.residuals_ + amse_fitets = f1.avg_mean_sq_err_ + lik_fitets = f1.liklihood_ + f1 = ETSForecaster(error, trend, season, m, alpha, beta, gamma, phi, 1) + # pylint: disable=W0212 + f1._fit(y)._initialise(y) + # pylint: enable=W0212 + if season == 0: + m = 1 + # Nixtla/statsforcast implementation + start = time.perf_counter() + amse_etscalc, e_etscalc, lik_etscalc = statsmodels_version( + y, m, f1, error, trend, season, alpha, beta, gamma, phi + ) + end = time.perf_counter() + time_etscalc = end - start + amse_etscalc = amse_etscalc[0] + + if catch_errors: + try: + # Comparing outputs and runtime + assert np.allclose(e_fitets, e_etscalc), "Residuals Compare failed" + assert np.allclose(amse_fitets, amse_etscalc), "AMSE Compare failed" + assert np.isclose(lik_fitets, lik_etscalc), "Liklihood Compare failed" + return True + except AssertionError as e: + print(e) # noqa + print( # noqa + f"Seed: {random_seed}, Model: Error={error}, Trend={trend},\ + Seasonality={season}, seasonal period={m},\ + alpha={alpha}, beta={beta}, gamma={gamma}, phi={phi}" + ) + return False + else: + print( # noqa + f"Seed: {random_seed}, Model: Error={error}, Trend={trend},\ + Seasonality={season}, seasonal period={m}, alpha={alpha},\ + beta={beta}, gamma={gamma}, phi={phi}" + ) + diff_indices = np.where( + np.abs(e_fitets - e_etscalc) > 1e-3 * np.abs(e_etscalc) + 1e-2 + )[0] + for index in diff_indices: + print( # noqa + f"Index {index}: e_fitets = {e_fitets[index]},\ + e_etscalc = {e_etscalc[index]}" + ) + print(amse_fitets) # noqa + print(amse_etscalc) # noqa + print(lik_fitets) # noqa + print(lik_etscalc) # noqa + assert np.allclose(e_fitets, e_etscalc) + assert np.allclose(amse_fitets, amse_etscalc) + # assert np.isclose(lik_fitets, lik_etscalc) + print(f"Time for ETS: {time_fitets:0.20f}") # noqa + print(f"Time for statsforecast ETS: {time_etscalc}") # noqa + return True + + +def time_ets(): + """Test function for optimised numba ets algorithm.""" + ETSForecaster(2, 2, 2, 4).fit(ap).predict() + + +def time_sf(): + """Test function for statsforecast ets algorithm.""" + x = np.zeros(144 * 7) + x[0:6] = [122.75, 1.123230970596215, 0.91242363, 0.96130346, 1.07535642, 1.0509165] + obscure_statsforecast_version( + ap[4:], + 4, + x, + 2, + 2, + 2, + 0.1, + 0.01, + 0.01, + 0.99, + ) + + +def time_compare(random_seed): + """Compare timings of different ets algorithms.""" + random.seed(random_seed) + (y, m, error, trend, season, alpha, beta, gamma, phi) = setup() + # etsnoopt_time = timeit.timeit(time_etsnoopt, globals={}, number=10000) + # print (f"Execution time ETS No-opt: {etsnoopt_time} seconds") + # Do a few iterations to remove background/overheads. Makes comparison more reliable + for _i in range(10): + time_ets() + time_sf() + ets_time = timeit.timeit(time_ets, globals={}, number=1000) + print(f"Execution time ETS: {ets_time} seconds") # noqa + statsforecast_time = timeit.timeit(time_sf, globals={}, number=1000) + print(f"Execution time StatsForecast: {statsforecast_time} seconds") # noqa + ets_time = timeit.timeit(time_ets, globals={}, number=1000) + print(f"Execution time ETS: {ets_time} seconds") # noqa + statsforecast_time = timeit.timeit(time_sf, globals={}, number=1000) + print(f"Execution time StatsForecast: {statsforecast_time} seconds") # noqa + # _ets implementation + start = time.perf_counter() + f1 = ets.ETSForecaster(error, trend, season, m, alpha, beta, gamma, phi, 1) + f1.fit(y) + end = time.perf_counter() + ets_time = end - start + print(f"ETS Time: {ets_time}") # noqa + return ets_time + + +if __name__ == "__main__": + np.set_printoptions(threshold=np.inf) + test_ets_comparison(setup, 300, False) + SUCCESSES = True + for i in range(0, 300): + SUCCESSES &= test_ets_comparison(setup, i, True) + if SUCCESSES: + print("Test Completed Successfully with no errors") # noqa + # time_compare(300) + # avg_ets = 0 + # iterations = 100 + # for i in range (iterations): + # time_ets = time_compare(300) + # avg_ets += time_ets + # avg_ets/= iterations + # print(f"Avg ETS Time: {avg_ets},\ diff --git a/aeon/testing/testing_data.py b/aeon/testing/testing_data.py index 7d2186398a..8304fbdf34 100644 --- a/aeon/testing/testing_data.py +++ b/aeon/testing/testing_data.py @@ -23,6 +23,7 @@ make_example_multi_index_dataframe, ) from aeon.transformations.collection import BaseCollectionTransformer +from aeon.transformations.format import BaseFormatTransformer from aeon.transformations.series import BaseSeriesTransformer from aeon.utils.conversion import convert_collection @@ -876,6 +877,7 @@ def _get_task_for_estimator(estimator): or isinstance(estimator, BaseSeriesTransformer) or isinstance(estimator, BaseForecaster) or isinstance(estimator, BaseSeriesSimilaritySearch) + or isinstance(estimator, BaseFormatTransformer) ): data_label = "None" else: diff --git a/aeon/transformations/format/__init__.py b/aeon/transformations/format/__init__.py new file mode 100644 index 0000000000..9409e0c3a4 --- /dev/null +++ b/aeon/transformations/format/__init__.py @@ -0,0 +1,11 @@ +"""Format transformations.""" + +__all__ = [ + "SlidingWindowTransformer", + "TrainTestTransformer", + "BaseFormatTransformer", +] + +from aeon.transformations.format._sliding_window import SlidingWindowTransformer +from aeon.transformations.format._train_test import TrainTestTransformer +from aeon.transformations.format.base import BaseFormatTransformer diff --git a/aeon/transformations/format/_sliding_window.py b/aeon/transformations/format/_sliding_window.py new file mode 100644 index 0000000000..b173cb9ad2 --- /dev/null +++ b/aeon/transformations/format/_sliding_window.py @@ -0,0 +1,93 @@ +"""Sliding Window transformation.""" + +__maintainer__ = [] +__all__ = ["SlidingWindowTransformer"] + +import numpy as np + +from aeon.transformations.format.base import BaseFormatTransformer + + +class SlidingWindowTransformer(BaseFormatTransformer): + """ + Create windowed views of a series by extracting fixed-length overlapping segments. + + This transformer generates multiple subsequences (windows) of a specified width from + the input time series. Each window represents a shifted view of the series, moving + forward by one time step. + + Parameters + ---------- + window_size : int, optional (default=100) + The number of consecutive time points in each window. + + Notes + ----- + - The function assumes that `window_width` is smaller than the length of `series`. + + Examples + -------- + >>> import numpy as np + >>> from aeon.transformations.format import SlidingWindowTransformer + >>> X = np.array([1, 2, 3, 4, 5, 6]) + >>> transformer = SlidingWindowTransformer(3) + >>> Xt = transformer.fit_transform(X) + >>> print(Xt) + (array([[1., 2.], [2., 3.], [3., 4.], [4., 5.]]), + array([3., 4., 5., 6.]), array([0., 1., 2., 3.])) + + + Returns + ------- + X : np.ndarray (2D) + A numpy array where each element is a window (subsequence) of length + `window_width - 1` from the original series. + Y : np.ndarray (1D) + A numpy array containing the next value in the series for each window. + indices : list of int + A list of starting indices corresponding to each extracted window. + + """ + + _tags = { + "capability:multivariate": True, + "X_inner_type": "np.ndarray", + "fit_is_empty": True, + "output_data_type": "Tuple", + } + + def __init__(self, window_size: int = 100): + super().__init__(axis=1) + if window_size <= 1: + raise ValueError(f"window_size must be > 1, got {window_size}") + self.window_size = window_size + + def _transform(self, X, y=None): + """Transform X and return a transformed version. + + private _transform containing core logic, called from transform + + Parameters + ---------- + X : np.ndarray + The input time series from which windows will be created. + y : ignored argument for interface compatibility + Additional data, e.g., labels for transformation + + Returns + ------- + Xt: 2D np.ndarray + transformed version of X + """ + X = X[0] + # Generate windowed versions of train and test sets + X_t = np.zeros((len(X) - self.window_size + 1, self.window_size - 1)) + Y_t = np.zeros(len(X) - self.window_size + 1) + indices = np.zeros(len(X) - self.window_size + 1) + for i in range(len(X) - self.window_size + 1): + X_t[i] = X[ + i : i + self.window_size - 1 + ] # Create a view from current index onward + Y_t[i] = X[i + self.window_size - 1] # Next value + indices[i] = i + return X_t, Y_t, indices diff --git a/aeon/transformations/format/_train_test.py b/aeon/transformations/format/_train_test.py new file mode 100644 index 0000000000..0d31d48aa9 --- /dev/null +++ b/aeon/transformations/format/_train_test.py @@ -0,0 +1,93 @@ +"""Sliding Window transformation.""" + +__maintainer__ = [] +__all__ = ["TrainTestTransformer"] + +import math + +from aeon.transformations.format.base import BaseFormatTransformer + + +class TrainTestTransformer(BaseFormatTransformer): + """ + Convert a single time series into train/test sets. + + This function assumes that the input DataFrame contains only one time series. + It splits the series into training and testing sets based on + the specified proportion. + + Parameters + ---------- + train_proportion : float, optional (default=0.7) + The proportion of the time series to use for training, + with the remaining used for test. + max_series_length : int, optional (default=10000) + The maximum length of the series to consider. If the series is longer + than this value, it will be truncated. + + Examples + -------- + >>> import numpy as np + >>> from aeon.transformations.format import TrainTestTransformer + >>> X = np.array([-3, -2, -1, 0, 1, 2, 3, 4]) + >>> transformer = TrainTestTransformer(0.75) + >>> Xt = transformer.fit_transform(X) + >>> print(Xt) + (array([-3, -2, -1, 0, 1, 2]), array([3, 4])) + + Returns + ------- + None + A tuple containing the training and testing sets. + + """ + + _tags = { + "capability:multivariate": True, + "X_inner_type": "np.ndarray", + "fit_is_empty": True, + "output_data_type": "Tuple", + } + + def __init__( + self, train_proportion: float = 0.7, max_series_length: int = 10000 + ) -> None: + super().__init__(axis=1) + if train_proportion <= 0 or train_proportion >= 1: + raise ValueError( + f"train_proportion must be between 0 and 1, got {train_proportion}" + ) + self.train_proportion = train_proportion + self.max_series_length = max_series_length + + def _transform(self, X, y=None): + """Transform X and return a transformed version. + + private _transform containing core logic, called from transform + + Parameters + ---------- + X : np.ndarray + Data to be transformed + y : ignored argument for interface compatibility + Additional data, e.g., labels for transformation + + Returns + ------- + Xt: 2D np.ndarray + transformed version of X + """ + X = X[0] + # Compute split index + if len(X) < self.max_series_length or self.max_series_length == -1: + end_location = len(X) + else: + end_location = self.max_series_length + train_test_split_location = math.ceil(end_location * self.train_proportion) + + # Split into train and test sets + train_series = X[:train_test_split_location] + test_series = X[train_test_split_location:end_location] + + # Generate windowed versions of train and test sets + return train_series, test_series diff --git a/aeon/transformations/format/base.py b/aeon/transformations/format/base.py new file mode 100644 index 0000000000..9047c667e1 --- /dev/null +++ b/aeon/transformations/format/base.py @@ -0,0 +1,301 @@ +"""Base class for Series transformers. + +class name: BaseSeriesTransformer + +Defining methods: +fitting - fit(self, X, y=None) +transform - transform(self, X, y=None) +fit & transform - fit_transform(self, X, y=None) +""" + +from abc import abstractmethod +from typing import final + +import numpy as np +import pandas as pd + +from aeon.base import BaseSeriesEstimator +from aeon.transformations.base import BaseTransformer + + +class BaseFormatTransformer(BaseSeriesEstimator, BaseTransformer): + """Transformer base class for collections.""" + + # tag values specific to SeriesTransformers + _tags = { + "input_data_type": "Series", + "output_data_type": "Tuple", + } + + @abstractmethod + def __init__(self, axis): + super().__init__(axis=axis) + + @final + def fit(self, X, y=None, axis=1): + """Fit transformer to X, optionally using y if supervised. + + State change: + Changes state to "fitted". + + Parameters + ---------- + X : Input data + Time series to fit transform to, of type ``np.ndarray``, ``pd.Series`` + ``pd.DataFrame``. + y : Target variable, default=None + Additional data, e.g., labels for transformation + axis : int, default = 1 + Axis of time in the input series. + If ``axis == 0``, it is assumed each column is a time series and each row is + a time point. i.e. the shape of the data is ``(n_timepoints, + n_channels)``. + ``axis == 1`` indicates the time series are in rows, i.e. the shape of + the data is ``(n_channels, n_timepoints)`.``axis is None`` indicates + that the axis of X is the same as ``self.axis``. + + Returns + ------- + self : a fitted instance of the estimator + """ + # skip the rest if fit_is_empty is True + if self.get_tag("fit_is_empty"): + self.is_fitted = True + return self + if self.get_tag("requires_y"): + if y is None: + raise ValueError("Tag requires_y is true, but fit called with y=None") + # reset estimator at the start of fit + self.reset() + X = self._preprocess_series(X, axis=axis, store_metadata=True) + if y is not None: + self._check_y(y) + self._fit(X=X, y=y) + self.is_fitted = True + return self + + @final + def transform(self, X, y=None, axis=1): + """Transform X and return a transformed version. + + State required: + Requires state to be "fitted". + + Parameters + ---------- + X : Input data + Data to fit transform to, of valid collection type. + y : Target variable, default=None + Additional data, e.g., labels for transformation + axis : int, default = 1 + Axis of time in the input series. + If ``axis == 0``, it is assumed each column is a time series and each row is + a time point. i.e. the shape of the data is ``(n_timepoints, + n_channels)``. + ``axis == 1`` indicates the time series are in rows, i.e. the shape of + the data is ``(n_channels, n_timepoints)`.``axis is None`` indicates + that the axis of X is the same as ``self.axis``. + + Returns + ------- + transformed version of X with the same axis as passed by the user, if axis + not None. + """ + # check whether is fitted + self._check_is_fitted() + X = self._preprocess_series(X, axis=axis, store_metadata=False) + Xt = self._transform(X, y) + return Xt + + @final + def fit_transform(self, X, y=None, axis=1): + """ + Fit to data, then transform it. + + Fits the transformer to X and y and returns a transformed version of X. + + Changes state to "fitted". Model attributes (ending in "_") : dependent on + estimator. + + Parameters + ---------- + X : Input data + Data to fit transform to, of valid collection type. + y : Target variable, default=None + Additional data, e.g., labels for transformation + axis : int, default = 1 + Axis of time in the input series. + If ``axis == 0``, it is assumed each column is a time series and each row is + a time point. i.e. the shape of the data is ``(n_timepoints, + n_channels)``. + ``axis == 1`` indicates the time series are in rows, i.e. the shape of + the data is ``(n_channels, n_timepoints)`.``axis is None`` indicates + that the axis of X is the same as ``self.axis``. + + Returns + ------- + transformed version of X with the same axis as passed by the user, if axis + not None. + """ + # input checks and datatype conversion, to avoid doing in both fit and transform + self.reset() + X = self._preprocess_series(X, axis=axis, store_metadata=True) + Xt = self._fit_transform(X=X, y=y) + self.is_fitted = True + return Xt + + @final + def inverse_transform(self, X, y=None, axis=1): + """Inverse transform X and return an inverse transformed version. + + State required: + Requires state to be "fitted". + + Parameters + ---------- + X : Input data + Data to fit transform to, of valid collection type. + y : Target variable, default=None + Additional data, e.g., labels for transformation + axis : int, default = 1 + Axis of time in the input series. + If ``axis == 0``, it is assumed each column is a time series and each row is + a time point. i.e. the shape of the data is ``(n_timepoints, + n_channels)``. + ``axis == 1`` indicates the time series are in rows, i.e. the shape of + the data is ``(n_channels, n_timepoints)`.``axis is None`` indicates + that the axis of X is the same as ``self.axis``. + + Returns + ------- + inverse transformed version of X + of the same type as X + """ + if not self.get_tag("capability:inverse_transform"): + raise NotImplementedError( + f"{type(self)} does not implement inverse_transform" + ) + + # check whether is fitted + self._check_is_fitted() + X = self._preprocess_series(X, axis=axis, store_metadata=False) + Xt = self._inverse_transform(X=X, y=y) + return Xt + + @final + def update(self, X, y=None, update_params=True, axis=1): + """Update transformer with X, optionally y. + + Parameters + ---------- + X : data to update of valid series type. + y : Target variable, default=None + Additional data, e.g., labels for transformation + update_params : bool, default=True + whether the model is updated. Yes if true, if false, simply skips call. + argument exists for compatibility with forecasting module. + axis : int, default=None + axis along which to update. If None, uses self.axis. + + Returns + ------- + self : a fitted instance of the estimator + """ + # check whether is fitted + self._check_is_fitted() + X = self._preprocess_series(X, axis, False) + return self._update(X=X, y=y, update_params=update_params) + + def _fit(self, X, y=None): + """Fit transformer to X and y. + + private _fit containing the core logic, called from fit + + Parameters + ---------- + X : Input data + Data to fit transform to, of valid collection type. + y : Target variable, default=None + Additional data, e.g., labels for transformation + + Returns + ------- + self: a fitted instance of the estimator + """ + # default fit is "no fitting happens" + return self + + @abstractmethod + def _transform(self, X, y=None): + """Transform X and return a transformed version. + + private _transform containing the core logic, called from transform + + Parameters + ---------- + X : Input data + Data to fit transform to, of valid collection type. + y : Target variable, default=None + Additional data, e.g., labels for transformation + + Returns + ------- + transformed version of X + """ + + def _fit_transform(self, X, y=None): + """Fit to data, then transform it. + + Fits the transformer to X and y and returns a transformed version of X. + + private _fit_transform containing the core logic, called from fit_transform. + + Parameters + ---------- + X : Input data + Data to fit transform to, of valid collection type. + y : Target variable, default=None + Additional data, e.g., labels for transformation. + + Returns + ------- + transformed version of X. + """ + # Non-optimized default implementation; override when a better + # method is possible for a given algorithm. + self._fit(X, y) + return self._transform(X, y) + + def _inverse_transform(self, X, y=None): + """Inverse transform X and return an inverse transformed version. + + private _inverse_transform containing core logic, called from inverse_transform. + + Parameters + ---------- + X : Input data + Time series to fit transform to, of valid collection type. + y : Target variable, default=None + Additional data, e.g., labels for transformation + + Returns + ------- + inverse transformed version of X + of the same type as X. + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not support inverse_transform" + ) + + def _update(self, X, y=None, update_params=True): + # standard behaviour: no update takes place, new data is ignored + return self + + def _check_y(self, y): + # Check y valid input for supervised transform + if not isinstance(y, (pd.Series, np.ndarray)): + raise TypeError( + f"y must be a np.array or a pd.Series, but found type: {type(y)}" + ) + if isinstance(y, np.ndarray) and y.ndim > 1: + raise TypeError(f"y must be 1-dimensional, found {y.ndim} dimensions") diff --git a/aeon/transformations/series/__init__.py b/aeon/transformations/series/__init__.py index d97d45c7ad..3f5da4acb8 100644 --- a/aeon/transformations/series/__init__.py +++ b/aeon/transformations/series/__init__.py @@ -5,6 +5,7 @@ "BaseSeriesTransformer", "ClaSPTransformer", "DFTSeriesTransformer", + "DifferencingSeriesTransformer", "Dobin", "ExpSmoothingSeriesTransformer", "GaussSeriesTransformer", diff --git a/aeon/transformations/series/_difference.py b/aeon/transformations/series/_difference.py new file mode 100644 index 0000000000..42addd377b --- /dev/null +++ b/aeon/transformations/series/_difference.py @@ -0,0 +1,52 @@ +"""Differencing transformations.""" + +__maintainer__ = ["TonyBagnall"] +__all__ = ["DifferencingSeriesTransformer"] + +from aeon.transformations.series.base import BaseSeriesTransformer + + +class DifferencingSeriesTransformer(BaseSeriesTransformer): + """Differencing transformations. + + This transformer returns the differenced series of the input time series. + The differenced series is obtained by subtracting the previous value + from the current value. + + Examples + -------- + >>> from aeon.transformations.series import DifferencingSeriesTransformer + >>> from aeon.datasets import load_airline + >>> y = load_airline() + >>> transformer = DifferencingSeriesTransformer() + >>> y_hat = transformer.fit_transform(y) + """ + + _tags = { + "X_inner_type": "np.ndarray", + "fit_is_empty": True, + } + + def __init__( + self, + ): + super().__init__(axis=1) + + def _transform(self, X, y=None): + """Transform X and return a transformed version. + + private _transform containing the core logic, called from transform + + Parameters + ---------- + X : np.ndarray + Data to be transformed, shape (n_channels, n_timepoints) + y : ignored argument for interface compatibility + Additional data, e.g., labels for transformation + + Returns + ------- + transformed version of X + """ + X = X[0] + return X[1:] - X[:-1] diff --git a/aeon/utils/base/_register.py b/aeon/utils/base/_register.py index 5076bd79ee..7f322edf97 100644 --- a/aeon/utils/base/_register.py +++ b/aeon/utils/base/_register.py @@ -30,6 +30,7 @@ from aeon.similarity_search.series import BaseSeriesSimilaritySearch from aeon.transformations.base import BaseTransformer from aeon.transformations.collection import BaseCollectionTransformer +from aeon.transformations.format import BaseFormatTransformer from aeon.transformations.series import BaseSeriesTransformer # all base classes @@ -51,6 +52,7 @@ "forecaster": BaseForecaster, "regressor": BaseRegressor, "segmenter": BaseSegmenter, + "format-transformer": BaseFormatTransformer, "series-anomaly-detector": BaseSeriesAnomalyDetector, "series-similarity-search": BaseSeriesSimilaritySearch, "series-transformer": BaseSeriesTransformer, diff --git a/aeon/utils/forecasting/__init__.py b/aeon/utils/forecasting/__init__.py new file mode 100644 index 0000000000..a168fa0f11 --- /dev/null +++ b/aeon/utils/forecasting/__init__.py @@ -0,0 +1 @@ +"""Forecasting utils.""" diff --git a/aeon/utils/forecasting/_hypo_tests.py b/aeon/utils/forecasting/_hypo_tests.py new file mode 100644 index 0000000000..2d581e971e --- /dev/null +++ b/aeon/utils/forecasting/_hypo_tests.py @@ -0,0 +1,102 @@ +import numpy as np + + +def kpss_test(y, regression="c", lags=None): # Test if time series is stationary + """ + Perform the KPSS (Kwiatkowski-Phillips-Schmidt-Shin) test for stationarity. + + The KPSS test evaluates the null hypothesis that a time series is + (trend or level) stationary against the alternative of a unit root + (non-stationarity). It can test for either stationarity around a + constant (level stationarity) or arounda deterministic trend + (trend stationarity). + + Parameters + ---------- + y : array-like + Time series data to test for stationarity. + regression : str, default="c" + Indicates the null hypothesis for stationarity: + - "c" : Stationary around a constant (level stationarity) + - "ct" : Stationary around a constant and linear trend (trend stationarity) + lags : int or None, optional + Number of lags to use for the + HAC (heteroskedasticity and autocorrelation consistent) variance estimator. + If None, defaults to sqrt(n), where n is the sample size. + + Returns + ------- + kpss_stat : float + The KPSS test statistic. + stationary : bool + True if the series is judged stationary at the 5% significance level + (i.e., test statistic is below the critical value); False otherwise. + + Notes + ----- + - Uses asymptotic 5% critical values from Kwiatkowski et al. (1992): 0.463 for level + stationarity, 0.146 for trend stationarity. + - Returns True for stationary if the test statistic is below the 5% critical value. + + References + ---------- + Kwiatkowski, D., Phillips, P.C.B., Schmidt, P., & Shin, Y. (1992). + "Testing the null hypothesis of stationarity against the alternative + of a unit root." + Journal of Econometrics, 54(1–3), 159–178. + https://doi.org/10.1016/0304-4076(92)90104-Y + + Examples + -------- + >>> from aeon.utils.forecasting._hypo_tests import kpss_test + >>> from aeon.datasets import load_airline + >>> y = load_airline() + >>> kpss_test(y) + (np.float64(1.1966313813...), np.False_) + """ + y = np.asarray(y) + n = len(y) + + # Step 1: Fit regression model to estimate residuals + if regression == "c": # Constant + X = np.ones((n, 1)) + elif regression == "ct": # Constant + Trend + X = np.column_stack((np.ones(n), np.arange(1, n + 1))) + else: + raise ValueError("regression must be 'c' or 'ct'") + + beta = np.linalg.lstsq(X, y, rcond=None)[0] # Estimate regression coefficients + residuals = y - X @ beta # Get residuals (u_t) + + # Step 2: Compute cumulative sum of residuals (S_t) + S_t = np.cumsum(residuals) + + # Step 3: Estimate long-run variance (HAC variance) + if lags is None: + # lags = int(12 * (n / 100)**(1/4)) # Default statsmodels lag length + lags = int(np.sqrt(n)) # Default lag length + + gamma_0 = np.sum(residuals**2) / (n - X.shape[1]) # Lag-0 autocovariance + gamma = [np.sum(residuals[k:] * residuals[:-k]) / n for k in range(1, lags + 1)] + + # Bartlett weights + weights = [1 - (k / (lags + 1)) for k in range(1, lags + 1)] + + # Long-run variance + sigma_squared = gamma_0 + 2 * np.sum([w * g for w, g in zip(weights, gamma)]) + + # Step 4: Calculate the KPSS statistic + kpss_stat = np.sum(S_t**2) / (n**2 * sigma_squared) + + # 5% critical values for KPSS test + if regression == "ct": + # p. 162 Kwiatkowski et al. (1992): y_t = beta * t + r_t + e_t, + # where beta is the trend, r_t a random walk and e_t a stationary + # error term. + crit = 0.146 + else: # hypo == "c" + # special case of the model above, where beta = 0 (so the null + # hypothesis is that the data is stationary around r_0). + crit = 0.463 + + return kpss_stat, kpss_stat < crit diff --git a/aeon/utils/forecasting/_seasonality.py b/aeon/utils/forecasting/_seasonality.py new file mode 100644 index 0000000000..356b1a40d2 --- /dev/null +++ b/aeon/utils/forecasting/_seasonality.py @@ -0,0 +1,101 @@ +"""Seasonality Tools. + +Includes autocorrelation function (ACF) and seasonal period estimation. +""" + +import numpy as np +from numba import njit + + +@njit(cache=True, fastmath=True) +def acf(X, max_lag): + """ + Compute the sample autocorrelation function (ACF) of a time series. + + Up to a specified maximum lag. + + The autocorrelation at lag k is defined as the Pearson correlation + coefficient between the series and a lagged version of itself. + If both segments at a given lag have zero variance, the function + returns 1 for that lag. If only one segment has zero variance, + the function returns 0. + + Parameters + ---------- + X : array-like, shape (n_samples,) + The input time series data. + max_lag : int + The maximum lag (number of steps) for which to + compute the autocorrelation. + + Returns + ------- + acf_values : np.ndarray, shape (max_lag,) + The autocorrelation values for lags 1 through `max_lag`. + + Notes + ----- + The function handles cases where the lagged segments have zero + variance to avoid division by zero. + The returned values correspond to + lags 1, 2, ..., `max_lag` (not including lag 0). + """ + length = len(X) + X_t = np.zeros(max_lag, dtype=float) + for lag in range(1, max_lag + 1): + lag_length = length - lag + x1 = X[:-lag] + x2 = X[lag:] + s1 = np.sum(x1) + s2 = np.sum(x2) + m1 = s1 / lag_length + m2 = s2 / lag_length + ss1 = np.sum(x1 * x1) + ss2 = np.sum(x2 * x2) + v1 = ss1 - s1 * m1 + v2 = ss2 - s2 * m2 + v1_is_zero, v2_is_zero = v1 <= 1e-9, v2 <= 1e-9 + if v1_is_zero and v2_is_zero: # Both zero variance, + # so must be 100% correlated + X_t[lag - 1] = 1 + elif v1_is_zero or v2_is_zero: # One zero variance + # the other not + X_t[lag - 1] = 0 + else: + X_t[lag - 1] = np.sum((x1 - m1) * (x2 - m2)) / np.sqrt(v1 * v2) + return X_t + + +@njit(cache=True, fastmath=True) +def calc_seasonal_period(data): + """ + Estimate the seasonal period of a time series using autocorrelation analysis. + + This function computes the autocorrelation function (ACF) of + the input series up to lag 24. It then identifies peaks in the + ACF above the mean value, treating the first such peak + as the estimated seasonal period. If no peak is found, + a period of 1 is returned. + + Parameters + ---------- + data : array-like, shape (n_samples,) + The input time series data. + + Returns + ------- + period : int + The estimated seasonal period (lag) of the series. Returns 1 if no significant + peak is detected in the autocorrelation. + """ + lags = acf(data, 24) + lags = np.concatenate((np.array([1.0]), lags)) + peaks = [] + mean_lags = np.mean(lags) + for i in range(1, len(lags) - 1): # Skip the first (lag 0) and last elements + if lags[i] >= lags[i - 1] and lags[i] >= lags[i + 1] and lags[i] > mean_lags: + peaks.append(i) + if not peaks: + return 1 + else: + return peaks[0] diff --git a/aeon/utils/optimisation/__init__.py b/aeon/utils/optimisation/__init__.py new file mode 100644 index 0000000000..11eddea791 --- /dev/null +++ b/aeon/utils/optimisation/__init__.py @@ -0,0 +1 @@ +"""Optimisation utils.""" diff --git a/aeon/utils/optimisation/_nelder_mead.py b/aeon/utils/optimisation/_nelder_mead.py new file mode 100644 index 0000000000..e59a70c5dd --- /dev/null +++ b/aeon/utils/optimisation/_nelder_mead.py @@ -0,0 +1,119 @@ +"""Optimisation algorithms for automatic parameter tuning.""" + +import numpy as np +from numba import njit + + +@njit(fastmath=True) +def nelder_mead( + loss_function, + num_params, + data, + model, + tol=1e-6, + max_iter=500, +): + """ + Perform optimisation using the Nelder–Mead simplex algorithm. + + This function minimises a given loss (objective) function using the Nelder–Mead + algorithm, a derivative-free method that iteratively refines a simplex of candidate + solutions. The implementation supports unconstrained minimisation of functions + with a fixed number of parameters. + + Parameters + ---------- + loss_function : callable + The objective function to minimise. Should accept a 1D NumPy array of length + `num_params` and return a scalar value. + num_params : int + The number of parameters (dimensions) in the optimisation problem. + data : np.ndarray + The input data used by the loss function. The shape and content depend on the + specific loss function being minimised. + model : np.ndarray + The model or context in which the loss function operates. This could be any + other object that the `loss_function` requires to compute its value. + The exact type and structure of `model` should be compatible with the + `loss_function`. + tol : float, optional (default=1e-6) + Tolerance for convergence. The algorithm stops when the maximum difference + between function values at simplex vertices is less than `tol`. + max_iter : int, optional (default=500) + Maximum number of iterations to perform. + + Returns + ------- + best_params : np.ndarray, shape (`num_params`,) + The parameter vector that minimises the loss function. + best_value : float + The value of the loss function at the optimal parameter vector. + + Notes + ----- + - The initial simplex is constructed by setting each parameter to 0.5, + with one additional point per dimension at 0.6 for that dimension. + - This implementation does not support constraints or bounds on the parameters. + - The algorithm does not guarantee finding a global minimum. + + References + ---------- + .. [1] Nelder, J. A. and Mead, R. (1965). + A Simplex Method for Function Minimization. + The Computer Journal, 7(4), 308–313. + https://doi.org/10.1093/comjnl/7.4.308 + """ + points = np.full((num_params + 1, num_params), 0.5) + for i in range(num_params): + points[i + 1][i] = 0.6 + values = np.array([loss_function(v, data, model) for v in points]) + for _iteration in range(max_iter): + # Order simplex by function values + order = np.argsort(values) + points = points[order] + values = values[order] + + # Centroid of the best n points + centre_point = points[:-1].sum(axis=0) / len(points[:-1]) + + # Reflection + # centre + distance between centre and largest value + reflected_point = centre_point + (centre_point - points[-1]) + reflected_value = loss_function(reflected_point, data, model) + # if between best and second best, use reflected value + if len(values) > 1 and values[0] <= reflected_value < values[-2]: + points[-1] = reflected_point + values[-1] = reflected_value + continue + # Expansion + # Otherwise if it is better than the best value + if reflected_value < values[0]: + expanded_point = centre_point + 2 * (reflected_point - centre_point) + expanded_value = loss_function(expanded_point, data, model) + # if less than reflected value use expanded, otherwise go back to reflected + if expanded_value < reflected_value: + points[-1] = expanded_point + values[-1] = expanded_value + else: + points[-1] = reflected_point + values[-1] = reflected_value + continue + # Contraction + # Otherwise if reflection is worse than all current values + contracted_point = centre_point - 0.5 * (centre_point - points[-1]) + contracted_value = loss_function(contracted_point, data, model) + # If contraction is better use that otherwise move to shrinkage + if contracted_value < values[-1]: + points[-1] = contracted_point + values[-1] = contracted_value + continue + + # Shrinkage + for i in range(1, len(points)): + points[i] = points[0] - 0.5 * (points[0] - points[i]) + values[i] = loss_function(points[i], data, model) + + # Convergence check + if np.max(np.abs(values - values[0])) < tol: + break + return points[0], values[0]