|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +# Copyright (c) 2023 Oracle and/or its affiliates. |
| 4 | +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ |
| 5 | + |
| 6 | +from darts import datasets as d_datasets |
| 7 | +import yaml |
| 8 | +import tempfile |
| 9 | +import subprocess |
| 10 | +import pandas as pd |
| 11 | +import pytest |
| 12 | +from time import sleep, time |
| 13 | +from copy import deepcopy |
| 14 | +from pathlib import Path |
| 15 | +import random |
| 16 | + |
| 17 | + |
| 18 | +DATASETS_LIST = [ |
| 19 | + "AirPassengersDataset", |
| 20 | + "AusBeerDataset", |
| 21 | + "AustralianTourismDataset", |
| 22 | + "ETTh1Dataset", |
| 23 | + # 'ETTh2Dataset', |
| 24 | + # 'ETTm1Dataset', |
| 25 | + # 'ETTm2Dataset', |
| 26 | + # 'ElectricityDataset', |
| 27 | + "EnergyDataset", |
| 28 | + "ExchangeRateDataset", |
| 29 | + "GasRateCO2Dataset", |
| 30 | + "HeartRateDataset", |
| 31 | + "ILINetDataset", |
| 32 | + "IceCreamHeaterDataset", |
| 33 | + "MonthlyMilkDataset", |
| 34 | + "MonthlyMilkIncompleteDataset", |
| 35 | + "SunspotsDataset", |
| 36 | + "TaylorDataset", |
| 37 | + "TemperatureDataset", |
| 38 | + "TrafficDataset", |
| 39 | + "USGasolineDataset", |
| 40 | + "UberTLCDataset", |
| 41 | + "WeatherDataset", |
| 42 | + "WineDataset", |
| 43 | + "WoolyDataset", |
| 44 | +] |
| 45 | + |
| 46 | +TEMPLATE_YAML = { |
| 47 | + "kind": "operator", |
| 48 | + "type": "forecast", |
| 49 | + "version": "v1", |
| 50 | + "spec": { |
| 51 | + "historical_data": { |
| 52 | + "url": None, |
| 53 | + }, |
| 54 | + "output_directory": { |
| 55 | + "url": "results", |
| 56 | + }, |
| 57 | + "model": None, |
| 58 | + "target_column": None, |
| 59 | + "datetime_column": { |
| 60 | + "name": None, |
| 61 | + }, |
| 62 | + "target_category_columns": [], |
| 63 | + "horizon": None, |
| 64 | + "generate_explanations": False, |
| 65 | + }, |
| 66 | +} |
| 67 | + |
| 68 | + |
| 69 | +PERIODS = 5 |
| 70 | +MAX_ADDITIONAL_COLS = 3 |
| 71 | +SAMPLE_FRACTION = 1 |
| 72 | + |
| 73 | +parameters_short = [] |
| 74 | + |
| 75 | +for dataset_i in DATASETS_LIST[2:3]: # + [DATASETS_LIST[-2]] |
| 76 | + for model in [ |
| 77 | + "arima", |
| 78 | + "automlx", |
| 79 | + "prophet", |
| 80 | + "neuralprophet", |
| 81 | + "autots", |
| 82 | + "auto", |
| 83 | + ]: # ["arima", "automlx", "prophet", "neuralprophet", "autots", "auto"] |
| 84 | + parameters_short.append((model, dataset_i)) |
| 85 | + |
| 86 | + |
| 87 | +@pytest.mark.parametrize("model, dataset_name", parameters_short) |
| 88 | +def test_load_datasets(model, dataset_name): |
| 89 | + dataset_i = getattr(d_datasets, dataset_name)().load() |
| 90 | + datetime_col = dataset_i.time_index.name |
| 91 | + |
| 92 | + columns = dataset_i.components |
| 93 | + target = dataset_i[columns[0]][:-PERIODS] |
| 94 | + test = dataset_i[columns[0]][-PERIODS:] |
| 95 | + |
| 96 | + print(dataset_name, len(columns), len(target)) |
| 97 | + with tempfile.TemporaryDirectory() as tmpdirname: |
| 98 | + historical_data_path = f"{tmpdirname}/primary_data.csv" |
| 99 | + additional_data_path = f"{tmpdirname}/add_data.csv" |
| 100 | + test_data_path = f"{tmpdirname}/test_data.csv" |
| 101 | + output_data_path = f"{tmpdirname}/results" |
| 102 | + yaml_i = deepcopy(TEMPLATE_YAML) |
| 103 | + generate_train_metrics = True # bool(random.getrandbits(1)) |
| 104 | + |
| 105 | + # TODO: Open bug ticket so that series is not required |
| 106 | + df_i = target.pd_dataframe().reset_index() |
| 107 | + df_i["Series"] = "A" |
| 108 | + if model == "automlx" and dataset_name == "AustralianTourismDataset": |
| 109 | + df_i[datetime_col] = pd.to_datetime( |
| 110 | + [f"{x+1:03d}" for x in df_i[datetime_col]], format="%j" |
| 111 | + ) |
| 112 | + |
| 113 | + df_i.to_csv(historical_data_path, index=False) |
| 114 | + # .sample(frac=SAMPLE_FRACTION).sort_values(by=datetime_col) |
| 115 | + |
| 116 | + test_df = test.pd_dataframe().reset_index() |
| 117 | + test_df["Series"] = "A" |
| 118 | + if model == "automlx" and dataset_name == "AustralianTourismDataset": |
| 119 | + test_df[datetime_col] = pd.to_datetime( |
| 120 | + [f"{x+1:03d}" for x in test_df[datetime_col]], format="%j" |
| 121 | + ) |
| 122 | + test_df.to_csv(test_data_path, index=False) |
| 123 | + |
| 124 | + if len(columns) > 1: |
| 125 | + additional_cols = columns[1 : min(len(columns), MAX_ADDITIONAL_COLS)] |
| 126 | + additional_data = dataset_i[list(additional_cols)] |
| 127 | + df_additional = additional_data.pd_dataframe().reset_index() |
| 128 | + df_additional["Series"] = "A" |
| 129 | + if model == "automlx" and dataset_name == "AustralianTourismDataset": |
| 130 | + df_additional[datetime_col] = pd.to_datetime( |
| 131 | + [f"{x+1:03d}" for x in df_additional[datetime_col]], format="%j" |
| 132 | + ) |
| 133 | + df_additional.to_csv(additional_data_path, index=False) |
| 134 | + yaml_i["spec"]["additional_data"] = {"url": additional_data_path} |
| 135 | + |
| 136 | + yaml_i["spec"]["historical_data"]["url"] = historical_data_path |
| 137 | + yaml_i["spec"]["test_data"] = {"url": test_data_path} |
| 138 | + yaml_i["spec"]["output_directory"]["url"] = output_data_path |
| 139 | + yaml_i["spec"]["model"] = model |
| 140 | + yaml_i["spec"]["target_column"] = columns[0] |
| 141 | + yaml_i["spec"]["datetime_column"]["name"] = datetime_col |
| 142 | + yaml_i["spec"]["target_category_columns"] = ["Series"] |
| 143 | + yaml_i["spec"]["horizon"] = PERIODS |
| 144 | + if ( |
| 145 | + yaml_i["spec"].get("additional_data") is not None |
| 146 | + and model != "neuralprophet" |
| 147 | + ): |
| 148 | + yaml_i["spec"]["generate_explanations"] = True |
| 149 | + if generate_train_metrics: |
| 150 | + yaml_i["spec"]["generate_metrics"] = generate_train_metrics |
| 151 | + if model == "autots": |
| 152 | + yaml_i["spec"]["model_kwargs"] = {"model_list": "superfast"} |
| 153 | + if model == "automlx": |
| 154 | + yaml_i["spec"]["model_kwargs"] = {"time_budget": 1} |
| 155 | + |
| 156 | + forecast_yaml_filename = f"{tmpdirname}/forecast.yaml" |
| 157 | + with open(f"{tmpdirname}/forecast.yaml", "w") as f: |
| 158 | + f.write(yaml.dump(yaml_i)) |
| 159 | + sleep(0.5) |
| 160 | + subprocess.run( |
| 161 | + f"ads operator run -f {forecast_yaml_filename} --debug", shell=True |
| 162 | + ) |
| 163 | + sleep(0.1) |
| 164 | + subprocess.run(f"ls -a {output_data_path}", shell=True) |
| 165 | + # if yaml_i["spec"]["generate_explanations"]: |
| 166 | + # glb_expl = pd.read_csv(f"{tmpdirname}/results/global_explanation.csv") |
| 167 | + # print(glb_expl) |
| 168 | + # loc_expl = pd.read_csv(f"{tmpdirname}/results/local_explanation.csv") |
| 169 | + # print(loc_expl) |
| 170 | + |
| 171 | + test_metrics = pd.read_csv(f"{tmpdirname}/results/test_metrics.csv") |
| 172 | + print(test_metrics) |
| 173 | + train_metrics = pd.read_csv(f"{tmpdirname}/results/metrics.csv") |
| 174 | + print(train_metrics) |
| 175 | + return test_metrics.iloc[0][f"{columns[0]}_A"] |
| 176 | + |
| 177 | + |
| 178 | +if __name__ == "__main__": |
| 179 | + failed_runs = [] |
| 180 | + results = dict() |
| 181 | + timings = dict() |
| 182 | + for dataset_name in DATASETS_LIST[2:3]: # random.sample(DATASETS_LIST, 2): |
| 183 | + results[dataset_name] = dict() |
| 184 | + timings[dataset_name] = dict() |
| 185 | + for m in [ |
| 186 | + "automlx" |
| 187 | + ]: # ["arima", "automlx", "prophet", "neuralprophet", "autots", "auto"]: |
| 188 | + start_time = time() |
| 189 | + try: |
| 190 | + results[dataset_name][m] = test_load_datasets( |
| 191 | + model=m, dataset_name=dataset_name |
| 192 | + ) |
| 193 | + except Exception as e: |
| 194 | + print(f"Failed with the following error! {e}") |
| 195 | + failed_runs.append((dataset_name, m)) |
| 196 | + elapsed = time() - start_time |
| 197 | + timings[dataset_name][m] = elapsed |
| 198 | + print(f"Failed Runs: {failed_runs}") |
| 199 | + print(f"results: {pd.DataFrame(results)}") |
| 200 | + print(f"timings: {timings}") |
| 201 | + pd.DataFrame(results).to_csv("~/Desktop/AUTO_benchmark_darts.csv") |
0 commit comments