test case added for disabling outlier treatment

prasankh · prasankh · commit 02267d52c013 · 2024-03-26T09:18:54.000+05:30
diff --git a/ads/opctl/operator/lowcode/forecast/operator_config.py b/ads/opctl/operator/lowcode/forecast/operator_config.py
@@ -31,7 +31,7 @@ class DateTimeColumn(DataClassSerializable):
 
 @dataclass(repr=True)
 class PreprocessingSteps(DataClassSerializable):
-    """Class representing operator specification preprocessing details."""
+    """Class representing preprocessing steps for operator."""
 
     missing_value_imputation: bool = True
     outlier_treatment: bool = True
diff --git a/ads/opctl/operator/lowcode/forecast/schema.yaml b/ads/opctl/operator/lowcode/forecast/schema.yaml
@@ -295,14 +295,18 @@ spec:
           default: true
           meta:
             description: "preprocessing and feature engineering can be disabled using this flag, Defaults to true"
-        missing_value_imputation:
-          type: boolean
-          required: false
-          default: true
-        outlier_detection:
-          type: boolean
+        steps:
+          type: dict
           required: false
-          default: true
+          schema:
+            missing_value_imputation:
+              type: boolean
+              required: false
+              default: true
+            outlier_treatment:
+              type: boolean
+              required: false
+              default: true
 
     generate_explanations:
       type: boolean
diff --git a/tests/operators/forecast/test_errors.py b/tests/operators/forecast/test_errors.py
@@ -26,7 +26,7 @@
     ForecastInputDataError,
 )
 from ads.opctl.operator.cmd import run
-
+import math
 
 NUM_ROWS = 1000
 NUM_SERIES = 10
@@ -172,8 +172,9 @@ def run_yaml(tmpdirname, yaml_i, output_data_path):
     run(yaml_i, backend="operator.local", debug=True)
     subprocess.run(f"ls -a {output_data_path}", shell=True)
 
-    test_metrics = pd.read_csv(f"{tmpdirname}/results/test_metrics.csv")
-    print(test_metrics)
+    if 'test_data' in yaml_i['spec']:
+        test_metrics = pd.read_csv(f"{tmpdirname}/results/test_metrics.csv")
+        print(test_metrics)
     train_metrics = pd.read_csv(f"{tmpdirname}/results/metrics.csv")
     print(train_metrics)
 
@@ -205,7 +206,8 @@ def populate_yaml(
     yaml_i["spec"]["datetime_column"]["name"] = "Date"
     yaml_i["spec"]["target_category_columns"] = ["Store"]
     yaml_i["spec"]["horizon"] = HORIZON
-    yaml_i["spec"]["preprocessing"] = preprocessing
+    if preprocessing:
+        yaml_i["spec"]["preprocessing"] = preprocessing
     if generate_train_metrics:
         yaml_i["spec"]["generate_metrics"] = generate_train_metrics
     if model == "autots":
@@ -431,6 +433,49 @@ def test_invalid_dates(operator_setup, model):
         )
 
 
+def test_disabling_outlier_treatment(operator_setup):
+    tmpdirname = operator_setup
+    NUM_ROWS = 100
+    hist_data_0 = pd.concat(
+        [
+            HISTORICAL_DATETIME_COL[: NUM_ROWS - HORIZON],
+            TARGET_COL[: NUM_ROWS - HORIZON],
+        ],
+        axis=1,
+    )
+    outliers = [1000, -800]
+    hist_data_0.at[40, 'Sales'] = outliers[0]
+    hist_data_0.at[75, 'Sales'] = outliers[1]
+    historical_data_path, additional_data_path, test_data_path = setup_artificial_data(
+        tmpdirname, hist_data_0
+    )
+
+    yaml_i, output_data_path = populate_yaml(
+        tmpdirname=tmpdirname,
+        model="arima",
+        historical_data_path=historical_data_path
+    )
+    yaml_i["spec"].pop("target_category_columns")
+    yaml_i["spec"].pop("additional_data")
+
+    # running default pipeline where outlier will be treated
+    run_yaml(tmpdirname=tmpdirname, yaml_i=yaml_i, output_data_path=output_data_path)
+    forecast_without_outlier = pd.read_csv(f"{tmpdirname}/results/forecast.csv")
+    input_vals_without_outlier = set(forecast_without_outlier['input_value'])
+    assert all(
+        item not in input_vals_without_outlier for item in outliers), "forecast file should not contain any outliers"
+
+    # switching off outlier_treatment
+    preprocessing_steps = {"missing_value_imputation": True, "outlier_treatment": False}
+    preprocessing = {"enabled": True, "steps": preprocessing_steps}
+    yaml_i["spec"]["preprocessing"] = preprocessing
+    run_yaml(tmpdirname=tmpdirname, yaml_i=yaml_i, output_data_path=output_data_path)
+    forecast_with_outlier = pd.read_csv(f"{tmpdirname}/results/forecast.csv")
+    input_vals_with_outlier = set(forecast_with_outlier['input_value'])
+    assert all(
+        item in input_vals_with_outlier for item in outliers), "forecast file should contain all the outliers"
+
+
 @pytest.mark.parametrize("model", MODELS)
 def test_2_series(operator_setup, model):
     # Test w and w/o add data
@@ -456,7 +501,7 @@ def split_df(df):
     historical_data_path, additional_data_path, test_data_path = setup_artificial_data(
         tmpdirname, hist_data, add_data, test_data
     )
-    preprocessing_steps = {"missing_value_imputation": True, "outlier_detection": False}
+    preprocessing_steps = {"missing_value_imputation": True, "outlier_treatment": False}
     yaml_i, output_data_path = populate_yaml(
         tmpdirname=tmpdirname,
         model=model,