diff --git a/runtime/databricks/automl_runtime/forecast/deepar/model.py b/runtime/databricks/automl_runtime/forecast/deepar/model.py index 137c37a..a51d6ec 100644 --- a/runtime/databricks/automl_runtime/forecast/deepar/model.py +++ b/runtime/databricks/automl_runtime/forecast/deepar/model.py @@ -15,6 +15,7 @@ # from typing import List, Optional +import category_encoders import gluonts import mlflow import pandas as pd @@ -29,6 +30,7 @@ DEEPAR_ADDITIONAL_PIP_DEPS = [ f"gluonts[torch]=={gluonts.__version__}", f"pandas=={pd.__version__}", + f"category_encoders=={category_encoders.__version__}", f"databricks-automl-runtime=={version.__version__}" ] diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/model.py b/runtime/databricks/automl_runtime/forecast/pmdarima/model.py index d1712c4..e54726d 100644 --- a/runtime/databricks/automl_runtime/forecast/pmdarima/model.py +++ b/runtime/databricks/automl_runtime/forecast/pmdarima/model.py @@ -17,6 +17,7 @@ from abc import abstractmethod from typing import List, Dict, Tuple, Optional, Union +import category_encoders import pandas as pd import mlflow import pmdarima @@ -34,6 +35,7 @@ ARIMA_ADDITIONAL_PIP_DEPS = [ f"pmdarima=={pmdarima.__version__}", f"pandas=={pd.__version__}", + f"category_encoders=={category_encoders.__version__}", f"databricks-automl-runtime=={version.__version__}" ] diff --git a/runtime/databricks/automl_runtime/forecast/prophet/model.py b/runtime/databricks/automl_runtime/forecast/prophet/model.py index 67b3386..77b510a 100644 --- a/runtime/databricks/automl_runtime/forecast/prophet/model.py +++ b/runtime/databricks/automl_runtime/forecast/prophet/model.py @@ -19,6 +19,7 @@ import mlflow import pandas as pd import prophet +import category_encoders from mlflow.models.signature import ModelSignature from mlflow.utils.environment import _mlflow_conda_env @@ -32,6 +33,7 @@ PROPHET_ADDITIONAL_PIP_DEPS = [ f"prophet=={prophet.__version__}", f"cloudpickle=={cloudpickle.__version__}", + f"category_encoders=={category_encoders.__version__}", f"databricks-automl-runtime=={version.__version__}", ] diff --git a/runtime/tests/automl_runtime/forecast/deepar/model_test.py b/runtime/tests/automl_runtime/forecast/deepar/model_test.py index bee823d..cfe111e 100644 --- a/runtime/tests/automl_runtime/forecast/deepar/model_test.py +++ b/runtime/tests/automl_runtime/forecast/deepar/model_test.py @@ -352,3 +352,223 @@ def test_model_prediction_with_multiple_minutes_frequency(self, frequency_quanti self.assertEqual(pred_df.columns.tolist(), [time_col, "yhat"]) self.assertEqual(len(pred_df), self.prediction_length) self.assertGreater(pred_df[time_col].min(), sample_input[time_col].max()) + + +class TestDeepARModelCategoryEncoders(unittest.TestCase): + """Test category_encoders dependency inclusion""" + + @classmethod + def setUpClass(cls) -> None: + # Use the same setup as the main test class + cls.context_length = 5 + cls.prediction_length = 5 + + # Create a simple mock network for testing + class MockNetwork(nn.Module): + def __init__(self, prediction_length: int, context_length: int) -> None: + super().__init__() + self.prediction_length = prediction_length + self.context_length = context_length + self.net = nn.Linear(context_length, prediction_length) + + def forward(self, past_target): + out = self.net(past_target.float()) + return out.unsqueeze(1) + + cls.pred_net = MockNetwork( + prediction_length=cls.context_length, context_length=cls.context_length + ) + + cls.transformation = InstanceSplitter( + target_field=FieldName.TARGET, + is_pad_field=FieldName.IS_PAD, + start_field=FieldName.START, + forecast_start_field=FieldName.FORECAST_START, + instance_sampler=TestSplitSampler(), + past_length=cls.context_length, + future_length=cls.prediction_length, + ) + + cls.model = PyTorchPredictor( + prediction_length=cls.prediction_length, + input_names=["past_target"], + prediction_net=cls.pred_net, + batch_size=16, + input_transform=cls.transformation, + device="cpu", + ) + + def test_category_encoders_in_requirements(self): + """Test that category_encoders is included in model requirements""" + target_col = "sales" + time_col = "date" + + deepar_model = DeepARModel( + model=self.model, + horizon=self.prediction_length, + frequency_unit="d", + frequency_quantity=1, + num_samples=1, + target_col=target_col, + time_col=time_col, + ) + + num_rows = 10 + sample_input = pd.concat( + [ + pd.to_datetime( + pd.Series(range(num_rows), name=time_col).apply( + lambda i: f"2020-10-{3 * i + 1}" + ) + ), + pd.Series(range(num_rows), name=target_col), + ], + axis=1, + ) + + with mlflow.start_run() as run: + mlflow_deepar_log_model(deepar_model, sample_input) + + run_id = run.info.run_id + + # Read requirements.txt from the run + requirements_path = mlflow.artifacts.download_artifacts(f"runs:/{run_id}/model/requirements.txt") + with open(requirements_path, "r") as f: + requirements = f.read() + + # Verify category_encoders is included in requirements + self.assertIn("category_encoders", requirements, "category_encoders should be included in model requirements") + + # Verify the specific version is included (from DEEPAR_ADDITIONAL_PIP_DEPS) + import category_encoders + expected_dep = f"category_encoders=={category_encoders.__version__}" + self.assertIn(expected_dep, requirements, f"Specific category_encoders version {expected_dep} should be in requirements") + + def test_model_with_category_encoding_preprocessing(self): + """Test that models work correctly with potential category encoding preprocessing""" + target_col = "sales" + time_col = "date" + + deepar_model = DeepARModel( + model=self.model, + horizon=self.prediction_length, + frequency_unit="d", + frequency_quantity=1, + num_samples=1, + target_col=target_col, + time_col=time_col, + ) + + # Create test data that could potentially use category encoding + num_rows = 10 + sample_input = pd.concat( + [ + pd.to_datetime( + pd.Series(range(num_rows), name=time_col).apply( + lambda i: f"2020-10-{3 * i + 1}" + ) + ), + pd.Series(range(num_rows), name=target_col), + pd.Series([f"category_{i % 3}" for i in range(num_rows)], name="category_col"), + ], + axis=1, + ) + + # This should work without errors if category_encoders is properly available + # Note: DeepAR doesn't directly use preprocessing functions like Prophet/ARIMA, + # but category_encoders might be used in data preparation pipelines + try: + import category_encoders as ce + # Test that we can import and use category_encoders + encoder = ce.BinaryEncoder(cols=['category_col']) + encoded_data = encoder.fit_transform(sample_input[['category_col']]) + self.assertIsNotNone(encoded_data) + except ImportError: + self.fail("category_encoders should be available for DeepAR models") + + def test_multiseries_model_with_category_encoding(self): + """Test that multi-series models work with category encoding""" + target_col = "sales" + time_col = "date" + id_col = "store" + + deepar_model = DeepARModel( + model=self.model, + horizon=self.prediction_length, + num_samples=1, + frequency_unit="d", + frequency_quantity=1, + target_col=target_col, + time_col=time_col, + id_cols=[id_col], + ) + + num_rows_per_ts = 10 + sample_input_base = pd.concat( + [ + pd.to_datetime( + pd.Series(range(num_rows_per_ts), name=time_col).apply( + lambda i: f"2020-10-{3 * i + 1}" + ) + ), + pd.Series(range(num_rows_per_ts), name=target_col), + pd.Series([f"cat_{i % 2}" for i in range(num_rows_per_ts)], name="category_col"), + ], + axis=1, + ) + sample_input = pd.concat([sample_input_base.copy(), sample_input_base.copy()], ignore_index=True) + sample_input[id_col] = [1] * num_rows_per_ts + [2] * num_rows_per_ts + + # Test that category_encoders can be used with multi-series data + try: + import category_encoders as ce + encoder = ce.TargetEncoder(cols=['category_col']) + # Just test that we can create the encoder - actual fitting would need target data + self.assertIsNotNone(encoder) + except ImportError: + self.fail("category_encoders should be available for multi-series DeepAR models") + + def test_category_encoders_version_compatibility(self): + """Test that the correct version of category_encoders is specified in dependencies""" + # Verify that category_encoders is in DEEPAR_ADDITIONAL_PIP_DEPS + category_encoders_deps = [dep for dep in DEEPAR_ADDITIONAL_PIP_DEPS if "category_encoders" in dep] + self.assertEqual(len(category_encoders_deps), 1, "category_encoders should be in DEEPAR_ADDITIONAL_PIP_DEPS") + + # Verify the format includes version specification + category_encoders_dep = category_encoders_deps[0] + self.assertIn("==", category_encoders_dep, "category_encoders dependency should specify exact version") + + # Verify it matches the currently installed version + import category_encoders + expected_dep = f"category_encoders=={category_encoders.__version__}" + self.assertEqual(category_encoders_dep, expected_dep, + f"Dependency should match installed version: {expected_dep}") + + def test_model_environment_includes_category_encoders(self): + """Test that the model environment includes category_encoders""" + target_col = "sales" + time_col = "date" + + deepar_model = DeepARModel( + model=self.model, + horizon=self.prediction_length, + frequency_unit="d", + frequency_quantity=1, + num_samples=1, + target_col=target_col, + time_col=time_col, + ) + + # Get the model environment + model_env = deepar_model.model_env + + # Navigate to pip dependencies: dependencies list -> find dict with 'pip' key -> get pip list + dependencies = model_env.get('dependencies', []) + pip_deps = [] + for dep in dependencies: + if isinstance(dep, dict) and 'pip' in dep: + pip_deps = dep['pip'] + break + + category_encoders_found = any("category_encoders" in dep for dep in pip_deps) + self.assertTrue(category_encoders_found, "category_encoders should be in model environment pip dependencies") diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py index 9fe0499..3571452 100644 --- a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py +++ b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py @@ -785,3 +785,163 @@ def test_predict_with_preprocess(self): # Get the actual return value from the first call actual_return = self.mock_preprocess.side_effect(first_call_arg) pd.testing.assert_frame_equal(actual_return, expected_return) + +class TestArimaModelCategoryEncoders(unittest.TestCase): + """Test category_encoders dependency inclusion""" + + def setUp(self) -> None: + num_rows = 9 + self.df = pd.concat([ + pd.to_datetime(pd.Series(range(num_rows), name="date").apply(lambda i: f"2020-10-{i + 1}")), + pd.Series(range(num_rows), name="y") + ], axis=1) + model = ARIMA(order=(2, 0, 2), suppress_warnings=True) + model.fit(self.df.set_index("date")) + self.pickled_model = pickle.dumps(model) + + def test_category_encoders_in_requirements(self): + """Test that category_encoders is included in model requirements""" + arima_model = ArimaModel( + self.pickled_model, + horizon=1, + frequency_unit='d', + frequency_quantity=1, + start_ds=pd.to_datetime("2020-10-01"), + end_ds=pd.to_datetime("2020-10-09"), + time_col="date" + ) + + with mlflow.start_run() as run: + mlflow_arima_log_model(arima_model) + + run_id = run.info.run_id + + # Read requirements.txt from the run + requirements_path = mlflow.artifacts.download_artifacts(f"runs:/{run_id}/model/requirements.txt") + with open(requirements_path, "r") as f: + requirements = f.read() + + # Verify category_encoders is included in requirements + self.assertIn("category_encoders", requirements, "category_encoders should be included in model requirements") + + # Verify the specific version is included (from ARIMA_ADDITIONAL_PIP_DEPS) + import category_encoders + expected_dep = f"category_encoders=={category_encoders.__version__}" + self.assertIn(expected_dep, requirements, f"Specific category_encoders version {expected_dep} should be in requirements") + + def test_model_with_category_encoding_preprocessing(self): + """Test that models work correctly with category encoding preprocessing functions""" + import category_encoders as ce + + def preprocess_func_with_category_encoding(df): + """Preprocessing function that uses category_encoders""" + # Simulate categorical encoding preprocessing + if 'category_col' in df.columns: + encoder = ce.BinaryEncoder(cols=['category_col']) + df = encoder.fit_transform(df) + return df + + arima_model = ArimaModel( + self.pickled_model, + horizon=1, + frequency_unit='d', + frequency_quantity=1, + start_ds=pd.to_datetime("2020-10-01"), + end_ds=pd.to_datetime("2020-10-09"), + time_col="date", + split_col="split", + preprocess_func=preprocess_func_with_category_encoding + ) + + # Test data with categorical column + test_df = pd.DataFrame({ + "date": [pd.to_datetime("2020-10-08"), pd.to_datetime("2020-10-10")], + "category_col": ["A", "B"] + }) + + # This should work without errors if category_encoders is properly available + yhat = arima_model.predict(context=None, model_input=test_df) + self.assertEqual(2, len(yhat)) + + def test_multiseries_model_with_category_encoding_preprocessing(self): + """Test that multi-series models work with category encoding preprocessing""" + import category_encoders as ce + + def preprocess_func_with_category_encoding(df): + """Preprocessing function that uses category_encoders for multi-series""" + if 'category_col' in df.columns: + # Use target encoder which is commonly used in multi-series scenarios + encoder = ce.TargetEncoder(cols=['category_col']) + # For this test, we'll just transform without fitting since we don't have a real target + df = df.copy() + df['category_col'] = df['category_col'].astype('category').cat.codes + return df + + pickled_model_dict = {("1",): self.pickled_model, ("2",): self.pickled_model} + start_ds_dict = {("1",): pd.Timestamp("2020-10-01"), ("2",): pd.Timestamp("2020-10-01")} + end_ds_dict = {("1",): pd.Timestamp("2020-10-09"), ("2",): pd.Timestamp("2020-10-09")} + + multiseries_arima_model = MultiSeriesArimaModel( + pickled_model_dict, + horizon=1, + frequency_unit='d', + frequency_quantity=1, + start_ds_dict=start_ds_dict, + end_ds_dict=end_ds_dict, + time_col="date", + id_cols=["id"], + split_col="split", + preprocess_func=preprocess_func_with_category_encoding + ) + + test_df = pd.DataFrame({ + "date": [pd.to_datetime("2020-10-08"), pd.to_datetime("2020-10-10")], + "id": ["1", "2"], + "category_col": ["X", "Y"] + }) + + # This should work without errors if category_encoders is properly available + yhat = multiseries_arima_model.predict(context=None, model_input=test_df) + self.assertEqual(2, len(yhat)) + + def test_category_encoders_version_compatibility(self): + """Test that the correct version of category_encoders is specified in dependencies""" + # Verify that category_encoders is in ARIMA_ADDITIONAL_PIP_DEPS + category_encoders_deps = [dep for dep in ARIMA_ADDITIONAL_PIP_DEPS if "category_encoders" in dep] + self.assertEqual(len(category_encoders_deps), 1, "category_encoders should be in ARIMA_ADDITIONAL_PIP_DEPS") + + # Verify the format includes version specification + category_encoders_dep = category_encoders_deps[0] + self.assertIn("==", category_encoders_dep, "category_encoders dependency should specify exact version") + + # Verify it matches the currently installed version + import category_encoders + expected_dep = f"category_encoders=={category_encoders.__version__}" + self.assertEqual(category_encoders_dep, expected_dep, + f"Dependency should match installed version: {expected_dep}") + + def test_model_environment_includes_category_encoders(self): + """Test that the model environment includes category_encoders""" + arima_model = ArimaModel( + self.pickled_model, + horizon=1, + frequency_unit='d', + frequency_quantity=1, + start_ds=pd.to_datetime("2020-10-01"), + end_ds=pd.to_datetime("2020-10-09"), + time_col="date" + ) + + # Get the model environment + model_env = arima_model.model_env + + # Navigate to pip dependencies: dependencies list -> find dict with 'pip' key -> get pip list + dependencies = model_env.get('dependencies', []) + pip_deps = [] + for dep in dependencies: + if isinstance(dep, dict) and 'pip' in dep: + pip_deps = dep['pip'] + break + + category_encoders_found = any("category_encoders" in dep for dep in pip_deps) + self.assertTrue(category_encoders_found, "category_encoders should be in model environment pip dependencies") diff --git a/runtime/tests/automl_runtime/forecast/prophet/model_test.py b/runtime/tests/automl_runtime/forecast/prophet/model_test.py index 5ef5297..0ff68cc 100644 --- a/runtime/tests/automl_runtime/forecast/prophet/model_test.py +++ b/runtime/tests/automl_runtime/forecast/prophet/model_test.py @@ -552,4 +552,144 @@ def preprocess_func(df): # Check the second call second_call_df = calls[1][0][0] # Get the DataFrame passed in the second call self.assertTrue((second_call_df["feature"] == [8, 10, 12]).all()) - self.assertTrue((second_call_df["id"] == ["id2", "id2", "id2"]).all()) \ No newline at end of file + self.assertTrue((second_call_df["id"] == ["id2", "id2", "id2"]).all()) + +class TestProphetModelCategoryEncoders(BaseProphetModelTest): + """Test category_encoders dependency inclusion""" + + @classmethod + def setUpClass(cls) -> None: + cls.model_json = PROPHET_MODEL_JSON + + def test_category_encoders_in_requirements(self): + """Test that category_encoders is included in model requirements""" + prophet_model = ProphetModel(self.model_json, 1, "d", 1, "ds") + + with mlflow.start_run() as run: + mlflow_prophet_log_model(prophet_model) + + run_id = run.info.run_id + + # Read requirements.txt from the run + requirements_path = mlflow.artifacts.download_artifacts(f"runs:/{run_id}/model/requirements.txt") + with open(requirements_path, "r") as f: + requirements = f.read() + + # Verify category_encoders is included in requirements + self.assertIn("category_encoders", requirements, "category_encoders should be included in model requirements") + + # Verify the specific version is included (from PROPHET_ADDITIONAL_PIP_DEPS) + import category_encoders + expected_dep = f"category_encoders=={category_encoders.__version__}" + self.assertIn(expected_dep, requirements, f"Specific category_encoders version {expected_dep} should be in requirements") + + def test_model_with_category_encoding_preprocessing(self): + """Test that models work correctly with category encoding preprocessing functions""" + import category_encoders as ce + + def preprocess_func_with_category_encoding(df): + """Preprocessing function that uses category_encoders""" + # Simulate categorical encoding preprocessing + if 'category_col' in df.columns: + encoder = ce.BinaryEncoder(cols=['category_col']) + df = encoder.fit_transform(df) + return df + + prophet_model = ProphetModel( + model_json=self.model_json, + horizon=1, + frequency_unit="d", + frequency_quantity=1, + time_col="ds", + split_col="split", + preprocess_func=preprocess_func_with_category_encoding + ) + + # Test data with categorical column + test_df = pd.DataFrame({ + "ds": [pd.to_datetime("2020-11-01"), pd.to_datetime("2020-11-04")], + "category_col": ["A", "B"], + "split": ["train", "test"] + }) + + # This should work without errors if category_encoders is properly available + yhat = prophet_model.predict(None, test_df) + self.assertEqual(2, len(yhat)) + + def test_multiseries_model_with_category_encoding_preprocessing(self): + """Test that multi-series models work with category encoding preprocessing""" + import category_encoders as ce + + def preprocess_func_with_category_encoding(df): + """Preprocessing function that uses category_encoders for multi-series""" + if 'category_col' in df.columns: + # Use target encoder which is commonly used in multi-series scenarios + encoder = ce.TargetEncoder(cols=['category_col']) + # For this test, we'll just transform without fitting since we don't have a real target + df = df.copy() + df['category_col'] = df['category_col'].astype('category').cat.codes + return df + + multi_series_model_json = {("1",): self.model_json, ("2",): self.model_json} + multi_series_start = { + ("1",): pd.Timestamp("2020-07-01"), + ("2",): pd.Timestamp("2020-07-01"), + } + + prophet_model = MultiSeriesProphetModel( + model_json=multi_series_model_json, + timeseries_starts=multi_series_start, + timeseries_end="2020-07-25", + horizon=1, + frequency_unit="days", + frequency_quantity=1, + time_col="ds", + id_cols=["id"], + split_col="split", + preprocess_func=preprocess_func_with_category_encoding + ) + + test_df = pd.DataFrame({ + "ds": [pd.to_datetime("2020-11-01"), pd.to_datetime("2020-11-02")], + "id": ["1", "2"], + "category_col": ["X", "Y"], + "split": ["train", "test"] + }) + + # This should work without errors if category_encoders is properly available + yhat = prophet_model.predict(None, test_df) + self.assertEqual(2, len(yhat)) + + def test_category_encoders_version_compatibility(self): + """Test that the correct version of category_encoders is specified in dependencies""" + # Verify that category_encoders is in PROPHET_ADDITIONAL_PIP_DEPS + category_encoders_deps = [dep for dep in PROPHET_ADDITIONAL_PIP_DEPS if "category_encoders" in dep] + self.assertEqual(len(category_encoders_deps), 1, "category_encoders should be in PROPHET_ADDITIONAL_PIP_DEPS") + + # Verify the format includes version specification + category_encoders_dep = category_encoders_deps[0] + self.assertIn("==", category_encoders_dep, "category_encoders dependency should specify exact version") + + # Verify it matches the currently installed version + import category_encoders + expected_dep = f"category_encoders=={category_encoders.__version__}" + self.assertEqual(category_encoders_dep, expected_dep, + f"Dependency should match installed version: {expected_dep}") + + def test_model_environment_includes_category_encoders(self): + """Test that the model environment includes category_encoders""" + prophet_model = ProphetModel(self.model_json, 1, "d", 1, "ds") + + # Get the model environment + model_env = prophet_model.model_env + + # Navigate to pip dependencies: dependencies list -> find dict with 'pip' key -> get pip list + dependencies = model_env.get('dependencies', []) + pip_deps = [] + for dep in dependencies: + if isinstance(dep, dict) and 'pip' in dep: + pip_deps = dep['pip'] + break + + category_encoders_found = any("category_encoders" in dep for dep in pip_deps) + self.assertTrue(category_encoders_found, "category_encoders should be in model environment pip dependencies") \ No newline at end of file