Skip to content

[ES-1492101] Add category_encoders to model dependency #174

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions runtime/databricks/automl_runtime/forecast/deepar/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#
from typing import List, Optional

import category_encoders
import gluonts
import mlflow
import pandas as pd
Expand All @@ -29,6 +30,7 @@
DEEPAR_ADDITIONAL_PIP_DEPS = [
f"gluonts[torch]=={gluonts.__version__}",
f"pandas=={pd.__version__}",
f"category_encoders=={category_encoders.__version__}",
f"databricks-automl-runtime=={version.__version__}"
]

Expand Down
2 changes: 2 additions & 0 deletions runtime/databricks/automl_runtime/forecast/pmdarima/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from abc import abstractmethod
from typing import List, Dict, Tuple, Optional, Union

import category_encoders
import pandas as pd
import mlflow
import pmdarima
Expand All @@ -34,6 +35,7 @@
ARIMA_ADDITIONAL_PIP_DEPS = [
f"pmdarima=={pmdarima.__version__}",
f"pandas=={pd.__version__}",
f"category_encoders=={category_encoders.__version__}",
f"databricks-automl-runtime=={version.__version__}"
]

Expand Down
2 changes: 2 additions & 0 deletions runtime/databricks/automl_runtime/forecast/prophet/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import mlflow
import pandas as pd
import prophet
import category_encoders

from mlflow.models.signature import ModelSignature
from mlflow.utils.environment import _mlflow_conda_env
Expand All @@ -32,6 +33,7 @@
PROPHET_ADDITIONAL_PIP_DEPS = [
f"prophet=={prophet.__version__}",
f"cloudpickle=={cloudpickle.__version__}",
f"category_encoders=={category_encoders.__version__}",
f"databricks-automl-runtime=={version.__version__}",
]

Expand Down
220 changes: 220 additions & 0 deletions runtime/tests/automl_runtime/forecast/deepar/model_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,3 +352,223 @@ def test_model_prediction_with_multiple_minutes_frequency(self, frequency_quanti
self.assertEqual(pred_df.columns.tolist(), [time_col, "yhat"])
self.assertEqual(len(pred_df), self.prediction_length)
self.assertGreater(pred_df[time_col].min(), sample_input[time_col].max())


class TestDeepARModelCategoryEncoders(unittest.TestCase):
"""Test category_encoders dependency inclusion"""

@classmethod
def setUpClass(cls) -> None:
# Use the same setup as the main test class
cls.context_length = 5
cls.prediction_length = 5

# Create a simple mock network for testing
class MockNetwork(nn.Module):
def __init__(self, prediction_length: int, context_length: int) -> None:
super().__init__()
self.prediction_length = prediction_length
self.context_length = context_length
self.net = nn.Linear(context_length, prediction_length)

def forward(self, past_target):
out = self.net(past_target.float())
return out.unsqueeze(1)

cls.pred_net = MockNetwork(
prediction_length=cls.context_length, context_length=cls.context_length
)

cls.transformation = InstanceSplitter(
target_field=FieldName.TARGET,
is_pad_field=FieldName.IS_PAD,
start_field=FieldName.START,
forecast_start_field=FieldName.FORECAST_START,
instance_sampler=TestSplitSampler(),
past_length=cls.context_length,
future_length=cls.prediction_length,
)

cls.model = PyTorchPredictor(
prediction_length=cls.prediction_length,
input_names=["past_target"],
prediction_net=cls.pred_net,
batch_size=16,
input_transform=cls.transformation,
device="cpu",
)

def test_category_encoders_in_requirements(self):
"""Test that category_encoders is included in model requirements"""
target_col = "sales"
time_col = "date"

deepar_model = DeepARModel(
model=self.model,
horizon=self.prediction_length,
frequency_unit="d",
frequency_quantity=1,
num_samples=1,
target_col=target_col,
time_col=time_col,
)

num_rows = 10
sample_input = pd.concat(
[
pd.to_datetime(
pd.Series(range(num_rows), name=time_col).apply(
lambda i: f"2020-10-{3 * i + 1}"
)
),
pd.Series(range(num_rows), name=target_col),
],
axis=1,
)

with mlflow.start_run() as run:
mlflow_deepar_log_model(deepar_model, sample_input)

run_id = run.info.run_id

# Read requirements.txt from the run
requirements_path = mlflow.artifacts.download_artifacts(f"runs:/{run_id}/model/requirements.txt")
with open(requirements_path, "r") as f:
requirements = f.read()

# Verify category_encoders is included in requirements
self.assertIn("category_encoders", requirements, "category_encoders should be included in model requirements")

# Verify the specific version is included (from DEEPAR_ADDITIONAL_PIP_DEPS)
import category_encoders
expected_dep = f"category_encoders=={category_encoders.__version__}"
self.assertIn(expected_dep, requirements, f"Specific category_encoders version {expected_dep} should be in requirements")

def test_model_with_category_encoding_preprocessing(self):
"""Test that models work correctly with potential category encoding preprocessing"""
target_col = "sales"
time_col = "date"

deepar_model = DeepARModel(
model=self.model,
horizon=self.prediction_length,
frequency_unit="d",
frequency_quantity=1,
num_samples=1,
target_col=target_col,
time_col=time_col,
)

# Create test data that could potentially use category encoding
num_rows = 10
sample_input = pd.concat(
[
pd.to_datetime(
pd.Series(range(num_rows), name=time_col).apply(
lambda i: f"2020-10-{3 * i + 1}"
)
),
pd.Series(range(num_rows), name=target_col),
pd.Series([f"category_{i % 3}" for i in range(num_rows)], name="category_col"),
],
axis=1,
)

# This should work without errors if category_encoders is properly available
# Note: DeepAR doesn't directly use preprocessing functions like Prophet/ARIMA,
# but category_encoders might be used in data preparation pipelines
try:
import category_encoders as ce
# Test that we can import and use category_encoders
encoder = ce.BinaryEncoder(cols=['category_col'])
encoded_data = encoder.fit_transform(sample_input[['category_col']])
self.assertIsNotNone(encoded_data)
except ImportError:
self.fail("category_encoders should be available for DeepAR models")

def test_multiseries_model_with_category_encoding(self):
"""Test that multi-series models work with category encoding"""
target_col = "sales"
time_col = "date"
id_col = "store"

deepar_model = DeepARModel(
model=self.model,
horizon=self.prediction_length,
num_samples=1,
frequency_unit="d",
frequency_quantity=1,
target_col=target_col,
time_col=time_col,
id_cols=[id_col],
)

num_rows_per_ts = 10
sample_input_base = pd.concat(
[
pd.to_datetime(
pd.Series(range(num_rows_per_ts), name=time_col).apply(
lambda i: f"2020-10-{3 * i + 1}"
)
),
pd.Series(range(num_rows_per_ts), name=target_col),
pd.Series([f"cat_{i % 2}" for i in range(num_rows_per_ts)], name="category_col"),
],
axis=1,
)
sample_input = pd.concat([sample_input_base.copy(), sample_input_base.copy()], ignore_index=True)
sample_input[id_col] = [1] * num_rows_per_ts + [2] * num_rows_per_ts

# Test that category_encoders can be used with multi-series data
try:
import category_encoders as ce
encoder = ce.TargetEncoder(cols=['category_col'])
# Just test that we can create the encoder - actual fitting would need target data
self.assertIsNotNone(encoder)
except ImportError:
self.fail("category_encoders should be available for multi-series DeepAR models")

def test_category_encoders_version_compatibility(self):
"""Test that the correct version of category_encoders is specified in dependencies"""
# Verify that category_encoders is in DEEPAR_ADDITIONAL_PIP_DEPS
category_encoders_deps = [dep for dep in DEEPAR_ADDITIONAL_PIP_DEPS if "category_encoders" in dep]
self.assertEqual(len(category_encoders_deps), 1, "category_encoders should be in DEEPAR_ADDITIONAL_PIP_DEPS")

# Verify the format includes version specification
category_encoders_dep = category_encoders_deps[0]
self.assertIn("==", category_encoders_dep, "category_encoders dependency should specify exact version")

# Verify it matches the currently installed version
import category_encoders
expected_dep = f"category_encoders=={category_encoders.__version__}"
self.assertEqual(category_encoders_dep, expected_dep,
f"Dependency should match installed version: {expected_dep}")

def test_model_environment_includes_category_encoders(self):
"""Test that the model environment includes category_encoders"""
target_col = "sales"
time_col = "date"

deepar_model = DeepARModel(
model=self.model,
horizon=self.prediction_length,
frequency_unit="d",
frequency_quantity=1,
num_samples=1,
target_col=target_col,
time_col=time_col,
)

# Get the model environment
model_env = deepar_model.model_env

# Navigate to pip dependencies: dependencies list -> find dict with 'pip' key -> get pip list
dependencies = model_env.get('dependencies', [])
pip_deps = []
for dep in dependencies:
if isinstance(dep, dict) and 'pip' in dep:
pip_deps = dep['pip']
break

category_encoders_found = any("category_encoders" in dep for dep in pip_deps)
self.assertTrue(category_encoders_found, "category_encoders should be in model environment pip dependencies")
Loading