-
Notifications
You must be signed in to change notification settings - Fork 25
Open
Description
When a deployed_model is provided to instanciate a SmartDrift object, the chisq_test function is called.
Its execution raises an exception as soon as one str column contains null values.
The following snippet, inspired by the tutorial01-datadrift-over-years.ipynb tutorial, to reproduce.
import pandas as pd
from category_encoders import OrdinalEncoder
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from eurybia import SmartDrift
from eurybia.data.data_loader import data_loading
house_df, house_dict = data_loading("house_prices")
house_df_learning = house_df.loc[house_df["YrSold"] == 2006]
house_df_2007 = house_df.loc[house_df["YrSold"] == 2007]
y_df_learning = house_df_learning["SalePrice"].to_frame()
X_df_learning = house_df_learning[house_df_learning.columns.difference(["SalePrice", "YrSold"])]
y_df_2007 = house_df_2007["SalePrice"].to_frame()
X_df_2007 = house_df_2007[house_df_2007.columns.difference(["SalePrice", "YrSold"])]
categorical_features = [col for col in X_df_learning.columns if X_df_learning[col].dtype == "object"]
encoder = OrdinalEncoder(cols=categorical_features, handle_unknown="ignore", return_df=True).fit(X_df_learning)
X_df_learning_encoded = encoder.transform(X_df_learning)
Xtrain, Xtest, ytrain, ytest = train_test_split(X_df_learning_encoded, y_df_learning, train_size=0.75, random_state=1)
regressor = LGBMRegressor(n_estimators=200).fit(Xtrain, ytrain)
print("cols", [(c, type(c)) for c in pd.unique(X_df_2007["MasVnrType"])])
SmartDrift(
df_current=X_df_2007,
df_baseline=X_df_learning,
deployed_model=regressor,
encoding=encoder,
).compile()Output:
cols: [(nan, <class 'float'>), ('Stone', <class 'str'>), ('Brick Face', <class 'str'>), ('Brick Common', <class 'str'>)]
Traceback (most recent call last):
File "/Users/78176D/workspace/eurybia/eurybia/core/smartdrift.py", line 1009, in _compute_datadrift_stat_test
test = chisq_test(current[features].to_numpy(), baseline[features].to_numpy())
File "/Users/78176D/workspace/eurybia/eurybia/utils/statistical_tests.py", line 47, in chisq_test
uniq_a, freq_a = np.unique(obs_a, return_counts=True)
~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/78176D/workspace/eurybia/.venv/lib/python3.13/site-packages/numpy/lib/arraysetops.py", line 274, in unique
ret = _unique1d(ar, return_index, return_inverse, return_counts,
equal_nan=equal_nan)
File "/Users/78176D/workspace/eurybia/.venv/lib/python3.13/site-packages/numpy/lib/arraysetops.py", line 336, in _unique1d
ar.sort()
~~~~~~~^^
TypeError: '<' not supported between instances of 'str' and 'float'
Python version : 3.13.9
Eurybia version : 1.3.3
Operating System : macOS
Metadata
Metadata
Assignees
Labels
No labels