Skip to content

Commit eec6ef0

Browse files
FIX make sure IterativeImputer does not skip iterative process when keep_empty_features=True (scikit-learn#29779)
Co-authored-by: Guillaume Lemaitre <guillaume@probabl.ai>
1 parent ff02e17 commit eec6ef0

File tree

3 files changed

+86
-24
lines changed

3 files changed

+86
-24
lines changed

doc/whats_new/v1.6.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,10 @@ Changelog
262262
computing the mean value for uniform weights.
263263
:pr:`29135` by :user:`Xuefeng Xu <xuefeng-xu>`.
264264

265+
- |Fix| Fixed :class:`impute.IterativeImputer` to make sure that it does not skip
266+
the iterative process when `keep_empty_features` is set to `True`.
267+
:pr:`29779` by :user:`Arif Qodari <arifqodari>`.
268+
265269
:mod:`sklearn.linear_model`
266270
...........................
267271

sklearn/impute/_iterative.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -646,19 +646,28 @@ def _initial_imputation(self, X, in_fit=False):
646646
else:
647647
X_filled = self.initial_imputer_.transform(X)
648648

649-
valid_mask = np.flatnonzero(
650-
np.logical_not(np.isnan(self.initial_imputer_.statistics_))
651-
)
649+
if in_fit:
650+
self._is_empty_feature = np.all(mask_missing_values, axis=0)
652651

653652
if not self.keep_empty_features:
654653
# drop empty features
655-
Xt = X[:, valid_mask]
656-
mask_missing_values = mask_missing_values[:, valid_mask]
654+
Xt = X[:, ~self._is_empty_feature]
655+
mask_missing_values = mask_missing_values[:, ~self._is_empty_feature]
656+
657+
if self.initial_imputer_.get_params()["strategy"] == "constant":
658+
# The constant strategy has a specific behavior and preserve empty
659+
# features even with ``keep_empty_features=False``. We need to drop
660+
# the column for consistency.
661+
# TODO: remove this `if` branch once the following issue is addressed:
662+
# https://github.com/scikit-learn/scikit-learn/issues/29827
663+
X_filled = X_filled[:, ~self._is_empty_feature]
664+
657665
else:
658666
# mark empty features as not missing and keep the original
659667
# imputation
660-
mask_missing_values[:, valid_mask] = True
668+
mask_missing_values[:, self._is_empty_feature] = False
661669
Xt = X
670+
Xt[:, self._is_empty_feature] = X_filled[:, self._is_empty_feature]
662671

663672
return Xt, X_filled, mask_missing_values, X_missing_mask
664673

sklearn/impute/tests/test_impute.py

Lines changed: 67 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1513,24 +1513,6 @@ def test_most_frequent(expected, array, dtype, extra_value, n_repeat):
15131513
)
15141514

15151515

1516-
@pytest.mark.parametrize(
1517-
"initial_strategy", ["mean", "median", "most_frequent", "constant"]
1518-
)
1519-
def test_iterative_imputer_keep_empty_features(initial_strategy):
1520-
"""Check the behaviour of the iterative imputer with different initial strategy
1521-
and keeping empty features (i.e. features containing only missing values).
1522-
"""
1523-
X = np.array([[1, np.nan, 2], [3, np.nan, np.nan]])
1524-
1525-
imputer = IterativeImputer(
1526-
initial_strategy=initial_strategy, keep_empty_features=True
1527-
)
1528-
X_imputed = imputer.fit_transform(X)
1529-
assert_allclose(X_imputed[:, 1], 0)
1530-
X_imputed = imputer.transform(X)
1531-
assert_allclose(X_imputed[:, 1], 0)
1532-
1533-
15341516
def test_iterative_imputer_constant_fill_value():
15351517
"""Check that we propagate properly the parameter `fill_value`."""
15361518
X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]])
@@ -1786,3 +1768,70 @@ def test_simple_imputer_constant_fill_value_casting():
17861768
)
17871769
X_trans = imputer.fit_transform(X_float32)
17881770
assert X_trans.dtype == X_float32.dtype
1771+
1772+
1773+
@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent", "constant"])
1774+
def test_iterative_imputer_no_empty_features(strategy):
1775+
"""Check the behaviour of `keep_empty_features` with no empty features.
1776+
1777+
With no-empty features, we should get the same imputation whatever the
1778+
parameter `keep_empty_features`.
1779+
1780+
Non-regression test for:
1781+
https://github.com/scikit-learn/scikit-learn/issues/29375
1782+
"""
1783+
X = np.array([[np.nan, 0, 1], [2, np.nan, 3], [4, 5, np.nan]])
1784+
1785+
imputer_drop_empty_features = IterativeImputer(
1786+
initial_strategy=strategy, fill_value=1, keep_empty_features=False
1787+
)
1788+
1789+
imputer_keep_empty_features = IterativeImputer(
1790+
initial_strategy=strategy, fill_value=1, keep_empty_features=True
1791+
)
1792+
1793+
assert_allclose(
1794+
imputer_drop_empty_features.fit_transform(X),
1795+
imputer_keep_empty_features.fit_transform(X),
1796+
)
1797+
1798+
1799+
@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent", "constant"])
1800+
@pytest.mark.parametrize(
1801+
"X_test",
1802+
[
1803+
np.array([[1, 2, 3, 4], [5, 6, 7, 8]]), # without empty feature
1804+
np.array([[np.nan, 2, 3, 4], [np.nan, 6, 7, 8]]), # empty feature at column 0
1805+
np.array([[1, 2, 3, np.nan], [5, 6, 7, np.nan]]), # empty feature at column 3
1806+
],
1807+
)
1808+
def test_iterative_imputer_with_empty_features(strategy, X_test):
1809+
"""Check the behaviour of `keep_empty_features` in the presence of empty features.
1810+
1811+
With `keep_empty_features=True`, the empty feature will be imputed with the value
1812+
defined by the initial imputation.
1813+
1814+
Non-regression test for:
1815+
https://github.com/scikit-learn/scikit-learn/issues/29375
1816+
"""
1817+
X_train = np.array(
1818+
[[np.nan, np.nan, 0, 1], [np.nan, 2, np.nan, 3], [np.nan, 4, 5, np.nan]]
1819+
)
1820+
1821+
imputer_drop_empty_features = IterativeImputer(
1822+
initial_strategy=strategy, fill_value=0, keep_empty_features=False
1823+
)
1824+
X_train_drop_empty_features = imputer_drop_empty_features.fit_transform(X_train)
1825+
X_test_drop_empty_features = imputer_drop_empty_features.transform(X_test)
1826+
1827+
imputer_keep_empty_features = IterativeImputer(
1828+
initial_strategy=strategy, fill_value=0, keep_empty_features=True
1829+
)
1830+
X_train_keep_empty_features = imputer_keep_empty_features.fit_transform(X_train)
1831+
X_test_keep_empty_features = imputer_keep_empty_features.transform(X_test)
1832+
1833+
assert_allclose(X_train_drop_empty_features, X_train_keep_empty_features[:, 1:])
1834+
assert_allclose(X_train_keep_empty_features[:, 0], 0)
1835+
1836+
assert X_train_drop_empty_features.shape[1] == X_test_drop_empty_features.shape[1]
1837+
assert X_train_keep_empty_features.shape[1] == X_test_keep_empty_features.shape[1]

0 commit comments

Comments
 (0)