FIX make sure IterativeImputer does not skip iterative process when keep_empty_features=True (scikit-learn#29779)

arifqodari · glemaitre · web-flow · commit eec6ef005c89 · 2024-10-10T15:45:26.000+02:00
Co-authored-by: Guillaume Lemaitre &lt;guillaume@probabl.ai&gt;
diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -262,6 +262,10 @@ Changelog
   computing the mean value for uniform weights.
   :pr:`29135` by :user:`Xuefeng Xu <xuefeng-xu>`.
 
+- |Fix| Fixed :class:`impute.IterativeImputer` to make sure that it does not skip
+  the iterative process when `keep_empty_features` is set to `True`.
+  :pr:`29779` by :user:`Arif Qodari <arifqodari>`.
+
 :mod:`sklearn.linear_model`
 ...........................
 
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
@@ -646,19 +646,28 @@ def _initial_imputation(self, X, in_fit=False):
         else:
             X_filled = self.initial_imputer_.transform(X)
 
-        valid_mask = np.flatnonzero(
-            np.logical_not(np.isnan(self.initial_imputer_.statistics_))
-        )
+        if in_fit:
+            self._is_empty_feature = np.all(mask_missing_values, axis=0)
 
         if not self.keep_empty_features:
             # drop empty features
-            Xt = X[:, valid_mask]
-            mask_missing_values = mask_missing_values[:, valid_mask]
+            Xt = X[:, ~self._is_empty_feature]
+            mask_missing_values = mask_missing_values[:, ~self._is_empty_feature]
+
+            if self.initial_imputer_.get_params()["strategy"] == "constant":
+                # The constant strategy has a specific behavior and preserve empty
+                # features even with ``keep_empty_features=False``. We need to drop
+                # the column for consistency.
+                # TODO: remove this `if` branch once the following issue is addressed:
+                # https://github.com/scikit-learn/scikit-learn/issues/29827
+                X_filled = X_filled[:, ~self._is_empty_feature]
+
         else:
             # mark empty features as not missing and keep the original
             # imputation
-            mask_missing_values[:, valid_mask] = True
+            mask_missing_values[:, self._is_empty_feature] = False
             Xt = X
+            Xt[:, self._is_empty_feature] = X_filled[:, self._is_empty_feature]
 
         return Xt, X_filled, mask_missing_values, X_missing_mask
 
diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py
@@ -1513,24 +1513,6 @@ def test_most_frequent(expected, array, dtype, extra_value, n_repeat):
     )
 
 
-@pytest.mark.parametrize(
-    "initial_strategy", ["mean", "median", "most_frequent", "constant"]
-)
-def test_iterative_imputer_keep_empty_features(initial_strategy):
-    """Check the behaviour of the iterative imputer with different initial strategy
-    and keeping empty features (i.e. features containing only missing values).
-    """
-    X = np.array([[1, np.nan, 2], [3, np.nan, np.nan]])
-
-    imputer = IterativeImputer(
-        initial_strategy=initial_strategy, keep_empty_features=True
-    )
-    X_imputed = imputer.fit_transform(X)
-    assert_allclose(X_imputed[:, 1], 0)
-    X_imputed = imputer.transform(X)
-    assert_allclose(X_imputed[:, 1], 0)
-
-
 def test_iterative_imputer_constant_fill_value():
     """Check that we propagate properly the parameter `fill_value`."""
     X = np.array([[-1, 2, 3, -1], [4, -1, 5, -1], [6, 7, -1, -1], [8, 9, 0, -1]])
@@ -1786,3 +1768,70 @@ def test_simple_imputer_constant_fill_value_casting():
         )
         X_trans = imputer.fit_transform(X_float32)
         assert X_trans.dtype == X_float32.dtype
+
+
+@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent", "constant"])
+def test_iterative_imputer_no_empty_features(strategy):
+    """Check the behaviour of `keep_empty_features` with no empty features.
+
+    With no-empty features, we should get the same imputation whatever the
+    parameter `keep_empty_features`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/29375
+    """
+    X = np.array([[np.nan, 0, 1], [2, np.nan, 3], [4, 5, np.nan]])
+
+    imputer_drop_empty_features = IterativeImputer(
+        initial_strategy=strategy, fill_value=1, keep_empty_features=False
+    )
+
+    imputer_keep_empty_features = IterativeImputer(
+        initial_strategy=strategy, fill_value=1, keep_empty_features=True
+    )
+
+    assert_allclose(
+        imputer_drop_empty_features.fit_transform(X),
+        imputer_keep_empty_features.fit_transform(X),
+    )
+
+
+@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent", "constant"])
+@pytest.mark.parametrize(
+    "X_test",
+    [
+        np.array([[1, 2, 3, 4], [5, 6, 7, 8]]),  # without empty feature
+        np.array([[np.nan, 2, 3, 4], [np.nan, 6, 7, 8]]),  # empty feature at column 0
+        np.array([[1, 2, 3, np.nan], [5, 6, 7, np.nan]]),  # empty feature at column 3
+    ],
+)
+def test_iterative_imputer_with_empty_features(strategy, X_test):
+    """Check the behaviour of `keep_empty_features` in the presence of empty features.
+
+    With `keep_empty_features=True`, the empty feature will be imputed with the value
+    defined by the initial imputation.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/29375
+    """
+    X_train = np.array(
+        [[np.nan, np.nan, 0, 1], [np.nan, 2, np.nan, 3], [np.nan, 4, 5, np.nan]]
+    )
+
+    imputer_drop_empty_features = IterativeImputer(
+        initial_strategy=strategy, fill_value=0, keep_empty_features=False
+    )
+    X_train_drop_empty_features = imputer_drop_empty_features.fit_transform(X_train)
+    X_test_drop_empty_features = imputer_drop_empty_features.transform(X_test)
+
+    imputer_keep_empty_features = IterativeImputer(
+        initial_strategy=strategy, fill_value=0, keep_empty_features=True
+    )
+    X_train_keep_empty_features = imputer_keep_empty_features.fit_transform(X_train)
+    X_test_keep_empty_features = imputer_keep_empty_features.transform(X_test)
+
+    assert_allclose(X_train_drop_empty_features, X_train_keep_empty_features[:, 1:])
+    assert_allclose(X_train_keep_empty_features[:, 0], 0)
+
+    assert X_train_drop_empty_features.shape[1] == X_test_drop_empty_features.shape[1]
+    assert X_train_keep_empty_features.shape[1] == X_test_keep_empty_features.shape[1]