DOC documentation and error message for mismatching output formats in transformers with a sparse output (scikit-learn#26919)

StefanieSenger · adrinjalali · ogrisel · web-flow · commit 20dad5851e77 · 2023-09-21T16:54:23.000+02:00
Co-authored-by: Adrin Jalali &lt;adrin.jalali@gmail.com&gt;
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -450,7 +450,7 @@ class OneHotEncoder(_BaseEncoder):
     The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
     encoding scheme. This creates a binary column for each category and
     returns a sparse matrix or dense array (depending on the ``sparse_output``
-    parameter)
+    parameter).
 
     By default, the encoder derives the categories based on the unique values
     in each feature. Alternatively, you can also specify the `categories`
@@ -522,7 +522,8 @@ class OneHotEncoder(_BaseEncoder):
            `sparse_output` instead.
 
     sparse_output : bool, default=True
-        Will return sparse matrix if set True else will return an array.
+        When ``True``, it returns a :class:`scipy.sparse.csr_matrix`,
+        i.e. a sparse matrix in "Compressed Sparse Row" (CSR) format.
 
         .. versionadded:: 1.2
            `sparse` was renamed to `sparse_output`
@@ -995,8 +996,12 @@ def transform(self, X):
         """
         Transform X using one-hot encoding.
 
-        If there are infrequent categories for a feature, the infrequent
-        categories will be grouped into a single category.
+        If `sparse_output=True` (default), it returns an instance of
+        :class:`scipy.sparse._csr.csr_matrix` (CSR format).
+
+        If there are infrequent categories for a feature, set by specifying
+        `max_categories` or `min_frequency`, the infrequent categories are
+        grouped into a single category.
 
         Parameters
         ----------
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
@@ -1979,7 +1979,7 @@ def test_one_hot_encoder_set_output():
 
     ohe.set_output(transform="pandas")
 
-    match = "Pandas output does not support sparse data"
+    match = "Pandas output does not support sparse data. Set sparse_output=False"
     with pytest.raises(ValueError, match=match):
         ohe.fit_transform(X_df)
 
diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py
@@ -43,7 +43,11 @@ def _wrap_in_pandas_container(
         Container with column names or unchanged `output`.
     """
     if issparse(data_to_wrap):
-        raise ValueError("Pandas output does not support sparse data.")
+        raise ValueError(
+            "The transformer outputs a scipy sparse matrix. "
+            "Try to set the transformer output to a dense array or disable "
+            "pandas output with set_output(transform='default')."
+        )
 
     if callable(columns):
         try:
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -4557,7 +4557,11 @@ def check_set_output_transform_pandas(name, transformer_orig):
         outputs_pandas = _output_from_fit_transform(transformer_pandas, name, X, df, y)
     except ValueError as e:
         # transformer does not support sparse data
-        assert "Pandas output does not support sparse data." in str(e), e
+        error_message = str(e)
+        assert (
+            "Pandas output does not support sparse data." in error_message
+            or "The transformer outputs a scipy sparse matrix." in error_message
+        ), e
         return
 
     for case in outputs_default:
@@ -4603,7 +4607,11 @@ def check_global_output_transform_pandas(name, transformer_orig):
             )
     except ValueError as e:
         # transformer does not support sparse data
-        assert "Pandas output does not support sparse data." in str(e), e
+        error_message = str(e)
+        assert (
+            "Pandas output does not support sparse data." in error_message
+            or "The transformer outputs a scipy sparse matrix." in error_message
+        ), e
         return
 
     for case in outputs_default:
diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py
@@ -46,7 +46,7 @@ def test__wrap_in_pandas_container_error_validation(csr_container):
     """Check errors in _wrap_in_pandas_container."""
     X = np.asarray([[1, 0, 3], [0, 0, 1]])
     X_csr = csr_container(X)
-    match = "Pandas output does not support sparse data"
+    match = "The transformer outputs a scipy sparse matrix."
     with pytest.raises(ValueError, match=match):
         _wrap_in_pandas_container(X_csr, columns=["a", "b", "c"])