ENH add normalize to LDA.transform (scikit-learn#30097)

adrinjalali · web-flow · commit a072e56fa67c · 2024-10-18T20:13:15.000+02:00
diff --git a/doc/whats_new/upcoming_changes/sklearn.decomposition/30097.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.decomposition/30097.enhancement.rst
@@ -0,0 +1,4 @@
+- :class:`~sklearn.decomposition.LatentDirichletAllocation` now has a
+  ``normalize`` parameter in ``transform`` and ``fit_transform`` methods
+  to control whether the document topic distribution is normalized.
+  By `Adrin Jalali`_.
diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
@@ -723,7 +723,7 @@ def _unnormalized_transform(self, X):
 
         return doc_topic_distr
 
-    def transform(self, X):
+    def transform(self, X, *, normalize=True):
         """Transform data X according to the fitted model.
 
         .. versionchanged:: 0.18
@@ -734,6 +734,9 @@ def transform(self, X):
         X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Document word matrix.
 
+        normalize : bool, default=True
+            Whether to normalize the document topic distribution.
+
         Returns
         -------
         doc_topic_distr : ndarray of shape (n_samples, n_components)
@@ -744,9 +747,35 @@ def transform(self, X):
             X, reset_n_features=False, whom="LatentDirichletAllocation.transform"
         )
         doc_topic_distr = self._unnormalized_transform(X)
-        doc_topic_distr /= doc_topic_distr.sum(axis=1)[:, np.newaxis]
+        if normalize:
+            doc_topic_distr /= doc_topic_distr.sum(axis=1)[:, np.newaxis]
         return doc_topic_distr
 
+    def fit_transform(self, X, y=None, *, normalize=True):
+        """
+        Fit to data, then transform it.
+
+        Fits transformer to `X` and `y` and returns a transformed version of `X`.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Input samples.
+
+        y :  array-like of shape (n_samples,) or (n_samples, n_outputs), \
+                default=None
+            Target values (None for unsupervised transformations).
+
+        normalize : bool, default=True
+            Whether to normalize the document topic distribution in `transform`.
+
+        Returns
+        -------
+        X_new : ndarray array of shape (n_samples, n_features_new)
+            Transformed array.
+        """
+        return self.fit(X, y).transform(X, normalize=normalize)
+
     def _approx_bound(self, X, doc_topic_distr, sub_sampling):
         """Estimate the variational bound.
 
diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py
@@ -132,7 +132,7 @@ def test_lda_dense_input(csr_container):
 
 def test_lda_transform():
     # Test LDA transform.
-    # Transform result cannot be negative and should be normalized
+    # Transform result cannot be negative and should be normalized by default
     rng = np.random.RandomState(0)
     X = rng.randint(5, size=(20, 10))
     n_components = 3
@@ -141,6 +141,11 @@ def test_lda_transform():
     assert (X_trans > 0.0).any()
     assert_array_almost_equal(np.sum(X_trans, axis=1), np.ones(X_trans.shape[0]))
 
+    X_trans_unnormalized = lda.transform(X, normalize=False)
+    assert_array_almost_equal(
+        X_trans, X_trans_unnormalized / X_trans_unnormalized.sum(axis=1)[:, np.newaxis]
+    )
+
 
 @pytest.mark.parametrize("method", ("online", "batch"))
 def test_lda_fit_transform(method):