EFF Optimize memory usage for sparse matrices in LLE (Hessian, Modified and LTSA) (scikit-learn#28096)

giorgioangel · web-flow · commit 9590c07128d3 · 2024-06-27T14:45:49.000Z
diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -136,6 +136,14 @@ Changelog
   has no effect. `copy_X` will be removed in 1.8.
   :pr:`29105` by :user:`Adam Li <adam2392>`.
 
+:mod:`sklearn.manifold`
+.......................
+
+- |Efficiency| :func:`manifold.locally_linear_embedding` and
+  :class:`manifold.LocallyLinearEmbedding` now allocate more efficiently the memory of
+  sparse matrices in the Hessian, Modified and LTSA methods.
+  :pr:`28096` by :user:`Giorgio Angelotti <giorgioangel>`.
+
 :mod:`sklearn.metrics`
 ......................
 
diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 from scipy.linalg import eigh, qr, solve, svd
-from scipy.sparse import csr_matrix, eye
+from scipy.sparse import csr_matrix, eye, lil_matrix
 from scipy.sparse.linalg import eigsh
 
 from ..base import (
@@ -229,6 +229,7 @@ def _locally_linear_embedding(
         )
 
     M_sparse = eigen_solver != "dense"
+    M_container_constructor = lil_matrix if M_sparse else np.zeros
 
     if method == "standard":
         W = barycenter_kneighbors_graph(
@@ -239,7 +240,7 @@ def _locally_linear_embedding(
         # depending on the solver, we'll do this differently
         if M_sparse:
             M = eye(*W.shape, format=W.format) - W
-            M = (M.T * M).tocsr()
+            M = M.T * M
         else:
             M = (W.T * W - W.T - W).toarray()
             M.flat[:: M.shape[0] + 1] += 1  # W = W - I = W - I
@@ -262,7 +263,7 @@ def _locally_linear_embedding(
         Yi = np.empty((n_neighbors, 1 + n_components + dp), dtype=np.float64)
         Yi[:, 0] = 1
 
-        M = np.zeros((N, N), dtype=np.float64)
+        M = M_container_constructor((N, N), dtype=np.float64)
 
         use_svd = n_neighbors > d_in
 
@@ -295,9 +296,6 @@ def _locally_linear_embedding(
             nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
             M[nbrs_x, nbrs_y] += np.dot(w, w.T)
 
-        if M_sparse:
-            M = csr_matrix(M)
-
     elif method == "modified":
         if n_neighbors < n_components:
             raise ValueError("modified LLE requires n_neighbors >= n_components")
@@ -361,7 +359,8 @@ def _locally_linear_embedding(
 
         # Now calculate M.
         # This is the [N x N] matrix whose null space is the desired embedding
-        M = np.zeros((N, N), dtype=np.float64)
+        M = M_container_constructor((N, N), dtype=np.float64)
+
         for i in range(N):
             s_i = s_range[i]
 
@@ -397,19 +396,16 @@ def _locally_linear_embedding(
             M[nbrs_x, nbrs_y] += np.dot(Wi, Wi.T)
             Wi_sum1 = Wi.sum(1)
             M[i, neighbors[i]] -= Wi_sum1
-            M[neighbors[i], i] -= Wi_sum1
+            M[neighbors[i], [i]] -= Wi_sum1
             M[i, i] += s_i
 
-        if M_sparse:
-            M = csr_matrix(M)
-
     elif method == "ltsa":
         neighbors = nbrs.kneighbors(
             X, n_neighbors=n_neighbors + 1, return_distance=False
         )
         neighbors = neighbors[:, 1:]
 
-        M = np.zeros((N, N))
+        M = M_container_constructor((N, N), dtype=np.float64)
 
         use_svd = n_neighbors > d_in
 
@@ -432,7 +428,11 @@ def _locally_linear_embedding(
 
             nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
             M[nbrs_x, nbrs_y] -= GiGiT
-            M[neighbors[i], neighbors[i]] += 1
+
+            M[neighbors[i], neighbors[i]] += np.ones(shape=n_neighbors)
+
+    if M_sparse:
+        M = M.tocsr()
 
     return null_space(
         M,