[MRG] Correcting length of explained_variance_ratio_, eigen solver (scikit-learn#7632)

JPFrancoia · jnothman · commit f260898e0f1d · 2016-10-25T23:52:13.000+11:00
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -98,11 +98,31 @@ Bug fixes
      `#6497 <https://github.com/scikit-learn/scikit-learn/pull/6497>`_
      by `Sebastian Säger`_
 
+   - Attribute ``explained_variance_ratio`` of
+     :class:`discriminant_analysis.LinearDiscriminantAnalysis` calculated
+     with SVD and Eigen solver are now of the same length. (`#7632
+     <https://github.com/scikit-learn/scikit-learn/pull/7632>`_).
+     By `JPFrancoia`_
+
    - Fixes issue in :ref:`univariate_feature_selection` where score 
      functions were not accepting multi-label targets.(`#7676
      <https://github.com/scikit-learn/scikit-learn/pull/7676>`_)
      by `Mohammed Affan`_
-     
+
+
+API changes summary
+-------------------
+
+Linear, kernelized and related models
+
+   - Length of `explained_variance_ratio` of
+     :class:`discriminant_analysis.LinearDiscriminantAnalysis`
+     changed for both Eigen and SVD solvers. The attribute has now a length
+     of min(n_components, n_classes - 1). (`#7632
+     <https://github.com/scikit-learn/scikit-learn/pull/7632>`_).
+     By `JPFrancoia`_
+
+
 .. _changes_0_18:
 
 Version 0.18
@@ -571,8 +591,8 @@ Decomposition, manifold learning and clustering
       :class:`decomposition.KernelPCA`, :class:`manifold.LocallyLinearEmbedding`,
       and :class:`manifold.SpectralEmbedding` (`#5012 <https://github.com/scikit-learn/scikit-learn/pull/5012>`_). By `Peter Fischer`_.
 
-    - Attribute ``explained_variance_ratio_`` calculated with the SVD solver of
-      :class:`discriminant_analysis.LinearDiscriminantAnalysis` now returns
+    - Attribute ``explained_variance_ratio_`` calculated with the SVD solver
+      of :class:`discriminant_analysis.LinearDiscriminantAnalysis` now returns
       correct results. By `JPFrancoia`_
 
 Preprocessing and feature selection
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
@@ -293,8 +293,8 @@ def _solve_lsqr(self, X, y, shrinkage):
         self.means_ = _class_means(X, y)
         self.covariance_ = _class_cov(X, y, self.priors_, shrinkage)
         self.coef_ = linalg.lstsq(self.covariance_, self.means_.T)[0].T
-        self.intercept_ = (-0.5 * np.diag(np.dot(self.means_, self.coef_.T))
-                           + np.log(self.priors_))
+        self.intercept_ = (-0.5 * np.diag(np.dot(self.means_, self.coef_.T)) +
+                           np.log(self.priors_))
 
     def _solve_eigen(self, X, y, shrinkage):
         """Eigenvalue solver.
@@ -336,15 +336,16 @@ class scatter). This solver supports both classification and
         Sb = St - Sw  # between scatter
 
         evals, evecs = linalg.eigh(Sb, Sw)
-        self.explained_variance_ratio_ = np.sort(evals / np.sum(evals))[::-1]
+        self.explained_variance_ratio_ = np.sort(evals / np.sum(evals)
+                                                 )[::-1][:self._max_components]
         evecs = evecs[:, np.argsort(evals)[::-1]]  # sort eigenvectors
         # evecs /= np.linalg.norm(evecs, axis=0)  # doesn't work with numpy 1.6
         evecs /= np.apply_along_axis(np.linalg.norm, 0, evecs)
 
         self.scalings_ = evecs
         self.coef_ = np.dot(self.means_, evecs).dot(evecs.T)
-        self.intercept_ = (-0.5 * np.diag(np.dot(self.means_, self.coef_.T))
-                           + np.log(self.priors_))
+        self.intercept_ = (-0.5 * np.diag(np.dot(self.means_, self.coef_.T)) +
+                           np.log(self.priors_))
 
     def _solve_svd(self, X, y):
         """SVD solver.
@@ -400,12 +401,12 @@ def _solve_svd(self, X, y):
         _, S, V = linalg.svd(X, full_matrices=0)
 
         self.explained_variance_ratio_ = (S**2 / np.sum(
-                S**2))[:self.n_components]
+            S**2))[:self._max_components]
         rank = np.sum(S > self.tol * S[0])
         self.scalings_ = np.dot(scalings, V.T[:, :rank])
         coef = np.dot(self.means_ - self.xbar_, self.scalings_)
-        self.intercept_ = (-0.5 * np.sum(coef ** 2, axis=1)
-                           + np.log(self.priors_))
+        self.intercept_ = (-0.5 * np.sum(coef ** 2, axis=1) +
+                           np.log(self.priors_))
         self.coef_ = np.dot(coef, self.scalings_.T)
         self.intercept_ -= np.dot(self.xbar_, self.coef_.T)
 
@@ -457,6 +458,13 @@ def fit(self, X, y, store_covariance=None, tol=None):
                           UserWarning)
             self.priors_ = self.priors_ / self.priors_.sum()
 
+        # Get the maximum number of components
+        if self.n_components is None:
+            self._max_components = len(self.classes_) - 1
+        else:
+            self._max_components = min(len(self.classes_) - 1,
+                                       self.n_components)
+
         if self.solver == 'svd':
             if self.shrinkage is not None:
                 raise NotImplementedError('shrinkage not supported')
@@ -497,9 +505,8 @@ def transform(self, X):
             X_new = np.dot(X - self.xbar_, self.scalings_)
         elif self.solver == 'eigen':
             X_new = np.dot(X, self.scalings_)
-        n_components = X.shape[1] if self.n_components is None \
-            else self.n_components
-        return X_new[:, :n_components]
+
+        return X_new[:, :self._max_components]
 
     def predict_proba(self, X):
         """Estimate probability.
diff --git a/sklearn/tests/test_discriminant_analysis.py b/sklearn/tests/test_discriminant_analysis.py
@@ -171,19 +171,17 @@ def test_lda_explained_variance_ratio():
     clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen")
     clf_lda_eigen.fit(X, y)
     assert_almost_equal(clf_lda_eigen.explained_variance_ratio_.sum(), 1.0, 3)
+    assert_equal(clf_lda_eigen.explained_variance_ratio_.shape, (2,),
+                 "Unexpected length for explained_variance_ratio_")
 
     clf_lda_svd = LinearDiscriminantAnalysis(solver="svd")
     clf_lda_svd.fit(X, y)
     assert_almost_equal(clf_lda_svd.explained_variance_ratio_.sum(), 1.0, 3)
+    assert_equal(clf_lda_svd.explained_variance_ratio_.shape, (2,),
+                 "Unexpected length for explained_variance_ratio_")
 
-    tested_length = min(clf_lda_svd.explained_variance_ratio_.shape[0],
-                        clf_lda_eigen.explained_variance_ratio_.shape[0])
-
-    # NOTE: clf_lda_eigen.explained_variance_ratio_ is not of n_components
-    # length. Make it the same length as clf_lda_svd.explained_variance_ratio_
-    # before comparison.
     assert_array_almost_equal(clf_lda_svd.explained_variance_ratio_,
-                              clf_lda_eigen.explained_variance_ratio_[:tested_length])
+                              clf_lda_eigen.explained_variance_ratio_)
 
 
 def test_lda_orthogonality():