[MGR + 2] fix selectFdr bug (scikit-learn#7490)

Meng, Peng · jnothman · commit 2caa1445fb36 · 2016-10-20T13:06:32.000+11:00
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -50,6 +50,12 @@ Enhancements
 Bug fixes
 .........
 
+   - Fix a bug where :class:`sklearn.feature_selection.SelectFdr` did not 
+     exactly implement Benjamini-Hochberg procedure. It formerly may have
+     selected fewer features than it should.
+     (`#7490 <https://github.com/scikit-learn/scikit-learn/pull/7490>`_) by
+     `Peng Meng`_.
+
    - :class:`sklearn.manifold.LocallyLinearEmbedding` now correctly handles
      integer inputs
      (`#6282 <https://github.com/scikit-learn/scikit-learn/pull/6282>`_) by
@@ -4873,3 +4879,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Eugene Chen: https://github.com/eyc88
 
 .. _Narine Kokhlikyan: https://github.com/NarineK
+
+.. _Peng Meng: https://github.com/mpjlu
diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py
@@ -371,6 +371,40 @@ def test_select_heuristics_regression():
         assert_less(np.sum(support[5:] == 1), 3)
 
 
+def test_boundary_case_ch2():
+    # Test boundary case, and always aim to select 1 feature.
+    X = np.array([[10, 20], [20, 20], [20, 30]])
+    y = np.array([[1], [0], [0]])
+    scores, pvalues = chi2(X, y)
+    assert_array_almost_equal(scores, np.array([4., 0.71428571]))
+    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))
+
+    filter_fdr = SelectFdr(chi2, alpha=0.1)
+    filter_fdr.fit(X, y)
+    support_fdr = filter_fdr.get_support()
+    assert_array_equal(support_fdr, np.array([True, False]))
+
+    filter_kbest = SelectKBest(chi2, k=1)
+    filter_kbest.fit(X, y)
+    support_kbest = filter_kbest.get_support()
+    assert_array_equal(support_kbest, np.array([True, False]))
+
+    filter_percentile = SelectPercentile(chi2, percentile=50)
+    filter_percentile.fit(X, y)
+    support_percentile = filter_percentile.get_support()
+    assert_array_equal(support_percentile, np.array([True, False]))
+
+    filter_fpr = SelectFpr(chi2, alpha=0.1)
+    filter_fpr.fit(X, y)
+    support_fpr = filter_fpr.get_support()
+    assert_array_equal(support_fpr, np.array([True, False]))
+
+    filter_fwe = SelectFwe(chi2, alpha=0.1)
+    filter_fwe.fit(X, y)
+    support_fwe = filter_fwe.get_support()
+    assert_array_equal(support_fwe, np.array([True, False]))
+
+
 def test_select_fdr_regression():
     # Test that fdr heuristic actually has low FDR.
     def single_fdr(alpha, n_informative, random_state):
@@ -404,7 +438,7 @@ def single_fdr(alpha, n_informative, random_state):
             # FDR = E(FP / (TP + FP)) <= alpha
             false_discovery_rate = np.mean([single_fdr(alpha, n_informative,
                                                        random_state) for
-                                            random_state in range(30)])
+                                            random_state in range(100)])
             assert_greater_equal(alpha, false_discovery_rate)
 
             # Make sure that the empirical false discovery rate increases
diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py
@@ -596,8 +596,8 @@ def _get_support_mask(self):
 
         n_features = len(self.pvalues_)
         sv = np.sort(self.pvalues_)
-        selected = sv[sv <= float(self.alpha) / n_features
-                      * np.arange(n_features)]
+        selected = sv[sv <= float(self.alpha) / n_features *
+                      np.arange(1, n_features + 1)]
         if selected.size == 0:
             return np.zeros_like(self.pvalues_, dtype=bool)
         return self.pvalues_ <= selected.max()