Skip to content

Commit 2caa144

Browse files
Meng, Pengjnothman
authored andcommitted
[MGR + 2] fix selectFdr bug (scikit-learn#7490)
1 parent 74e4c42 commit 2caa144

File tree

3 files changed

+45
-3
lines changed

3 files changed

+45
-3
lines changed

doc/whats_new.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,12 @@ Enhancements
5050
Bug fixes
5151
.........
5252

53+
- Fix a bug where :class:`sklearn.feature_selection.SelectFdr` did not
54+
exactly implement Benjamini-Hochberg procedure. It formerly may have
55+
selected fewer features than it should.
56+
(`#7490 <https://github.com/scikit-learn/scikit-learn/pull/7490>`_) by
57+
`Peng Meng`_.
58+
5359
- :class:`sklearn.manifold.LocallyLinearEmbedding` now correctly handles
5460
integer inputs
5561
(`#6282 <https://github.com/scikit-learn/scikit-learn/pull/6282>`_) by
@@ -4873,3 +4879,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
48734879
.. _Eugene Chen: https://github.com/eyc88
48744880

48754881
.. _Narine Kokhlikyan: https://github.com/NarineK
4882+
4883+
.. _Peng Meng: https://github.com/mpjlu

sklearn/feature_selection/tests/test_feature_select.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,40 @@ def test_select_heuristics_regression():
371371
assert_less(np.sum(support[5:] == 1), 3)
372372

373373

374+
def test_boundary_case_ch2():
375+
# Test boundary case, and always aim to select 1 feature.
376+
X = np.array([[10, 20], [20, 20], [20, 30]])
377+
y = np.array([[1], [0], [0]])
378+
scores, pvalues = chi2(X, y)
379+
assert_array_almost_equal(scores, np.array([4., 0.71428571]))
380+
assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))
381+
382+
filter_fdr = SelectFdr(chi2, alpha=0.1)
383+
filter_fdr.fit(X, y)
384+
support_fdr = filter_fdr.get_support()
385+
assert_array_equal(support_fdr, np.array([True, False]))
386+
387+
filter_kbest = SelectKBest(chi2, k=1)
388+
filter_kbest.fit(X, y)
389+
support_kbest = filter_kbest.get_support()
390+
assert_array_equal(support_kbest, np.array([True, False]))
391+
392+
filter_percentile = SelectPercentile(chi2, percentile=50)
393+
filter_percentile.fit(X, y)
394+
support_percentile = filter_percentile.get_support()
395+
assert_array_equal(support_percentile, np.array([True, False]))
396+
397+
filter_fpr = SelectFpr(chi2, alpha=0.1)
398+
filter_fpr.fit(X, y)
399+
support_fpr = filter_fpr.get_support()
400+
assert_array_equal(support_fpr, np.array([True, False]))
401+
402+
filter_fwe = SelectFwe(chi2, alpha=0.1)
403+
filter_fwe.fit(X, y)
404+
support_fwe = filter_fwe.get_support()
405+
assert_array_equal(support_fwe, np.array([True, False]))
406+
407+
374408
def test_select_fdr_regression():
375409
# Test that fdr heuristic actually has low FDR.
376410
def single_fdr(alpha, n_informative, random_state):
@@ -404,7 +438,7 @@ def single_fdr(alpha, n_informative, random_state):
404438
# FDR = E(FP / (TP + FP)) <= alpha
405439
false_discovery_rate = np.mean([single_fdr(alpha, n_informative,
406440
random_state) for
407-
random_state in range(30)])
441+
random_state in range(100)])
408442
assert_greater_equal(alpha, false_discovery_rate)
409443

410444
# Make sure that the empirical false discovery rate increases

sklearn/feature_selection/univariate_selection.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -596,8 +596,8 @@ def _get_support_mask(self):
596596

597597
n_features = len(self.pvalues_)
598598
sv = np.sort(self.pvalues_)
599-
selected = sv[sv <= float(self.alpha) / n_features
600-
* np.arange(n_features)]
599+
selected = sv[sv <= float(self.alpha) / n_features *
600+
np.arange(1, n_features + 1)]
601601
if selected.size == 0:
602602
return np.zeros_like(self.pvalues_, dtype=bool)
603603
return self.pvalues_ <= selected.max()

0 commit comments

Comments
 (0)