Skip to content

Commit 829efa5

Browse files
NarineKamueller
authored andcommitted
[MRG+1] Learning curve: Add an option to randomly choose indices for different training sizes (scikit-learn#7506)
* Chooses randomly the indices for different training sizes * Bring back deleted line * Rewrote the description of 'shuffle' attribute * use random.sample instead of np.random.choice * replace tabs with spaces * merge to master * Added shuffle in model-selection's learning_curve method * Added shuffle for incremental learning + addressed Joel's comment * Shorten long lines * Add 2 blank spaces between test cases * Addressed Joel's review comments * Added 2 blank lines between methods * Added non regression test for learning_curve with shuffle * Fixed indentions * Fixed space issues * Modified test cases + small code improvements * Fix some style issues * Addressed Joel's comments - removed _shuffle_train_indices, more test cases and added new entry under 0.19/enhancements * Added some modifications in whats_new.rst
1 parent 387f25c commit 829efa5

File tree

3 files changed

+87
-27
lines changed

3 files changed

+87
-27
lines changed

doc/whats_new.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,12 @@ Enhancements
4141
(`#6101 <https://github.com/scikit-learn/scikit-learn/pull/6101>`_)
4242
By `Ibraim Ganiev`_.
4343

44+
- Added ``shuffle`` and ``random_state`` parameters to shuffle training
45+
data before taking prefixes of it based on training sizes in
46+
:func:`model_selection.learning_curve`.
47+
(`#7506` <https://github.com/scikit-learn/scikit-learn/pull/7506>_) by
48+
`Narine Kokhlikyan`_.
49+
4450
Bug fixes
4551
.........
4652

@@ -4861,3 +4867,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
48614867
.. _Utkarsh Upadhyay: https://github.com/musically-ut
48624868

48634869
.. _Eugene Chen: https://github.com/eyc88
4870+
4871+
.. _Narine Kokhlikyan: https://github.com/NarineK

sklearn/model_selection/_validation.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -642,7 +642,8 @@ def _shuffle(y, groups, random_state):
642642
def learning_curve(estimator, X, y, groups=None,
643643
train_sizes=np.linspace(0.1, 1.0, 5), cv=None, scoring=None,
644644
exploit_incremental_learning=False, n_jobs=1,
645-
pre_dispatch="all", verbose=0):
645+
pre_dispatch="all", verbose=0, shuffle=False,
646+
random_state=None):
646647
"""Learning curve.
647648
648649
Determines cross-validated training and test scores for different training
@@ -718,7 +719,14 @@ def learning_curve(estimator, X, y, groups=None,
718719
verbose : integer, optional
719720
Controls the verbosity: the higher, the more messages.
720721
721-
Returns
722+
shuffle : boolean, optional
723+
Whether to shuffle training data before taking prefixes of it
724+
based on``train_sizes``.
725+
726+
random_state : None, int or RandomState
727+
When shuffle=True, pseudo-random number generator state used for
728+
shuffling. If None, use default numpy RNG for shuffling.
729+
722730
-------
723731
train_sizes_abs : array, shape = (n_unique_ticks,), dtype int
724732
Numbers of training examples that has been used to generate the
@@ -759,17 +767,27 @@ def learning_curve(estimator, X, y, groups=None,
759767

760768
parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
761769
verbose=verbose)
770+
771+
if shuffle:
772+
rng = check_random_state(random_state)
773+
cv_iter = ((rng.permutation(train), test) for train, test in cv_iter)
774+
762775
if exploit_incremental_learning:
763776
classes = np.unique(y) if is_classifier(estimator) else None
764777
out = parallel(delayed(_incremental_fit_estimator)(
765-
clone(estimator), X, y, classes, train, test, train_sizes_abs,
766-
scorer, verbose) for train, test in cv.split(X, y, groups))
778+
clone(estimator), X, y, classes, train,
779+
test, train_sizes_abs, scorer, verbose)
780+
for train, test in cv_iter)
767781
else:
782+
train_test_proportions = []
783+
for train, test in cv_iter:
784+
for n_train_samples in train_sizes_abs:
785+
train_test_proportions.append((train[:n_train_samples], test))
786+
768787
out = parallel(delayed(_fit_and_score)(
769-
clone(estimator), X, y, scorer, train[:n_train_samples], test,
788+
clone(estimator), X, y, scorer, train, test,
770789
verbose, parameters=None, fit_params=None, return_train_score=True)
771-
for train, test in cv_iter
772-
for n_train_samples in train_sizes_abs)
790+
for train, test in train_test_proportions)
773791
out = np.array(out)
774792
n_cv_folds = out.shape[0] // n_unique_ticks
775793
out = out.reshape(n_cv_folds, n_unique_ticks, 2)

sklearn/model_selection/tests/test_validation.py

Lines changed: 54 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -560,18 +560,20 @@ def test_learning_curve():
560560
n_redundant=0, n_classes=2,
561561
n_clusters_per_class=1, random_state=0)
562562
estimator = MockImprovingEstimator(20)
563-
with warnings.catch_warnings(record=True) as w:
564-
train_sizes, train_scores, test_scores = learning_curve(
565-
estimator, X, y, cv=3, train_sizes=np.linspace(0.1, 1.0, 10))
566-
if len(w) > 0:
567-
raise RuntimeError("Unexpected warning: %r" % w[0].message)
568-
assert_equal(train_scores.shape, (10, 3))
569-
assert_equal(test_scores.shape, (10, 3))
570-
assert_array_equal(train_sizes, np.linspace(2, 20, 10))
571-
assert_array_almost_equal(train_scores.mean(axis=1),
572-
np.linspace(1.9, 1.0, 10))
573-
assert_array_almost_equal(test_scores.mean(axis=1),
574-
np.linspace(0.1, 1.0, 10))
563+
for shuffle_train in [False, True]:
564+
with warnings.catch_warnings(record=True) as w:
565+
train_sizes, train_scores, test_scores = learning_curve(
566+
estimator, X, y, cv=3, train_sizes=np.linspace(0.1, 1.0, 10),
567+
shuffle=shuffle_train)
568+
if len(w) > 0:
569+
raise RuntimeError("Unexpected warning: %r" % w[0].message)
570+
assert_equal(train_scores.shape, (10, 3))
571+
assert_equal(test_scores.shape, (10, 3))
572+
assert_array_equal(train_sizes, np.linspace(2, 20, 10))
573+
assert_array_almost_equal(train_scores.mean(axis=1),
574+
np.linspace(1.9, 1.0, 10))
575+
assert_array_almost_equal(test_scores.mean(axis=1),
576+
np.linspace(0.1, 1.0, 10))
575577

576578

577579
def test_learning_curve_unsupervised():
@@ -622,14 +624,15 @@ def test_learning_curve_incremental_learning():
622624
n_redundant=0, n_classes=2,
623625
n_clusters_per_class=1, random_state=0)
624626
estimator = MockIncrementalImprovingEstimator(20)
625-
train_sizes, train_scores, test_scores = learning_curve(
626-
estimator, X, y, cv=3, exploit_incremental_learning=True,
627-
train_sizes=np.linspace(0.1, 1.0, 10))
628-
assert_array_equal(train_sizes, np.linspace(2, 20, 10))
629-
assert_array_almost_equal(train_scores.mean(axis=1),
630-
np.linspace(1.9, 1.0, 10))
631-
assert_array_almost_equal(test_scores.mean(axis=1),
632-
np.linspace(0.1, 1.0, 10))
627+
for shuffle_train in [False, True]:
628+
train_sizes, train_scores, test_scores = learning_curve(
629+
estimator, X, y, cv=3, exploit_incremental_learning=True,
630+
train_sizes=np.linspace(0.1, 1.0, 10), shuffle=shuffle_train)
631+
assert_array_equal(train_sizes, np.linspace(2, 20, 10))
632+
assert_array_almost_equal(train_scores.mean(axis=1),
633+
np.linspace(1.9, 1.0, 10))
634+
assert_array_almost_equal(test_scores.mean(axis=1),
635+
np.linspace(0.1, 1.0, 10))
633636

634637

635638
def test_learning_curve_incremental_learning_unsupervised():
@@ -713,6 +716,37 @@ def test_learning_curve_with_boolean_indices():
713716
np.linspace(0.1, 1.0, 10))
714717

715718

719+
def test_learning_curve_with_shuffle():
720+
"""Following test case was designed this way to verify the code
721+
changes made in pull request: #7506."""
722+
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [11, 12], [13, 14], [15, 16],
723+
[17, 18], [19, 20], [7, 8], [9, 10], [11, 12], [13, 14],
724+
[15, 16], [17, 18]])
725+
y = np.array([1, 1, 1, 2, 3, 4, 1, 1, 2, 3, 4, 1, 2, 3, 4])
726+
groups = np.array([1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 4, 4, 4, 4])
727+
estimator = PassiveAggressiveClassifier(shuffle=False)
728+
729+
cv = GroupKFold(n_splits=2)
730+
train_sizes_batch, train_scores_batch, test_scores_batch = learning_curve(
731+
estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3),
732+
groups=groups, shuffle=True, random_state=2)
733+
assert_array_almost_equal(train_scores_batch.mean(axis=1),
734+
np.array([0.75, 0.3, 0.36111111]))
735+
assert_array_almost_equal(test_scores_batch.mean(axis=1),
736+
np.array([0.36111111, 0.25, 0.25]))
737+
assert_raises(ValueError, learning_curve, estimator, X, y, cv=cv, n_jobs=1,
738+
train_sizes=np.linspace(0.3, 1.0, 3), groups=groups)
739+
740+
train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve(
741+
estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3),
742+
groups=groups, shuffle=True, random_state=2,
743+
exploit_incremental_learning=True)
744+
assert_array_almost_equal(train_scores_inc.mean(axis=1),
745+
train_scores_batch.mean(axis=1))
746+
assert_array_almost_equal(test_scores_inc.mean(axis=1),
747+
test_scores_batch.mean(axis=1))
748+
749+
716750
def test_validation_curve():
717751
X, y = make_classification(n_samples=2, n_features=1, n_informative=1,
718752
n_redundant=0, n_classes=2,

0 commit comments

Comments
 (0)