[MRG+1] ENH add memory to make_pipeline (scikit-learn#8831)

glemaitre · jmschrei · commit 6f74a7cfa6dc · 2017-05-07T15:08:59.000-07:00
[MRG+2] ENH add memory to make_pipeline
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
@@ -11,12 +11,10 @@
 
 from collections import defaultdict
 
-from abc import ABCMeta, abstractmethod
-
 import numpy as np
 from scipy import sparse
 
-from .base import clone, BaseEstimator, TransformerMixin
+from .base import clone, TransformerMixin
 from .externals.joblib import Parallel, delayed, Memory
 from .externals import six
 from .utils import tosequence
@@ -35,7 +33,7 @@ class Pipeline(_BaseComposition):
     Intermediate steps of the pipeline must be 'transforms', that is, they
     must implement fit and transform methods.
     The final estimator only needs to implement fit.
-    The transformers in the pipeline can be cached using ```memory`` argument.
+    The transformers in the pipeline can be cached using ``memory`` argument.
 
     The purpose of the pipeline is to assemble several steps that can be
     cross-validated together while setting different parameters.
@@ -527,13 +525,27 @@ def _name_estimators(estimators):
     return list(zip(names, estimators))
 
 
-def make_pipeline(*steps):
+def make_pipeline(*steps, **kwargs):
     """Construct a Pipeline from the given estimators.
 
     This is a shorthand for the Pipeline constructor; it does not require, and
     does not permit, naming the estimators. Instead, their names will be set
     to the lowercase of their types automatically.
 
+    Parameters
+    ----------
+    *steps : list of estimators,
+
+    memory : Instance of joblib.Memory or string, optional (default=None)
+        Used to cache the fitted transformers of the pipeline. By default,
+        no caching is performed. If a string is given, it is the path to
+        the caching directory. Enabling caching triggers a clone of
+        the transformers before fitting. Therefore, the transformer
+        instance given to the pipeline cannot be inspected
+        directly. Use the attribute ``named_steps`` or ``steps`` to
+        inspect estimators within the pipeline. Caching the
+        transformers is advantageous when fitting is time consuming.
+
     Examples
     --------
     >>> from sklearn.naive_bayes import GaussianNB
@@ -549,7 +561,11 @@ def make_pipeline(*steps):
     -------
     p : Pipeline
     """
-    return Pipeline(_name_estimators(steps))
+    memory = kwargs.pop('memory', None)
+    if kwargs:
+        raise TypeError('Unknown keyword arguments: "{}"'
+                        .format(list(kwargs.keys())[0]))
+    return Pipeline(_name_estimators(steps), memory=memory)
 
 
 def _fit_one_transformer(transformer, X, y):
diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py
@@ -637,6 +637,12 @@ def test_make_pipeline():
     assert_equal(pipe.steps[1][0], "transf-2")
     assert_equal(pipe.steps[2][0], "fitparamt")
 
+    assert_raise_message(
+        TypeError,
+        'Unknown keyword arguments: "random_parameter"',
+        make_pipeline, t1, t2, random_parameter='rnd'
+    )
+
 
 def test_feature_union_weights():
     # test feature union with transformer weights
@@ -911,3 +917,14 @@ def test_pipeline_memory():
         assert_equal(ts, cached_pipe_2.named_steps['transf_2'].timestamp_)
     finally:
         shutil.rmtree(cachedir)
+
+
+def test_make_pipeline_memory():
+    cachedir = mkdtemp()
+    memory = Memory(cachedir=cachedir)
+    pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory)
+    assert_true(pipeline.memory is memory)
+    pipeline = make_pipeline(DummyTransf(), SVC())
+    assert_true(pipeline.memory is None)
+
+    shutil.rmtree(cachedir)