[MRG] identity activation function for MLPs (scikit-learn#7267)

ogrisel · web-flow · commit 6a2b4f7e7b46 · 2016-08-30T13:29:39.000+02:00
* ENH identity activation function for MLPs

* ENH code simplication in mlp
diff --git a/sklearn/neural_network/_base.py b/sklearn/neural_network/_base.py
@@ -99,64 +99,78 @@ def softmax(X):
                'relu': relu, 'softmax': softmax}
 
 
-def inplace_logistic_derivative(Z):
-    """Compute the derivative of the logistic function given output value
-    from logistic function
+def inplace_identity_derivative(Z, delta):
+    """Apply the derivative of the identity function: do nothing.
+
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the identity activation function during
+        the forward pass.
+
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    # Nothing to do
+
+
+def inplace_logistic_derivative(Z, delta):
+    """Apply the derivative of the logistic sigmoid function.
 
     It exploits the fact that the derivative is a simple function of the output
-    value from logistic function
+    value from logistic function.
 
     Parameters
     ----------
     Z : {array-like, sparse matrix}, shape (n_samples, n_features)
-        The input data which is output from logistic function
+        The data which was output from the logistic activation function during
+        the forward pass.
 
-    Returns
-    -------
-    Z_new : {array-like, sparse matrix}, shape (n_samples, n_features)
-        The transformed data.
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
     """
-    return Z * (1 - Z)
+    delta *= Z
+    delta *= (1 - Z)
 
 
-def inplace_tanh_derivative(Z):
-    """Compute the derivative of the hyperbolic tan function given output value
-    from hyperbolic tan
+def inplace_tanh_derivative(Z, delta):
+    """Apply the derivative of the hyperbolic tanh function.
 
     It exploits the fact that the derivative is a simple function of the output
-    value from hyperbolic tan
+    value from hyperbolic tangent.
 
     Parameters
     ----------
     Z : {array-like, sparse matrix}, shape (n_samples, n_features)
-        The input data which is output from hyperbolic tan function
+        The data which was output from the hyperbolic tangent activation
+        function during the forward pass.
 
-    Returns
-    -------
-    Z_new : {array-like, sparse matrix}, shape (n_samples, n_features)
-        The transformed data.
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
     """
-    return 1 - (Z ** 2)
+    delta *= (1 - Z ** 2)
 
 
-def inplace_relu_derivative(Z):
-    """Compute the derivative of the rectified linear unit function given output
-    value from relu
+def inplace_relu_derivative(Z, delta):
+    """Apply the derivative of the relu function.
+
+    It exploits the fact that the derivative is a simple function of the output
+    value from rectified linear units activation function.
 
     Parameters
     ----------
     Z : {array-like, sparse matrix}, shape (n_samples, n_features)
-        The input data which is output from some relu
+        The data which was output from the rectified linear units activation
+        function during the forward pass.
 
-    Returns
-    -------
-    Z_new : {array-like, sparse matrix}, shape (n_samples, n_features)
-        The transformed data.
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
     """
-    return (Z > 0).astype(Z.dtype)
+    delta[Z == 0] = 0
 
 
-DERIVATIVES = {'tanh': inplace_tanh_derivative,
+DERIVATIVES = {'identity': inplace_identity_derivative,
+               'tanh': inplace_tanh_derivative,
                'logistic': inplace_logistic_derivative,
                'relu': inplace_relu_derivative}
 
diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py
@@ -235,8 +235,7 @@ def _backprop(self, X, y, activations, deltas, coef_grads,
         # combinations of output activation and loss function:
         # sigmoid and binary cross entropy, softmax and categorical cross
         # entropy, and identity with squared loss
-        diff = y - activations[-1]
-        deltas[last] = -diff
+        deltas[last] = activations[-1] - y
 
         # Compute gradient for the last layer
         coef_grads, intercept_grads = self._compute_loss_grad(
@@ -245,8 +244,8 @@ def _backprop(self, X, y, activations, deltas, coef_grads,
         # Iterate over the hidden layers
         for i in range(self.n_layers_ - 2, 0, -1):
             deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T)
-            derivative = DERIVATIVES[self.activation]
-            deltas[i - 1] *= derivative(activations[i])
+            inplace_derivative = DERIVATIVES[self.activation]
+            inplace_derivative(activations[i], deltas[i - 1])
 
             coef_grads, intercept_grads = self._compute_loss_grad(
                 i - 1, n_samples, activations, deltas, coef_grads,
@@ -302,9 +301,7 @@ def _init_coef(self, fan_in, fan_out, rng):
             # Use the initialization method recommended by
             # Glorot et al.
             init_bound = np.sqrt(2. / (fan_in + fan_out))
-        elif self.activation == 'tanh':
-            init_bound = np.sqrt(6. / (fan_in + fan_out))
-        elif self.activation == 'relu':
+        elif self.activation in ('identity', 'tanh', 'relu'):
             init_bound = np.sqrt(6. / (fan_in + fan_out))
         else:
             # this was caught earlier, just to make sure
@@ -414,7 +411,7 @@ def _validate_hyperparameters(self):
             raise ValueError("epsilon must be > 0, got %s." % self.epsilon)
 
         # raise ValueError if not registered
-        supported_activations = ['logistic', 'tanh', 'relu']
+        supported_activations = ('identity', 'logistic', 'tanh', 'relu')
         if self.activation not in supported_activations:
             raise ValueError("The activation '%s' is not supported. Supported "
                              "activations are %s." % (self.activation,
@@ -688,9 +685,12 @@ class MLPClassifier(BaseMultilayerPerceptron, ClassifierMixin):
         The ith element represents the number of neurons in the ith
         hidden layer.
 
-    activation : {'logistic', 'tanh', 'relu'}, default 'relu'
+    activation : {'identity', 'logistic', 'tanh', 'relu'}, default 'relu'
         Activation function for the hidden layer.
 
+        - 'identity', no-op activation, useful to implement linear bottleneck,
+          returns f(x) = x
+
         - 'logistic', the logistic sigmoid function,
           returns f(x) = 1 / (1 + exp(-x)).
 
@@ -1042,9 +1042,12 @@ class MLPRegressor(BaseMultilayerPerceptron, RegressorMixin):
         The ith element represents the number of neurons in the ith
         hidden layer.
 
-    activation : {'logistic', 'tanh', 'relu'}, default 'relu'
+    activation : {'identity', 'logistic', 'tanh', 'relu'}, default 'relu'
         Activation function for the hidden layer.
 
+        - 'identity', no-op activation, useful to implement linear bottleneck,
+          returns f(x) = x
+
         - 'logistic', the logistic sigmoid function,
           returns f(x) = 1 / (1 + exp(-x)).
 
diff --git a/sklearn/neural_network/tests/test_mlp.py b/sklearn/neural_network/tests/test_mlp.py
@@ -27,7 +27,7 @@
 
 np.seterr(all='warn')
 
-ACTIVATION_TYPES = ["logistic", "tanh", "relu"]
+ACTIVATION_TYPES = ["identity", "logistic", "tanh", "relu"]
 
 digits_dataset_multi = load_digits(n_class=3)
 
@@ -254,7 +254,11 @@ def test_lbfgs_regression():
                            max_iter=150, shuffle=True, random_state=1,
                            activation=activation)
         mlp.fit(X, y)
-        assert_greater(mlp.score(X, y), 0.95)
+        if activation == 'identity':
+            assert_greater(mlp.score(X, y), 0.84)
+        else:
+            # Non linear models perform much better than linear bottleneck:
+            assert_greater(mlp.score(X, y), 0.95)
 
 
 def test_learning_rate_warmstart():