Skip to content

Commit 6a2b4f7

Browse files
authored
[MRG] identity activation function for MLPs (scikit-learn#7267)
* ENH identity activation function for MLPs * ENH code simplication in mlp
1 parent 0a0bf24 commit 6a2b4f7

File tree

3 files changed

+63
-42
lines changed

3 files changed

+63
-42
lines changed

sklearn/neural_network/_base.py

Lines changed: 44 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -99,64 +99,78 @@ def softmax(X):
9999
'relu': relu, 'softmax': softmax}
100100

101101

102-
def inplace_logistic_derivative(Z):
103-
"""Compute the derivative of the logistic function given output value
104-
from logistic function
102+
def inplace_identity_derivative(Z, delta):
103+
"""Apply the derivative of the identity function: do nothing.
104+
105+
Parameters
106+
----------
107+
Z : {array-like, sparse matrix}, shape (n_samples, n_features)
108+
The data which was output from the identity activation function during
109+
the forward pass.
110+
111+
delta : {array-like}, shape (n_samples, n_features)
112+
The backpropagated error signal to be modified inplace.
113+
"""
114+
# Nothing to do
115+
116+
117+
def inplace_logistic_derivative(Z, delta):
118+
"""Apply the derivative of the logistic sigmoid function.
105119
106120
It exploits the fact that the derivative is a simple function of the output
107-
value from logistic function
121+
value from logistic function.
108122
109123
Parameters
110124
----------
111125
Z : {array-like, sparse matrix}, shape (n_samples, n_features)
112-
The input data which is output from logistic function
126+
The data which was output from the logistic activation function during
127+
the forward pass.
113128
114-
Returns
115-
-------
116-
Z_new : {array-like, sparse matrix}, shape (n_samples, n_features)
117-
The transformed data.
129+
delta : {array-like}, shape (n_samples, n_features)
130+
The backpropagated error signal to be modified inplace.
118131
"""
119-
return Z * (1 - Z)
132+
delta *= Z
133+
delta *= (1 - Z)
120134

121135

122-
def inplace_tanh_derivative(Z):
123-
"""Compute the derivative of the hyperbolic tan function given output value
124-
from hyperbolic tan
136+
def inplace_tanh_derivative(Z, delta):
137+
"""Apply the derivative of the hyperbolic tanh function.
125138
126139
It exploits the fact that the derivative is a simple function of the output
127-
value from hyperbolic tan
140+
value from hyperbolic tangent.
128141
129142
Parameters
130143
----------
131144
Z : {array-like, sparse matrix}, shape (n_samples, n_features)
132-
The input data which is output from hyperbolic tan function
145+
The data which was output from the hyperbolic tangent activation
146+
function during the forward pass.
133147
134-
Returns
135-
-------
136-
Z_new : {array-like, sparse matrix}, shape (n_samples, n_features)
137-
The transformed data.
148+
delta : {array-like}, shape (n_samples, n_features)
149+
The backpropagated error signal to be modified inplace.
138150
"""
139-
return 1 - (Z ** 2)
151+
delta *= (1 - Z ** 2)
140152

141153

142-
def inplace_relu_derivative(Z):
143-
"""Compute the derivative of the rectified linear unit function given output
144-
value from relu
154+
def inplace_relu_derivative(Z, delta):
155+
"""Apply the derivative of the relu function.
156+
157+
It exploits the fact that the derivative is a simple function of the output
158+
value from rectified linear units activation function.
145159
146160
Parameters
147161
----------
148162
Z : {array-like, sparse matrix}, shape (n_samples, n_features)
149-
The input data which is output from some relu
163+
The data which was output from the rectified linear units activation
164+
function during the forward pass.
150165
151-
Returns
152-
-------
153-
Z_new : {array-like, sparse matrix}, shape (n_samples, n_features)
154-
The transformed data.
166+
delta : {array-like}, shape (n_samples, n_features)
167+
The backpropagated error signal to be modified inplace.
155168
"""
156-
return (Z > 0).astype(Z.dtype)
169+
delta[Z == 0] = 0
157170

158171

159-
DERIVATIVES = {'tanh': inplace_tanh_derivative,
172+
DERIVATIVES = {'identity': inplace_identity_derivative,
173+
'tanh': inplace_tanh_derivative,
160174
'logistic': inplace_logistic_derivative,
161175
'relu': inplace_relu_derivative}
162176

sklearn/neural_network/multilayer_perceptron.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -235,8 +235,7 @@ def _backprop(self, X, y, activations, deltas, coef_grads,
235235
# combinations of output activation and loss function:
236236
# sigmoid and binary cross entropy, softmax and categorical cross
237237
# entropy, and identity with squared loss
238-
diff = y - activations[-1]
239-
deltas[last] = -diff
238+
deltas[last] = activations[-1] - y
240239

241240
# Compute gradient for the last layer
242241
coef_grads, intercept_grads = self._compute_loss_grad(
@@ -245,8 +244,8 @@ def _backprop(self, X, y, activations, deltas, coef_grads,
245244
# Iterate over the hidden layers
246245
for i in range(self.n_layers_ - 2, 0, -1):
247246
deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T)
248-
derivative = DERIVATIVES[self.activation]
249-
deltas[i - 1] *= derivative(activations[i])
247+
inplace_derivative = DERIVATIVES[self.activation]
248+
inplace_derivative(activations[i], deltas[i - 1])
250249

251250
coef_grads, intercept_grads = self._compute_loss_grad(
252251
i - 1, n_samples, activations, deltas, coef_grads,
@@ -302,9 +301,7 @@ def _init_coef(self, fan_in, fan_out, rng):
302301
# Use the initialization method recommended by
303302
# Glorot et al.
304303
init_bound = np.sqrt(2. / (fan_in + fan_out))
305-
elif self.activation == 'tanh':
306-
init_bound = np.sqrt(6. / (fan_in + fan_out))
307-
elif self.activation == 'relu':
304+
elif self.activation in ('identity', 'tanh', 'relu'):
308305
init_bound = np.sqrt(6. / (fan_in + fan_out))
309306
else:
310307
# this was caught earlier, just to make sure
@@ -414,7 +411,7 @@ def _validate_hyperparameters(self):
414411
raise ValueError("epsilon must be > 0, got %s." % self.epsilon)
415412

416413
# raise ValueError if not registered
417-
supported_activations = ['logistic', 'tanh', 'relu']
414+
supported_activations = ('identity', 'logistic', 'tanh', 'relu')
418415
if self.activation not in supported_activations:
419416
raise ValueError("The activation '%s' is not supported. Supported "
420417
"activations are %s." % (self.activation,
@@ -688,9 +685,12 @@ class MLPClassifier(BaseMultilayerPerceptron, ClassifierMixin):
688685
The ith element represents the number of neurons in the ith
689686
hidden layer.
690687
691-
activation : {'logistic', 'tanh', 'relu'}, default 'relu'
688+
activation : {'identity', 'logistic', 'tanh', 'relu'}, default 'relu'
692689
Activation function for the hidden layer.
693690
691+
- 'identity', no-op activation, useful to implement linear bottleneck,
692+
returns f(x) = x
693+
694694
- 'logistic', the logistic sigmoid function,
695695
returns f(x) = 1 / (1 + exp(-x)).
696696
@@ -1042,9 +1042,12 @@ class MLPRegressor(BaseMultilayerPerceptron, RegressorMixin):
10421042
The ith element represents the number of neurons in the ith
10431043
hidden layer.
10441044
1045-
activation : {'logistic', 'tanh', 'relu'}, default 'relu'
1045+
activation : {'identity', 'logistic', 'tanh', 'relu'}, default 'relu'
10461046
Activation function for the hidden layer.
10471047
1048+
- 'identity', no-op activation, useful to implement linear bottleneck,
1049+
returns f(x) = x
1050+
10481051
- 'logistic', the logistic sigmoid function,
10491052
returns f(x) = 1 / (1 + exp(-x)).
10501053

sklearn/neural_network/tests/test_mlp.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
np.seterr(all='warn')
2929

30-
ACTIVATION_TYPES = ["logistic", "tanh", "relu"]
30+
ACTIVATION_TYPES = ["identity", "logistic", "tanh", "relu"]
3131

3232
digits_dataset_multi = load_digits(n_class=3)
3333

@@ -254,7 +254,11 @@ def test_lbfgs_regression():
254254
max_iter=150, shuffle=True, random_state=1,
255255
activation=activation)
256256
mlp.fit(X, y)
257-
assert_greater(mlp.score(X, y), 0.95)
257+
if activation == 'identity':
258+
assert_greater(mlp.score(X, y), 0.84)
259+
else:
260+
# Non linear models perform much better than linear bottleneck:
261+
assert_greater(mlp.score(X, y), 0.95)
258262

259263

260264
def test_learning_rate_warmstart():

0 commit comments

Comments
 (0)