@@ -11,23 +11,88 @@ def __init__(self, fit_intercept=True):
11
11
12
12
Notes
13
13
-----
14
- Given data matrix *X* and target vector *y* , the maximum-likelihood estimate
15
- for the regression coefficients, :math:`\ \beta`, is:
14
+ Given data matrix **X** and target vector **y** , the maximum-likelihood
15
+ estimate for the regression coefficients, :math:`\beta`, is:
16
16
17
17
.. math::
18
18
19
- \hat{\beta} =
20
- \left(\mathbf{X}^\top \mathbf{X}\right)^{-1} \mathbf{X}^\top \mathbf{y}
19
+ \hat{\beta} = \Sigma^{-1} \mathbf{X}^\top \mathbf{y}
20
+
21
+ where :math:`\Sigma^{-1} = (\mathbf{X}^\top \mathbf{X})^{-1}`.
21
22
22
23
Parameters
23
24
----------
24
25
fit_intercept : bool
25
- Whether to fit an additional intercept term in addition to the
26
- model coefficients. Default is True.
26
+ Whether to fit an intercept term in addition to the model
27
+ coefficients. Default is True.
27
28
"""
28
29
self .beta = None
30
+ self .sigma_inv = None
29
31
self .fit_intercept = fit_intercept
30
32
33
+ self ._is_fit = False
34
+
35
+ def update (self , x , y ):
36
+ r"""
37
+ Incrementally update the least-squares coefficients on a new example
38
+ via recursive least-squares (RLS) [1]_ .
39
+
40
+ Notes
41
+ -----
42
+ The RLS algorithm [2]_ is used to efficiently update the regression
43
+ parameters as new examples become available. For a new example
44
+ :math:`(\mathbf{x}_{t+1}, \mathbf{y}_{t+1})`, the parameter updates are
45
+
46
+ .. math::
47
+
48
+ \beta_{t+1} = \left(
49
+ \mathbf{X}_{1:t}^\top \mathbf{X}_{1:t} +
50
+ \mathbf{x}_{t+1}\mathbf{x}_{t+1}^\top \right)^{-1}
51
+ \mathbf{X}_{1:t}^\top \mathbf{Y}_{1:t} +
52
+ \mathbf{x}_{t+1}^\top \mathbf{y}_{t+1}
53
+
54
+ where :math:`\beta_{t+1}` are the updated regression coefficients,
55
+ :math:`\mathbf{X}_{1:t}` and :math:`\mathbf{Y}_{1:t}` are the set of
56
+ examples observed from timestep 1 to *t*.
57
+
58
+ To perform the above update efficiently, the RLS algorithm makes use of
59
+ the Sherman-Morrison formula [3]_ to avoid re-inverting the covariance
60
+ matrix on each new update.
61
+
62
+ References
63
+ ----------
64
+ .. [1] Gauss, C. F. (1821) _Theoria combinationis observationum
65
+ erroribus minimis obnoxiae_, Werke, 4. Gottinge
66
+ .. [2] https://en.wikipedia.org/wiki/Recursive_least_squares_filter
67
+ .. [3] https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
68
+
69
+ Parameters
70
+ ----------
71
+ x : :py:class:`ndarray <numpy.ndarray>` of shape `(1, M)`
72
+ A single example of rank `M`
73
+ y : :py:class:`ndarray <numpy.ndarray>` of shape `(1, K)`
74
+ A `K`-dimensional target vector for the current example
75
+ """
76
+ if not self ._is_fit :
77
+ raise RuntimeError ("You must call the `fit` method before calling `update`" )
78
+
79
+ x , y = np .atleast_2d (x ), np .atleast_2d (y )
80
+ beta , S_inv = self .beta , self .sigma_inv
81
+
82
+ X1 , Y1 = x .shape [0 ], y .shape [0 ]
83
+ err_str = f"First dimension of x and y must be 1, but got { X1 } and { Y1 } "
84
+ assert X1 == Y1 == 1 , err_str
85
+
86
+ # convert x to a design vector if we're fitting an intercept
87
+ if self .fit_intercept :
88
+ x = np .c_ [1 , x ]
89
+
90
+ # update the inverse of the covariance matrix via Sherman-Morrison
91
+ S_inv -= (S_inv @ x .T @ x @ S_inv ) / (1 + x @ S_inv @ x .T )
92
+
93
+ # update the model coefficients
94
+ beta += S_inv @ x .T @ (y - x @ beta )
95
+
31
96
def fit (self , X , y ):
32
97
"""
33
98
Fit the regression coefficients via maximum likelihood.
@@ -44,8 +109,10 @@ def fit(self, X, y):
44
109
if self .fit_intercept :
45
110
X = np .c_ [np .ones (X .shape [0 ]), X ]
46
111
47
- pseudo_inverse = np .linalg .inv (X .T @ X ) @ X .T
48
- self .beta = np .dot (pseudo_inverse , y )
112
+ self .sigma_inv = np .linalg .pinv (X .T @ X )
113
+ self .beta = np .atleast_2d (self .sigma_inv @ X .T @ y )
114
+
115
+ self ._is_fit = True
49
116
50
117
def predict (self , X ):
51
118
"""
@@ -166,22 +233,22 @@ def __init__(self, penalty="l2", gamma=0, fit_intercept=True):
166
233
\left(
167
234
\sum_{i=0}^N y_i \log(\hat{y}_i) +
168
235
(1-y_i) \log(1-\hat{y}_i)
169
- \right) - R(\mathbf{b}, \gamma)
236
+ \right) - R(\mathbf{b}, \gamma)
170
237
\right]
171
-
238
+
172
239
where
173
-
240
+
174
241
.. math::
175
-
242
+
176
243
R(\mathbf{b}, \gamma) = \left\{
177
244
\begin{array}{lr}
178
245
\frac{\gamma}{2} ||\mathbf{beta}||_2^2 & :\texttt{ penalty = 'l2'}\\
179
246
\gamma ||\beta||_1 & :\texttt{ penalty = 'l1'}
180
247
\end{array}
181
248
\right.
182
-
183
- is a regularization penalty, :math:`\gamma` is a regularization weight,
184
- `N` is the number of examples in **y**, and **b** is the vector of model
249
+
250
+ is a regularization penalty, :math:`\gamma` is a regularization weight,
251
+ `N` is the number of examples in **y**, and **b** is the vector of model
185
252
coefficients.
186
253
187
254
Parameters
@@ -251,10 +318,10 @@ def _NLL(self, X, y, y_pred):
251
318
\right]
252
319
"""
253
320
N , M = X .shape
254
- beta , gamma = self .beta , self .gamma
321
+ beta , gamma = self .beta , self .gamma
255
322
order = 2 if self .penalty == "l2" else 1
256
323
norm_beta = np .linalg .norm (beta , ord = order )
257
-
324
+
258
325
nll = - np .log (y_pred [y == 1 ]).sum () - np .log (1 - y_pred [y == 0 ]).sum ()
259
326
penalty = (gamma / 2 ) * norm_beta ** 2 if order == 2 else gamma * norm_beta
260
327
return (penalty + nll ) / N
0 commit comments