8
8
from sklearn .exceptions import NotFittedError
9
9
from sklearn .utils import check_array
10
10
from sklearn .metrics import r2_score , accuracy_score
11
+ from sklearn .preprocessing import LabelEncoder , OneHotEncoder
12
+ from scipy .sparse import vstack , issparse
11
13
12
14
from adapt .base import BaseAdaptEstimator , make_insert_doc
13
15
from adapt .utils import check_arrays , check_estimator , set_random_seed
14
16
15
17
EPS = np .finfo (float ).eps
16
18
17
- def _get_median_predict (X , predictions , weights ):
19
+ def _get_median_predict (predictions , weights ):
18
20
sorted_idx = np .argsort (predictions , axis = - 1 )
19
21
# Find index of median prediction for each sample
20
22
weight_cdf = np .cumsum (weights [sorted_idx ], axis = - 1 )
21
23
median_or_above = weight_cdf >= 0.5 * weight_cdf [..., - 1 ][..., np .newaxis ]
22
24
median_idx = median_or_above .argmax (axis = - 1 )
23
25
new_predictions = None
24
26
for i in range (median_idx .shape [1 ]):
25
- median_estimators = sorted_idx [np .arange (len (X )), i , median_idx [:, i ]]
27
+ median_estimators = sorted_idx [np .arange (len (predictions )), i , median_idx [:, i ]]
26
28
if new_predictions is None :
27
- new_predictions = predictions [np .arange (len (X )), i , median_estimators ].reshape (- 1 ,1 )
29
+ new_predictions = predictions [np .arange (len (predictions )), i , median_estimators ].reshape (- 1 ,1 )
28
30
else :
29
31
new_predictions = np .concatenate ((
30
32
new_predictions ,
31
- predictions [np .arange (len (X )), i , median_estimators ].reshape (- 1 ,1 )
33
+ predictions [np .arange (len (predictions )), i , median_estimators ].reshape (- 1 ,1 )
32
34
), axis = 1 )
33
35
return new_predictions
34
36
@@ -229,12 +231,17 @@ def fit(self, X, y, Xt=None, yt=None,
229
231
set_random_seed (self .random_state )
230
232
tf .compat .v1 .logging .set_verbosity (tf .compat .v1 .logging .ERROR )
231
233
232
- Xs , ys = check_arrays (X , y )
234
+ Xs , ys = check_arrays (X , y , accept_sparse = True )
233
235
Xt , yt = self ._get_target_data (Xt , yt )
234
- Xt , yt = check_arrays (Xt , yt )
236
+ Xt , yt = check_arrays (Xt , yt , accept_sparse = True )
235
237
236
- n_s = len (Xs )
237
- n_t = len (Xt )
238
+ if not isinstance (self , TrAdaBoostR2 ) and isinstance (self .estimator , BaseEstimator ):
239
+ self .label_encoder_ = LabelEncoder ()
240
+ ys = self .label_encoder_ .fit_transform (ys )
241
+ yt = self .label_encoder_ .transform (yt )
242
+
243
+ n_s = Xs .shape [0 ]
244
+ n_t = Xt .shape [0 ]
238
245
239
246
if sample_weight_src is None :
240
247
sample_weight_src = np .ones (n_s ) / (n_s + n_t )
@@ -284,8 +291,11 @@ def fit(self, X, y, Xt=None, yt=None,
284
291
def _boost (self , iboost , Xs , ys , Xt , yt ,
285
292
sample_weight_src , sample_weight_tgt ,
286
293
** fit_params ):
287
-
288
- X = np .concatenate ((Xs , Xt ))
294
+
295
+ if issparse (Xs ):
296
+ X = vstack ((Xs , Xt ))
297
+ else :
298
+ X = np .concatenate ((Xs , Xt ))
289
299
y = np .concatenate ((ys , yt ))
290
300
sample_weight = np .concatenate ((sample_weight_src ,
291
301
sample_weight_tgt ))
@@ -297,39 +307,72 @@ def _boost(self, iboost, Xs, ys, Xt, yt,
297
307
warm_start = False ,
298
308
** fit_params )
299
309
300
- ys_pred = estimator .predict (Xs )
301
- yt_pred = estimator .predict (Xt )
310
+ if hasattr (estimator , "predict_proba" ):
311
+ ys_pred = estimator .predict_proba (Xs )
312
+ yt_pred = estimator .predict_proba (Xt )
313
+ elif hasattr (estimator , "_predict_proba_lr" ):
314
+ ys_pred = estimator ._predict_proba_lr (Xs )
315
+ yt_pred = estimator ._predict_proba_lr (Xt )
316
+ else :
317
+ ys_pred = estimator .predict (Xs )
318
+ yt_pred = estimator .predict (Xt )
302
319
303
- if ys_pred . ndim == 1 or ys .ndim == 1 :
320
+ if ys .ndim == 1 :
304
321
ys = ys .reshape (- 1 , 1 )
305
322
yt = yt .reshape (- 1 , 1 )
323
+
324
+ if ys_pred .ndim == 1 :
306
325
ys_pred = ys_pred .reshape (- 1 , 1 )
307
326
yt_pred = yt_pred .reshape (- 1 , 1 )
308
327
309
- if isinstance (self , TrAdaBoostR2 ):
328
+ if not isinstance (self , TrAdaBoostR2 ):
329
+ if isinstance (estimator , BaseEstimator ):
330
+ ohe = OneHotEncoder (sparse = False )
331
+ ohe .fit (y .reshape (- 1 , 1 ))
332
+ ys = ohe .transform (ys )
333
+ yt = ohe .transform (yt )
334
+
335
+ if ys_pred .shape [1 ] == 1 :
336
+ ys_pred = ohe .transform (ys_pred )
337
+ yt_pred = ohe .transform (yt_pred )
338
+
339
+ error_vect_src = np .abs (ys_pred - ys ).sum (tuple (range (1 , ys .ndim ))) / 2.
340
+ error_vect_tgt = np .abs (yt_pred - yt ).sum (tuple (range (1 , yt .ndim ))) / 2.
341
+
342
+ else :
343
+ assert np .all (ys_pred .shape == ys .shape )
344
+ error_vect_src = np .abs (ys_pred - ys ).sum (tuple (range (1 , ys .ndim )))
345
+ error_vect_tgt = np .abs (yt_pred - yt ).sum (tuple (range (1 , yt .ndim )))
346
+
347
+ if ys .ndim != 1 :
348
+ error_vect_src /= 2.
349
+ error_vect_tgt /= 2.
350
+
351
+ else :
310
352
error_vect_src = np .abs (ys_pred - ys ).mean (tuple (range (1 , ys .ndim )))
311
353
error_vect_tgt = np .abs (yt_pred - yt ).mean (tuple (range (1 , yt .ndim )))
312
- error_vect = np .concatenate ((error_vect_src , error_vect_tgt ))
313
354
314
- error_max = error_vect .max () + EPS
315
- if error_max != 0 :
316
- error_vect /= error_max
355
+ error_max = max (error_vect_src .max (), error_vect_tgt .max ())+ EPS
356
+ if error_max > 0 :
317
357
error_vect_src /= error_max
318
358
error_vect_tgt /= error_max
319
- else :
320
- if isinstance (estimator , BaseEstimator ):
321
- error_vect_src = (ys_pred != ys ).astype (float ).ravel ()
322
- error_vect_tgt = (yt_pred != yt ).astype (float ).ravel ()
323
- error_vect = np .concatenate ((error_vect_src , error_vect_tgt ))
324
- else :
325
- if ys .shape [1 ] == 1 :
326
- error_vect_src = (np .abs (ys_pred - ys ) > 0.5 ).astype (float ).ravel ()
327
- error_vect_tgt = (np .abs (yt_pred - yt ) > 0.5 ).astype (float ).ravel ()
328
- else :
329
- error_vect_src = (ys_pred .argmax (1 ) != ys .argmax (1 )).astype (float ).ravel ()
330
- error_vect_tgt = (yt_pred .argmax (1 ) != yt .argmax (1 )).astype (float ).ravel ()
359
+
360
+ # else:
361
+ # if isinstance(estimator, BaseEstimator):
362
+ # error_vect_src = (ys_pred != ys).astype(float).ravel()
363
+ # error_vect_tgt = (yt_pred != yt).astype(float).ravel()
364
+ # error_vect = np.concatenate((error_vect_src, error_vect_tgt))
365
+ # else:
366
+ # if ys.shape[1] == 1:
367
+ # error_vect_src = (np.abs(ys_pred - ys) > 0.5).astype(float).ravel()
368
+ # error_vect_tgt = (np.abs(yt_pred - yt) > 0.5).astype(float).ravel()
369
+ # else:
370
+ # error_vect_src = (ys_pred.argmax(1) != ys.argmax(1)).astype(float).ravel()
371
+ # error_vect_tgt = (yt_pred.argmax(1) != yt.argmax(1)).astype(float).ravel()
331
372
332
373
error_vect = np .concatenate ((error_vect_src , error_vect_tgt ))
374
+
375
+ assert sample_weight .ndim == error_vect .ndim
333
376
334
377
if isinstance (self , _AdaBoostR2 ):
335
378
estimator_error = (sample_weight * error_vect ).sum ()
@@ -341,10 +384,16 @@ def _boost(self, iboost, Xs, ys, Xt, yt,
341
384
# if estimator_error > 0.49:
342
385
# estimator_error = 0.49
343
386
387
+ self .estimators_ .append (estimator )
388
+ self .estimator_errors_ .append (estimator_error )
389
+
390
+ if estimator_error <= 0. :
391
+ return None , None
392
+
344
393
beta_t = estimator_error / (2. - estimator_error )
345
394
346
395
beta_s = 1. / (1. + np .sqrt (
347
- 2. * np .log (len ( Xs ) ) / self .n_estimators
396
+ 2. * np .log (Xs . shape [ 0 ] ) / self .n_estimators
348
397
))
349
398
350
399
if not iboost == self .n_estimators - 1 :
@@ -362,9 +411,6 @@ def _boost(self, iboost, Xs, ys, Xt, yt,
362
411
# Target updating weights
363
412
sample_weight_tgt *= np .power (
364
413
beta_t , - self .lr * error_vect_tgt )
365
-
366
- self .estimators_ .append (estimator )
367
- self .estimator_errors_ .append (estimator_error )
368
414
369
415
return sample_weight_src , sample_weight_tgt
370
416
@@ -383,14 +429,21 @@ def predict(self, X):
383
429
y_pred : array
384
430
Vote results.
385
431
"""
386
- X = check_array (X )
432
+ X = check_array (X , ensure_2d = True , allow_nd = True , accept_sparse = True )
387
433
N = len (self .estimators_ )
388
434
weights = np .array (self .estimator_weights_ )
389
435
weights = weights [int (N / 2 ):]
390
436
predictions = []
391
437
for est in self .estimators_ [int (N / 2 ):]:
392
438
if isinstance (est , BaseEstimator ):
393
- y_pred = est .predict_proba (X )
439
+ if hasattr (est , "predict_proba" ):
440
+ y_pred = est .predict_proba (X )
441
+ elif hasattr (est , "_predict_proba_lr" ):
442
+ y_pred = est ._predict_proba_lr (X )
443
+ else :
444
+ labels = est .predict (X )
445
+ y_pred = np .zeros ((len (labels ), int (max (labels ))+ 1 ))
446
+ y_pred [np .arange (len (labels )), labels ] = 1.
394
447
else :
395
448
y_pred = est .predict (X )
396
449
if y_pred .ndim == 1 :
@@ -401,7 +454,10 @@ def predict(self, X):
401
454
predictions .append (y_pred )
402
455
predictions = np .stack (predictions , - 1 )
403
456
weighted_vote = predictions .dot (weights ).argmax (1 )
404
- return weighted_vote
457
+ if hasattr (self , "label_encoder_" ):
458
+ return self .label_encoder_ .inverse_transform (weighted_vote )
459
+ else :
460
+ return weighted_vote
405
461
406
462
407
463
def predict_weights (self , domain = "src" ):
@@ -454,7 +510,7 @@ def score(self, X, y):
454
510
score : float
455
511
estimator score.
456
512
"""
457
- X , y = check_arrays (X , y )
513
+ X , y = check_arrays (X , y , accept_sparse = True )
458
514
yp = self .predict (X )
459
515
if isinstance (self , TrAdaBoostR2 ):
460
516
score = r2_score (y , yp )
@@ -587,7 +643,7 @@ def predict(self, X):
587
643
y_pred : array
588
644
Median results.
589
645
"""
590
- X = check_array (X )
646
+ X = check_array (X , ensure_2d = True , allow_nd = True , accept_sparse = True )
591
647
N = len (self .estimators_ )
592
648
weights = np .array (self .estimator_weights_ )
593
649
weights = weights [int (N / 2 ):]
@@ -598,7 +654,7 @@ def predict(self, X):
598
654
y_pred = y_pred .reshape (- 1 , 1 )
599
655
predictions .append (y_pred )
600
656
predictions = np .stack (predictions , - 1 )
601
- return _get_median_predict (X , predictions , weights )
657
+ return _get_median_predict (predictions , weights )
602
658
603
659
604
660
class _AdaBoostR2 (TrAdaBoostR2 ):
@@ -770,12 +826,12 @@ def fit(self, X, y, Xt=None, yt=None,
770
826
set_random_seed (self .random_state )
771
827
tf .compat .v1 .logging .set_verbosity (tf .compat .v1 .logging .ERROR )
772
828
773
- Xs , ys = check_arrays (X , y )
829
+ Xs , ys = check_arrays (X , y , accept_sparse = True )
774
830
Xt , yt = self ._get_target_data (Xt , yt )
775
- Xt , yt = check_arrays (Xt , yt )
831
+ Xt , yt = check_arrays (Xt , yt , accept_sparse = True )
776
832
777
- n_s = len ( Xs )
778
- n_t = len ( Xt )
833
+ n_s = Xs . shape [ 0 ]
834
+ n_t = Xt . shape [ 0 ]
779
835
780
836
if sample_weight_src is None :
781
837
sample_weight_src = np .ones (n_s ) / (n_s + n_t )
@@ -786,9 +842,6 @@ def fit(self, X, y, Xt=None, yt=None,
786
842
sample_weight_tgt .sum ())
787
843
sample_weight_src = sample_weight_src / sum_weights
788
844
sample_weight_tgt = sample_weight_tgt / sum_weights
789
-
790
- X = np .concatenate ((Xs , Xt ))
791
- y = np .concatenate ((ys , yt ))
792
845
793
846
self .sample_weights_src_ = []
794
847
self .sample_weights_tgt_ = []
@@ -901,13 +954,13 @@ def func(x):
901
954
def _cross_val_score (self , Xs , ys , Xt , yt ,
902
955
sample_weight_src , sample_weight_tgt ,
903
956
** fit_params ):
904
- if len ( Xt ) >= self .cv :
957
+ if Xt . shape [ 0 ] >= self .cv :
905
958
cv = self .cv
906
959
else :
907
- cv = len ( Xt )
960
+ cv = Xt . shape [ 0 ]
908
961
909
- tgt_index = np .arange (len ( Xt ) )
910
- split = int (len ( Xt ) / cv )
962
+ tgt_index = np .arange (Xt . shape [ 0 ] )
963
+ split = int (Xt . shape [ 0 ] / cv )
911
964
scores = []
912
965
for i in range (cv ):
913
966
if i == cv - 1 :
@@ -916,7 +969,11 @@ def _cross_val_score(self, Xs, ys, Xt, yt,
916
969
test_index = tgt_index [i * split : (i + 1 ) * split ]
917
970
train_index = list (set (tgt_index ) - set (test_index ))
918
971
919
- X = np .concatenate ((Xs , Xt [train_index ]))
972
+
973
+ if issparse (Xs ):
974
+ X = vstack ((Xs , Xt [train_index ]))
975
+ else :
976
+ X = np .concatenate ((Xs , Xt [train_index ]))
920
977
y = np .concatenate ((ys , yt [train_index ]))
921
978
sample_weight = np .concatenate ((sample_weight_src ,
922
979
sample_weight_tgt [train_index ]))
@@ -956,7 +1013,7 @@ def predict(self, X):
956
1013
y_pred : array
957
1014
Best estimator predictions.
958
1015
"""
959
- X = check_array (X )
1016
+ X = check_array (X , ensure_2d = True , allow_nd = True , accept_sparse = True )
960
1017
best_estimator = self .estimators_ [
961
1018
np .argmin (self .estimator_errors_ )]
962
1019
return best_estimator .predict (X )
0 commit comments