Skip to content

Commit f56f6a1

Browse files
Merge pull request #45 from antoinedemathelin/master
docs: Add TrAdaBoost experiments
2 parents 93a565d + 4240e74 commit f56f6a1

File tree

12 files changed

+1364
-82
lines changed

12 files changed

+1364
-82
lines changed

adapt/base.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -282,8 +282,8 @@ def unsupervised_score(self, Xs, Xt):
282282
score : float
283283
Unsupervised score.
284284
"""
285-
Xs = check_array(np.array(Xs))
286-
Xt = check_array(np.array(Xt))
285+
Xs = check_array(Xs, accept_sparse=True)
286+
Xt = check_array(Xt, accept_sparse=True)
287287

288288
if hasattr(self, "transform"):
289289
args = [
@@ -306,13 +306,11 @@ def unsupervised_score(self, Xs, Xt):
306306

307307
set_random_seed(self.random_state)
308308
bootstrap_index = np.random.choice(
309-
len(Xs), size=len(Xs), replace=True, p=sample_weight)
309+
Xs.shape[0], size=Xs.shape[0], replace=True, p=sample_weight)
310310
Xs = Xs[bootstrap_index]
311311
else:
312312
raise ValueError("The Adapt model should implement"
313313
" a transform or predict_weights methods")
314-
Xs = np.array(Xs)
315-
Xt = np.array(Xt)
316314
return normalized_linear_discrepancy(Xs, Xt)
317315

318316

@@ -534,7 +532,7 @@ def fit_estimator(self, X, y, sample_weight=None,
534532
-------
535533
estimator_ : fitted estimator
536534
"""
537-
X, y = check_arrays(X, y)
535+
X, y = check_arrays(X, y, accept_sparse=True)
538536
set_random_seed(random_state)
539537

540538
if (not warm_start) or (not hasattr(self, "estimator_")):
@@ -613,7 +611,7 @@ def predict_estimator(self, X, **predict_params):
613611
y_pred : array
614612
prediction of estimator.
615613
"""
616-
X = check_array(X, ensure_2d=True, allow_nd=True)
614+
X = check_array(X, ensure_2d=True, allow_nd=True, accept_sparse=True)
617615
predict_params = self._filter_params(self.estimator_.predict,
618616
predict_params)
619617
return self.estimator_.predict(X, **predict_params)
@@ -648,7 +646,7 @@ def predict(self, X, domain=None, **predict_params):
648646
y_pred : array
649647
prediction of the Adapt Model.
650648
"""
651-
X = check_array(X, ensure_2d=True, allow_nd=True)
649+
X = check_array(X, ensure_2d=True, allow_nd=True, accept_sparse=True)
652650
if hasattr(self, "transform"):
653651
if domain is None:
654652
domain = "tgt"
@@ -700,7 +698,7 @@ def score(self, X, y, sample_weight=None, domain=None):
700698
score : float
701699
estimator score.
702700
"""
703-
X, y = check_arrays(X, y)
701+
X, y = check_arrays(X, y, accept_sparse=True)
704702

705703
if domain is None:
706704
domain = "target"

adapt/instance_based/_ldm.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,8 @@ def fit_weights(self, Xs, Xt, **kwargs):
142142
print("Final Discrepancy : %f"%sol['primal objective'])
143143

144144
self.weights_ = np.array(sol["x"]).ravel()
145+
self.lambda_ = self.weights_[0]
146+
self.weights_ = np.clip(self.weights_[1:], 0., np.inf)
145147
return self.weights_
146148

147149

adapt/instance_based/_tradaboost.py

Lines changed: 110 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,29 @@
88
from sklearn.exceptions import NotFittedError
99
from sklearn.utils import check_array
1010
from sklearn.metrics import r2_score, accuracy_score
11+
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
12+
from scipy.sparse import vstack, issparse
1113

1214
from adapt.base import BaseAdaptEstimator, make_insert_doc
1315
from adapt.utils import check_arrays, check_estimator, set_random_seed
1416

1517
EPS = np.finfo(float).eps
1618

17-
def _get_median_predict(X, predictions, weights):
19+
def _get_median_predict(predictions, weights):
1820
sorted_idx = np.argsort(predictions, axis=-1)
1921
# Find index of median prediction for each sample
2022
weight_cdf = np.cumsum(weights[sorted_idx], axis=-1)
2123
median_or_above = weight_cdf >= 0.5 * weight_cdf[..., -1][..., np.newaxis]
2224
median_idx = median_or_above.argmax(axis=-1)
2325
new_predictions = None
2426
for i in range(median_idx.shape[1]):
25-
median_estimators = sorted_idx[np.arange(len(X)), i, median_idx[:, i]]
27+
median_estimators = sorted_idx[np.arange(len(predictions)), i, median_idx[:, i]]
2628
if new_predictions is None:
27-
new_predictions = predictions[np.arange(len(X)), i, median_estimators].reshape(-1,1)
29+
new_predictions = predictions[np.arange(len(predictions)), i, median_estimators].reshape(-1,1)
2830
else:
2931
new_predictions = np.concatenate((
3032
new_predictions,
31-
predictions[np.arange(len(X)), i, median_estimators].reshape(-1,1)
33+
predictions[np.arange(len(predictions)), i, median_estimators].reshape(-1,1)
3234
), axis=1)
3335
return new_predictions
3436

@@ -229,12 +231,17 @@ def fit(self, X, y, Xt=None, yt=None,
229231
set_random_seed(self.random_state)
230232
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
231233

232-
Xs, ys = check_arrays(X, y)
234+
Xs, ys = check_arrays(X, y, accept_sparse=True)
233235
Xt, yt = self._get_target_data(Xt, yt)
234-
Xt, yt = check_arrays(Xt, yt)
236+
Xt, yt = check_arrays(Xt, yt, accept_sparse=True)
235237

236-
n_s = len(Xs)
237-
n_t = len(Xt)
238+
if not isinstance(self, TrAdaBoostR2) and isinstance(self.estimator, BaseEstimator):
239+
self.label_encoder_ = LabelEncoder()
240+
ys = self.label_encoder_.fit_transform(ys)
241+
yt = self.label_encoder_.transform(yt)
242+
243+
n_s = Xs.shape[0]
244+
n_t = Xt.shape[0]
238245

239246
if sample_weight_src is None:
240247
sample_weight_src = np.ones(n_s) / (n_s + n_t)
@@ -284,8 +291,11 @@ def fit(self, X, y, Xt=None, yt=None,
284291
def _boost(self, iboost, Xs, ys, Xt, yt,
285292
sample_weight_src, sample_weight_tgt,
286293
**fit_params):
287-
288-
X = np.concatenate((Xs, Xt))
294+
295+
if issparse(Xs):
296+
X = vstack((Xs, Xt))
297+
else:
298+
X = np.concatenate((Xs, Xt))
289299
y = np.concatenate((ys, yt))
290300
sample_weight = np.concatenate((sample_weight_src,
291301
sample_weight_tgt))
@@ -297,39 +307,72 @@ def _boost(self, iboost, Xs, ys, Xt, yt,
297307
warm_start=False,
298308
**fit_params)
299309

300-
ys_pred = estimator.predict(Xs)
301-
yt_pred = estimator.predict(Xt)
310+
if hasattr(estimator, "predict_proba"):
311+
ys_pred = estimator.predict_proba(Xs)
312+
yt_pred = estimator.predict_proba(Xt)
313+
elif hasattr(estimator, "_predict_proba_lr"):
314+
ys_pred = estimator._predict_proba_lr(Xs)
315+
yt_pred = estimator._predict_proba_lr(Xt)
316+
else:
317+
ys_pred = estimator.predict(Xs)
318+
yt_pred = estimator.predict(Xt)
302319

303-
if ys_pred.ndim == 1 or ys.ndim == 1:
320+
if ys.ndim == 1:
304321
ys = ys.reshape(-1, 1)
305322
yt = yt.reshape(-1, 1)
323+
324+
if ys_pred.ndim == 1:
306325
ys_pred = ys_pred.reshape(-1, 1)
307326
yt_pred = yt_pred.reshape(-1, 1)
308327

309-
if isinstance(self, TrAdaBoostR2):
328+
if not isinstance(self, TrAdaBoostR2):
329+
if isinstance(estimator, BaseEstimator):
330+
ohe = OneHotEncoder(sparse=False)
331+
ohe.fit(y.reshape(-1, 1))
332+
ys = ohe.transform(ys)
333+
yt = ohe.transform(yt)
334+
335+
if ys_pred.shape[1] == 1:
336+
ys_pred = ohe.transform(ys_pred)
337+
yt_pred = ohe.transform(yt_pred)
338+
339+
error_vect_src = np.abs(ys_pred - ys).sum(tuple(range(1, ys.ndim))) / 2.
340+
error_vect_tgt = np.abs(yt_pred - yt).sum(tuple(range(1, yt.ndim))) / 2.
341+
342+
else:
343+
assert np.all(ys_pred.shape == ys.shape)
344+
error_vect_src = np.abs(ys_pred - ys).sum(tuple(range(1, ys.ndim)))
345+
error_vect_tgt = np.abs(yt_pred - yt).sum(tuple(range(1, yt.ndim)))
346+
347+
if ys.ndim != 1:
348+
error_vect_src /= 2.
349+
error_vect_tgt /= 2.
350+
351+
else:
310352
error_vect_src = np.abs(ys_pred - ys).mean(tuple(range(1, ys.ndim)))
311353
error_vect_tgt = np.abs(yt_pred - yt).mean(tuple(range(1, yt.ndim)))
312-
error_vect = np.concatenate((error_vect_src, error_vect_tgt))
313354

314-
error_max = error_vect.max() + EPS
315-
if error_max != 0:
316-
error_vect /= error_max
355+
error_max = max(error_vect_src.max(), error_vect_tgt.max())+ EPS
356+
if error_max > 0:
317357
error_vect_src /= error_max
318358
error_vect_tgt /= error_max
319-
else:
320-
if isinstance(estimator, BaseEstimator):
321-
error_vect_src = (ys_pred != ys).astype(float).ravel()
322-
error_vect_tgt = (yt_pred != yt).astype(float).ravel()
323-
error_vect = np.concatenate((error_vect_src, error_vect_tgt))
324-
else:
325-
if ys.shape[1] == 1:
326-
error_vect_src = (np.abs(ys_pred - ys) > 0.5).astype(float).ravel()
327-
error_vect_tgt = (np.abs(yt_pred - yt) > 0.5).astype(float).ravel()
328-
else:
329-
error_vect_src = (ys_pred.argmax(1) != ys.argmax(1)).astype(float).ravel()
330-
error_vect_tgt = (yt_pred.argmax(1) != yt.argmax(1)).astype(float).ravel()
359+
360+
# else:
361+
# if isinstance(estimator, BaseEstimator):
362+
# error_vect_src = (ys_pred != ys).astype(float).ravel()
363+
# error_vect_tgt = (yt_pred != yt).astype(float).ravel()
364+
# error_vect = np.concatenate((error_vect_src, error_vect_tgt))
365+
# else:
366+
# if ys.shape[1] == 1:
367+
# error_vect_src = (np.abs(ys_pred - ys) > 0.5).astype(float).ravel()
368+
# error_vect_tgt = (np.abs(yt_pred - yt) > 0.5).astype(float).ravel()
369+
# else:
370+
# error_vect_src = (ys_pred.argmax(1) != ys.argmax(1)).astype(float).ravel()
371+
# error_vect_tgt = (yt_pred.argmax(1) != yt.argmax(1)).astype(float).ravel()
331372

332373
error_vect = np.concatenate((error_vect_src, error_vect_tgt))
374+
375+
assert sample_weight.ndim == error_vect.ndim
333376

334377
if isinstance(self, _AdaBoostR2):
335378
estimator_error = (sample_weight * error_vect).sum()
@@ -341,10 +384,16 @@ def _boost(self, iboost, Xs, ys, Xt, yt,
341384
# if estimator_error > 0.49:
342385
# estimator_error = 0.49
343386

387+
self.estimators_.append(estimator)
388+
self.estimator_errors_.append(estimator_error)
389+
390+
if estimator_error <= 0.:
391+
return None, None
392+
344393
beta_t = estimator_error / (2. - estimator_error)
345394

346395
beta_s = 1. / (1. + np.sqrt(
347-
2. * np.log(len(Xs)) / self.n_estimators
396+
2. * np.log(Xs.shape[0]) / self.n_estimators
348397
))
349398

350399
if not iboost == self.n_estimators - 1:
@@ -362,9 +411,6 @@ def _boost(self, iboost, Xs, ys, Xt, yt,
362411
# Target updating weights
363412
sample_weight_tgt *= np.power(
364413
beta_t, - self.lr * error_vect_tgt)
365-
366-
self.estimators_.append(estimator)
367-
self.estimator_errors_.append(estimator_error)
368414

369415
return sample_weight_src, sample_weight_tgt
370416

@@ -383,14 +429,21 @@ def predict(self, X):
383429
y_pred : array
384430
Vote results.
385431
"""
386-
X = check_array(X)
432+
X = check_array(X, ensure_2d=True, allow_nd=True, accept_sparse=True)
387433
N = len(self.estimators_)
388434
weights = np.array(self.estimator_weights_)
389435
weights = weights[int(N/2):]
390436
predictions = []
391437
for est in self.estimators_[int(N/2):]:
392438
if isinstance(est, BaseEstimator):
393-
y_pred = est.predict_proba(X)
439+
if hasattr(est, "predict_proba"):
440+
y_pred = est.predict_proba(X)
441+
elif hasattr(est, "_predict_proba_lr"):
442+
y_pred = est._predict_proba_lr(X)
443+
else:
444+
labels = est.predict(X)
445+
y_pred = np.zeros((len(labels), int(max(labels))+1))
446+
y_pred[np.arange(len(labels)), labels] = 1.
394447
else:
395448
y_pred = est.predict(X)
396449
if y_pred.ndim == 1:
@@ -401,7 +454,10 @@ def predict(self, X):
401454
predictions.append(y_pred)
402455
predictions = np.stack(predictions, -1)
403456
weighted_vote = predictions.dot(weights).argmax(1)
404-
return weighted_vote
457+
if hasattr(self, "label_encoder_"):
458+
return self.label_encoder_.inverse_transform(weighted_vote)
459+
else:
460+
return weighted_vote
405461

406462

407463
def predict_weights(self, domain="src"):
@@ -454,7 +510,7 @@ def score(self, X, y):
454510
score : float
455511
estimator score.
456512
"""
457-
X, y = check_arrays(X, y)
513+
X, y = check_arrays(X, y, accept_sparse=True)
458514
yp = self.predict(X)
459515
if isinstance(self, TrAdaBoostR2):
460516
score = r2_score(y, yp)
@@ -587,7 +643,7 @@ def predict(self, X):
587643
y_pred : array
588644
Median results.
589645
"""
590-
X = check_array(X)
646+
X = check_array(X, ensure_2d=True, allow_nd=True, accept_sparse=True)
591647
N = len(self.estimators_)
592648
weights = np.array(self.estimator_weights_)
593649
weights = weights[int(N/2):]
@@ -598,7 +654,7 @@ def predict(self, X):
598654
y_pred = y_pred.reshape(-1, 1)
599655
predictions.append(y_pred)
600656
predictions = np.stack(predictions, -1)
601-
return _get_median_predict(X, predictions, weights)
657+
return _get_median_predict(predictions, weights)
602658

603659

604660
class _AdaBoostR2(TrAdaBoostR2):
@@ -770,12 +826,12 @@ def fit(self, X, y, Xt=None, yt=None,
770826
set_random_seed(self.random_state)
771827
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
772828

773-
Xs, ys = check_arrays(X, y)
829+
Xs, ys = check_arrays(X, y, accept_sparse=True)
774830
Xt, yt = self._get_target_data(Xt, yt)
775-
Xt, yt = check_arrays(Xt, yt)
831+
Xt, yt = check_arrays(Xt, yt, accept_sparse=True)
776832

777-
n_s = len(Xs)
778-
n_t = len(Xt)
833+
n_s = Xs.shape[0]
834+
n_t = Xt.shape[0]
779835

780836
if sample_weight_src is None:
781837
sample_weight_src = np.ones(n_s) / (n_s + n_t)
@@ -786,9 +842,6 @@ def fit(self, X, y, Xt=None, yt=None,
786842
sample_weight_tgt.sum())
787843
sample_weight_src = sample_weight_src / sum_weights
788844
sample_weight_tgt = sample_weight_tgt / sum_weights
789-
790-
X = np.concatenate((Xs, Xt))
791-
y = np.concatenate((ys, yt))
792845

793846
self.sample_weights_src_ = []
794847
self.sample_weights_tgt_ = []
@@ -901,13 +954,13 @@ def func(x):
901954
def _cross_val_score(self, Xs, ys, Xt, yt,
902955
sample_weight_src, sample_weight_tgt,
903956
**fit_params):
904-
if len(Xt) >= self.cv:
957+
if Xt.shape[0] >= self.cv:
905958
cv = self.cv
906959
else:
907-
cv = len(Xt)
960+
cv = Xt.shape[0]
908961

909-
tgt_index = np.arange(len(Xt))
910-
split = int(len(Xt) / cv)
962+
tgt_index = np.arange(Xt.shape[0])
963+
split = int(Xt.shape[0] / cv)
911964
scores = []
912965
for i in range(cv):
913966
if i == cv-1:
@@ -916,7 +969,11 @@ def _cross_val_score(self, Xs, ys, Xt, yt,
916969
test_index = tgt_index[i * split: (i + 1) * split]
917970
train_index = list(set(tgt_index) - set(test_index))
918971

919-
X = np.concatenate((Xs, Xt[train_index]))
972+
973+
if issparse(Xs):
974+
X = vstack((Xs, Xt[train_index]))
975+
else:
976+
X = np.concatenate((Xs, Xt[train_index]))
920977
y = np.concatenate((ys, yt[train_index]))
921978
sample_weight = np.concatenate((sample_weight_src,
922979
sample_weight_tgt[train_index]))
@@ -956,7 +1013,7 @@ def predict(self, X):
9561013
y_pred : array
9571014
Best estimator predictions.
9581015
"""
959-
X = check_array(X)
1016+
X = check_array(X, ensure_2d=True, allow_nd=True, accept_sparse=True)
9601017
best_estimator = self.estimators_[
9611018
np.argmin(self.estimator_errors_)]
9621019
return best_estimator.predict(X)

0 commit comments

Comments
 (0)