1
+ """
2
+ CDAN
3
+ """
4
+
5
+ import warnings
6
+ from copy import deepcopy
7
+
8
+ import numpy as np
9
+ import tensorflow as tf
10
+ from tensorflow .keras import Model , Sequential
11
+ from tensorflow .keras .layers import Layer , Input , subtract , Dense , Flatten
12
+ from tensorflow .keras .callbacks import Callback
13
+ from tensorflow .keras .optimizers import Adam
14
+ import tensorflow .keras .backend as K
15
+
16
+ from adapt .feature_based import BaseDeepFeature
17
+ from adapt .utils import (GradientHandler ,
18
+ check_arrays ,
19
+ check_one_array ,
20
+ check_network )
21
+
22
+
23
+ EPS = K .epsilon ()
24
+
25
+
26
+ def _get_default_classifier ():
27
+ model = Sequential ()
28
+ model .add (Flatten ())
29
+ model .add (Dense (10 , activation = "relu" ))
30
+ model .add (Dense (10 , activation = "relu" ))
31
+ model .add (Dense (2 , activation = "softmax" ))
32
+ return model
33
+
34
+
35
+ class CDAN (BaseDeepFeature ):
36
+ """
37
+ CDAN (Conditional Adversarial Domain Adaptation) is an
38
+ unsupervised domain adaptation method on the model of the
39
+ :ref:`DANN <adapt.feature_based.DANN>`. In CDAN the discriminator
40
+ is conditioned on the prediction of the task network for
41
+ source and target data. This should , in theory, focus the
42
+ source-target matching of instances belonging to the same class.
43
+
44
+ To condition the **discriminator** network on each class, a
45
+ multilinear map of shape: ``nb_class * encoder.output_shape[1]``
46
+ is given as input. If the shape is too large (>4096), a random
47
+ sub-multilinear map of lower dimension is considered.
48
+
49
+ The optimization formulation of CDAN is the following:
50
+
51
+ .. math::
52
+
53
+ \min_{\phi, F} & \; \mathcal{L}_{task}(F(\phi(X_S)), y_S) -
54
+ \lambda \\ left( \log(1 - D(\phi(X_S) \\ bigotimes F(X_S)) + \\ \\
55
+ \log(D(\phi(X_T) \\ bigotimes F(X_T)) \\ right) \\ \\
56
+ \max_{D} & \; \log(1 - D(\phi(X_S) \\ bigotimes F(X_S)) + \\ \\
57
+ \log(D(\phi(X_T) \\ bigotimes F(X_T))
58
+
59
+ Where:
60
+
61
+ - :math:`(X_S, y_S), (X_T)` are respectively the labeled source data
62
+ and the unlabeled target data.
63
+ - :math:`\phi, F, D` are respectively the **encoder**, the **task**
64
+ and the **discriminator** networks
65
+ - :math:`\lambda` is the trade-off parameter.
66
+ - :math:`\phi(X_S) \\ bigotimes F(X_S)` is the multilinear map between
67
+ the encoded sources and the task predictions.
68
+
69
+ In CDAN+E, an entropy regularization is added to prioritize the
70
+ transfer of easy-to-transfer exemples. The optimization formulation
71
+ of CDAN+E is the following:
72
+
73
+ .. math::
74
+
75
+ \min_{\phi, F} & \; \mathcal{L}_{task}(F(\phi(X_S)), y_S) -
76
+ \lambda \\ left( \log(1 - W_S D(\phi(X_S) \\ bigotimes F(X_S)) + \\ \\
77
+ W_T \log(D(\phi(X_T) \\ bigotimes F(X_T)) \\ right) \\ \\
78
+ \max_{D} & \; \log(1 - W_S D(\phi(X_S) \\ bigotimes F(X_S)) + \\ \\
79
+ W_T \log(D(\phi(X_T) \\ bigotimes F(X_T))
80
+
81
+ Where:
82
+
83
+ - :math:`W_S = 1+\exp{-\\ text{entropy}(F(X_S))}`
84
+ - :math:`\\ text{entropy}(F(X_S)) = - \sum_{i < C} F(X_S)_i \log(F(X_S)_i)`
85
+ with :math:`C` the number of classes.
86
+
87
+ .. figure:: ../_static/images/cdan.png
88
+ :align: center
89
+
90
+ CDAN architecture (source: [1])
91
+
92
+ Notes
93
+ -----
94
+ CDAN is specific for multi-class classification tasks. Be sure to add a
95
+ softmax activation at the end of the task network.
96
+
97
+ Parameters
98
+ ----------
99
+ encoder : tensorflow Model (default=None)
100
+ Encoder netwok. If ``None``, a shallow network with 10
101
+ neurons and ReLU activation is used as encoder network.
102
+
103
+ task : tensorflow Model (default=None)
104
+ Task netwok. If ``None``, a two layers network with 10
105
+ neurons per layer and ReLU activation is used as task network.
106
+ ``task`` should end with a softmax activation.
107
+
108
+ discriminator : tensorflow Model (default=None)
109
+ Discriminator netwok. If ``None``, a two layers network with 10
110
+ neurons per layer and ReLU activation is used as discriminator
111
+ network. Note that the output shape of the discriminator should
112
+ be ``(None, 1)`` and the input shape:
113
+ ``(None, encoder.output_shape[1] * nb_class)``.
114
+
115
+ lambda_ : float or None (default=1)
116
+ Trade-off parameter. This parameter gives the trade-off
117
+ for the encoder between learning the task and matching
118
+ the source and target distribution. If `lambda_`is small
119
+ the encoder will focus on the task. If `lambda_=0`, CDAN
120
+ is equivalent to a "source only" method.
121
+
122
+ entropy : boolean (default=True)
123
+ Whether to use or not the entropy regularization.
124
+ Adding this regularization will prioritize the
125
+ ``discriminator`` on easy-to-transfer examples.
126
+ This, in theory, should make the transfer "safer".
127
+
128
+ max_features : int (default=4096)
129
+ If ``encoder.output_shape[1] * nb_class)`` is higer than
130
+ ``max_features`` the multilinear map is produced with
131
+ considering random sub vectors of the encoder and task outputs.
132
+
133
+ loss : string or tensorflow loss (default="mse")
134
+ Loss function used for the task.
135
+
136
+ metrics : dict or list of string or tensorflow metrics (default=None)
137
+ Metrics given to the model. If a list is provided,
138
+ metrics are used on both ``task`` and ``discriminator``
139
+ outputs. To give seperated metrics, please provide a
140
+ dict of metrics list with ``"task"`` and ``"disc"`` as keys.
141
+
142
+ optimizer : string or tensorflow optimizer (default=None)
143
+ Optimizer of the model. If ``None``, the
144
+ optimizer is set to tf.keras.optimizers.Adam(0.001)
145
+
146
+ copy : boolean (default=True)
147
+ Whether to make a copy of ``encoder``, ``task`` and
148
+ ``discriminator`` or not.
149
+
150
+ random_state : int (default=None)
151
+ Seed of random generator.
152
+
153
+ Attributes
154
+ ----------
155
+ encoder_ : tensorflow Model
156
+ encoder network.
157
+
158
+ task_ : tensorflow Model
159
+ task network.
160
+
161
+ discriminator_ : tensorflow Model
162
+ discriminator network.
163
+
164
+ model_ : tensorflow Model
165
+ Fitted model: the union of ``encoder_``,
166
+ ``task_`` and ``discriminator_`` networks.
167
+
168
+ history_ : dict
169
+ history of the losses and metrics across the epochs.
170
+ If ``yt`` is given in ``fit`` method, target metrics
171
+ and losses are recorded too.
172
+
173
+ See also
174
+ --------
175
+ DANN
176
+ ADDA
177
+ WDGRL
178
+
179
+ References
180
+ ----------
181
+ .. [1] `[1] <https://arxiv.org/pdf/1705.10667.pdf>`_ Long, M., Cao, \
182
+ Z., Wang, J., and Jordan, M. I. "Conditional adversarial domain adaptation". \
183
+ In NIPS, 2018
184
+ """
185
+ def __init__ (self ,
186
+ encoder = None ,
187
+ task = None ,
188
+ discriminator = None ,
189
+ lambda_ = 1. ,
190
+ entropy = True ,
191
+ max_features = 4096 ,
192
+ loss = "mse" ,
193
+ metrics = None ,
194
+ optimizer = None ,
195
+ copy = True ,
196
+ random_state = None ):
197
+
198
+ self .lambda_ = lambda_
199
+ self .entropy = entropy
200
+ self .max_features = max_features
201
+
202
+ if task is None :
203
+ task = _get_default_classifier ()
204
+ super ().__init__ (encoder , task , discriminator ,
205
+ loss , metrics , optimizer , copy ,
206
+ random_state )
207
+
208
+
209
+ def create_model (self , inputs_Xs , inputs_Xt ):
210
+ encoded_src = self .encoder_ (inputs_Xs )
211
+ encoded_tgt = self .encoder_ (inputs_Xt )
212
+ task_src = self .task_ (encoded_src )
213
+ task_tgt = self .task_ (encoded_tgt )
214
+
215
+ no_grad = GradientHandler (0. , name = "no_grad" )
216
+ flip = GradientHandler (- self .lambda_ , name = "flip" )
217
+
218
+ task_src_nograd = no_grad (task_src )
219
+ task_tgt_nograd = no_grad (task_tgt )
220
+
221
+ if task_src .shape [1 ] * encoded_src .shape [1 ] > self .max_features :
222
+ self ._random_task = tf .random .normal ([task_src .shape [1 ],
223
+ self .max_features ])
224
+ self ._random_enc = tf .random .normal ([encoded_src .shape [1 ],
225
+ self .max_features ])
226
+
227
+ mapping_task_src = tf .matmul (task_src_nograd , self ._random_task )
228
+ mapping_enc_src = tf .matmul (encoded_src , self ._random_enc )
229
+ mapping_src = tf .multiply (mapping_enc_src , mapping_task_src )
230
+ mapping_src /= (tf .math .sqrt (tf .cast (self .max_features , tf .float32 )) + EPS )
231
+
232
+ mapping_task_tgt = tf .matmul (task_tgt_nograd , self ._random_task )
233
+ mapping_enc_tgt = tf .matmul (encoded_tgt , self ._random_enc )
234
+ mapping_tgt = tf .multiply (mapping_enc_tgt , mapping_task_tgt )
235
+ mapping_tgt /= (tf .math .sqrt (tf .cast (self .max_features , tf .float32 )) + EPS )
236
+
237
+ else :
238
+ mapping_src = tf .matmul (
239
+ tf .expand_dims (encoded_src , 2 ),
240
+ tf .expand_dims (task_src_nograd , 1 ))
241
+ mapping_tgt = tf .matmul (
242
+ tf .expand_dims (encoded_tgt , 2 ),
243
+ tf .expand_dims (task_tgt_nograd , 1 ))
244
+
245
+ mapping_src = Flatten ("channels_first" )(mapping_src )
246
+ mapping_tgt = Flatten ("channels_first" )(mapping_tgt )
247
+
248
+ disc_src = flip (mapping_src )
249
+ disc_src = self .discriminator_ (disc_src )
250
+ disc_tgt = flip (mapping_tgt )
251
+ disc_tgt = self .discriminator_ (disc_tgt )
252
+
253
+ outputs = dict (task_src = task_src ,
254
+ task_tgt = task_tgt ,
255
+ disc_src = disc_src ,
256
+ disc_tgt = disc_tgt ,
257
+ task_src_nograd = task_src_nograd ,
258
+ task_tgt_nograd = task_tgt_nograd )
259
+ return outputs
260
+
261
+
262
+ def get_loss (self , inputs_ys , inputs_yt ,
263
+ task_src , task_tgt ,
264
+ disc_src , disc_tgt ,
265
+ task_src_nograd ,
266
+ task_tgt_nograd ):
267
+
268
+ loss_task = self .loss_ (inputs_ys , task_src )
269
+
270
+ if self .entropy :
271
+ entropy_src = - tf .reduce_sum (task_src_nograd *
272
+ tf .math .log (task_src_nograd + EPS ),
273
+ axis = 1 , keepdims = True )
274
+ entropy_tgt = - tf .reduce_sum (task_tgt_nograd *
275
+ tf .math .log (task_tgt_nograd + EPS ),
276
+ axis = 1 , keepdims = True )
277
+ weight_src = 1. + tf .exp (- entropy_src )
278
+ weight_tgt = 1. + tf .exp (- entropy_tgt )
279
+ weight_src /= (tf .reduce_mean (weight_src ) + EPS )
280
+ weight_tgt /= (tf .reduce_mean (weight_tgt ) + EPS )
281
+ weight_src *= .5
282
+ weight_tgt *= .5
283
+
284
+ assert str (weight_src .shape ) == str (disc_src .shape )
285
+ assert str (weight_tgt .shape ) == str (disc_tgt .shape )
286
+
287
+ loss_disc = (- tf .math .log (1 - weight_src * disc_src + EPS )
288
+ - tf .math .log (weight_tgt * disc_tgt + EPS ))
289
+ else :
290
+ loss_disc = (- tf .math .log (1 - disc_src + EPS )
291
+ - tf .math .log (disc_tgt + EPS ))
292
+
293
+ loss = tf .reduce_mean (loss_task ) + tf .reduce_mean (loss_disc )
294
+ return loss
295
+
296
+
297
+ def get_metrics (self , inputs_ys , inputs_yt ,
298
+ task_src , task_tgt ,
299
+ disc_src , disc_tgt ,
300
+ task_src_nograd ,
301
+ task_tgt_nograd ):
302
+ metrics = {}
303
+
304
+ task_s = self .loss_ (inputs_ys , task_src )
305
+
306
+ if self .entropy :
307
+ entropy_src = - tf .reduce_sum (task_src_nograd *
308
+ tf .math .log (task_src_nograd + EPS ),
309
+ axis = 1 , keepdims = True )
310
+ entropy_tgt = - tf .reduce_sum (task_tgt_nograd *
311
+ tf .math .log (task_tgt_nograd + EPS ),
312
+ axis = 1 , keepdims = True )
313
+ weight_src = 1. + tf .exp (- entropy_src )
314
+ weight_tgt = 1. + tf .exp (- entropy_tgt )
315
+ weight_src /= (tf .reduce_mean (weight_src ) + EPS )
316
+ weight_tgt /= (tf .reduce_mean (weight_tgt ) + EPS )
317
+ weight_src *= .5
318
+ weight_tgt *= .5
319
+ disc = (- tf .math .log (1 - weight_src * disc_src + EPS )
320
+ - tf .math .log (weight_tgt * disc_tgt + EPS ))
321
+ else :
322
+ disc = (- tf .math .log (1 - disc_src + EPS )
323
+ - tf .math .log (disc_tgt + EPS ))
324
+
325
+ metrics ["task_s" ] = K .mean (task_s )
326
+ metrics ["disc" ] = K .mean (disc )
327
+ if inputs_yt is not None :
328
+ task_t = self .loss_ (inputs_yt , task_tgt )
329
+ metrics ["task_t" ] = K .mean (task_t )
330
+
331
+ names_task , names_disc = self ._get_metric_names ()
332
+
333
+ for metric , name in zip (self .metrics_task_ , names_task ):
334
+ metrics [name + "_s" ] = metric (inputs_ys , task_src )
335
+ if inputs_yt is not None :
336
+ metrics [name + "_t" ] = metric (inputs_yt , task_tgt )
337
+
338
+ for metric , name in zip (self .metrics_disc_ , names_disc ):
339
+ pred = K .concatenate ((disc_src , disc_tgt ), axis = 0 )
340
+ true = K .concatenate ((K .zeros_like (disc_src ),
341
+ K .ones_like (disc_tgt )), axis = 0 )
342
+ metrics [name ] = metric (true , pred )
343
+ return metrics
344
+
345
+
346
+ def _initialize_networks (self , shape_Xt ):
347
+ # Call predict to avoid strange behaviour with
348
+ # Sequential model whith unspecified input_shape
349
+ zeros_enc_ = self .encoder_ .predict (np .zeros ((1 ,) + shape_Xt ));
350
+ zeros_task_ = self .task_ .predict (zeros_enc_ );
351
+ if zeros_task_ .shape [1 ] * zeros_enc_ .shape [1 ] > self .max_features :
352
+ self .discriminator_ .predict (np .zeros ((1 , self .max_features )))
353
+ else :
354
+ zeros_mapping_ = np .matmul (np .expand_dims (zeros_enc_ , 2 ),
355
+ np .expand_dims (zeros_task_ , 1 ))
356
+ zeros_mapping_ = np .reshape (zeros_mapping_ , (1 , - 1 ))
357
+ self .discriminator_ .predict (zeros_mapping_ );
358
+
359
+
360
+ def predict_disc (self , X ):
361
+ X_enc = self .encoder_ .predict (X )
362
+ X_task = self .task_ .predict (X_enc )
363
+ if X_enc .shape [1 ] * X_task .shape [1 ] > self .max_features :
364
+ X_enc = X_enc .dot (self ._random_enc .numpy ())
365
+ X_task = X_task .dot (self ._random_task .numpy ())
366
+ X_disc = X_enc * X_task
367
+ X_disc /= np .sqrt (self .max_features )
368
+ else :
369
+ X_disc = np .matmul (np .expand_dims (X_enc , 2 ),
370
+ np .expand_dims (X_task , 1 ))
371
+ X_disc = X_disc .transpose ([0 , 2 , 1 ])
372
+ X_disc = X_disc .reshape (- 1 , X_enc .shape [1 ] * X_task .shape [1 ])
373
+ y_disc = self .discriminator_ .predict (X_disc )
374
+ return y_disc
0 commit comments