@@ -223,7 +223,7 @@ def generator(self):
223
223
batch_size = final_sample - init_sample
224
224
it = 0
225
225
226
- ##### Recovers a batch of data #####
226
+ # Recovers a batch of data
227
227
# random data selection
228
228
if self .params ['random_samples' ] > 0 :
229
229
num_retrieve = min (self .params ['random_samples' ], self .params ['batch_size' ])
@@ -668,6 +668,7 @@ def __init__(self, name, path, silence=False):
668
668
self .vocabulary_len = dict () # number of words in the vocabulary
669
669
self .text_offset = dict () # number of timesteps that the text is shifted (to the right)
670
670
self .fill_text = dict () # text padding mode
671
+ self .label_smoothing = dict () # Epsilon value for label smoothing. See arxiv.org/abs/1512.00567.
671
672
self .pad_on_batch = dict () # text padding mode: If pad_on_batch, the sample will have the maximum length
672
673
# of the current batch. Else, it will have a fixed length (max_text_len)
673
674
self .words_so_far = dict () # if True, each sample will be represented as the complete set of words until
@@ -1082,8 +1083,7 @@ def setRawOutput(self, path_list, set_name, type='file-name', id='raw-text', ove
1082
1083
logging .info ('Loaded "' + set_name + '" set inputs of type "' + type + '" with id "' + id + '".' )
1083
1084
1084
1085
def setOutput (self , path_list , set_name , type = 'categorical' , id = 'label' , repeat_set = 1 , overwrite_split = False ,
1085
- add_additional = False ,
1086
- sample_weights = False ,
1086
+ add_additional = False , sample_weights = False , label_smoothing = 0. ,
1087
1087
tokenization = 'tokenize_none' , max_text_len = 0 , offset = 0 , fill = 'end' , min_occ = 0 , # 'text'
1088
1088
pad_on_batch = True , words_so_far = False , build_vocabulary = False , max_words = 0 , # 'text'
1089
1089
bpe_codes = None , separator = '@@' , # 'text'
@@ -1106,7 +1106,7 @@ def setOutput(self, path_list, set_name, type='categorical', id='label', repeat_
1106
1106
the data with id that was already declared in the dataset
1107
1107
:param add_additional: adds additional data to an already existent output ID
1108
1108
:param sample_weights: switch on/off sample weights usage for the current output
1109
-
1109
+ :param label_smoothing: epsilon value for label smoothing. See arxiv.org/abs/1512.00567.
1110
1110
# 'text'-related parameters
1111
1111
1112
1112
:param tokenization: type of tokenization applied (must be declared as a method of this class)
@@ -1155,6 +1155,10 @@ def setOutput(self, path_list, set_name, type='categorical', id='label', repeat_
1155
1155
'The output type "' + type + '" is not implemented. The list of valid types are the following: ' + str (
1156
1156
self .__accepted_types_outputs ))
1157
1157
1158
+ if self .label_smoothing .get (id ) is None :
1159
+ self .label_smoothing [id ] = dict ()
1160
+ self .label_smoothing [id ][set_name ] = label_smoothing
1161
+
1158
1162
# Preprocess the output data depending on its type
1159
1163
if type == 'categorical' :
1160
1164
self .setClasses (path_list , id )
@@ -2054,7 +2058,7 @@ def loadText(self, X, vocabularies, max_len, offset, fill, pad_on_batch, words_s
2054
2058
return X_out
2055
2059
2056
2060
def loadTextOneHot (self , X , vocabularies , vocabulary_len , max_len , offset , fill , pad_on_batch , words_so_far ,
2057
- sample_weights = False , loading_X = False ):
2061
+ sample_weights = False , loading_X = False , label_smoothing = 0. ):
2058
2062
2059
2063
"""
2060
2064
Text encoder: Transforms samples from a text representation into a one-hot. It also masks the text.
@@ -2084,8 +2088,9 @@ def loadTextOneHot(self, X, vocabularies, vocabulary_len, max_len, offset, fill,
2084
2088
else :
2085
2089
y_aux = np .zeros (list (y [0 ].shape ) + [vocabulary_len ]).astype (np .uint8 )
2086
2090
for idx in range (y [0 ].shape [0 ]):
2087
- y_aux [idx ] = to_categorical (y [0 ][idx ], vocabulary_len ).astype (
2088
- np .uint8 )
2091
+ y_aux [idx ] = to_categorical (y [0 ][idx ], vocabulary_len ).astype (np .uint8 )
2092
+ if label_smoothing > 0. :
2093
+ y_aux [idx ] = ((1 - label_smoothing ) * y_aux [idx ] + (label_smoothing / vocabulary_len )).astype (np .float32 )
2089
2094
if sample_weights :
2090
2095
y_aux = (y_aux , y [1 ]) # join data and mask
2091
2096
return y_aux
@@ -3154,8 +3159,8 @@ def loadImages(self, images, id, normalization_type='(-1)-1',
3154
3159
logging .disable (logging .NOTSET )
3155
3160
3156
3161
except :
3157
- print "WARNING!"
3158
- print "Can't load image " + im
3162
+ logging . warning ( "WARNING!" )
3163
+ logging . warning ( "Can't load image " + im )
3159
3164
im = np .zeros (tuple (self .img_size [id ]))
3160
3165
3161
3166
# Convert to RGB
@@ -3499,18 +3504,23 @@ def getXY(self, set_name, k, normalization_type='(-1)-1',
3499
3504
self .img_size [assoc_id_in ], self .img_size_crop [assoc_id_in ],
3500
3505
imlist )
3501
3506
elif type_out == 'text' or type_out == 'dense_text' :
3502
- y = self .loadText (y , self .vocabulary [id_out ],
3503
- self .max_text_len [id_out ][set_name ], self .text_offset [id_out ],
3507
+ y = self .loadText (y , self .vocabulary [id_out ], self .max_text_len [id_out ][set_name ], self .text_offset [id_out ],
3504
3508
fill = self .fill_text [id_out ], pad_on_batch = self .pad_on_batch [id_out ],
3505
3509
words_so_far = self .words_so_far [id_out ], loading_X = False )
3506
3510
# Use whole sentence as class (classifier model)
3507
3511
if self .max_text_len [id_out ][set_name ] == 0 :
3508
3512
y = to_categorical (y , self .vocabulary_len [id_out ]).astype (np .uint8 )
3509
3513
# Use words separately (generator model)
3510
3514
elif type_out == 'text' :
3511
- y_aux = np .zeros (list (y [0 ].shape ) + [self .vocabulary_len [id_out ]]).astype (np .uint8 )
3515
+ if hasattr (self , 'label_smoothing' ) and self .label_smoothing [id_out ][set_name ] > 0. :
3516
+ y_aux_type = np .float32
3517
+ else :
3518
+ y_aux_type = np .uint8
3519
+ y_aux = np .zeros (list (y [0 ].shape ) + [self .vocabulary_len [id_out ]]).astype (y_aux_type )
3512
3520
for idx in range (y [0 ].shape [0 ]):
3513
- y_aux [idx ] = to_categorical (y [0 ][idx ], self .vocabulary_len [id_out ]).astype (np .uint8 )
3521
+ y_aux [idx ] = to_categorical (y [0 ][idx ], self .vocabulary_len [id_out ]).astype (y_aux_type )
3522
+ if hasattr (self , 'label_smoothing' ) and self .label_smoothing [id_out ][set_name ] > 0. :
3523
+ y_aux [idx ] = ((1. - self .label_smoothing [id_out ][set_name ]) * y_aux [idx ] + (self .label_smoothing [id_out ][set_name ] / self .vocabulary_len [id_out ]))
3514
3524
if self .sample_weights [id_out ][set_name ]:
3515
3525
y_aux = (y_aux , y [1 ]) # join data and mask
3516
3526
y = y_aux
@@ -3641,10 +3651,15 @@ def getXY_FromIndices(self, set_name, k, normalization_type='(-1)-1',
3641
3651
y = to_categorical (y , self .vocabulary_len [id_out ]).astype (np .uint8 )
3642
3652
# Use words separately (generator model)
3643
3653
elif type_out == 'text' :
3644
- y_aux = np .zeros (list (y [0 ].shape ) + [self .vocabulary_len [id_out ]]).astype (np .uint8 )
3654
+ if hasattr (self , 'label_smoothing' ) and self .label_smoothing [id_out ][set_name ] > 0. :
3655
+ y_aux_type = np .float32
3656
+ else :
3657
+ y_aux_type = np .uint8
3658
+ y_aux = np .zeros (list (y [0 ].shape ) + [self .vocabulary_len [id_out ]]).astype (y_aux_type )
3645
3659
for idx in range (y [0 ].shape [0 ]):
3646
- y_aux [idx ] = to_categorical (y [0 ][idx ], self .vocabulary_len [id_out ]).astype (
3647
- np .uint8 )
3660
+ y_aux [idx ] = to_categorical (y [0 ][idx ], self .vocabulary_len [id_out ]).astype (y_aux_type )
3661
+ if hasattr (self , 'label_smoothing' ) and self .label_smoothing [id_out ][set_name ] > 0. :
3662
+ y_aux [idx ] = ((1. - self .label_smoothing [id_out ][set_name ]) * y_aux [idx ] + (self .label_smoothing [id_out ][set_name ] / self .vocabulary_len [id_out ]))
3648
3663
if self .sample_weights [id_out ][set_name ]:
3649
3664
y_aux = (y_aux , y [1 ]) # join data and mask
3650
3665
y = y_aux
@@ -3804,10 +3819,15 @@ def getY(self, set_name, init, final, normalization_type='0-1', normalization=Fa
3804
3819
y = to_categorical (y , self .vocabulary_len [id_out ]).astype (np .uint8 )
3805
3820
# Use words separately (generator model)
3806
3821
elif type_out == 'text' :
3807
- y_aux = np .zeros (list (y [0 ].shape ) + [self .vocabulary_len [id_out ]]).astype (np .uint8 )
3822
+ if hasattr (self , 'label_smoothing' ) and self .label_smoothing [id_out ][set_name ] > 0. :
3823
+ y_aux_type = np .float32
3824
+ else :
3825
+ y_aux_type = np .uint8
3826
+ y_aux = np .zeros (list (y [0 ].shape ) + [self .vocabulary_len [id_out ]]).astype (y_aux_type )
3808
3827
for idx in range (y [0 ].shape [0 ]):
3809
- y_aux [idx ] = to_categorical (y [0 ][idx ], self .vocabulary_len [id_out ]).astype (
3810
- np .uint8 )
3828
+ y_aux [idx ] = to_categorical (y [0 ][idx ], self .vocabulary_len [id_out ]).astype (np .uint8 )
3829
+ if hasattr (self , 'label_smoothing' ) and self .label_smoothing [id_out ][set_name ] > 0. :
3830
+ y_aux [idx ] = ((1. - self .label_smoothing [id_out ][set_name ]) * y_aux [idx ] + (self .label_smoothing [id_out ][set_name ] / self .vocabulary_len [id_out ]))
3811
3831
if self .sample_weights [id_out ][set_name ]:
3812
3832
y_aux = (y_aux , y [1 ]) # join data and mask
3813
3833
0 commit comments