Skip to content

Commit d9ce084

Browse files
authored
Merge pull request #159 from lvapeab/master
label_smoothing for text ouputs
2 parents 5c0fae5 + 855e388 commit d9ce084

File tree

2 files changed

+41
-21
lines changed

2 files changed

+41
-21
lines changed

keras_wrapper/dataset.py

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def generator(self):
223223
batch_size = final_sample - init_sample
224224
it = 0
225225

226-
##### Recovers a batch of data #####
226+
# Recovers a batch of data
227227
# random data selection
228228
if self.params['random_samples'] > 0:
229229
num_retrieve = min(self.params['random_samples'], self.params['batch_size'])
@@ -668,6 +668,7 @@ def __init__(self, name, path, silence=False):
668668
self.vocabulary_len = dict() # number of words in the vocabulary
669669
self.text_offset = dict() # number of timesteps that the text is shifted (to the right)
670670
self.fill_text = dict() # text padding mode
671+
self.label_smoothing = dict() # Epsilon value for label smoothing. See arxiv.org/abs/1512.00567.
671672
self.pad_on_batch = dict() # text padding mode: If pad_on_batch, the sample will have the maximum length
672673
# of the current batch. Else, it will have a fixed length (max_text_len)
673674
self.words_so_far = dict() # if True, each sample will be represented as the complete set of words until
@@ -1082,8 +1083,7 @@ def setRawOutput(self, path_list, set_name, type='file-name', id='raw-text', ove
10821083
logging.info('Loaded "' + set_name + '" set inputs of type "' + type + '" with id "' + id + '".')
10831084

10841085
def setOutput(self, path_list, set_name, type='categorical', id='label', repeat_set=1, overwrite_split=False,
1085-
add_additional=False,
1086-
sample_weights=False,
1086+
add_additional=False, sample_weights=False, label_smoothing=0.,
10871087
tokenization='tokenize_none', max_text_len=0, offset=0, fill='end', min_occ=0, # 'text'
10881088
pad_on_batch=True, words_so_far=False, build_vocabulary=False, max_words=0, # 'text'
10891089
bpe_codes=None, separator='@@', # 'text'
@@ -1106,7 +1106,7 @@ def setOutput(self, path_list, set_name, type='categorical', id='label', repeat_
11061106
the data with id that was already declared in the dataset
11071107
:param add_additional: adds additional data to an already existent output ID
11081108
:param sample_weights: switch on/off sample weights usage for the current output
1109-
1109+
:param label_smoothing: epsilon value for label smoothing. See arxiv.org/abs/1512.00567.
11101110
# 'text'-related parameters
11111111
11121112
:param tokenization: type of tokenization applied (must be declared as a method of this class)
@@ -1155,6 +1155,10 @@ def setOutput(self, path_list, set_name, type='categorical', id='label', repeat_
11551155
'The output type "' + type + '" is not implemented. The list of valid types are the following: ' + str(
11561156
self.__accepted_types_outputs))
11571157

1158+
if self.label_smoothing.get(id) is None:
1159+
self.label_smoothing[id] = dict()
1160+
self.label_smoothing[id][set_name] = label_smoothing
1161+
11581162
# Preprocess the output data depending on its type
11591163
if type == 'categorical':
11601164
self.setClasses(path_list, id)
@@ -2054,7 +2058,7 @@ def loadText(self, X, vocabularies, max_len, offset, fill, pad_on_batch, words_s
20542058
return X_out
20552059

20562060
def loadTextOneHot(self, X, vocabularies, vocabulary_len, max_len, offset, fill, pad_on_batch, words_so_far,
2057-
sample_weights=False, loading_X=False):
2061+
sample_weights=False, loading_X=False, label_smoothing=0.):
20582062

20592063
"""
20602064
Text encoder: Transforms samples from a text representation into a one-hot. It also masks the text.
@@ -2084,8 +2088,9 @@ def loadTextOneHot(self, X, vocabularies, vocabulary_len, max_len, offset, fill,
20842088
else:
20852089
y_aux = np.zeros(list(y[0].shape) + [vocabulary_len]).astype(np.uint8)
20862090
for idx in range(y[0].shape[0]):
2087-
y_aux[idx] = to_categorical(y[0][idx], vocabulary_len).astype(
2088-
np.uint8)
2091+
y_aux[idx] = to_categorical(y[0][idx], vocabulary_len).astype(np.uint8)
2092+
if label_smoothing > 0.:
2093+
y_aux[idx] = ((1-label_smoothing) * y_aux[idx] + (label_smoothing / vocabulary_len)).astype(np.float32)
20892094
if sample_weights:
20902095
y_aux = (y_aux, y[1]) # join data and mask
20912096
return y_aux
@@ -3154,8 +3159,8 @@ def loadImages(self, images, id, normalization_type='(-1)-1',
31543159
logging.disable(logging.NOTSET)
31553160

31563161
except:
3157-
print "WARNING!"
3158-
print "Can't load image " + im
3162+
logging.warning("WARNING!")
3163+
logging.warning("Can't load image " + im)
31593164
im = np.zeros(tuple(self.img_size[id]))
31603165

31613166
# Convert to RGB
@@ -3499,18 +3504,23 @@ def getXY(self, set_name, k, normalization_type='(-1)-1',
34993504
self.img_size[assoc_id_in], self.img_size_crop[assoc_id_in],
35003505
imlist)
35013506
elif type_out == 'text' or type_out == 'dense_text':
3502-
y = self.loadText(y, self.vocabulary[id_out],
3503-
self.max_text_len[id_out][set_name], self.text_offset[id_out],
3507+
y = self.loadText(y, self.vocabulary[id_out], self.max_text_len[id_out][set_name], self.text_offset[id_out],
35043508
fill=self.fill_text[id_out], pad_on_batch=self.pad_on_batch[id_out],
35053509
words_so_far=self.words_so_far[id_out], loading_X=False)
35063510
# Use whole sentence as class (classifier model)
35073511
if self.max_text_len[id_out][set_name] == 0:
35083512
y = to_categorical(y, self.vocabulary_len[id_out]).astype(np.uint8)
35093513
# Use words separately (generator model)
35103514
elif type_out == 'text':
3511-
y_aux = np.zeros(list(y[0].shape) + [self.vocabulary_len[id_out]]).astype(np.uint8)
3515+
if hasattr(self, 'label_smoothing') and self.label_smoothing[id_out][set_name] > 0.:
3516+
y_aux_type = np.float32
3517+
else:
3518+
y_aux_type = np.uint8
3519+
y_aux = np.zeros(list(y[0].shape) + [self.vocabulary_len[id_out]]).astype(y_aux_type)
35123520
for idx in range(y[0].shape[0]):
3513-
y_aux[idx] = to_categorical(y[0][idx], self.vocabulary_len[id_out]).astype(np.uint8)
3521+
y_aux[idx] = to_categorical(y[0][idx], self.vocabulary_len[id_out]).astype(y_aux_type)
3522+
if hasattr(self, 'label_smoothing') and self.label_smoothing[id_out][set_name] > 0.:
3523+
y_aux[idx] = ((1.-self.label_smoothing[id_out][set_name]) * y_aux[idx] + (self.label_smoothing[id_out][set_name] / self.vocabulary_len[id_out]))
35143524
if self.sample_weights[id_out][set_name]:
35153525
y_aux = (y_aux, y[1]) # join data and mask
35163526
y = y_aux
@@ -3641,10 +3651,15 @@ def getXY_FromIndices(self, set_name, k, normalization_type='(-1)-1',
36413651
y = to_categorical(y, self.vocabulary_len[id_out]).astype(np.uint8)
36423652
# Use words separately (generator model)
36433653
elif type_out == 'text':
3644-
y_aux = np.zeros(list(y[0].shape) + [self.vocabulary_len[id_out]]).astype(np.uint8)
3654+
if hasattr(self, 'label_smoothing') and self.label_smoothing[id_out][set_name] > 0.:
3655+
y_aux_type = np.float32
3656+
else:
3657+
y_aux_type = np.uint8
3658+
y_aux = np.zeros(list(y[0].shape) + [self.vocabulary_len[id_out]]).astype(y_aux_type)
36453659
for idx in range(y[0].shape[0]):
3646-
y_aux[idx] = to_categorical(y[0][idx], self.vocabulary_len[id_out]).astype(
3647-
np.uint8)
3660+
y_aux[idx] = to_categorical(y[0][idx], self.vocabulary_len[id_out]).astype(y_aux_type)
3661+
if hasattr(self, 'label_smoothing') and self.label_smoothing[id_out][set_name] > 0.:
3662+
y_aux[idx] = ((1.-self.label_smoothing[id_out][set_name]) * y_aux[idx] + (self.label_smoothing[id_out][set_name] / self.vocabulary_len[id_out]))
36483663
if self.sample_weights[id_out][set_name]:
36493664
y_aux = (y_aux, y[1]) # join data and mask
36503665
y = y_aux
@@ -3804,10 +3819,15 @@ def getY(self, set_name, init, final, normalization_type='0-1', normalization=Fa
38043819
y = to_categorical(y, self.vocabulary_len[id_out]).astype(np.uint8)
38053820
# Use words separately (generator model)
38063821
elif type_out == 'text':
3807-
y_aux = np.zeros(list(y[0].shape) + [self.vocabulary_len[id_out]]).astype(np.uint8)
3822+
if hasattr(self, 'label_smoothing') and self.label_smoothing[id_out][set_name] > 0.:
3823+
y_aux_type = np.float32
3824+
else:
3825+
y_aux_type = np.uint8
3826+
y_aux = np.zeros(list(y[0].shape) + [self.vocabulary_len[id_out]]).astype(y_aux_type)
38083827
for idx in range(y[0].shape[0]):
3809-
y_aux[idx] = to_categorical(y[0][idx], self.vocabulary_len[id_out]).astype(
3810-
np.uint8)
3828+
y_aux[idx] = to_categorical(y[0][idx], self.vocabulary_len[id_out]).astype(np.uint8)
3829+
if hasattr(self, 'label_smoothing') and self.label_smoothing[id_out][set_name] > 0.:
3830+
y_aux[idx] = ((1.-self.label_smoothing[id_out][set_name]) * y_aux[idx] + (self.label_smoothing[id_out][set_name] / self.vocabulary_len[id_out]))
38113831
if self.sample_weights[id_out][set_name]:
38123832
y_aux = (y_aux, y[1]) # join data and mask
38133833

keras_wrapper/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -634,7 +634,7 @@ def one_hot_2_indices(preds, pad_sequences=True, verbose=0):
634634
"""
635635
if verbose > 0:
636636
logging.info('Converting one hot prediction into indices...')
637-
preds = map(lambda x: np.nonzero(x)[1], preds)
637+
preds = map(lambda x: np.argmax(x, axis=1), preds)
638638
if pad_sequences:
639639
preds = [pred[:sum([int(elem > 0) for elem in pred]) + 1] for pred in preds]
640640
return preds
@@ -701,7 +701,7 @@ def decode_predictions_one_hot(preds, index2word, verbose=0):
701701
"""
702702
if verbose > 0:
703703
logging.info('Decoding one hot prediction ...')
704-
preds = map(lambda prediction: np.nonzero(prediction)[1], preds)
704+
preds = map(lambda prediction: np.argmax(prediction, axis=1), preds)
705705
PAD = '<pad>'
706706
flattened_answer_pred = [map(lambda index: index2word[index], pred) for pred in preds]
707707
answer_pred_matrix = np.asarray(flattened_answer_pred)

0 commit comments

Comments
 (0)