Merge pull request #159 from lvapeab/master

lvapeab · web-flow · commit d9ce08478f53 · 2018-04-12T18:01:59.000+02:00
label_smoothing for text ouputs
diff --git a/keras_wrapper/dataset.py b/keras_wrapper/dataset.py
@@ -223,7 +223,7 @@ def generator(self):
                 batch_size = final_sample - init_sample
                 it = 0
 
-            ##### Recovers a batch of data #####
+            # Recovers a batch of data
             # random data selection
             if self.params['random_samples'] > 0:
                 num_retrieve = min(self.params['random_samples'], self.params['batch_size'])
@@ -668,6 +668,7 @@ def __init__(self, name, path, silence=False):
         self.vocabulary_len = dict()  # number of words in the vocabulary
         self.text_offset = dict()  # number of timesteps that the text is shifted (to the right)
         self.fill_text = dict()  # text padding mode
+        self.label_smoothing = dict() # Epsilon value for label smoothing. See arxiv.org/abs/1512.00567.
         self.pad_on_batch = dict()  # text padding mode: If pad_on_batch, the sample will have the maximum length
         # of the current batch. Else, it will have a fixed length (max_text_len)
         self.words_so_far = dict()  # if True, each sample will be represented as the complete set of words until
@@ -1082,8 +1083,7 @@ def setRawOutput(self, path_list, set_name, type='file-name', id='raw-text', ove
             logging.info('Loaded "' + set_name + '" set inputs of type "' + type + '" with id "' + id + '".')
 
     def setOutput(self, path_list, set_name, type='categorical', id='label', repeat_set=1, overwrite_split=False,
-                  add_additional=False,
-                  sample_weights=False,
+                  add_additional=False, sample_weights=False, label_smoothing=0.,
                   tokenization='tokenize_none', max_text_len=0, offset=0, fill='end', min_occ=0,  # 'text'
                   pad_on_batch=True, words_so_far=False, build_vocabulary=False, max_words=0,  # 'text'
                   bpe_codes=None, separator='@@',  # 'text'
@@ -1106,7 +1106,7 @@ def setOutput(self, path_list, set_name, type='categorical', id='label', repeat_
                                 the data with id that was already declared in the dataset
         :param add_additional: adds additional data to an already existent output ID
         :param sample_weights: switch on/off sample weights usage for the current output
-
+        :param label_smoothing: epsilon value for label smoothing. See arxiv.org/abs/1512.00567.
             # 'text'-related parameters
 
         :param tokenization: type of tokenization applied (must be declared as a method of this class)
@@ -1155,6 +1155,10 @@ def setOutput(self, path_list, set_name, type='categorical', id='label', repeat_
                 'The output type "' + type + '" is not implemented. The list of valid types are the following: ' + str(
                     self.__accepted_types_outputs))
 
+        if self.label_smoothing.get(id) is None:
+            self.label_smoothing[id] = dict()
+        self.label_smoothing[id][set_name] = label_smoothing
+
         # Preprocess the output data depending on its type
         if type == 'categorical':
             self.setClasses(path_list, id)
@@ -2054,7 +2058,7 @@ def loadText(self, X, vocabularies, max_len, offset, fill, pad_on_batch, words_s
         return X_out
 
     def loadTextOneHot(self, X, vocabularies, vocabulary_len, max_len, offset, fill, pad_on_batch, words_so_far,
-                       sample_weights=False, loading_X=False):
+                       sample_weights=False, loading_X=False, label_smoothing=0.):
 
         """
         Text encoder: Transforms samples from a text representation into a one-hot. It also masks the text.
@@ -2084,8 +2088,9 @@ def loadTextOneHot(self, X, vocabularies, vocabulary_len, max_len, offset, fill,
         else:
             y_aux = np.zeros(list(y[0].shape) + [vocabulary_len]).astype(np.uint8)
             for idx in range(y[0].shape[0]):
-                y_aux[idx] = to_categorical(y[0][idx], vocabulary_len).astype(
-                    np.uint8)
+                y_aux[idx] = to_categorical(y[0][idx], vocabulary_len).astype(np.uint8)
+                if label_smoothing > 0.:
+                    y_aux[idx] = ((1-label_smoothing) * y_aux[idx] + (label_smoothing / vocabulary_len)).astype(np.float32)
             if sample_weights:
                 y_aux = (y_aux, y[1])  # join data and mask
         return y_aux
@@ -3154,8 +3159,8 @@ def loadImages(self, images, id, normalization_type='(-1)-1',
                     logging.disable(logging.NOTSET)
 
                 except:
-                    print "WARNING!"
-                    print "Can't load image " + im
+                    logging.warning("WARNING!")
+                    logging.warning("Can't load image " + im)
                     im = np.zeros(tuple(self.img_size[id]))
 
             # Convert to RGB
@@ -3499,18 +3504,23 @@ def getXY(self, set_name, k, normalization_type='(-1)-1',
                                                   self.img_size[assoc_id_in], self.img_size_crop[assoc_id_in],
                                                   imlist)
                 elif type_out == 'text' or type_out == 'dense_text':
-                    y = self.loadText(y, self.vocabulary[id_out],
-                                      self.max_text_len[id_out][set_name], self.text_offset[id_out],
+                    y = self.loadText(y, self.vocabulary[id_out], self.max_text_len[id_out][set_name], self.text_offset[id_out],
                                       fill=self.fill_text[id_out], pad_on_batch=self.pad_on_batch[id_out],
                                       words_so_far=self.words_so_far[id_out], loading_X=False)
                     # Use whole sentence as class (classifier model)
                     if self.max_text_len[id_out][set_name] == 0:
                         y = to_categorical(y, self.vocabulary_len[id_out]).astype(np.uint8)
                     # Use words separately (generator model)
                     elif type_out == 'text':
-                        y_aux = np.zeros(list(y[0].shape) + [self.vocabulary_len[id_out]]).astype(np.uint8)
+                        if hasattr(self, 'label_smoothing') and self.label_smoothing[id_out][set_name] > 0.:
+                            y_aux_type = np.float32
+                        else:
+                            y_aux_type = np.uint8
+                        y_aux = np.zeros(list(y[0].shape) + [self.vocabulary_len[id_out]]).astype(y_aux_type)
                         for idx in range(y[0].shape[0]):
-                            y_aux[idx] = to_categorical(y[0][idx], self.vocabulary_len[id_out]).astype(np.uint8)
+                            y_aux[idx] = to_categorical(y[0][idx], self.vocabulary_len[id_out]).astype(y_aux_type)
+                            if hasattr(self, 'label_smoothing') and self.label_smoothing[id_out][set_name] > 0.:
+                                y_aux[idx] = ((1.-self.label_smoothing[id_out][set_name]) * y_aux[idx] + (self.label_smoothing[id_out][set_name] / self.vocabulary_len[id_out]))
                         if self.sample_weights[id_out][set_name]:
                             y_aux = (y_aux, y[1])  # join data and mask
                         y = y_aux
@@ -3641,10 +3651,15 @@ def getXY_FromIndices(self, set_name, k, normalization_type='(-1)-1',
                         y = to_categorical(y, self.vocabulary_len[id_out]).astype(np.uint8)
                     # Use words separately (generator model)
                     elif type_out == 'text':
-                        y_aux = np.zeros(list(y[0].shape) + [self.vocabulary_len[id_out]]).astype(np.uint8)
+                        if hasattr(self, 'label_smoothing') and self.label_smoothing[id_out][set_name] > 0.:
+                            y_aux_type = np.float32
+                        else:
+                            y_aux_type = np.uint8
+                        y_aux = np.zeros(list(y[0].shape) + [self.vocabulary_len[id_out]]).astype(y_aux_type)
                         for idx in range(y[0].shape[0]):
-                            y_aux[idx] = to_categorical(y[0][idx], self.vocabulary_len[id_out]).astype(
-                                np.uint8)
+                            y_aux[idx] = to_categorical(y[0][idx], self.vocabulary_len[id_out]).astype(y_aux_type)
+                            if hasattr(self, 'label_smoothing') and self.label_smoothing[id_out][set_name] > 0.:
+                                y_aux[idx] = ((1.-self.label_smoothing[id_out][set_name]) * y_aux[idx] + (self.label_smoothing[id_out][set_name] / self.vocabulary_len[id_out]))
                         if self.sample_weights[id_out][set_name]:
                             y_aux = (y_aux, y[1])  # join data and mask
                         y = y_aux
@@ -3804,10 +3819,15 @@ def getY(self, set_name, init, final, normalization_type='0-1', normalization=Fa
                         y = to_categorical(y, self.vocabulary_len[id_out]).astype(np.uint8)
                     # Use words separately (generator model)
                     elif type_out == 'text':
-                        y_aux = np.zeros(list(y[0].shape) + [self.vocabulary_len[id_out]]).astype(np.uint8)
+                        if hasattr(self, 'label_smoothing') and self.label_smoothing[id_out][set_name] > 0.:
+                            y_aux_type = np.float32
+                        else:
+                            y_aux_type = np.uint8
+                        y_aux = np.zeros(list(y[0].shape) + [self.vocabulary_len[id_out]]).astype(y_aux_type)
                         for idx in range(y[0].shape[0]):
-                            y_aux[idx] = to_categorical(y[0][idx], self.vocabulary_len[id_out]).astype(
-                                np.uint8)
+                            y_aux[idx] = to_categorical(y[0][idx], self.vocabulary_len[id_out]).astype(np.uint8)
+                            if hasattr(self, 'label_smoothing') and self.label_smoothing[id_out][set_name] > 0.:
+                                y_aux[idx] = ((1.-self.label_smoothing[id_out][set_name]) * y_aux[idx] + (self.label_smoothing[id_out][set_name] / self.vocabulary_len[id_out]))
                         if self.sample_weights[id_out][set_name]:
                             y_aux = (y_aux, y[1])  # join data and mask
 
diff --git a/keras_wrapper/utils.py b/keras_wrapper/utils.py
@@ -634,7 +634,7 @@ def one_hot_2_indices(preds, pad_sequences=True, verbose=0):
     """
     if verbose > 0:
         logging.info('Converting one hot prediction into indices...')
-    preds = map(lambda x: np.nonzero(x)[1], preds)
+    preds = map(lambda x: np.argmax(x, axis=1), preds)
     if pad_sequences:
         preds = [pred[:sum([int(elem > 0) for elem in pred]) + 1] for pred in preds]
     return preds
@@ -701,7 +701,7 @@ def decode_predictions_one_hot(preds, index2word, verbose=0):
     """
     if verbose > 0:
         logging.info('Decoding one hot prediction ...')
-    preds = map(lambda prediction: np.nonzero(prediction)[1], preds)
+    preds = map(lambda prediction: np.argmax(prediction, axis=1), preds)
     PAD = '<pad>'
     flattened_answer_pred = [map(lambda index: index2word[index], pred) for pred in preds]
     answer_pred_matrix = np.asarray(flattened_answer_pred)