Merge pull request #40 from wellcometrust/feature/nsorros/attention

nsorros · web-flow · commit 43e5dcfb7153 · 2020-05-08T10:52:25.000+03:00
Add attention
diff --git a/wellcomeml/ml/attention.py b/wellcomeml/ml/attention.py
@@ -0,0 +1,57 @@
+import tensorflow as tf
+
+class SelfAttention(tf.keras.layers.Layer):
+    """https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf"""
+    def __init__(self, attention_dim=20):
+        super(SelfAttention, self).__init__()
+        self.attention_dim = attention_dim
+
+    def build(self, input_shape):
+        self.WQ = self.add_weight(shape=(input_shape[-1], self.attention_dim), trainable=True, initializer='uniform')
+        self.WK = self.add_weight(shape=(input_shape[-1], self.attention_dim), trainable=True, initializer='uniform')
+        self.WV = self.add_weight(shape=(input_shape[-1], input_shape[-1]), trainable=True, initializer='uniform')
+
+    def call(self, X):
+        """
+        In: (batch_size, sequence_length, embedding_dimension)
+        Out: (batch_size, sequence_length, embedding_dimension)
+        """
+        Q = tf.matmul(X, self.WQ)
+        K = tf.matmul(X, self.WK)
+        V = tf.matmul(X, self.WV)
+        
+        attention_scores = tf.nn.softmax(tf.matmul(Q, tf.transpose(K, perm=[0,2,1])))
+        return tf.matmul(attention_scores, V)
+
+class FeedForwardAttention(tf.keras.layers.Layer):
+    """https://colinraffel.com/publications/iclr2016feed.pdf"""
+    def __init__(self):
+        super(FeedForwardAttention, self).__init__()
+
+    def build(self, input_shape):
+        self.W = self.add_weight(shape=(input_shape[-1],1), trainable=True, initializer='uniform')
+
+    def call(self, X):
+        """
+        In: (batch_size, sequence_length, embedding_dimension)
+        Out: (batch_size, embedding_dimension)
+        """
+        e = tf.math.tanh(tf.matmul(X, self.W))
+        attention_scores = tf.nn.softmax(e)
+        return tf.matmul(tf.transpose(X, perm=[0,2,1]), attention_scores)
+
+class HierarchicalAttention(tf.keras.layers.Layer):
+    """https://www.aclweb.org/anthology/N16-1174/"""
+    def __init__(self):
+        super(HierarchicalAttention, self).__init__()
+    
+    def build(self, input_shape):
+        self.attention_matrix = self.add_weight(shape=(input_shape[-1], input_shape[-2]), trainable=True, initializer='uniform')
+    
+    def call(self, X):
+        """
+        In: (batch_size, sequence_length, embedding_dimension)
+        Out: (batch_size, sequence_length, embedding_dimension)
+        """
+        attention_scores = tf.nn.softmax(tf.math.tanh(tf.matmul(X, self.attention_matrix)))
+        return tf.matmul(attention_scores, X)
diff --git a/wellcomeml/ml/bilstm.py b/wellcomeml/ml/bilstm.py
@@ -3,17 +3,20 @@
 from sklearn.metrics import f1_score
 import tensorflow as tf
 
+from wellcomeml.ml.attention import HierarchicalAttention
 from wellcomeml.ml.keras_utils import Metrics
 
 class BiLSTMClassifier(BaseEstimator, ClassifierMixin):
     def __init__(self, learning_rate=0.01, batch_size=32, nb_epochs=5,
-                 dropout=0.1, nb_layers=2, multilabel=False):
+                 dropout=0.1, nb_layers=2, multilabel=False,
+                 attention=False):
         self.learning_rate = learning_rate
         self.batch_size = batch_size
         self.nb_epochs = nb_epochs
         self.dropout = dropout
         self.nb_layers = nb_layers
         self.multilabel = multilabel
+        self.attention = attention
 
     def fit(self, X, Y, embedding_matrix=None, *_):
         sequence_length = X.shape[1]
@@ -29,6 +32,12 @@ def residual_bilstm(x1, l2):
             x2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(int(x1.shape[-1]/2), return_sequences=True, kernel_regularizer=l2))(x1)
             return tf.keras.layers.add([x1, x2])
 
+        def residual_attention(x1):
+            x2 = HierarchicalAttention()(x1)
+            x2 = tf.keras.layers.Dropout(self.dropout)(x2)
+            x2 = tf.keras.layers.LayerNormalization()(x2)
+            return tf.keras.layers.add([x1, x2])
+
         l2 = tf.keras.regularizers.l2(1e-6)
         embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix) if embedding_matrix else 'uniform'
         inp = tf.keras.layers.Input(shape=(sequence_length,))
@@ -40,6 +49,8 @@ def residual_bilstm(x1, l2):
             )(inp)
         for _ in range(self.nb_layers):
             x = residual_bilstm(x, l2)
+        if self.attention:
+            x = residual_attention(x)
         x = tf.keras.layers.GlobalMaxPooling1D()(x)
         x = tf.keras.layers.Dense(20, kernel_regularizer=l2)(x)
         out = tf.keras.layers.Dense(nb_outputs, activation=output_activation, kernel_regularizer=l2)(x)
diff --git a/wellcomeml/ml/cnn.py b/wellcomeml/ml/cnn.py
@@ -15,12 +15,14 @@
 from sklearn.metrics import f1_score, precision_score, recall_score
 import tensorflow as tf
 
+from wellcomeml.ml.attention import HierarchicalAttention
 from wellcomeml.ml.keras_utils import Metrics
 
 class CNNClassifier(BaseEstimator, ClassifierMixin):
     def __init__(self, context_window = 3, learning_rate=0.001,
                  batch_size=32, nb_epochs=5, dropout=0.2,
-                 nb_layers=4, hidden_size=100, multilabel=False):
+                 nb_layers=4, hidden_size=100, multilabel=False,
+                 attention=False):
         self.context_window = context_window
         self.learning_rate = learning_rate
         self.batch_size = batch_size
@@ -29,6 +31,7 @@ def __init__(self, context_window = 3, learning_rate=0.001,
         self.nb_layers = nb_layers
         self.hidden_size = hidden_size # note that on current implementation CNN use same hidden size as embedding so if embedding matrix is passed, this is not used. in the future we can decouple
         self.multilabel = multilabel
+        self.attention = attention
 
     def fit(self, X, Y, embedding_matrix=None):
         sequence_length = X.shape[1]
@@ -49,6 +52,12 @@ def residual_conv_block(x1):
             x2 = tf.keras.layers.LayerNormalization()(x2)
             return tf.keras.layers.add([x1, x2])
 
+        def residual_attention(x1):
+            x2 = HierarchicalAttention()(x1)
+            x2 = tf.keras.layers.Dropout(self.dropout)(x2)
+            x2 = tf.keras.layers.LayerNormalization()(x2)
+            return tf.keras.layers.add([x1, x2])
+
         embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix) if embedding_matrix else 'uniform'
         inp = tf.keras.layers.Input(shape=(sequence_length,))
         x = tf.keras.layers.Embedding(
@@ -59,6 +68,8 @@ def residual_conv_block(x1):
         x = tf.keras.layers.LayerNormalization()(x)
         for i in range(self.nb_layers):
             x = residual_conv_block(x)
+        if self.attention:
+            x = residual_attention(x)
         x = tf.keras.layers.GlobalMaxPooling1D()(x)
         x = tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-6))(x)
         x = tf.keras.layers.Dropout(self.dropout)(x)