Use static masks to simulate axial attn

borzunov · borzunov · commit b76b78e2e4e7 · 2021-12-21T03:15:51.000Z
diff --git a/dalle_pytorch/attention.py b/dalle_pytorch/attention.py
@@ -46,7 +46,8 @@ def apply_pos_emb(pos_emb, qkv):
 # classes
 
 class Attention(nn.Module):
-    def __init__(self, dim, seq_len, causal = True, heads = 8, dim_head = 64, dropout = 0., stable = False):
+    def __init__(self, dim, seq_len, causal = True, heads = 8, dim_head = 64, dropout = 0., stable = False,
+                 static_mask = None):
         super().__init__()
         inner_dim = dim_head *  heads
         self.heads = heads
@@ -55,6 +56,7 @@ def __init__(self, dim, seq_len, causal = True, heads = 8, dim_head = 64, dropou
 
         self.stable = stable
         self.causal = causal
+        self.register_buffer('static_mask', static_mask, persistent=False)
 
         self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
         self.to_out = nn.Sequential(
@@ -95,6 +97,9 @@ def forward(self, x, mask = None, rotary_pos_emb = None, cache = None, cache_key
             mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool()
             dots.masked_fill_(mask, mask_value)
 
+        if exists(self.static_mask):
+            dots.masked_fill_(~self.static_mask[offset:offset + n, :offset + n], mask_value)
+
         attn = softmax(dots, dim=-1)
 
         out = attn @ v
@@ -126,7 +131,13 @@ def __init__(self, dim, seq_len, image_size = 32, kernel_size = 5, dilation = 1,
             nn.Dropout(dropout)
         )
 
-    def forward(self, x, mask = None, rotary_pos_emb = None):
+    def forward(self, x, mask = None, rotary_pos_emb = None, cache = None, cache_key = None):
+        n0 = x.shape[1]
+        if exists(cache):
+            if cache_key in cache:
+                x = torch.cat([cache[cache_key], x], dim=-2)
+            cache[cache_key] = x
+
         b, n, _, h, img_size, kernel_size, dilation, seq_len, device = *x.shape, self.heads, self.image_size, self.kernel_size, self.dilation, self.seq_len, x.device
         softmax = torch.softmax if not self.stable else stable_softmax
 
@@ -221,7 +232,7 @@ def forward(self, x, mask = None, rotary_pos_emb = None):
 
         out = rearrange(out, '(b h) n d -> b n (h d)', h = h)
         out =  self.to_out(out)
-        return out[:, :n]
+        return out[:, n - n0:n]
 
 # sparse axial causal attention
 
diff --git a/dalle_pytorch/dalle_pytorch.py b/dalle_pytorch/dalle_pytorch.py
@@ -344,6 +344,7 @@ def __init__(
         shared_attn_ids = None,
         shared_ff_ids = None,
         share_input_output_emb = False,
+        use_static_masks = False,
     ):
         super().__init__()
         assert isinstance(vae, (DiscreteVAE, OpenAIDiscreteVAE, VQGanVAE)), 'vae must be an instance of DiscreteVAE'
@@ -391,6 +392,7 @@ def __init__(
             rotary_emb = rotary_emb,
             shared_attn_ids = shared_attn_ids,
             shared_ff_ids = shared_ff_ids,
+            use_static_masks = use_static_masks,
         )
 
         self.stable = stable
diff --git a/dalle_pytorch/transformer.py b/dalle_pytorch/transformer.py
@@ -1,6 +1,6 @@
 from collections.abc import Iterable
 from functools import partial
-from itertools import islice, cycle
+from itertools import islice, cycle, product
 
 import torch
 from torch import nn, einsum
@@ -161,11 +161,15 @@ def __init__(
         rotary_emb = True,
         shared_attn_ids = None,
         shared_ff_ids = None,
+        use_static_masks = False,
     ):
         super().__init__()
         layers = nn.ModuleList([])
         sparse_layer = cast_tuple(sparse_attn, depth)
 
+        self.seq_len = seq_len
+        self.image_fmap_size = image_fmap_size
+
         attn_types = default(attn_types, ('full',))
         attn_types = cast_tuple(attn_types)
         attn_type_layer = islice(cycle(attn_types), depth)
@@ -182,9 +186,15 @@ def __init__(
             elif attn_type == 'sparse':
                 attn_class = SparseAttention
             elif attn_type == 'axial_row':
-                attn_class = partial(SparseAxialCausalAttention, seq_len = seq_len, axis = 0, image_size = image_fmap_size, stable = stable)
+                if use_static_masks:
+                    attn_class = partial(Attention, stable = stable, static_mask = self._get_static_mask(attn_type))
+                else:
+                    attn_class = partial(SparseAxialCausalAttention, seq_len = seq_len, axis = 0, image_size = image_fmap_size, stable = stable)
             elif attn_type == 'axial_col':
-                attn_class = partial(SparseAxialCausalAttention, seq_len = seq_len, axis = 1, image_size = image_fmap_size, stable = stable)
+                if use_static_masks:
+                    attn_class = partial(Attention, stable = stable, static_mask = self._get_static_mask(attn_type))
+                else:
+                    attn_class = partial(SparseAxialCausalAttention, seq_len = seq_len, axis = 1, image_size = image_fmap_size, stable = stable)
             elif attn_type == 'conv_like':
                 attn_class = partial(SparseConvCausalAttention, seq_len = seq_len, image_size = image_fmap_size, stable = stable)
             elif attn_type == 'mlp':
@@ -257,3 +267,22 @@ def __init__(
 
     def forward(self, x, **kwargs):
         return self.layers(x, rotary_pos_emb = self.pos_emb, **kwargs)
+
+    def _get_static_mask(self, attn_type):
+        img_seq_len = self.image_fmap_size ** 2
+        text_len = self.seq_len - img_seq_len
+
+        static_mask = torch.ones(self.seq_len, self.seq_len, dtype=torch.bool)
+        static_mask[:, :text_len] = True
+        if attn_type == 'axial_row':
+            for row in range(self.image_fmap_size):
+                begin = text_len + row * self.image_fmap_size
+                end = text_len + (row + 1) * self.image_fmap_size
+                static_mask[begin:end, begin:end] = True
+        elif attn_type == 'axial_col':
+            for col in range(self.image_fmap_size):
+                begin = text_len + col
+                static_mask[begin::self.image_fmap_size, begin::self.image_fmap_size] = True
+        else:
+            raise ValueError(f'attention type "{attn_type}" can\'t be simulated with a static mask')
+        return static_mask