Cache to_qkv and to_out in sparse attn, add debug prints

borzunov · borzunov · commit 4d431ac93531 · 2021-12-20T13:39:38.000Z
diff --git a/dalle_pytorch/attention.py b/dalle_pytorch/attention.py
@@ -6,6 +6,8 @@
 import torch.nn.functional as F
 from einops import rearrange, repeat
 
+from dalle_pytorch.cache import Cached
+
 from rotary_embedding_torch import apply_rotary_emb
 
 # helpers
@@ -102,14 +104,14 @@ def __init__(self, dim, seq_len, image_size = 32, kernel_size = 5, dilation = 1,
 
         self.stable = stable
 
-        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
+        self.to_qkv = Cached(nn.Linear(dim, inner_dim * 3, bias = False))
 
-        self.to_out = nn.Sequential(
+        self.to_out = Cached(nn.Sequential(
             nn.Linear(inner_dim, dim),
             nn.Dropout(dropout)
-        )
+        ))
 
-    def forward(self, x, mask = None, rotary_pos_emb = None):
+    def forward(self, x, mask = None, rotary_pos_emb = None, cache = None, cache_key = None):
         b, n, _, h, img_size, kernel_size, dilation, seq_len, device = *x.shape, self.heads, self.image_size, self.kernel_size, self.dilation, self.seq_len, x.device
         softmax = torch.softmax if not self.stable else stable_softmax
 
@@ -126,7 +128,7 @@ def forward(self, x, mask = None, rotary_pos_emb = None):
 
         # derive query / keys / values
 
-        qkv = self.to_qkv(x).chunk(3, dim = -1)
+        qkv = self.to_qkv(x, cache = cache, cache_key = f'{cache_key}_qkv').chunk(3, dim = -1)
         q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h = h), qkv)
 
         if exists(rotary_pos_emb):
@@ -203,11 +205,13 @@ def forward(self, x, mask = None, rotary_pos_emb = None):
         out = torch.cat((out_text, out_image), dim = 1)
 
         out = rearrange(out, '(b h) n d -> b n (h d)', h = h)
-        out =  self.to_out(out)
+        out =  self.to_out(out, cache = cache, cache_key = f'{cache_key}_out')
         return out[:, :n]
 
 # sparse axial causal attention
 
+from time import time
+
 class SparseAxialCausalAttention(nn.Module):
     def __init__(self, dim, seq_len, image_size = 32, axis = 0, heads = 8, dim_head = 64, dropout = 0., stable = False, **kwargs):
         super().__init__()
@@ -222,14 +226,14 @@ def __init__(self, dim, seq_len, image_size = 32, axis = 0, heads = 8, dim_head
 
         self.stable = stable
 
-        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
+        self.to_qkv = Cached(nn.Linear(dim, inner_dim * 3, bias = False))
 
-        self.to_out = nn.Sequential(
+        self.to_out = Cached(nn.Sequential(
             nn.Linear(inner_dim, dim),
             nn.Dropout(dropout)
-        )
+        ))
 
-    def forward(self, x, mask = None, rotary_pos_emb = None):
+    def forward(self, x, mask = None, rotary_pos_emb = None, cache = None, cache_key = None):
         b, n, _, h, img_size, axis, seq_len, device = *x.shape, self.heads, self.image_size, self.axis, self.seq_len, x.device
         softmax = torch.softmax if not self.stable else stable_softmax
 
@@ -246,7 +250,10 @@ def forward(self, x, mask = None, rotary_pos_emb = None):
 
         # derive queries / keys / values
 
-        qkv = self.to_qkv(x).chunk(3, dim = -1)
+        t = time()
+        qkv = self.to_qkv(x, cache = cache, cache_key = f'{cache_key}_qkv').chunk(3, dim = -1)
+        print(f'Time 1: {time() - t:.5f} sec')
+        t = time()
         q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h = h), qkv)
 
         if exists(rotary_pos_emb):
@@ -317,7 +324,10 @@ def forward(self, x, mask = None, rotary_pos_emb = None):
         out = torch.cat((out_text, out_image), dim = 1)
 
         out = rearrange(out, '(b h) n d -> b n (h d)', h = h)
-        out =  self.to_out(out)
+        print(f'Time 2: {time() - t:.5f} sec')
+        t = time()
+        out =  self.to_out(out, cache = cache, cache_key = f'{cache_key}_out')
+        print(f'Time 3: {time() - t:.5f} sec\n')
         return out[:, :n]
 
 # microsoft sparse attention CUDA kernel
diff --git a/dalle_pytorch/cache.py b/dalle_pytorch/cache.py
@@ -0,0 +1,38 @@
+import torch
+import torch.nn as nn
+
+# helpers
+
+def exists(val):
+    return val is not None
+
+class Cached(nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, x, *, cache=None, cache_key=None, **kwargs):
+        assert exists(cache) and exists(cache_key)
+
+        return self.fn(x, **kwargs)  # dbg
+
+        if exists(cache) and cache_key in cache:
+            prefix = cache[cache_key]
+            assert prefix.shape[1] == x.shape[1] or prefix.shape[1] + 1 == x.shape[1], f'{prefix.shape[1]} {x.shape[1]} {cache_key} {cache.keys()}'  # TODO: Change to <= for prod
+            suffix = self.fn(x[:, prefix.shape[1]:, :], **kwargs)
+            out = torch.cat([prefix, suffix], dim=1)
+        else:
+            out = self.fn(x, **kwargs)
+
+        if exists(cache):
+            cache[cache_key] = out
+        return out
+
+class FixCacheKey(nn.Module):
+    def __init__(self, cache_key, fn):
+        super().__init__()
+        self.cache_key = cache_key
+        self.fn = fn
+
+    def forward(self, x, *, cache=None, **kwargs):
+        return self.fn(x, cache=cache, cache_key=self.cache_key, **kwargs)
diff --git a/dalle_pytorch/transformer.py b/dalle_pytorch/transformer.py
@@ -9,6 +9,7 @@
 
 from dalle_pytorch.reversible import ReversibleSequence, SequentialSequence
 from dalle_pytorch.attention import Attention, SparseAttention, SparseConvCausalAttention, SparseAxialCausalAttention
+from dalle_pytorch.cache import Cached, FixCacheKey
 
 from rotary_embedding_torch import RotaryEmbedding, broadcat
 from g_mlp_pytorch import gMLPBlock
@@ -86,24 +87,6 @@ def __init__(self, dim, dropout = 0., mult = 4.):
     def forward(self, x):
         return self.net(x)
 
-class Cached(nn.Module):
-    def __init__(self, key, fn):
-        super().__init__()
-        self.key = key
-        self.fn = fn
-
-    def forward(self, x, cache=None, **kwargs):
-        if exists(cache) and self.key in cache:
-            prefix = cache[self.key]
-            suffix = self.fn(x[:, prefix.shape[1]:, :], **kwargs)
-            out = torch.cat([prefix, suffix], dim=1)
-        else:
-            out = self.fn(x, **kwargs)
-
-        if exists(cache):
-            cache[self.key] = out
-        return out
-
 # token shift classes
 
 class PreShiftToken(nn.Module):
@@ -217,7 +200,8 @@ def __init__(
                 ff = FeedForward(dim, mult = ff_mult, dropout = ff_dropout)
                 shared_ff_layers[ff_id] = ff
 
-            ff = Cached(f'ff_{ind}', ff)
+            attn = FixCacheKey(f'attn_{ind}', attn)
+            ff = FixCacheKey(f'ff_{ind}', Cached(ff))
 
             if shift_tokens:
                 attn, ff = map(lambda t: PreShiftToken(t, image_size = image_fmap_size, seq_len = seq_len), (attn, ff))
@@ -229,9 +213,9 @@ def __init__(
 
         execute_type = ReversibleSequence if reversible else SequentialSequence
         route_attn = ((True, False),) * depth
-        route_ffn = ((False, True),) * depth
+        route_all = ((True, True),) * depth
         attn_route_map = {'mask': route_attn, 'rotary_pos_emb': route_attn,
-                          'cache': route_ffn}
+                          'cache': route_all}
 
         self.layers = execute_type(layers, args_route = attn_route_map)