Add NonCached wrapper

borzunov · borzunov · commit 59cfc49210f0 · 2022-01-10T16:31:01.000Z
diff --git a/dalle_pytorch/attention.py b/dalle_pytorch/attention.py
@@ -122,13 +122,7 @@ def __init__(self, dim, seq_len, image_size = 32, kernel_size = 5, dilation = 1,
             nn.Dropout(dropout)
         )
 
-    def forward(self, x, mask = None, rotary_pos_emb = None, cache = None, cache_key = None):
-        n0 = x.shape[1]
-        if exists(cache):
-            if cache_key in cache:
-                x = torch.cat([cache[cache_key], x], dim=-2)
-            cache[cache_key] = x
-
+    def forward(self, x, mask = None, rotary_pos_emb = None):
         b, n, _, h, img_size, kernel_size, dilation, seq_len, device = *x.shape, self.heads, self.image_size, self.kernel_size, self.dilation, self.seq_len, x.device
         softmax = torch.softmax if not self.stable else stable_softmax
 
@@ -223,7 +217,7 @@ def forward(self, x, mask = None, rotary_pos_emb = None, cache = None, cache_key
 
         out = rearrange(out, '(b h) n d -> b n (h d)', h = h)
         out =  self.to_out(out)
-        return out[:, n - n0:n]
+        return out[:, :n]
 
 # sparse axial causal attention
 
diff --git a/dalle_pytorch/transformer.py b/dalle_pytorch/transformer.py
@@ -36,7 +36,33 @@ def forward(self, x):
         maxes = x.amax(dim = self.dim, keepdim = True)
         return x / maxes
 
+class NonCached(nn.Module):
+    """
+    A wrapper for layers that don't support the inference cache themselves.
+    Reconstructs the full sequence before the layer and
+    cuts the suffix of the outputs after the layer.
+    """
+
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, x, *, cache = None, cache_key = None, **kwargs):
+        n = x.shape[-2]
+        if exists(cache):
+            if cache_key in cache:
+                x = torch.cat([cache[cache_key], x], dim=-2)
+            cache[cache_key] = x
+
+        out = self.fn(x, **kwargs)
+
+        return out[:, -n:]
+
 class CachedAs(nn.Module):
+    """
+    A wrapper that defines a key for the inference cache.
+    """
+
     def __init__(self, cache_key, fn):
         super().__init__()
         self.cache_key = cache_key
@@ -251,7 +277,11 @@ def __init__(
                 ff = FeedForward(dim, mult = ff_mult, dropout = ff_dropout)
                 shared_ff_layers[ff_id] = ff
 
-            attn = CachedAs(f'attn_{ind}', attn)
+            if isinstance(attn, Attention):
+                attn = CachedAs(f'attn_{ind}', attn)
+            else:
+                # at the moment, other Attention classes don't support cache
+                attn = NonCached(attn)
 
             if shift_tokens:
                 attn = CachedAs(f'preshift_attn_{ind}', PreShiftToken(attn, image_size = image_fmap_size, seq_len = seq_len))