Add FFN caching

borzunov · borzunov · commit 1cd8e2027878 · 2021-12-20T12:59:31.000Z
diff --git a/dalle_pytorch/dalle_pytorch.py b/dalle_pytorch/dalle_pytorch.py
@@ -503,12 +503,13 @@ def generate_images(
             indices = indices[:, :num_img_tokens]
             out = torch.cat((out, indices), dim = -1)
 
+        cache = {}
         for cur_len in range(out.shape[1], total_len):
             is_image = cur_len >= text_seq_len
 
             text, image = out[:, :text_seq_len], out[:, text_seq_len:]
 
-            logits = self(text, image, mask = mask)[:, -1, :]
+            logits = self(text, image, mask = mask, cache = cache)[:, -1, :]
 
             filtered_logits = top_k(logits, thres = filter_thres)
             probs = F.softmax(filtered_logits / temperature, dim = -1)
@@ -536,6 +537,7 @@ def forward(
         text,
         image = None,
         mask = None,
+        cache = None,
         return_loss = False
     ):
         assert text.shape[-1] == self.text_seq_len, f'the length {text.shape[-1]} of the text tokens you passed in does not have the correct length ({self.text_seq_len})'
@@ -584,7 +586,7 @@ def forward(
             alpha = 0.1
             tokens = tokens * alpha + tokens.detach() * (1 - alpha)
 
-        out = self.transformer(tokens)
+        out = self.transformer(tokens, cache=cache)
 
         if self.stable:
             out = self.norm_by_max(out)
diff --git a/dalle_pytorch/transformer.py b/dalle_pytorch/transformer.py
@@ -86,6 +86,24 @@ def __init__(self, dim, dropout = 0., mult = 4.):
     def forward(self, x):
         return self.net(x)
 
+class Cached(nn.Module):
+    def __init__(self, key, fn):
+        super().__init__()
+        self.key = key
+        self.fn = fn
+
+    def forward(self, x, cache=None, **kwargs):
+        if exists(cache) and self.key in cache:
+            prefix = cache[self.key]
+            suffix = self.fn(x[:, prefix.shape[1]:, :], **kwargs)
+            out = torch.cat([prefix, suffix], dim=1)
+        else:
+            out = self.fn(x, **kwargs)
+
+        if exists(cache):
+            cache[self.key] = out
+        return out
+
 # token shift classes
 
 class PreShiftToken(nn.Module):
@@ -199,6 +217,8 @@ def __init__(
                 ff = FeedForward(dim, mult = ff_mult, dropout = ff_dropout)
                 shared_ff_layers[ff_id] = ff
 
+            ff = Cached(f'ff_{ind}', ff)
+
             if shift_tokens:
                 attn, ff = map(lambda t: PreShiftToken(t, image_size = image_fmap_size, seq_len = seq_len), (attn, ff))
 
@@ -209,7 +229,9 @@ def __init__(
 
         execute_type = ReversibleSequence if reversible else SequentialSequence
         route_attn = ((True, False),) * depth
-        attn_route_map = {'mask': route_attn, 'rotary_pos_emb': route_attn}
+        route_ffn = ((False, True),) * depth
+        attn_route_map = {'mask': route_attn, 'rotary_pos_emb': route_attn,
+                          'cache': route_ffn}
 
         self.layers = execute_type(layers, args_route = attn_route_map)