Don't cache MLPs since we can just pass only last item

borzunov · borzunov · commit 2b7701891ac4 · 2021-12-21T00:41:13.000Z
diff --git a/dalle_pytorch/attention.py b/dalle_pytorch/attention.py
@@ -57,22 +57,22 @@ def __init__(self, dim, seq_len, causal = True, heads = 8, dim_head = 64, dropou
         self.causal = causal
 
         self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
-        self.to_out = Cached(nn.Sequential(
+        self.to_out = nn.Sequential(
             nn.Linear(inner_dim, dim),
             nn.Dropout(dropout)
-        ))
+        )
 
     def forward(self, x, mask = None, rotary_pos_emb = None, cache = None, cache_key = None):
         b, n, _, h, device = *x.shape, self.heads, x.device
         softmax = torch.softmax if not self.stable else stable_softmax
 
         qkv_key = f'{cache_key}_qkv'
         if exists(cache) and qkv_key in cache:
-            qkv = self.to_qkv(x[..., n - 1:n, :]).chunk(3, dim = -1)
+            qkv = self.to_qkv(x).chunk(3, dim = -1)
             q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
 
             if exists(rotary_pos_emb):
-                q, k, v = apply_pos_emb(rotary_pos_emb[..., n - 1:n, :], (q, k, v))
+                q, k, v = apply_pos_emb(rotary_pos_emb[..., n - 1:n, :], (q, k, v))  # FIXME: Fix rotary index here
 
             q *= self.scale
 
@@ -105,18 +105,10 @@ def forward(self, x, mask = None, rotary_pos_emb = None, cache = None, cache_key
 
         attn = softmax(dots, dim=-1)
 
-        out_key = f'{cache_key}_out'
-        if exists(cache) and out_key in cache:
-            top = cache[out_key]
-            bottom = attn @ v
-            out = torch.cat([top, bottom], dim=-2)
-        else:
-            out = attn @ v
-        if exists(cache):
-            cache[out_key] = out
-
+        out = attn @ v
         out = rearrange(out, 'b h n d -> b n (h d)')
-        out =  self.to_out(out, cache = cache, cache_key = f'{cache_key}_out_proj')
+        out =  self.to_out(out)
+
         return out
 
 # sparse attention with convolutional pattern, as mentioned in the blog post. customizable kernel size and dilation
diff --git a/dalle_pytorch/cache.py b/dalle_pytorch/cache.py
@@ -12,17 +12,7 @@ def __init__(self, fn):
         self.fn = fn
 
     def forward(self, x, *, cache=None, cache_key=None, **kwargs):
-        if exists(cache) and cache_key in cache:
-            prefix = cache[cache_key]
-            assert prefix.shape[1] + 1 == x.shape[1], f'{prefix.shape[1]} {x.shape[1]} {cache_key} {cache.keys()}'  # TODO: Change to <= for prod
-            suffix = self.fn(x[:, prefix.shape[1]:, :], **kwargs)
-            out = torch.cat([prefix, suffix], dim=1)
-        else:
-            out = self.fn(x, **kwargs)
-
-        if exists(cache):
-            cache[cache_key] = out
-        return out
+        return self.fn(x, **kwargs)
 
 class FixCacheKey(nn.Module):
     def __init__(self, cache_key, fn):
diff --git a/dalle_pytorch/dalle_pytorch.py b/dalle_pytorch/dalle_pytorch.py
@@ -401,12 +401,12 @@ def __init__(
 
         self.to_logits = nn.Sequential(
             nn.LayerNorm(dim),
-            FixCacheKey('to_logits_linear', Cached(nn.Linear(dim, self.total_tokens))),
+            nn.Linear(dim, self.total_tokens),
         )
 
         if share_input_output_emb:
-            self.text_emb = SharedEmbedding(self.to_logits[1].fn.fn, 0, num_text_tokens)
-            self.image_emb = SharedEmbedding(self.to_logits[1].fn.fn, num_text_tokens, total_tokens)
+            self.text_emb = SharedEmbedding(self.to_logits[1], 0, num_text_tokens)
+            self.image_emb = SharedEmbedding(self.to_logits[1], num_text_tokens, total_tokens)
         else:
             self.text_emb = nn.Embedding(num_text_tokens, dim)
             self.image_emb = nn.Embedding(num_image_tokens, dim)
@@ -587,17 +587,22 @@ def forward(
             alpha = 0.1
             tokens = tokens * alpha + tokens.detach() * (1 - alpha)
 
+        if cache is not None and 'decoding' in cache:
+            tokens = tokens[:, -1:]
         out = self.transformer(tokens, cache=cache)
 
         if self.stable:
             out = self.norm_by_max(out)
 
-        out = self.to_logits[0](out)
-        logits = self.to_logits[1](out, cache=cache)
+        logits = self.to_logits(out)
 
         # mask logits to make sure text predicts text (except last token), and image predicts image
 
         logits_mask = self.logits_mask[:, :seq_len]
+        if cache is not None:
+            if 'decoding' in cache:
+                logits_mask = logits_mask[:, -1:]
+            cache['decoding'] = True
         max_neg_value = -torch.finfo(logits.dtype).max
         logits.masked_fill_(logits_mask, max_neg_value)
 
diff --git a/dalle_pytorch/transformer.py b/dalle_pytorch/transformer.py
@@ -201,7 +201,6 @@ def __init__(
                 shared_ff_layers[ff_id] = ff
 
             attn = FixCacheKey(f'attn_{ind}', attn)
-            ff = FixCacheKey(f'ff_{ind}', Cached(ff))
 
             if shift_tokens:
                 attn, ff = map(lambda t: PreShiftToken(t, image_size = image_fmap_size, seq_len = seq_len), (attn, ff))
@@ -213,9 +212,8 @@ def __init__(
 
         execute_type = ReversibleSequence if reversible else SequentialSequence
         route_attn = ((True, False),) * depth
-        route_all = ((True, True),) * depth
         attn_route_map = {'mask': route_attn, 'rotary_pos_emb': route_attn,
-                          'cache': route_all}
+                          'cache': route_attn}
 
         self.layers = execute_type(layers, args_route = attn_route_map)