offer way to merge multiple rank1editmodules. concept id will be in the same order as the merge order

lucidrains · lucidrains · commit 86a4b4285942 · 2023-08-16T09:39:29.000-07:00
diff --git a/README.md b/README.md
@@ -82,12 +82,14 @@ values = wrapped_to_values(
 
 ## Todo
 
-- [ ] offer a way to combine separately learned concepts from multiple `Rank1EditModule` into one for inference
 - [ ] handle rank-1 update for multiple concepts
     - [x] handle training with multiple concepts
     - [ ] handle multiple concepts in one prompt at inference - summation of the sigmoid term + outputs
+        - [ ] accept multiple concept indices
 - [ ] offer a magic function that automatically tries to wire up the cross attention by looking for appropriately named `nn.Linear` and auto-inferring which ones are keys or values
 
+- [x] offer a way to combine separately learned concepts from multiple `Rank1EditModule` into one for inference
+    - [x] offer function for merging `Rank1EditModule`s
 - [x] add the zero-shot masking of concept proposed in paper
 - [x] take care of the function that takes in the dataset and text encoder and precomputes the covariance matrix needed for the rank-1 update
 - [x] instead of having the researcher worry about different learning rates, offer the fractional gradient trick from other paper (to learn the concept embedding)
diff --git a/perfusion_pytorch/__init__.py b/perfusion_pytorch/__init__.py
@@ -1,4 +1,6 @@
 from perfusion_pytorch.perfusion import (
     Rank1EditModule,
-    calculate_input_covariance
+    calculate_input_covariance,
+    loss_fn_weighted_by_mask,
+    merge_rank1_edit_modules
 )
diff --git a/perfusion_pytorch/perfusion.py b/perfusion_pytorch/perfusion.py
@@ -1,6 +1,8 @@
 from math import ceil
+from copy import deepcopy
+
 from beartype import beartype
-from beartype.typing import Union, List, Optional
+from beartype.typing import Union, List, Optional, Tuple
 
 import torch
 from torch import nn, einsum, Tensor, IntTensor, LongTensor, FloatTensor
@@ -163,7 +165,7 @@ def __init__(
 
         self.is_key_proj = is_key_proj # will lock the output to the super-class, and turn off gradients
 
-        self.concept_output = nn.Parameter(torch.zeros(num_concepts, dim_output), requires_grad = not is_key_proj)
+        self.concept_outputs = nn.Parameter(torch.zeros(num_concepts, dim_output), requires_grad = not is_key_proj)
 
         # C in the paper, inverse precomputed
 
@@ -173,7 +175,7 @@ def parameters(self):
         if not self.is_key_proj:
             return []
 
-        return [self.concept_output]
+        return [self.concept_outputs]
 
     @beartype
     def forward(
@@ -240,21 +242,21 @@ def forward(
                 assert exists(superclass_output), 'text_enc_with_superclass must be passed in for the first batch'
 
                 # init concept output with superclass output - fixed for keys, learned for values
-                self.concept_output[concept_id].data.copy_(superclass_output)
+                self.concept_outputs[concept_id].data.copy_(superclass_output)
 
             elif exists(superclass_output) and self.is_key_proj:
                 # if text enc with superclass is passed in for more than 1 batch
                 # just take the opportunity to exponentially average it a bit more for the keys, which have fixed concept output (to superclass)
 
-                ema_concept_output = self.concept_output * decay + superclass_output * (1. - decay)
-                self.concept_output[concept_id].data.copy_(ema_concept_output)
+                ema_concept_output = self.concept_outputs[concept_id] * decay + superclass_output * (1. - decay)
+                self.concept_outputs[concept_id].data.copy_(ema_concept_output)
 
             # if any in the batch is not initialized, initialize
 
             if not initted:
                 ema_concept_text_enc = concept_text_enc
             else:
-                ema_concept_text_enc = self.ema_concept_text_enc[concept_id]
+                ema_concept_text_enc = self.ema_concept_text_encs[concept_id]
 
             # exponential moving average for concept input encoding
 
@@ -270,7 +272,7 @@ def forward(
 
         # make it easier to match with paper
 
-        i, o, W = self.ema_concept_text_encs[concept_id], self.concept_output[concept_id], weights
+        i, o, W = self.ema_concept_text_encs[concept_id], self.concept_outputs[concept_id], weights
 
         # main contribution eq (3)
 
@@ -289,3 +291,24 @@ def forward(
         W_em_orthogonal_term = text_enc_output - (sim * concept_output / i_energy)
 
         return W_em_orthogonal_term + sigmoid_term * rearrange(o, 'd -> 1 1 d')
+
+# for merging trained Rank1EditModule(s) above
+
+@beartype
+def merge_rank1_edit_modules(
+    *modules: Rank1EditModule
+) -> Rank1EditModule:
+
+    assert all([m.initted.item() for m in modules]), 'all modules must be initialized and ideally trained'
+    assert len(set([m.concept_outputs.shape[-1] for m in modules])) == 1, 'concept output dimension must be the same'
+    assert len(set([m.is_key_proj for m in modules])) == 1, 'all modules must be either for keys, or values. you cannot merge rank 1 edit modules of keys and values together'
+
+    merged_module = deepcopy(modules[0])
+
+    print(len(modules))
+    merged_module.num_concepts = sum([m.num_concepts for m in modules])
+
+    concept_outputs = torch.cat(tuple(m.concept_outputs.data for m in modules), dim = 0)
+    merged_module.concept_outputs = nn.Parameter(concept_outputs)
+
+    return merged_module
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'perfusion-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.22',
+  version = '0.0.23',
   license='MIT',
   description = 'Perfusion - Pytorch',
   author = 'Phil Wang',

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,6 @@`
`1`	`1`	`from perfusion_pytorch.perfusion import (`
`2`	`2`	`Rank1EditModule,`
`3`		`- calculate_input_covariance`
	`3`	`+ calculate_input_covariance,`
	`4`	`+ loss_fn_weighted_by_mask,`
	`5`	`+ merge_rank1_edit_modules`
`4`	`6`	`)`