feat(modules): Sep att-mat-comp type & att-method

okunator · okunator · commit 5619963d3c61 · 2022-12-23T13:17:31.000+02:00
diff --git a/cellseg_models_pytorch/modules/base_modules.py b/cellseg_models_pytorch/modules/base_modules.py
@@ -4,9 +4,10 @@
 from .act import ACT_LOOKUP
 from .conv import CONV_LOOKUP
 from .norm import NORM_LOOKUP
+from .self_attention import SELFATT_LOOKUP
 from .upsample import UP_LOOKUP
 
-__all__ = ["Activation", "Norm", "Up", "Conv", "Identity"]
+__all__ = ["Activation", "Norm", "Up", "Conv", "Identity", "MultiHeadSelfAttention"]
 
 
 class Identity(nn.Module):
@@ -176,3 +177,44 @@ def __init__(self, name: str, **kwargs) -> None:
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward pass for the convolution function."""
         return self.conv(x)
+
+
+class MultiHeadSelfAttention(nn.Module):
+    def __init__(self, name: str, **kwargs) -> None:
+        """Multi-head self-attention wrapper class.
+
+        Parameters:
+        -----------
+            name : str
+                Name of the mhsa method.
+
+        Raises
+        ------
+            ValueError: if the mhsa method name is illegal.
+        """
+        super().__init__()
+
+        allowed = list(SELFATT_LOOKUP.keys())
+        if name not in allowed:
+            raise ValueError(
+                "Illegal multi-head attention method given. "
+                f"Allowed: {allowed}. Got: '{name}'"
+            )
+
+        try:
+            self.att = SELFATT_LOOKUP[name](**kwargs)
+        except Exception as e:
+            raise Exception(
+                "Encountered an error when trying to init convolution function: "
+                f"MultiHeadSelfAttention(name='{name}'): {e.__class__.__name__}: {e}"
+            )
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Forward pass for the convolution function."""
+        return self.att(query, key, value, **kwargs)
diff --git a/cellseg_models_pytorch/modules/self_attention_modules.py b/cellseg_models_pytorch/modules/self_attention_modules.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn as nn
 
-from .self_attention import ExactSelfAttention
+from .base_modules import MultiHeadSelfAttention
 
 __all__ = ["SelfAttention", "SelfAttentionBlock"]
 
@@ -12,7 +12,8 @@ class SelfAttention(nn.Module):
     def __init__(
         self,
         query_dim: int,
-        self_attention: str = "basic",
+        name: str = "exact",
+        how: str = "basic",
         cross_attention_dim: int = None,
         num_heads: int = 8,
         head_dim: int = 64,
@@ -29,12 +30,16 @@ def __init__(
         ----------
             query_dim : int
                 The number of channels in the query. Typically: num_heads*head_dim
-            self_attention : str, default="basic"
-                One of ("basic", "flash", "slice", "memeff").
+            name : str
+                Name of the attention method. One of ("exact", "linformer").
+            how : str, default="basic"
+                How to compute the self-attention matrix.
+                One of ("basic", "flash", "slice", "memeff", "slice-memeff").
                 "basic": the normal O(N^2) self attention.
                 "flash": the flash attention (by xformers library),
                 "slice": batch sliced attention operation to save mem.
-                "memeff" xformers.memory_efficient_attention.
+                "memeff": xformers.memory_efficient_attention.
+                "slice-memeff": Conmbine slicing and memory_efficient_attention.
             cross_attention_dim : int, optional
                 Number of channels in the context tensor. Cross attention combines
                 asymmetrically two separate embeddings (context and input embeddings).
@@ -51,27 +56,29 @@ def __init__(
             slice_size : int, default=4
                 Slice size for sliced self-attention. This is used only if
                 `self_attention = "slice"`.
+            **kwargs:
+                Extra key-word arguments for the MHSA-module
         """
         super().__init__()
         self.out_channels = query_dim
+        self.num_heads = num_heads
         proj_channels = head_dim * num_heads
 
         # cross attention dim
         if cross_attention_dim is None:
             cross_attention_dim = query_dim
 
-        self.scale = head_dim**-0.5
-        self.num_heads = num_heads
-
         self.to_q = nn.Linear(query_dim, proj_channels, bias=bias)
         self.to_k = nn.Linear(cross_attention_dim, proj_channels, bias=bias)
         self.to_v = nn.Linear(cross_attention_dim, proj_channels, bias=bias)
 
-        self.self_attn = ExactSelfAttention(
+        self.self_attn = MultiHeadSelfAttention(
+            name=name,
             head_dim=head_dim,
-            self_attention=self_attention,
             num_heads=self.num_heads,
+            how=how,
             slice_size=slice_size,
+            **kwargs,
         )
 
         self.to_out = nn.Linear(proj_channels, query_dim)
@@ -187,8 +194,9 @@ def forward(
 class SelfAttentionBlock(nn.Module):
     def __init__(
         self,
-        name: str,
+        how: str,
         query_dim: int,
+        name: str = "exact",
         cross_attention_dim: int = None,
         num_heads: int = 8,
         head_dim: int = 64,
@@ -206,11 +214,15 @@ def __init__(
         Parameters
         ----------
             name : str
-                One of ("basic", "flash", "slice", "memeff").
+                Name of the attention method. One of ("exact", "linformer").
+            how : str, default="basic"
+                How to compute the self-attention matrix.
+                One of ("basic", "flash", "slice", "memeff", "slice-memeff").
                 "basic": the normal O(N^2) self attention.
                 "flash": the flash attention (by xformers library),
                 "slice": batch sliced attention operation to save mem.
-                "memeff" xformers.memory_efficient_attention.
+                "memeff": xformers.memory_efficient_attention.
+                "slice-memeff": Conmbine slicing and memory_efficient_attention.
             query_dim : int
                 The number of channels in the query. Typically: num_heads*head_dim
             cross_attention_dim : int, optional
@@ -234,14 +246,16 @@ def __init__(
 
         self.norm = nn.LayerNorm(query_dim)
         self.att = SelfAttention(
-            self_attention=name,
+            name=name,
+            how=how,
             query_dim=query_dim,
             cross_attention_dim=cross_attention_dim,
             head_dim=head_dim,
             num_heads=num_heads,
             dropout=dropout,
             bias=bias,
             slice_size=slice_size,
+            **kwargs,
         )
 
     def forward(self, x: torch.Tensor, context: torch.Tensor = None) -> torch.Tensor:
diff --git a/cellseg_models_pytorch/modules/transformers.py b/cellseg_models_pytorch/modules/transformers.py
@@ -19,7 +19,8 @@ def __init__(
         head_dim: int = 64,
         cross_attention_dim: int = None,
         n_blocks: int = 2,
-        block_types: Tuple[str, ...] = ("basic", "basic"),
+        block_types: Tuple[str, ...] = ("exact", "exact"),
+        computation_types: Tuple[str, ...] = ("basic", "basic"),
         dropouts: Tuple[float, ...] = (0.0, 0.0),
         biases: Tuple[bool, ...] = (False, False),
         activation: str = "star_relu",
@@ -47,10 +48,14 @@ def __init__(
                 set to None, no cross attention is applied.
             n_blocks : int, default=2
                 Number of Multihead attention blocks in the transformer.
-            block_types : Tuple[str, ...], default=("basic", "basic")
+            block_types : Tuple[str, ...], default=("exact", "exact")
                 The name of the SelfAttentionBlocks in the TransformerLayer.
-                Length of the tuple has to equal `n_blocks`
-                Allowed names: "basic". "slice", "flash", "memeff".
+                Length of the tuple has to equal `n_blocks`.
+                Allowed names: ("exact", "linformer").
+            computation_types : Tuple[str, ...], default=("basic", "basic")
+                The way of computing the attention matrices in the SelfAttentionBlocks
+                in the TransformerLayer. Length of the tuple has to equal `n_blocks`
+                Allowed styles: "basic". "slice", "flash", "memeff", "slice_memeff".
             dropouts : Tuple[float, ...], default=(False, False)
                 Dropout probabilities for the SelfAttention blocks.
             biases : bool, default=(True, True)
@@ -87,11 +92,13 @@ def __init__(
             cross_attention_dim=cross_attention_dim,
             n_blocks=n_blocks,
             block_types=block_types,
+            computation_types=computation_types,
             dropouts=dropouts,
             biases=biases,
             activation=activation,
             slice_size=slice_size,
             mlp_ratio=mlp_ratio,
+            **kwargs,
         )
 
         self.proj_out = nn.Conv2d(
@@ -140,7 +147,8 @@ def __init__(
         cross_attention_dim: int = None,
         activation: str = "star_relu",
         n_blocks: int = 2,
-        block_types: Tuple[str, ...] = ("basic", "basic"),
+        block_types: Tuple[str, ...] = ("exact", "exact"),
+        computation_types: Tuple[str, ...] = ("basic", "basic"),
         dropouts: Tuple[float, ...] = (0.0, 0.0),
         biases: Tuple[bool, ...] = (False, False),
         slice_size: int = 4,
@@ -171,10 +179,14 @@ def __init__(
                 One of ("gelu", "geglu", "approximate_gelu", "star_relu").
             n_blocks : int, default=2
                 Number of SelfAttentionBlocks used in this layer.
-            block_types : Tuple[str, ...], default=("basic", "basic")
+            block_types : Tuple[str, ...], default=("exact", "exact")
+                The name of the SelfAttentionBlocks in the TransformerLayer.
+                Length of the tuple has to equal `n_blocks`.
+                Allowed names: ("exact", "linformer").
+            computation_types : Tuple[str, ...], default=("basic", "basic")
                 The name of the SelfAttentionBlocks in the TransformerLayer.
                 Length of the tuple has to equal `n_blocks`
-                Allowed names: "basic". "slice", "flash".
+                Allowed styles: "basic". "slice", "flash", "memeff", "slice_memeff".
             dropouts : Tuple[float, ...], default=(False, False)
                 Dropout probabilities for the SelfAttention blocks.
             biases : bool, default=(True, True)
@@ -186,7 +198,7 @@ def __init__(
                 Multiplier that defines the out dimension of the final fc projection
                 layer.
             **kwargs:
-                Arbitrary key-word arguments (e.g. for activation function.).
+                Arbitrary key-word arguments.
 
         Raises
         ------
@@ -213,13 +225,15 @@ def __init__(
 
             att_block = SelfAttentionBlock(
                 name=block_types[i],
+                how=computation_types[i],
                 query_dim=query_dim,
                 num_heads=num_heads,
                 head_dim=head_dim,
                 cross_attention_dim=cross_dim,
                 dropout=dropouts[i],
                 biases=biases[i],
                 slice_size=slice_size,
+                **kwargs,
             )
             self.tr_blocks[f"transformer_{block_types[i]}_{i + 1}"] = att_block
 
@@ -254,7 +268,7 @@ def forward(self, x: torch.Tensor, context: torch.Tensor = None) -> torch.Tensor
             con = None
             if i == n_blocks:
                 con = context
-            print("context: ", con.shape)
+
             x = tr_block(x, con)
 
         return self.mlp(x) + x
diff --git a/changelog.d/20221222_110850_oskari.lehtonen.md b/changelog.d/20221222_110850_oskari.lehtonen.md
@@ -0,0 +1,4 @@
+## Refactor
+
+- Added more verbose error messages for the abstract wrapper-modules in `modules.base_modules`
+- Added more verbose error catching for xformers.ops.memory_efficient_attention.
diff --git a/changelog.d/20221223_131600_oskari.lehtonen.md b/changelog.d/20221223_131600_oskari.lehtonen.md
@@ -0,0 +1,3 @@
+## Features
+
+- Add Linformer self-attention mechanism.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+## Features`
	`2`	`+`
	`3`	`+- Add Linformer self-attention mechanism.`