feat(modules): add xformers eff self-attention

okunator · okunator · commit dee38ed658d1 · 2022-12-16T16:06:23.000+02:00
diff --git a/cellseg_models_pytorch/modules/self_attention/exact_attention.py b/cellseg_models_pytorch/modules/self_attention/exact_attention.py
@@ -1,6 +1,11 @@
 import torch
 import torch.nn as nn
 
+try:
+    from xformers.ops import memory_efficient_attention
+except ModuleNotFoundError:
+    pass
+
 
 class ExactSelfAttention(nn.Module):
     def __init__(
@@ -15,26 +20,37 @@ def __init__(
         Three variants:
         - basic self-attention implementation with torch.matmul O(N^2)
         - slice-attention - Computes the attention matrix in slices to save mem.
-        - flash attention from xformers package:
-            Citation..
+        - `xformers.ops.memory_efficient_attention` from xformers package.
 
         Parameters
         ----------
             head_dim : int
                 Out dim per attention head.
             self_attention : str, default="basic"
-                One of ("basic", "flash", "sliced"). Basic is the normal O(N^2)
-                self attention. "flash" is the flash attention (by xformes library),
-                "slice" is self attention implemented with sliced matmul operation
-                to save memory.
+                One of ("basic", "flash", "sliced", "memeff").
+                "basic": the normal O(N^2) self attention.
+                "flash": the flash attention (by xformers library),
+                "slice": batch sliced attention operation to save mem.
+                "memeff" xformers.memory_efficient_attention.
             num_heads : int, optional
                 Number of heads. Used only if `slice_attention = True`.
             slice_size, int, optional
                 The size of the slice. Used only if `slice_attention = True`.
+
+        Raises
+        ------
+            - ValueError:
+                - If illegal self attention method is given.
+                - If `self_attention` is set to `slice` while `num_heads` | `slice_size`
+                    args are not given proper integer values.
+                - If `self_attention` is set to `memeff` but cuda is not available.
+            - ModuleNotFoundError:
+                - If `self_attention` is set to `memeff` and `xformers` package is not
+                installed
         """
         super().__init__()
 
-        allowed = ("basic", "flash", "slice")
+        allowed = ("basic", "flash", "slice", "memeff")
         if self_attention not in allowed:
             raise ValueError(
                 f"Illegal exact self attention type given. Got: {self_attention}. "
@@ -59,6 +75,21 @@ def __init__(
                     f"and `num_heads`: {num_heads}."
                 )
 
+        if self_attention == "memeff":
+            try:
+                import xformers  # noqa F401
+            except ModuleNotFoundError:
+                raise ModuleNotFoundError(
+                    "`self_attention` was set to `memeff`. The method requires the "
+                    "xformers package. See how to install xformers: "
+                    "https://github.com/facebookresearch/xformers"
+                )
+            if not torch.cuda.is_available():
+                raise ValueError(
+                    "`self_attention` was set to `memeff`. The method is implemented "
+                    "with `xformers.memory_efficient_attention` that requires cuda."
+                )
+
     def _attention(
         self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
     ) -> torch.Tensor:
@@ -161,7 +192,9 @@ def forward(
             torch.Tensor:
                 The self-attention matrix. Same shape as inputs.
         """
-        if self.self_attention == "flash":
+        if self.self_attention == "memeff":
+            attn = memory_efficient_attention(query, key, value)
+        elif self.self_attention == "flash":
             raise NotImplementedError
         elif self.self_attention == "slice":
             attn = self._slice_attention(query, key, value)
diff --git a/cellseg_models_pytorch/modules/self_attention_modules.py b/cellseg_models_pytorch/modules/self_attention_modules.py
@@ -29,11 +29,11 @@ def __init__(
             query_dim : int
                 The number of channels in the query. Typically: num_heads*head_dim
             self_attention : str, default="basic"
-                One of ("basic", "flash", "sliced").
-                "basic" is the normal O(N^2) self attention.
-                "flash" is the flash attention (by xformers library),
-                "slice" is self attention implemented with sliced matmul operation
-                on the batch dimension to save memory.
+                One of ("basic", "flash", "slice", "memeff").
+                "basic": the normal O(N^2) self attention.
+                "flash": the flash attention (by xformers library),
+                "slice": batch sliced attention operation to save mem.
+                "memeff" xformers.memory_efficient_attention.
             cross_attention_dim : int, optional
                 Number of channels in the context tensor. Cross attention combines
                 asymmetrically two separate embeddings (context and input embeddings).
@@ -204,11 +204,11 @@ def __init__(
         Parameters
         ----------
             name : str
-                One of ("basic", "flash", "sliced").
-                "basic" is the normal O(N^2) self attention.
-                "flash" is the flash attention (by xformers library),
-                "slice" is self attention implemented with sliced matmul operation
-                on the batch dimension to save memory.
+                One of ("basic", "flash", "slice", "memeff").
+                "basic": the normal O(N^2) self attention.
+                "flash": the flash attention (by xformers library),
+                "slice": batch sliced attention operation to save mem.
+                "memeff" xformers.memory_efficient_attention.
             query_dim : int
                 The number of channels in the query. Typically: num_heads*head_dim
             cross_attention_dim : int, optional
diff --git a/cellseg_models_pytorch/modules/transformers.py b/cellseg_models_pytorch/modules/transformers.py
@@ -50,7 +50,7 @@ def __init__(
             block_types : Tuple[str, ...], default=("basic", "basic")
                 The name of the SelfAttentionBlocks in the TransformerLayer.
                 Length of the tuple has to equal `n_blocks`
-                Allowed names: "basic". "slice", "flash".
+                Allowed names: "basic". "slice", "flash", "memeff".
             dropouts : Tuple[float, ...], default=(False, False)
                 Dropout probabilities for the SelfAttention blocks.
             biases : bool, default=(True, True)