style: minor variable name fixes

okunator · okunator · commit 60fb35178856 · 2022-12-22T10:55:21.000+02:00
diff --git a/cellseg_models_pytorch/modules/mlp.py b/cellseg_models_pytorch/modules/mlp.py
@@ -17,7 +17,7 @@ def __init__(
         dropout: float = 0.0,
         bias: bool = False,
         out_channels: int = None,
-        **kwargs
+        **act_kwargs
     ) -> None:
         """MLP token mixer.
 
@@ -41,13 +41,15 @@ def __init__(
                 Flag whether to use bias terms in the nn.Linear modules.
             out_channels : int, optional
                 Number of out channels. If None `out_channels = in_channels`
+            **act_kwargs:
+                Arbitrary key-word arguments for the activation function.
         """
         super().__init__()
         self.out_channels = in_channels if out_channels is None else out_channels
         hidden_channels = int(mlp_ratio * in_channels)
 
         self.fc1 = nn.Linear(in_channels, hidden_channels, bias=bias)
-        self.act = Activation(activation)
+        self.act = Activation(activation, **act_kwargs)
         self.drop1 = nn.Dropout(dropout)
         self.fc2 = nn.Linear(hidden_channels, self.out_channels, bias=bias)
         self.drop2 = nn.Dropout(dropout)
@@ -69,6 +71,7 @@ def __init__(
         in_channels: int,
         mlp_ratio: int = 4,
         activation: str = "star_relu",
+        activation_kwargs: Dict[str, Any] = None,
         dropout: float = 0.0,
         bias: bool = False,
         normalization: str = "ln",
@@ -105,6 +108,7 @@ def __init__(
             activation=activation,
             dropout=dropout,
             bias=bias,
+            **activation_kwargs
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/cellseg_models_pytorch/modules/transformers.py b/cellseg_models_pytorch/modules/transformers.py
@@ -22,7 +22,7 @@ def __init__(
         block_types: Tuple[str, ...] = ("basic", "basic"),
         dropouts: Tuple[float, ...] = (0.0, 0.0),
         biases: Tuple[bool, ...] = (False, False),
-        act: str = "geglu",
+        activation: str = "star_relu",
         num_groups: int = 32,
         slice_size: int = 4,
         mlp_ratio: int = 4,
@@ -55,16 +55,16 @@ def __init__(
                 Dropout probabilities for the SelfAttention blocks.
             biases : bool, default=(True, True)
                 Include bias terms in the SelfAttention blocks.
-            act : str, default="geglu"
+            activation : str, default="star_relu"
                 The activation function applied at the end of the transformer layer fc.
-                One of ("geglu", "approximate_gelu").
+                One of ("geglu", "approximate_gelu", "star_relu").
             num_groups : int, default=32
                 Number of groups in the first group-norm op before the input is
                 projected to be suitable for self-attention.
             slice_size : int, default=4
                 Slice size for sliced self-attention. This is used only if
                 `name = "slice"` for a SelfAttentionBlock.
-            fc_projection_mult : int, default=4
+            mlp_ratio : int, default=4
                 Multiplier that defines the out dimension of the final fc projection
                 layer.
         """
@@ -89,7 +89,7 @@ def __init__(
             block_types=block_types,
             dropouts=dropouts,
             biases=biases,
-            act=act,
+            activation=activation,
             slice_size=slice_size,
             mlp_ratio=mlp_ratio,
         )
@@ -166,9 +166,9 @@ def __init__(
                 asymmetrically two separate embeddings (context and input embeddings).
                 E.g. passage from transformer encoder to transformer decoder. If this is
                 set to None, no cross attention is applied.
-            act : str, default="geglu"
+            activation : str, default="star_relu"
                 The activation function applied at the end of the transformer layer fc.
-                One of ("geglu", "approximate_gelu").
+                One of ("gelu", "geglu", "approximate_gelu", "star_relu").
             n_blocks : int, default=2
                 Number of SelfAttentionBlocks used in this layer.
             block_types : Tuple[str, ...], default=("basic", "basic")
@@ -182,9 +182,11 @@ def __init__(
             slice_size : int, default=4
                 Slice size for sliced self-attention. This is used only if
                 `name = "slice"` for a SelfAttentionBlock.
-            fc_projection_mult : int, default=4
+            mlp_proj : int, default=4
                 Multiplier that defines the out dimension of the final fc projection
                 layer.
+            **kwargs:
+                Arbitrary key-word arguments (e.g. for activation function.).
 
         Raises
         ------
@@ -227,6 +229,7 @@ def __init__(
             activation=activation,
             normalization="ln",
             norm_kwargs={"normalized_shape": query_dim},
+            activation_kwargs=kwargs,
         )
 
     def forward(self, x: torch.Tensor, context: torch.Tensor = None) -> torch.Tensor:
@@ -251,7 +254,7 @@ def forward(self, x: torch.Tensor, context: torch.Tensor = None) -> torch.Tensor
             con = None
             if i == n_blocks:
                 con = context
-
+            print("context: ", con.shape)
             x = tr_block(x, con)
 
         return self.mlp(x) + x
diff --git a/changelog.d/20221221_155406_oskari.lehtonen.md b/changelog.d/20221221_155406_oskari.lehtonen.md
@@ -0,0 +1,6 @@
+## Features
+
+- Add transformers modules
+- Add exact, slice, and memory efficient (xformers) self attention modules
+- Add transformers modules to `Decoder` modules
+- Add common transformer mlp avtivation functions.