okunator
diff --git a/‎cellseg_models_pytorch/decoders/decoder.py
Lines changed: 119 additions & 53 deletions b/‎cellseg_models_pytorch/decoders/decoder.py
Lines changed: 119 additions & 53 deletions
diff --git a/‎cellseg_models_pytorch/decoders/decoder_stage.py
Lines changed: 40 additions & 33 deletions b/‎cellseg_models_pytorch/decoders/decoder_stage.py
Lines changed: 40 additions & 33 deletions
@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -13,50 +13,77 @@ def __init__(
         self,
         enc_channels: Tuple[int, ...],
         out_channels: Tuple[int, ...] = (256, 128, 64, 32, 16),
-        style_channels: int = None,
-        n_conv_layers: Tuple[int, ...] = (1, 1, 1, 1, 1),
-        n_conv_blocks: Tuple[Tuple[int, ...], ...] = ((2,), (2,), (2,), (2,), (2,)),
-        long_skip: str = "unet",
-        n_transformers: Tuple[int, ...] = None,
-        n_transformer_blocks: Tuple[Tuple[int], ...] = ((1,), (1,), (1,), (1,), (1,)),
+        long_skip: Union[None, str, Tuple[str, ...]] = "unet",
+        n_conv_layers: Union[None, int, Tuple[int, ...]] = 1,
+        n_transformers: Union[None, int, Tuple[int, ...]] = None,
+        n_conv_blocks: Union[int, Tuple[Tuple[int, ...], ...]] = 2,
+        n_transformer_blocks: Union[int, Tuple[Tuple[int], ...]] = 1,
         stage_params: Optional[Tuple[Dict, ...]] = None,
+        style_channels: int = None,
         **kwargs,
     ) -> None:
         """Build a generic U-net-like decoder.
 
+        I.e stack decoder stages that are composed followingly:
+
+        DecoderStage:
+            - UpSample(up_method)
+            - LongSkip(long_skip_method)
+            - ConvLayer (optional)
+                - ConvBlock(conv_block_method)
+            - TransformerLayer (optional)
+                - TransformerBlock(transformer_block_method)
+
         Parameters
         ----------
             enc_channels : Tuple[int, ...]
                 Number of channels at each encoder layer.
             out_channels : Tuple[int, ...], default=(256, 128, 64, 32, 16)
                 Number of channels at each decoder layer output.
-            style_channels : int, default=None
-                Number of style vector channels. If None, style vectors are ignored.
-            n_conv_layers : Tuple[int, ...], default=(1, 1, 1, 1, 1)
-                The number of conv layers inside each of the decoder stages.
-            n_conv_blocks : Tuple[Tuple[int, ...], ...] =((2, ),(2, ),(2, ),(2, ),(2, ))
-                The number of blocks inside each conv-layer at each decoder stage.
-            long_skip : str, default="unet"
-                long skip method to be used. One of: "unet", "unetpp", "unet3p",
-                "unet3p-lite", None
-            n_transformers : Tuple[int, ...], optional, default=None
-                The number of transformer layers inside each of the decoder stages.
-            n_transformer_blocks : Tuple[Tuple[int]] = ((1, ),(1, ),(1, ),(1, ),(1, ))
+            long_skip : Union[None, str, Tuple[str, ...]], default="unet"
+                long skip method to be used. The argument can be given as a tuple, where
+                each value indicates the long-skip method for each stage of the decoder,
+                allowing the mixing of long-skip methods in the decoder.
+                Allowed: "cross-attn", "unet", "unetpp", "unet3p", "unet3p-lite", None
+            n_conv_layers : Union[None, int, Tuple[int, ...]], default=1
+                The number of convolution layers inside each of the decoder stages. The
+                argument can be given as a tuple, where each value indicates the number
+                of conv-layers inside each stage of the decoder allowing the mixing of
+                different sized layers inside the stages in the decoder. If set to None,
+                no conv-layers will be included in the decoder.
+            n_transformers : Union[None, int, Tuple[int, ...]] , optional
+                The number of transformer layers inside each of the decoder stages. The
+                argument can be given as a tuple, where each value indicates the number
+                of transformer-layers inside each stage of the decoder allowing the
+                mixing of different sized layers inside the stages in the decoder. If
+                set to None, no transformer layers will be included in the decoder.
+            n_conv_blocks : Union[int, Tuple[Tuple[int, ...], ...]], default=2
+                The number of blocks inside each conv-layer at each decoder stage. The
+                argument can be given as a nested tuple, where each value indicates the
+                number of `ConvBlock`s inside a single `ConvLayer` allowing different
+                sized blocks inside each conv-layer in the decoder.
+            n_transformer_blocks : Union[int, Tuple[Tuple[int], ...]], default=1
                 The number of transformer blocks inside each transformer-layer at each
-                decoder stage.
+                decoder stage. The argument can be given as a nested tuple, where each
+                value indicates the number of `SelfAttention`s inside a single
+                `TranformerLayer` allowing different sized transformer blocks inside
+                each transformer-layer in the decoder.
             stage_params : Optional[Tuple[Dict, ...]], default=None
                 The keyword args for each of the distinct decoder stages. Incudes the
                 parameters for the long skip connections, convolutional layers of the
                 decoder and transformer layers itself. See the `DecoderStage`
                 documentation for more info.
+            style_channels : int, default=None
+                Number of style vector channels. If None, style vectors are ignored.
+                If `n_conv_layers` is None, this is ignored since style vectors are
+                applied inside `ConvBlocks`.
 
         Raises
         ------
             ValueError:
                 If there is a mismatch between encoder and decoder channel lengths.
         """
         super().__init__()
-        self.long_skip = long_skip
 
         if not len(out_channels) == len(enc_channels):
             raise ValueError(
@@ -70,66 +97,105 @@ def __init__(
 
         # scaling factor assumed to be 2 for the spatial dims and the input
         # has to be divisible by 32. 256 used here just for convenience.
-        depth = len(out_channels)
-        out_dims = [256 // 2**i for i in range(depth)][::-1]
+        self.depth = len(out_channels)
+        out_dims = [256 // 2**i for i in range(self.depth)][::-1]
 
-        # Build decoder
-        for i in range(depth - 1):
-            # number of conv layers
-            n_clayers = None
-            if n_conv_layers is not None:
-                n_clayers = n_conv_layers[i]
-
-            # number of conv blocks inside each layer
-            n_cblocks = None
-            if n_conv_blocks is not None:
-                n_cblocks = n_conv_blocks[i]
-
-            # number of transformer layers
-            n_tr_layers = None
-            if n_transformers is not None:
-                n_tr_layers = n_transformers[i]
-
-            # number of transformer blocks inside transformer layers
-            n_tr_blocks = None
-            if n_transformer_blocks is not None:
-                n_tr_blocks = n_transformer_blocks[i]
+        # set layer-level tuple-args
+        self.long_skips = self._layer_tuple(long_skip)
+        n_conv_layers = self._layer_tuple(n_conv_layers)
+        n_transformers = self._layer_tuple(n_transformers)
 
+        # set block-level tuple-args
+        n_conv_blocks = self._block_tuple(n_conv_blocks, n_conv_layers)
+        n_transformer_blocks = self._block_tuple(n_transformer_blocks, n_transformers)
+
+        # Build decoder
+        for i in range(self.depth - 1):
             decoder_block = DecoderStage(
                 stage_ix=i,
                 dec_channels=tuple(out_channels),
                 dec_dims=tuple(out_dims),
                 skip_channels=skip_channels,
+                long_skip=self._tup_arg(self.long_skips, i),
+                n_conv_layers=self._tup_arg(n_conv_layers, i),
+                n_conv_blocks=self._tup_arg(n_conv_blocks, i),
+                n_transformers=self._tup_arg(n_transformers, i),
+                n_transformer_blocks=self._tup_arg(n_transformer_blocks, i),
                 style_channels=style_channels,
-                long_skip=long_skip,
-                n_conv_layers=n_clayers,
-                n_conv_blocks=n_cblocks,
-                n_transformers=n_tr_layers,
-                n_transformer_blocks=n_tr_blocks,
                 **stage_params[i] if stage_params is not None else {"k": None},
             )
             self.add_module(f"decoder_stage{i + 1}", decoder_block)
 
         self.out_channels = decoder_block.out_channels
 
+    def _tup_arg(self, tup: Tuple[Any, ...], ix: int) -> Union[None, int, str]:
+        """Return None if given tuple-arg is None, else, return the value at ix."""
+        ret = None
+        if tup is not None:
+            ret = tup[ix]
+        return ret
+
+    def _layer_tuple(
+        self, arg: Union[None, str, int, Tuple[Any, ...]]
+    ) -> Union[None, Tuple[Any, ...]]:
+        """Return a non-nested tuple or None for layer-related arguments."""
+        ret = None
+        if isinstance(arg, (list, tuple)):
+            ret = tuple(arg)
+        elif isinstance(arg, (str, int)):
+            ret = tuple([arg] * self.depth)
+        elif arg is None:
+            ret = ret
+        else:
+            raise ValueError(
+                f"Given arg: {arg} should be None, str, int or a Tuple of ints or strs."
+            )
+
+        return ret
+
+    def _block_tuple(
+        self,
+        arg: Union[int, None, Tuple[Tuple[int, ...], ...]],
+        n_layers: Tuple[int, ...],
+    ) -> Union[None, Tuple[Tuple[int, ...], ...]]:
+        """Return a nested tuple or None for block-related arguments."""
+        ret = None
+        if isinstance(arg, (list, tuple)):
+            if not all([isinstance(a, (tuple, list)) for a in arg]):
+                raise ValueError(
+                    f"Given arg: {arg} should be a nested sequence. Got: {arg}."
+                )
+            ret = tuple(arg)
+        elif isinstance(arg, int):
+            if n_layers is not None:
+                ret = tuple([tuple([arg] * i) for i in n_layers])
+            else:
+                ret = None
+        elif arg is None:
+            ret = ret
+        else:
+            raise ValueError(f"Given arg: {arg} should be None, int or a nested tuple.")
+
+        return ret
+
     def forward_features(
         self, features: Tuple[torch.Tensor], style: torch.Tensor = None
     ) -> List[torch.Tensor]:
         """Forward pass of the decoder. Returns all the decoder stage feats."""
         head = features[0]
         skips = features[1:]
-        extra_skips = [head] if self.long_skip == "unet3p" else []
+        extra_skips = [head] if self.long_skips[0] == "unet3p" else []
         ret_feats = []
 
         x = head
-        for decoder_stage in self.values():
+        for i, decoder_stage in enumerate(self.values()):
             x, extra = decoder_stage(
                 x, skips=skips, extra_skips=extra_skips, style=style
             )
 
-            if self.long_skip == "unetpp":
+            if self.long_skips[i] == "unetpp":
                 extra_skips = extra
-            elif self.long_skip == "unet3p":
+            elif self.long_skips[i] == "unet3p":
                 extra_skips.append(x)
 
             ret_feats.append(x)
 
@@ -16,33 +16,34 @@ def __init__(
         dec_channels: Tuple[int, ...],
         dec_dims: Tuple[int, ...],
         skip_channels: Tuple[int, ...],
-        style_channels: int = None,
-        n_conv_layers: int = 1,
-        n_conv_blocks: Tuple[int, ...] = (2,),
-        short_skips: Tuple[str, ...] = ("residual",),
-        expand_ratios: Tuple[float, float] = ((1.0, 1.0),),
-        block_types: Tuple[Tuple[str, ...], ...] = (("basic", "basic"),),
-        normalizations: Tuple[Tuple[str, ...], ...] = (("bn", "bn"),),
-        activations: Tuple[Tuple[str, ...], ...] = (("relu", "relu"),),
-        convolutions: Tuple[Tuple[str, ...], ...] = (("conv", "conv"),),
-        attentions: Tuple[Tuple[str, ...], ...] = ((None, "se"),),
-        preactivates: Tuple[Tuple[bool, ...], ...] = ((False, False),),
-        preattends: Tuple[Tuple[bool, ...], ...] = ((False, False),),
-        use_styles: Tuple[Tuple[bool, ...], ...] = ((False, False),),
-        kernel_sizes: Tuple[Tuple[int, ...]] = ((3, 3),),
-        groups: Tuple[Tuple[int, ...]] = ((1, 1),),
-        biases: Tuple[Tuple[bool, ...]] = ((False, False),),
-        layer_residual: bool = False,
-        upsampling: str = "fixed-unpool",
         long_skip: str = "unet",
         merge_policy: str = "sum",
         skip_params: Optional[Dict[str, Any]] = None,
+        upsampling: str = "fixed-unpool",
+        n_conv_layers: Optional[int] = 1,
+        style_channels: Optional[int] = None,
+        layer_residual: Optional[bool] = False,
+        n_conv_blocks: Optional[Tuple[int, ...]] = (2,),
+        short_skips: Optional[Tuple[str, ...]] = ("residual",),
+        expand_ratios: Optional[Tuple[float, float]] = ((1.0, 1.0),),
+        block_types: Optional[Tuple[Tuple[str, ...], ...]] = (("basic", "basic"),),
+        normalizations: Optional[Tuple[Tuple[str, ...], ...]] = (("bn", "bn"),),
+        activations: Optional[Tuple[Tuple[str, ...], ...]] = (("relu", "relu"),),
+        convolutions: Optional[Tuple[Tuple[str, ...], ...]] = (("conv", "conv"),),
+        attentions: Optional[Tuple[Tuple[str, ...], ...]] = ((None, "se"),),
+        preactivates: Optional[Tuple[Tuple[bool, ...], ...]] = ((False, False),),
+        preattends: Optional[Tuple[Tuple[bool, ...], ...]] = ((False, False),),
+        use_styles: Optional[Tuple[Tuple[bool, ...], ...]] = ((False, False),),
+        kernel_sizes: Optional[Tuple[Tuple[int, ...]]] = ((3, 3),),
+        groups: Optional[Tuple[Tuple[int, ...]]] = ((1, 1),),
+        biases: Optional[Tuple[Tuple[bool, ...]]] = ((False, False),),
         n_transformers: Optional[int] = None,
         n_transformer_blocks: Optional[Tuple[int, ...]] = (1,),
         transformer_blocks: Optional[Tuple[Tuple[str, ...], ...]] = (("exact",),),
         transformer_computations: Optional[Tuple[Tuple[str, ...], ...]] = (("basic",),),
         transformer_biases: Optional[Tuple[Tuple[bool, ...], ...]] = ((False,),),
         transformer_dropouts: Optional[Tuple[Tuple[float, ...], ...]] = ((0.0,),),
+        transformer_layer_scales: Optional[Tuple[Tuple[bool, ...], ...]] = ((False,),),
         transformer_params: Optional[List[Dict[str, Any]]] = None,
         **kwargs,
     ) -> None:
@@ -67,15 +68,28 @@ def __init__(
             skip_channels : Tuple[int, ...]
                 List of the number of channels in the encoder skip tensors. Ignored if
                 `long_skip` == None.
+            long_skip : str, default="unet"
+                long skip method to be used.
+                Allowed: "cross-attn", "unet", "unetpp", "unet3p", "unet3p-lite", None
+            merge_policy : str, default="sum"
+                The long skip merge policy. One of: "sum", "cat"
+            skip_params : Optional[Dict]
+                Extra keyword arguments for the skip-connection module. These depend
+                on the skip module. Refer to specific skip modules for more info.
+            upsampling : str, default="fixed-unpool"
+                Name of the upsampling method.
+            n_conv_layers : int, default=1
+                The number of conv layers inside one decoder stage.
             style_channels : int, default=None
                 Number of style vector channels. If None, style vectors are ignored.
                 Also, ignored if `n_conv_layers` is None.
-            n_conv_layers : int, default=1
-                The number of conv layers inside one decoder stage.
+            layer_residual : bool, optional, default=False
+                Apply a layer level residual short skip at each layer. I.e x + layer(x).
+                Ignored if `n_conv_layers` is None.
             n_conv_blocks : Tuple[int, ...], default=(2,)
                 Number of conv-blocks inside each conv layer. The tuple-length has to
                 match `n_conv_layers`. Ignored if `n_conv_layers` is None.
-            short_skips : str, default=("residual", )
+            short_skips : str, optional,  default=("residual", )
                 The short skip methods used inside the conv layers. Ignored if
                 `n_conv_layers` is None.
             expand_ratios : Tuple[float, ...], default=((1.0, 1.0),):
@@ -122,18 +136,6 @@ def __init__(
                 Include bias terms in the convolution blocks.
                 The tuple-length has to match `n_conv_layers`. Ignored if
                 `n_conv_layers` is None.
-            upsampling : str, default="fixed-unpool"
-                Name of the upsampling method.
-            long_skip : str, default="unet"
-                long skip method to be used. One of: "unet", "unetpp", "unet3p",
-                 "unet3p-lite", None,
-            merge_policy : str, default="sum"
-                The long skip merge policy. One of: "sum", "cat"
-            layer_residual : bool, default=False
-                Apply a layer level residual skip at each layer. I.e x + layer(x)
-            skip_params : Optional[Dict]
-                Extra keyword arguments for the skip-connection module. These depend
-                on the skip module. Refer to specific skip modules for more info.
             n_transformers : int, optional
                 Number of self-attention tranformers applied after the conv-layer.
                 If this is None, no transformers will be added.
@@ -156,6 +158,9 @@ def __init__(
             transformer_dropoouts : Tuple[Tuple[float, ...], ...], default=((0.0,),)
                 Dropout probabilities in the transformer layers. Ignored if
                 `n_transformers` is None.
+            transformer_layer_scales : Tuple[Tuple[bool, ...], ...], default=((False,),)
+                Flags, whether to use layer scales in the transformer layers. Ignored if
+                `n_transformers` is None.
             transformer_params : List[Dict[str, Any]]
                 Extra keyword arguments for the transformer layers. Refer to
                 `Transformer2D` module for more info. Ignored if `n_transformers`
@@ -211,6 +216,7 @@ def __init__(
                     n_transformer_blocks,
                     transformer_biases,
                     transformer_dropouts,
+                    transformer_layer_scales,
                 ),
             )
 
@@ -287,6 +293,7 @@ def __init__(
                     computation_types=transformer_computations[i],
                     biases=transformer_biases[i],
                     dropouts=transformer_dropouts[i],
+                    layer_scales=transformer_layer_scales[i],
                     **transformer_params
                     if transformer_params is not None
                     else {"k": None},