feat(encoders): update encoder module to use only timm encoders. Re-build the unetTR upsampler module for generic image transformer encoders.

okunator · okunator · commit e398eba52dd1 · 2024-07-02T17:20:44.000+03:00
diff --git a/cellseg_models_pytorch/encoders/encoder.py b/cellseg_models_pytorch/encoders/encoder.py
@@ -1,98 +1,64 @@
-from typing import Any, Dict, Tuple, Union
+from typing import Any, Dict, Tuple
 
 import torch
 import torch.nn as nn
 
-from .dino_vit import build_dinov2_encoder
-from .histo_encoder import build_histo_encoder
+from .encoder_upsampler import EncoderUpsampler
 from .timm_encoder import TimmEncoder
-from .unettr_encoder import EncoderUnetTR
-from .vit_det_SAM import build_sam_encoder
 
 __all__ = ["Encoder"]
 
 
-TR_ENCODERS = {
-    "histo_encoder_prostate_s": build_histo_encoder,
-    "histo_encoder_prostate_m": build_histo_encoder,
-    "sam_vit_l": build_sam_encoder,
-    "sam_vit_b": build_sam_encoder,
-    "sam_vit_h": build_sam_encoder,
-    "dinov2_vit_small": build_dinov2_encoder,
-    "dinov2_vit_base": build_dinov2_encoder,
-    "dinov2_vit_large": build_dinov2_encoder,
-    "dinov2_vit_giant": build_dinov2_encoder,
-}
-
-
 class Encoder(nn.Module):
     def __init__(
         self,
-        name: str,
-        pretrained: bool = False,
-        checkpoint_path: str = None,
-        in_channels: int = 3,
-        depth: int = 4,
-        out_indices: Tuple[int] = None,
-        unettr_kwargs: Dict[str, Any] = None,
-        **kwargs,
+        timm_encoder_name: str,
+        timm_encoder_out_indices: Tuple[int, ...],
+        pixel_decoder_out_channels: Tuple[int, ...],
+        timm_encoder_pretrained: bool = True,
+        timm_extra_kwargs: Dict[str, Any] = {},
     ) -> None:
-        """Wrap timm conv-based encoders and transformer-based encoders to one class.
-
-        NOTE: Refer to the docstring of the `TimmEncoder` and `EncoderUnetTR` for the
-        input key-word arguments (**kwargs).
+        """Wrap timm encoders to one class.
 
         Parameters
         ----------
-        name : str
+        timm_encoder_name : str
             Name of the encoder. If the name is in `TR_ENCODERS.keys()`, a transformer
             will be used. Otherwise, a timm encoder will be used.
-        pretrained : bool, optional, default=False
-            If True, load imagenet pretrained weights, by default False.
-        checkpoint_path : str, optional
-            Path to the weights of the encoder. If None, the encoder is initialized
-            with imagenet pre-trained weights if `enc_pretrain` argument is set to True
-            or with random weights if set to False. Defaults to None.
-        in_channels : int, optional
-            Number of input channels, by default 3.
-        depth : int, optional
-            Number of output features, by default 4. Ignored for transformer encoders.
-        out_indices : Tuple[int], optional
-            Indices of the output features, by default None. If None,
-            out_indices is set to range(len(depth)). Overrides the `depth` argument.
-        unettr_kwargs : Dict[str, Any]
-            Key-word arguments for the transformer encoder. These arguments are used
-            only if the encoder is transformer based. Refer to the docstring of the
-            `EncoderUnetTR`
-        **kwargs : Dict[str, Any]
-            Key-word arguments for any `timm` based encoder. These arguments are used
-            in `timm.create_model(**kwargs)` function call.
+        timm_encoder_out_indices : Tuple[int], optional
+            Indices of the output features.
+        pixel_decoder_out_channels : Tuple[int], optional
+            Number of output channels at each upsampling stage.
+        timm_encoder_pretrained : bool, optional, default=False
+            If True, load pretrained timm weights, by default False.
+        timm_extra_kwargs : Dict[str, Any], optional, default={}
+            Key-word arguments for any `timm` based encoder. These arguments are
+            used in `timm.create_model(**kwargs)` function call.
         """
         super().__init__()
 
-        if name not in TR_ENCODERS.keys():
-            self.encoder = TimmEncoder(
-                name,
-                pretrained=pretrained,
-                checkpoint_path=checkpoint_path,
-                in_channels=in_channels,
-                depth=depth,
-                out_indices=out_indices,
-                **kwargs,
-            )
-        else:
-            self.encoder = EncoderUnetTR(
-                backbone=TR_ENCODERS[name](
-                    name,
-                    pretrained=pretrained,
-                    checkpoint_path=checkpoint_path,
-                ),
-                **unettr_kwargs if unettr_kwargs is not None else {},
+        # initialize timm encoder
+        self.encoder = TimmEncoder(
+            timm_encoder_name,
+            pretrained=timm_encoder_pretrained,
+            out_indices=timm_encoder_out_indices,
+            extra_kwargs=timm_extra_kwargs,
+        )
+
+        # initialize feature upsampler if encoder is a vision transformer
+        feature_info = self.encoder.feature_info
+        reductions = [finfo["reduction"] for finfo in feature_info]
+        if all(element == reductions[0] for element in reductions):
+            self.encoder = EncoderUpsampler(
+                backbone=self.encoder,
+                out_channels=pixel_decoder_out_channels,
             )
+            feature_info = self.encoder.feature_info
 
-        self.out_channels = self.encoder.out_channels
-        self.feature_info = self.encoder.feature_info
+        self.out_channels = [f["num_chs"] for f in self.encoder.feature_info][::-1]
+        self.feature_info = self.encoder.feature_info[::-1]
 
-    def forward(self, x: torch.Tensor) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]:
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]:
         """Forward pass of the encoder and return all the features."""
-        return self.encoder(x)
+        output, feats = self.encoder(x)
+        return output, feats[::-1]
diff --git a/cellseg_models_pytorch/encoders/encoder_upsampler.py b/cellseg_models_pytorch/encoders/encoder_upsampler.py
@@ -0,0 +1,167 @@
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+
+__all__ = ["EncoderUpsampler", "FeatUpsampleBlock"]
+
+
+class FeatUpsampleBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int = None,
+        scale_factor: int = 2,
+    ) -> None:
+        """Upsample 2D dimensions of a feature.
+
+        TransConv + Conv layers
+
+        Parameters:
+            in_channels (int):
+                Number of input channels.
+            out_channels (int):
+                Number of output channels.
+            scale_factor (int):
+                Scale factor for upsampling. Defaults to 2.
+        """
+        super().__init__()
+        if out_channels is None:
+            out_channels = in_channels
+
+        self.scale_factor = scale_factor
+        self.out_channels = out_channels
+
+        if isinstance(scale_factor, int):
+            self.up = nn.ConvTranspose2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 ** (scale_factor - 1),
+                stride=2 ** (scale_factor - 1),
+                padding=0,
+                output_padding=0,
+            )
+        else:
+            self.up = nn.Upsample(
+                scale_factor=scale_factor,
+                mode="bilinear",
+                align_corners=True,
+            )
+
+        self.conv_block = nn.Conv2d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            padding=1,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.up(x)
+        x = self.conv_block(x)
+        return x
+
+
+class EncoderUpsampler(nn.Module):
+    def __init__(
+        self,
+        backbone: nn.Module,
+        out_channels: Tuple[int, ...],
+    ) -> None:
+        """Feature upsampler for transformer-like backbones.
+
+        Note:
+            This is a U-NetTR like upsampler that takes the features from the backbone
+            and upsamples them such that the scale factor between the upsampled features
+            are two. Builds an image-pyramid like structure.
+
+        Parameters:
+            backbone (nn.Module):
+                Backbone network that extracts features.
+            out_channels (Tuple[int, ...]):
+                Number of channels in the output tensor of each upsampling block.
+                Defaults to None.
+        """
+        print(out_channels, backbone.feature_info)
+        super().__init__()
+        if len(out_channels) != len(backbone.feature_info):
+            raise ValueError(
+                "`out_channels` must have the same len as the `backbone.feature_info.`"
+                f"Got {len(out_channels)} and {len(backbone.feature_info)} respectively."
+            )
+
+        self.backbone = backbone
+        self.out_channels = out_channels
+        self.feature_info = []
+
+        # flip the feature info so that we start building the
+        # upsampling blocks from the bottleneck layer
+        feature_info = backbone.feature_info[::-1]
+
+        # bottleneck layer
+        self.bottleneck = nn.Conv2d(
+            in_channels=feature_info[0]["num_chs"],
+            out_channels=self.out_channels[0],
+            kernel_size=1,
+        )
+
+        # add timm-like feature info of the bottleneck layer
+        self.feature_info.append(
+            {
+                "num_chs": self.out_channels[0],
+                "module": "bottleneck",
+                "reduction": float(feature_info[0]["reduction"]),
+            }
+        )
+
+        self.up_blocks = nn.ModuleDict()
+        n_up_blocks = list(range(1, len(self.out_channels)))
+        for i, (out_chls, finfo, n_blocks) in enumerate(
+            zip(self.out_channels[1:], feature_info[1:], n_up_blocks)
+        ):
+            up_blocks = []
+            squeeze_rates = list(range(n_blocks))[::-1]
+
+            for j, squeeze_ratio in zip(range(n_blocks), squeeze_rates):
+                if j == 0:
+                    in_channels = finfo["num_chs"]
+                else:
+                    in_channels = up.out_channels  # noqa
+
+                up = FeatUpsampleBlock(
+                    in_channels=in_channels,
+                    out_channels=out_chls * (2**squeeze_ratio),
+                    scale_factor=2,
+                )
+                up_blocks.append(up)
+
+            # add feature info
+            self.feature_info.append(
+                {
+                    "num_chs": out_chls,
+                    "module": f"up{i + 1}",
+                    "reduction": finfo["reduction"] / 2**n_blocks,
+                }
+            )
+            self.up_blocks[f"up{i + 1}"] = nn.Sequential(*up_blocks)
+
+        # flip the feature info back to the original order to match the top-down
+        # order of timm feature_info. (high to low res)
+        self.feature_info = self.feature_info[::-1]
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]:
+        # get the features from the backbone
+        final_feat, feats = self.backbone(x)
+
+        # flip the features so that we start from the bottleneck (low res)
+        feats = feats[::-1]
+
+        # bottleneck feature
+        up_feat = self.bottleneck(feats[0])
+        intermediate_features = [up_feat]
+
+        # upsampled features
+        for i, feat in enumerate(feats[1:]):
+            up_feat = self.up_blocks[f"up{i + 1}"](feat)
+            intermediate_features.append(up_feat)
+
+        return final_feat, tuple(intermediate_features[::-1])  # feats in top-down order
diff --git a/cellseg_models_pytorch/encoders/timm_encoder.py b/cellseg_models_pytorch/encoders/timm_encoder.py