feat(models): add param enc_out_indices to all model classes. Enables selecting the specific encoder features by their indices.

okunator · okunator · commit 372db6555528 · 2024-07-04T12:56:40.000+03:00
diff --git a/cellseg_models_pytorch/models/cellpose/cellpose.py b/cellseg_models_pytorch/models/cellpose/cellpose.py
@@ -34,6 +34,7 @@ def __init__(
         enc_name: str = "resnet50",
         enc_pretrain: bool = True,
         enc_freeze: bool = False,
+        enc_out_indices: Tuple[int, ...] = None,
         upsampling: str = "fixed-unpool",
         long_skip: str = "unet",
         merge_policy: str = "sum",
@@ -95,6 +96,9 @@ def __init__(
                 Whether to use imagenet pretrained weights in the encoder.
             enc_freeze : bool, default=False
                 Freeze encoder weights for training.
+            enc_out_indices : Tuple[int, ...], optional
+                Indices of the output features from the encoder. If None, indices are
+                set to `range(len(depth))`
             upsampling : str, default="fixed-unpool"
                 The upsampling method. One of: "fixed-unpool", "bilinear", "nearest",
                 "conv_transpose", "bicubic"
@@ -147,8 +151,17 @@ def __init__(
         self.aux_key = self._check_decoder_args(decoders, ("omnipose", "cellpose"))
         self.inst_key = inst_key
         self._check_head_args(heads, decoders)
+
+        if enc_out_indices is None:
+            enc_out_indices = tuple(range(depth))
+
         self._check_depth(
-            depth, {"out_channels": out_channels, "layer_depths": layer_depths}
+            depth,
+            {
+                "out_channels": out_channels,
+                "layer_depths": layer_depths,
+                "enc_out_indices": enc_out_indices,
+            },
         )
 
         self.enc_freeze = enc_freeze
@@ -177,23 +190,9 @@ def __init__(
             for d in decoders
         }
 
-        # set encoder
-        # self.encoder = Encoder(
-        #     enc_name,
-        #     depth=depth,
-        #     pretrained=enc_pretrain,
-        #     checkpoint_path=kwargs.get("checkpoint_path", None),
-        #     unettr_kwargs={  # Only used for transformer encoders
-        #         "convolution": convolution,
-        #         "activation": activation,
-        #         "normalization": normalization,
-        #         "attention": attention,
-        #     },
-        #     **encoder_params if encoder_params is not None else {},
-        # )
         self.encoder = Encoder(
             timm_encoder_name=enc_name,
-            timm_encoder_out_indices=tuple(range(depth)),
+            timm_encoder_out_indices=enc_out_indices,
             pixel_decoder_out_channels=out_channels,
             timm_encoder_pretrained=enc_pretrain,
             timm_extra_kwargs=encoder_params,
diff --git a/cellseg_models_pytorch/models/cellvit/cellvit.py b/cellseg_models_pytorch/models/cellvit/cellvit.py
@@ -28,12 +28,13 @@ def __init__(
         heads: Dict[str, Dict[str, int]],
         inst_key: str = "inst",
         out_channels: Tuple[int, ...] = (512, 256, 128, 64),
-        encoder_out_channels: Tuple[int, ...] = (512, 512, 256, 128),
         layer_depths: Tuple[int, ...] = (3, 2, 2, 2),
         style_channels: int = None,
         enc_name: str = "sam_vit_b",
         enc_pretrain: bool = True,
         enc_freeze: bool = False,
+        enc_out_channels: Tuple[int, ...] = None,
+        enc_out_indices: Tuple[int, ...] = None,
         long_skip: str = "unet",
         merge_policy: str = "cat",
         short_skip: str = "basic",
@@ -74,8 +75,6 @@ def __init__(
             inst_key : str, default="inst"
                 The key for the model output that will be used in the instance
                 segmentation post-processing pipeline as the binary segmentation result.
-            encoder_out_channels : Tuple[int, ...], default=(512, 512, 256, 128)
-                Out channels for each SAM-UnetTR encoder stage.
             out_channels : Tuple[int, ...], default=(256, 256, 64, 64)
                 Out channels for each decoder stage.
             layer_depths : Tuple[int, ...], default=(4, 4, 4, 4)
@@ -88,6 +87,11 @@ def __init__(
                 Whether to use imagenet pretrained weights in the encoder.
             enc_freeze : bool, default=False
                 Freeze encoder weights for training.
+            enc_out_channels : Tuple[int, ...], default=None
+                Out channels for each SAM-UnetTR encoder stage.
+            enc_out_indices : Tuple[int, ...], default=None
+                Indices of the output features from the encoder. If None,
+                `len(range(layer_depths))` features are used.
             long_skip : str, default="unet"
                 long skip method to be used. One of: "unet", "unetpp", "unet3p",
                 "unet3p-lite", None
@@ -133,9 +137,23 @@ def __init__(
         self.out_size = out_size
         self.aux_key = self._check_decoder_args(decoders, ("hovernet",))
         self.inst_key = inst_key
-        self.depth = 4
+        self.depth = len(layer_depths)
         self._check_head_args(heads, decoders)
-        self._check_depth(self.depth, {"out_channels": out_channels})
+
+        if enc_out_indices is None:
+            enc_out_indices = tuple(range(self.depth))
+
+        if enc_out_channels is None:
+            enc_out_channels = out_channels
+
+        self._check_depth(
+            self.depth,
+            {
+                "out_channels": out_channels,
+                "enc_out_indices": enc_out_indices,
+                "enc_out_channels": enc_out_channels,
+            },
+        )
 
         self.add_stem_skip = add_stem_skip
         self.enc_freeze = enc_freeze
@@ -175,21 +193,11 @@ def __init__(
                 f"Allowed encoder for CellVit: {allowed}"
             )
 
-        # set encoder
-        # self.encoder = EncoderUnetTR(
-        #     backbone=build_sam_encoder(name=enc_name, pretrained=enc_pretrain),
-        #     out_channels=encoder_out_channels,
-        #     up_method="conv_transpose",
-        #     convolution=convolution,
-        #     activation=activation,
-        #     normalization=normalization,
-        #     attention=attention,
-        # )
-
+        # set encoders
         self.encoder = Encoder(
             timm_encoder_name=enc_name,
-            timm_encoder_out_indices=tuple(range(len(encoder_out_channels))),
-            pixel_decoder_out_channels=encoder_out_channels,
+            timm_encoder_out_indices=enc_out_indices,
+            pixel_decoder_out_channels=enc_out_channels,
             timm_encoder_pretrained=enc_pretrain,
             timm_extra_kwargs=encoder_params,
         )
diff --git a/cellseg_models_pytorch/models/cppnet/cppnet.py b/cellseg_models_pytorch/models/cppnet/cppnet.py
@@ -35,6 +35,7 @@ def __init__(
         enc_name: str = "resnet50",
         enc_pretrain: bool = True,
         enc_freeze: bool = False,
+        enc_out_indices: Tuple[int, ...] = None,
         upsampling: str = "conv_transpose",
         long_skip: str = "unet",
         merge_policy: str = "cat",
@@ -94,6 +95,9 @@ def __init__(
                 Whether to use imagenet pretrained weights in the encoder.
             enc_freeze : bool, default=False
                 Freeze encoder weights for training.
+            enc_out_indices : Optional[Tuple[int]], default=None
+                Indices of the encoder output features. If None, these are set to
+                range(len(depth)).
             upsampling : str, default="fixed-unpool"
                 The upsampling method to be used. One of: "fixed-unpool", "nearest",
                 "bilinear", "bicubic", "conv_transpose"
@@ -150,7 +154,14 @@ def __init__(
         self.aux_key = "stardist_refined"
         self.inst_key = inst_key
         self._check_head_args(heads, decoders)
-        self._check_depth(depth, {"out_channels": out_channels})
+
+        if enc_out_indices is None:
+            enc_out_indices = tuple(range(depth))
+
+        self._check_depth(
+            depth,
+            {"out_channels": out_channels, "enc_out_indices": enc_out_indices},
+        )
 
         self.add_stem_skip = add_stem_skip
         self.enc_freeze = enc_freeze
@@ -179,22 +190,9 @@ def __init__(
         }
 
         # set encoder
-        # self.encoder = Encoder(
-        #     enc_name,
-        #     depth=depth,
-        #     pretrained=enc_pretrain,
-        #     checkpoint_path=kwargs.get("checkpoint_path", None),
-        #     unettr_kwargs={  # Only used for transformer encoders
-        #         "convolution": convolution,
-        #         "activation": activation,
-        #         "normalization": normalization,
-        #         "attention": attention,
-        #     },
-        #     **encoder_params if encoder_params is not None else {},
-        # )
         self.encoder = Encoder(
             timm_encoder_name=enc_name,
-            timm_encoder_out_indices=tuple(range(depth)),
+            timm_encoder_out_indices=enc_out_indices,
             pixel_decoder_out_channels=out_channels,
             timm_encoder_pretrained=enc_pretrain,
             timm_extra_kwargs=encoder_params,
diff --git a/cellseg_models_pytorch/models/hovernet/hovernet.py b/cellseg_models_pytorch/models/hovernet/hovernet.py
@@ -33,6 +33,7 @@ def __init__(
         enc_name: str = "resnet50",
         enc_pretrain: bool = True,
         enc_freeze: bool = False,
+        enc_out_indices: Tuple[int, ...] = None,
         upsampling: str = "fixed-unpool",
         long_skip: str = "unet",
         merge_policy: str = "sum",
@@ -91,6 +92,9 @@ def __init__(
                 Whether to use imagenet pretrained weights in the encoder.
             enc_freeze : bool, default=False
                 Freeze encoder weights for training.
+            enc_out_indices : Tuple[int, ...], optional
+                Indices of the encoder output features. If None, indices is set to
+                `range(len(depth))`.
             upsampling : str, default="fixed-unpool"
                 The upsampling method to be used. One of: "fixed-unpool", "nearest",
                 "bilinear", "bicubic", "conv_transpose"
@@ -143,7 +147,14 @@ def __init__(
         self.aux_key = self._check_decoder_args(decoders, ("hovernet",))
         self.inst_key = inst_key
         self._check_head_args(heads, decoders)
-        self._check_depth(depth, {"out_channels": out_channels})
+
+        if enc_out_indices is None:
+            enc_out_indices = tuple(range(depth))
+
+        self._check_depth(
+            depth,
+            {"out_channels": out_channels, "enc_out_indices": enc_out_indices},
+        )
 
         self.add_stem_skip = add_stem_skip
         self.enc_freeze = enc_freeze
@@ -172,23 +183,9 @@ def __init__(
         }
 
         # set encoder
-        # self.encoder = Encoder(
-        #     enc_name,
-        #     depth=depth,
-        #     pretrained=enc_pretrain,
-        #     checkpoint_path=kwargs.get("checkpoint_path", None),
-        #     unettr_kwargs={  # Only used for transformer encoders
-        #         "convolution": convolution,
-        #         "activation": activation,
-        #         "normalization": normalization,
-        #         "attention": attention,
-        #     },
-        #     **encoder_params if encoder_params is not None else {},
-        # )
-
         self.encoder = Encoder(
             timm_encoder_name=enc_name,
-            timm_encoder_out_indices=tuple(range(depth)),
+            timm_encoder_out_indices=enc_out_indices,
             pixel_decoder_out_channels=out_channels,
             timm_encoder_pretrained=enc_pretrain,
             timm_extra_kwargs=encoder_params,
diff --git a/cellseg_models_pytorch/models/stardist/stardist.py b/cellseg_models_pytorch/models/stardist/stardist.py
@@ -28,6 +28,7 @@ def __init__(
         enc_name: str = "resnet50",
         enc_pretrain: bool = True,
         enc_freeze: bool = False,
+        enc_out_indices: Tuple[int, ...] = None,
         upsampling: str = "fixed-unpool",
         long_skip: str = "unet",
         merge_policy: str = "cat",
@@ -91,6 +92,9 @@ def __init__(
                 Whether to use imagenet pretrained weights in the encoder.
             enc_freeze : bool, default=False
                 Freeze encoder weights for training.
+            enc_out_indices : Tuple[int, ...], optional
+                Indices of the encoder output features. If None, indices is set to
+                `range(len(depth))`.
             upsampling : str, default="fixed-unpool"
                 The upsampling method. One of: "fixed-unpool", "nearest", "bilinear",
                 "bicubic", "conv_transpose"
@@ -147,7 +151,14 @@ def __init__(
         self.inst_key = inst_key
         self._check_head_args(extra_convs, decoders)
         self._check_head_args(heads, self._get_inner_keys(extra_convs))
-        self._check_depth(depth, {"out_channels": out_channels})
+
+        if enc_out_indices is None:
+            enc_out_indices = tuple(range(depth))
+
+        self._check_depth(
+            depth,
+            {"out_channels": out_channels, "enc_out_indices": enc_out_indices},
+        )
 
         self.enc_freeze = enc_freeze
         use_style = style_channels is not None
@@ -177,22 +188,9 @@ def __init__(
         }
 
         # set encoder
-        # self.encoder = Encoder(
-        #     enc_name,
-        #     depth=depth,
-        #     pretrained=enc_pretrain,
-        #     checkpoint_path=kwargs.get("checkpoint_path", None),
-        #     unettr_kwargs={  # Only used for transformer encoders, ignored otherwise
-        #         "convolution": convolution,
-        #         "activation": activation,
-        #         "normalization": normalization,
-        #         "attention": attention,
-        #     },
-        #     **encoder_params if encoder_params is not None else {},
-        # )
         self.encoder = Encoder(
             timm_encoder_name=enc_name,
-            timm_encoder_out_indices=tuple(range(depth)),
+            timm_encoder_out_indices=enc_out_indices,
             pixel_decoder_out_channels=out_channels,
             timm_encoder_pretrained=enc_pretrain,
             timm_extra_kwargs=encoder_params,