fix(models): modify seg models to use the new encoder API

okunator · okunator · commit a4387576c0bd · 2024-07-02T17:23:41.000+03:00
diff --git a/cellseg_models_pytorch/models/base/_base_model.py b/cellseg_models_pytorch/models/base/_base_model.py
@@ -17,15 +17,15 @@ def forward_features(
 
         NOTE: Returns both encoder and decoder features, not style.
         """
-        feats = self.forward_encoder(x)
+        enc_output, feats = self.forward_encoder(x)
         style = self.forward_style(feats[0])
         dec_feats = self.forward_dec_features(feats, style)
 
         # final input resolution skip connection
         if self.add_stem_skip:
             dec_feats = self.forward_stem_skip(x, dec_feats)
 
-        return feats, dec_feats
+        return enc_output, feats, dec_feats
 
     def forward_stem_skip(
         self, x: torch.Tensor, dec_feats: Dict[str, torch.Tensor]
@@ -38,12 +38,14 @@ def forward_stem_skip(
 
         return dec_feats
 
-    def forward_encoder(self, x: torch.Tensor) -> List[torch.Tensor]:
+    def forward_encoder(
+        self, x: torch.Tensor
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
         """Forward the model encoder."""
         self._check_input_shape(x)
-        feats = self.encoder(x)
+        output, feats = self.encoder(x)
 
-        return feats
+        return output, feats
 
     def forward_style(self, feat: torch.Tensor) -> torch.Tensor:
         """Forward the style domain adaptation layer.
diff --git a/cellseg_models_pytorch/models/base/_multitask_unet.py b/cellseg_models_pytorch/models/base/_multitask_unet.py
@@ -37,7 +37,6 @@ def __init__(
         out_size: Optional[int] = None,
         stem_params: Dict[str, Any] = None,
         encoder_params: Optional[Dict] = None,
-        unettr_kwargs: Optional[Dict] = None,
         **kwargs,
     ) -> None:
         """Create a universal multi-task (2D) unet.
@@ -52,9 +51,9 @@ def __init__(
                 Names of the decoder branches (has to match `decoders`) mapped to dicts
                  of output name - number of output classes. E.g.
                 {"cellpose": {"type": 4, "cellpose": 2}, "sem": {"sem": 5}}
-            out_channels : Tuple[int, ...]
+            out_channels : Dict[str, Dict[str, int]]
                 Out channels for each decoder stage.
-            long_skips : Dict[str, str]
+            long_skips : Dict[str, Union[str, Tuple[str, ...]]]
                 Dictionary mapping decoder branch-names to tuples defining the long skip
                 method to be used inside each of the decoder stages.
                 Allowed: "cross-attn", "unet", "unetpp", "unet3p", "unet3p-lite", None
@@ -118,13 +117,20 @@ def __init__(
         self.add_stem_skip = add_stem_skip
 
         # set encoder
+        # self.encoder = Encoder(
+        #     enc_name,
+        #     depth=depth,
+        #     pretrained=enc_pretrain,
+        #     checkpoint_path=kwargs.get("checkpoint_path", None),
+        #     unettr_kwargs=unettr_kwargs,
+        #     **encoder_params if encoder_params is not None else {},
+        # )
         self.encoder = Encoder(
-            enc_name,
-            depth=depth,
-            pretrained=enc_pretrain,
-            checkpoint_path=kwargs.get("checkpoint_path", None),
-            unettr_kwargs=unettr_kwargs,
-            **encoder_params if encoder_params is not None else {},
+            timm_encoder_name=enc_name,
+            timm_encoder_out_indices=tuple(range(depth)),
+            pixel_decoder_out_channels=tuple(out_channels.values())[0],
+            timm_encoder_pretrained=enc_pretrain,
+            timm_extra_kwargs=encoder_params,
         )
 
         # get the reduction factors for the encoder
@@ -202,7 +208,7 @@ def from_yaml(cls, yaml_path: str) -> nn.Module:
 
     def forward(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
         """Forward pass of Multi-task U-net."""
-        feats = self.forward_encoder(x)
+        _, feats = self.forward_encoder(x)
         style = self.forward_style(feats[0])
         dec_feats = self.forward_dec_features(feats, style)
         out = self.forward_heads(dec_feats)
diff --git a/cellseg_models_pytorch/models/cellpose/cellpose.py b/cellseg_models_pytorch/models/cellpose/cellpose.py
@@ -178,18 +178,25 @@ def __init__(
         }
 
         # set encoder
+        # self.encoder = Encoder(
+        #     enc_name,
+        #     depth=depth,
+        #     pretrained=enc_pretrain,
+        #     checkpoint_path=kwargs.get("checkpoint_path", None),
+        #     unettr_kwargs={  # Only used for transformer encoders
+        #         "convolution": convolution,
+        #         "activation": activation,
+        #         "normalization": normalization,
+        #         "attention": attention,
+        #     },
+        #     **encoder_params if encoder_params is not None else {},
+        # )
         self.encoder = Encoder(
-            enc_name,
-            depth=depth,
-            pretrained=enc_pretrain,
-            checkpoint_path=kwargs.get("checkpoint_path", None),
-            unettr_kwargs={  # Only used for transformer encoders
-                "convolution": convolution,
-                "activation": activation,
-                "normalization": normalization,
-                "attention": attention,
-            },
-            **encoder_params if encoder_params is not None else {},
+            timm_encoder_name=enc_name,
+            timm_encoder_out_indices=tuple(range(depth)),
+            pixel_decoder_out_channels=out_channels,
+            timm_encoder_pretrained=enc_pretrain,
+            timm_extra_kwargs=encoder_params,
         )
 
         # get the reduction factors for the encoder
@@ -286,7 +293,7 @@ def forward(
             returns also the encoder features in a list, decoder features as a dict
             mapping decoder names to outputs and the final head outputs dict.
         """
-        feats, dec_feats = self.forward_features(x)
+        _, feats, dec_feats = self.forward_features(x)
         out = self.forward_heads(dec_feats)
 
         if return_feats:
diff --git a/cellseg_models_pytorch/models/cellvit/cellvit.py b/cellseg_models_pytorch/models/cellvit/cellvit.py
@@ -5,8 +5,7 @@
 
 from cellseg_models_pytorch.decoders import UnetDecoder
 from cellseg_models_pytorch.decoders.long_skips import StemSkip
-from cellseg_models_pytorch.encoders import EncoderUnetTR
-from cellseg_models_pytorch.encoders.vit_det_SAM import build_sam_encoder
+from cellseg_models_pytorch.encoders import Encoder
 from cellseg_models_pytorch.modules.misc_modules import StyleReshape
 
 from ..base._base_model import BaseMultiTaskSegModel
@@ -47,6 +46,7 @@ def __init__(
         add_stem_skip: Optional[bool] = True,
         out_size: Optional[int] = None,
         skip_params: Optional[Dict] = None,
+        encoder_params: Optional[Dict] = None,
         **kwargs,
     ) -> None:
         """Create a CellVit model.
@@ -163,21 +163,35 @@ def __init__(
             for d in decoders
         }
 
-        if enc_name not in ("sam_vit_b", "sam_vit_l", "sam_vit_h"):
+        allowed = (
+            "samvit_base_patch16",
+            "samvit_base_patch16_224",
+            "samvit_huge_patch16",
+            "samvit_large_patch16",
+        )
+        if enc_name not in allowed:
             raise ValueError(
                 f"Wrong encoder name. Got: {enc_name}. "
-                "Allowed encoder for CellVit: sam_vit_b, sam_vit_l, sam_vit_h."
+                f"Allowed encoder for CellVit: {allowed}"
             )
 
         # set encoder
-        self.encoder = EncoderUnetTR(
-            backbone=build_sam_encoder(name=enc_name, pretrained=enc_pretrain),
-            out_channels=encoder_out_channels,
-            up_method="conv_transpose",
-            convolution=convolution,
-            activation=activation,
-            normalization=normalization,
-            attention=attention,
+        # self.encoder = EncoderUnetTR(
+        #     backbone=build_sam_encoder(name=enc_name, pretrained=enc_pretrain),
+        #     out_channels=encoder_out_channels,
+        #     up_method="conv_transpose",
+        #     convolution=convolution,
+        #     activation=activation,
+        #     normalization=normalization,
+        #     attention=attention,
+        # )
+
+        self.encoder = Encoder(
+            timm_encoder_name=enc_name,
+            timm_encoder_out_indices=tuple(range(len(encoder_out_channels))),
+            pixel_decoder_out_channels=encoder_out_channels,
+            timm_encoder_pretrained=enc_pretrain,
+            timm_extra_kwargs=encoder_params,
         )
 
         # get the reduction factors for the encoder
@@ -275,7 +289,7 @@ def forward(
             returns also the encoder features in a list, decoder features as a dict
             mapping decoder names to outputs and the final head outputs dict.
         """
-        feats, dec_feats = self.forward_features(x)
+        _, feats, dec_feats = self.forward_features(x)
         out = self.forward_heads(dec_feats)
 
         if return_feats:
diff --git a/cellseg_models_pytorch/models/cppnet/cppnet.py b/cellseg_models_pytorch/models/cppnet/cppnet.py
@@ -179,18 +179,25 @@ def __init__(
         }
 
         # set encoder
+        # self.encoder = Encoder(
+        #     enc_name,
+        #     depth=depth,
+        #     pretrained=enc_pretrain,
+        #     checkpoint_path=kwargs.get("checkpoint_path", None),
+        #     unettr_kwargs={  # Only used for transformer encoders
+        #         "convolution": convolution,
+        #         "activation": activation,
+        #         "normalization": normalization,
+        #         "attention": attention,
+        #     },
+        #     **encoder_params if encoder_params is not None else {},
+        # )
         self.encoder = Encoder(
-            enc_name,
-            depth=depth,
-            pretrained=enc_pretrain,
-            checkpoint_path=kwargs.get("checkpoint_path", None),
-            unettr_kwargs={  # Only used for transformer encoders
-                "convolution": convolution,
-                "activation": activation,
-                "normalization": normalization,
-                "attention": attention,
-            },
-            **encoder_params if encoder_params is not None else {},
+            timm_encoder_name=enc_name,
+            timm_encoder_out_indices=tuple(range(depth)),
+            pixel_decoder_out_channels=out_channels,
+            timm_encoder_pretrained=enc_pretrain,
+            timm_extra_kwargs=encoder_params,
         )
 
         # get the reduction factors for the encoder
@@ -358,7 +365,7 @@ def forward(
             returns also the encoder features in a list, decoder features as a dict
             mapping decoder names to outputs and the final head outputs dict.
         """
-        feats, dec_feats = self.forward_features(x)
+        _, feats, dec_feats = self.forward_features(x)
         out = self.forward_heads(dec_feats)
 
         # cppnet specific
diff --git a/cellseg_models_pytorch/models/hovernet/hovernet.py b/cellseg_models_pytorch/models/hovernet/hovernet.py
@@ -172,18 +172,26 @@ def __init__(
         }
 
         # set encoder
+        # self.encoder = Encoder(
+        #     enc_name,
+        #     depth=depth,
+        #     pretrained=enc_pretrain,
+        #     checkpoint_path=kwargs.get("checkpoint_path", None),
+        #     unettr_kwargs={  # Only used for transformer encoders
+        #         "convolution": convolution,
+        #         "activation": activation,
+        #         "normalization": normalization,
+        #         "attention": attention,
+        #     },
+        #     **encoder_params if encoder_params is not None else {},
+        # )
+
         self.encoder = Encoder(
-            enc_name,
-            depth=depth,
-            pretrained=enc_pretrain,
-            checkpoint_path=kwargs.get("checkpoint_path", None),
-            unettr_kwargs={  # Only used for transformer encoders
-                "convolution": convolution,
-                "activation": activation,
-                "normalization": normalization,
-                "attention": attention,
-            },
-            **encoder_params if encoder_params is not None else {},
+            timm_encoder_name=enc_name,
+            timm_encoder_out_indices=tuple(range(depth)),
+            pixel_decoder_out_channels=out_channels,
+            timm_encoder_pretrained=enc_pretrain,
+            timm_extra_kwargs=encoder_params,
         )
 
         # get the reduction factors for the encoder
@@ -281,7 +289,7 @@ def forward(
             returns also the encoder features in a list, decoder features as a dict
             mapping decoder names to outputs and the final head outputs dict.
         """
-        feats, dec_feats = self.forward_features(x)
+        _, feats, dec_feats = self.forward_features(x)
         out = self.forward_heads(dec_feats)
 
         if return_feats:
diff --git a/cellseg_models_pytorch/models/stardist/stardist.py b/cellseg_models_pytorch/models/stardist/stardist.py
@@ -177,18 +177,25 @@ def __init__(
         }
 
         # set encoder
+        # self.encoder = Encoder(
+        #     enc_name,
+        #     depth=depth,
+        #     pretrained=enc_pretrain,
+        #     checkpoint_path=kwargs.get("checkpoint_path", None),
+        #     unettr_kwargs={  # Only used for transformer encoders, ignored otherwise
+        #         "convolution": convolution,
+        #         "activation": activation,
+        #         "normalization": normalization,
+        #         "attention": attention,
+        #     },
+        #     **encoder_params if encoder_params is not None else {},
+        # )
         self.encoder = Encoder(
-            enc_name,
-            depth=depth,
-            pretrained=enc_pretrain,
-            checkpoint_path=kwargs.get("checkpoint_path", None),
-            unettr_kwargs={  # Only used for transformer encoders, ignored otherwise
-                "convolution": convolution,
-                "activation": activation,
-                "normalization": normalization,
-                "attention": attention,
-            },
-            **encoder_params if encoder_params is not None else {},
+            timm_encoder_name=enc_name,
+            timm_encoder_out_indices=tuple(range(depth)),
+            pixel_decoder_out_channels=out_channels,
+            timm_encoder_pretrained=enc_pretrain,
+            timm_extra_kwargs=encoder_params,
         )
 
         # get the reduction factors for the encoder
@@ -315,7 +322,7 @@ def forward(
             returns also the encoder features in a list, decoder features as a dict
             mapping decoder names to outputs and the final head outputs dict.
         """
-        feats, dec_feats = self.forward_features(x)
+        _, feats, dec_feats = self.forward_features(x)
 
         if return_feats:
             ret_dec_feats = dec_feats.copy()
diff --git a/changelog.d/20240702_171210_oskari.lehtonen.md b/changelog.d/20240702_171210_oskari.lehtonen.md
@@ -0,0 +1,15 @@
+## Breaking changes
+- Lose support for python 3.9
+
+## Chore
+- Update timm version to above 1.0.0.
+
+## Features
+- Image encoders now only from timm models.
+
+# Removed
+- SAM and DINOv2 original implementation image-encoders removed from this repo. These can be found from timm models these days.
+
+## Examples
+- Updated example notebooks.
+- Added new example notebooks utilizing UNI and CONCH encoders from the huggingface model hub.