|
1 |
| -from typing import Any, Dict, Tuple, Union |
| 1 | +from typing import Any, Dict, Tuple |
2 | 2 |
|
3 | 3 | import torch
|
4 | 4 | import torch.nn as nn
|
5 | 5 |
|
6 |
| -from .dino_vit import build_dinov2_encoder |
7 |
| -from .histo_encoder import build_histo_encoder |
| 6 | +from .encoder_upsampler import EncoderUpsampler |
8 | 7 | from .timm_encoder import TimmEncoder
|
9 |
| -from .unettr_encoder import EncoderUnetTR |
10 |
| -from .vit_det_SAM import build_sam_encoder |
11 | 8 |
|
12 | 9 | __all__ = ["Encoder"]
|
13 | 10 |
|
14 | 11 |
|
15 |
| -TR_ENCODERS = { |
16 |
| - "histo_encoder_prostate_s": build_histo_encoder, |
17 |
| - "histo_encoder_prostate_m": build_histo_encoder, |
18 |
| - "sam_vit_l": build_sam_encoder, |
19 |
| - "sam_vit_b": build_sam_encoder, |
20 |
| - "sam_vit_h": build_sam_encoder, |
21 |
| - "dinov2_vit_small": build_dinov2_encoder, |
22 |
| - "dinov2_vit_base": build_dinov2_encoder, |
23 |
| - "dinov2_vit_large": build_dinov2_encoder, |
24 |
| - "dinov2_vit_giant": build_dinov2_encoder, |
25 |
| -} |
26 |
| - |
27 |
| - |
28 | 12 | class Encoder(nn.Module):
|
29 | 13 | def __init__(
|
30 | 14 | self,
|
31 |
| - name: str, |
32 |
| - pretrained: bool = False, |
33 |
| - checkpoint_path: str = None, |
34 |
| - in_channels: int = 3, |
35 |
| - depth: int = 4, |
36 |
| - out_indices: Tuple[int] = None, |
37 |
| - unettr_kwargs: Dict[str, Any] = None, |
38 |
| - **kwargs, |
| 15 | + timm_encoder_name: str, |
| 16 | + timm_encoder_out_indices: Tuple[int, ...], |
| 17 | + pixel_decoder_out_channels: Tuple[int, ...], |
| 18 | + timm_encoder_pretrained: bool = True, |
| 19 | + timm_extra_kwargs: Dict[str, Any] = {}, |
39 | 20 | ) -> None:
|
40 |
| - """Wrap timm conv-based encoders and transformer-based encoders to one class. |
41 |
| -
|
42 |
| - NOTE: Refer to the docstring of the `TimmEncoder` and `EncoderUnetTR` for the |
43 |
| - input key-word arguments (**kwargs). |
| 21 | + """Wrap timm encoders to one class. |
44 | 22 |
|
45 | 23 | Parameters
|
46 | 24 | ----------
|
47 |
| - name : str |
| 25 | + timm_encoder_name : str |
48 | 26 | Name of the encoder. If the name is in `TR_ENCODERS.keys()`, a transformer
|
49 | 27 | will be used. Otherwise, a timm encoder will be used.
|
50 |
| - pretrained : bool, optional, default=False |
51 |
| - If True, load imagenet pretrained weights, by default False. |
52 |
| - checkpoint_path : str, optional |
53 |
| - Path to the weights of the encoder. If None, the encoder is initialized |
54 |
| - with imagenet pre-trained weights if `enc_pretrain` argument is set to True |
55 |
| - or with random weights if set to False. Defaults to None. |
56 |
| - in_channels : int, optional |
57 |
| - Number of input channels, by default 3. |
58 |
| - depth : int, optional |
59 |
| - Number of output features, by default 4. Ignored for transformer encoders. |
60 |
| - out_indices : Tuple[int], optional |
61 |
| - Indices of the output features, by default None. If None, |
62 |
| - out_indices is set to range(len(depth)). Overrides the `depth` argument. |
63 |
| - unettr_kwargs : Dict[str, Any] |
64 |
| - Key-word arguments for the transformer encoder. These arguments are used |
65 |
| - only if the encoder is transformer based. Refer to the docstring of the |
66 |
| - `EncoderUnetTR` |
67 |
| - **kwargs : Dict[str, Any] |
68 |
| - Key-word arguments for any `timm` based encoder. These arguments are used |
69 |
| - in `timm.create_model(**kwargs)` function call. |
| 28 | + timm_encoder_out_indices : Tuple[int], optional |
| 29 | + Indices of the output features. |
| 30 | + pixel_decoder_out_channels : Tuple[int], optional |
| 31 | + Number of output channels at each upsampling stage. |
| 32 | + timm_encoder_pretrained : bool, optional, default=False |
| 33 | + If True, load pretrained timm weights, by default False. |
| 34 | + timm_extra_kwargs : Dict[str, Any], optional, default={} |
| 35 | + Key-word arguments for any `timm` based encoder. These arguments are |
| 36 | + used in `timm.create_model(**kwargs)` function call. |
70 | 37 | """
|
71 | 38 | super().__init__()
|
72 | 39 |
|
73 |
| - if name not in TR_ENCODERS.keys(): |
74 |
| - self.encoder = TimmEncoder( |
75 |
| - name, |
76 |
| - pretrained=pretrained, |
77 |
| - checkpoint_path=checkpoint_path, |
78 |
| - in_channels=in_channels, |
79 |
| - depth=depth, |
80 |
| - out_indices=out_indices, |
81 |
| - **kwargs, |
82 |
| - ) |
83 |
| - else: |
84 |
| - self.encoder = EncoderUnetTR( |
85 |
| - backbone=TR_ENCODERS[name]( |
86 |
| - name, |
87 |
| - pretrained=pretrained, |
88 |
| - checkpoint_path=checkpoint_path, |
89 |
| - ), |
90 |
| - **unettr_kwargs if unettr_kwargs is not None else {}, |
| 40 | + # initialize timm encoder |
| 41 | + self.encoder = TimmEncoder( |
| 42 | + timm_encoder_name, |
| 43 | + pretrained=timm_encoder_pretrained, |
| 44 | + out_indices=timm_encoder_out_indices, |
| 45 | + extra_kwargs=timm_extra_kwargs, |
| 46 | + ) |
| 47 | + |
| 48 | + # initialize feature upsampler if encoder is a vision transformer |
| 49 | + feature_info = self.encoder.feature_info |
| 50 | + reductions = [finfo["reduction"] for finfo in feature_info] |
| 51 | + if all(element == reductions[0] for element in reductions): |
| 52 | + self.encoder = EncoderUpsampler( |
| 53 | + backbone=self.encoder, |
| 54 | + out_channels=pixel_decoder_out_channels, |
91 | 55 | )
|
| 56 | + feature_info = self.encoder.feature_info |
92 | 57 |
|
93 |
| - self.out_channels = self.encoder.out_channels |
94 |
| - self.feature_info = self.encoder.feature_info |
| 58 | + self.out_channels = [f["num_chs"] for f in self.encoder.feature_info][::-1] |
| 59 | + self.feature_info = self.encoder.feature_info[::-1] |
95 | 60 |
|
96 |
| - def forward(self, x: torch.Tensor) -> Union[torch.Tensor, Tuple[torch.Tensor, ...]]: |
| 61 | + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]: |
97 | 62 | """Forward pass of the encoder and return all the features."""
|
98 |
| - return self.encoder(x) |
| 63 | + output, feats = self.encoder(x) |
| 64 | + return output, feats[::-1] |
0 commit comments