@@ -1164,8 +1164,10 @@ def __init__(
1164
1164
1165
1165
# For segmentation and detection, extract intermediate output
1166
1166
if self .fork_feat :
1167
- # add a norm layer for each output
1168
- self .out_indices = [0 , 2 , 4 , 6 ]
1167
+ # Add a norm layer for each output. self.stages is slightly different than self.network
1168
+ # in the original code, the PatchEmbed layer is part of self.stages in this code where
1169
+ # it was part of self.network in the original code. So we do not need to skip out indices.
1170
+ self .out_indices = [0 , 1 , 2 , 3 ]
1169
1171
for i_emb , i_layer in enumerate (self .out_indices ):
1170
1172
if i_emb == 0 and os .environ .get ("FORK_LAST3" , None ):
1171
1173
"""For RetinaNet, `start_level=1`. The first norm layer will not used.
@@ -1416,4 +1418,4 @@ def fastvit_ma36(pretrained=False, **kwargs):
1416
1418
pos_embs = (None , None , None , partial (RepConditionalPosEnc , spatial_shape = (7 , 7 ))),
1417
1419
token_mixers = ("repmixer" , "repmixer" , "repmixer" , "attention" )
1418
1420
)
1419
- return _create_fastvit ('fastvit_ma36' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
1421
+ return _create_fastvit ('fastvit_ma36' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
0 commit comments