handle stochastic depth

gau-nernst · gau-nernst · commit 3b0d330456a5 · 2024-12-02T09:02:35.000+08:00
diff --git a/timm/models/beit.py b/timm/models/beit.py
@@ -326,7 +326,7 @@ def __init__(
         else:
             self.rel_pos_bias = None
 
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth, device="cpu")]  # stochastic depth decay rule
         self.blocks = nn.ModuleList([
             Block(
                 dim=embed_dim,
diff --git a/timm/models/byobnet.py b/timm/models/byobnet.py
@@ -1113,7 +1113,7 @@ def create_byob_stages(
     feature_info = []
     block_cfgs = [expand_blocks_cfg(s) for s in cfg.blocks]
     depths = [sum([bc.d for bc in stage_bcs]) for stage_bcs in block_cfgs]
-    dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+    dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
     dilation = 1
     net_stride = stem_feat['reduction']
     prev_chs = stem_feat['num_chs']
diff --git a/timm/models/convit.py b/timm/models/convit.py
@@ -292,7 +292,7 @@ def __init__(
             self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
             trunc_normal_(self.pos_embed, std=.02)
 
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth, device="cpu")]  # stochastic depth decay rule
         self.blocks = nn.ModuleList([
             Block(
                 dim=embed_dim,
diff --git a/timm/models/convnext.py b/timm/models/convnext.py
@@ -328,7 +328,7 @@ def __init__(
             stem_stride = 4
 
         self.stages = nn.Sequential()
-        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
         stages = []
         prev_chs = dims[0]
         curr_stride = stem_stride
diff --git a/timm/models/crossvit.py b/timm/models/crossvit.py
@@ -351,7 +351,7 @@ def __init__(
         self.pos_drop = nn.Dropout(p=pos_drop_rate)
 
         total_depth = sum([sum(x[-2:]) for x in depth])
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, total_depth)]  # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, total_depth, device="cpu")]  # stochastic depth decay rule
         dpr_ptr = 0
         self.blocks = nn.ModuleList()
         for idx, block_cfg in enumerate(depth):
diff --git a/timm/models/cspnet.py b/timm/models/cspnet.py
@@ -569,7 +569,7 @@ def create_csp_stages(
     cfg_dict = asdict(cfg.stages)
     num_stages = len(cfg.stages.depth)
     cfg_dict['block_dpr'] = [None] * num_stages if not drop_path_rate else \
-        [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg.stages.depth)).split(cfg.stages.depth)]
+        [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg.stages.depth), device="cpu").split(cfg.stages.depth)]
     stage_args = [dict(zip(cfg_dict.keys(), values)) for values in zip(*cfg_dict.values())]
     block_kwargs = dict(
         act_layer=cfg.act_layer,
diff --git a/timm/models/davit.py b/timm/models/davit.py
@@ -554,7 +554,7 @@ def __init__(
         self.stem = Stem(in_chans, embed_dims[0], norm_layer=norm_layer)
         in_chs = embed_dims[0]
 
-        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
         stages = []
         for stage_idx in range(num_stages):
             out_chs = embed_dims[stage_idx]
diff --git a/timm/models/edgenext.py b/timm/models/edgenext.py
@@ -342,7 +342,7 @@ def __init__(
 
         curr_stride = 4
         stages = []
-        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
         in_chs = dims[0]
         for i in range(4):
             stride = 2 if curr_stride == 2 or i > 0 else 1
diff --git a/timm/models/efficientformer.py b/timm/models/efficientformer.py
@@ -385,7 +385,7 @@ def __init__(
         # stochastic depth decay rule
         self.num_stages = len(depths)
         last_stage = self.num_stages - 1
-        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
         downsamples = downsamples or (False,) + (True,) * (self.num_stages - 1)
         stages = []
         self.feature_info = []
diff --git a/timm/models/efficientformer_v2.py b/timm/models/efficientformer_v2.py
@@ -542,7 +542,7 @@ def __init__(
         stride = 4
 
         num_stages = len(depths)
-        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
         downsamples = downsamples or (False,) + (True,) * (len(depths) - 1)
         mlp_ratios = to_ntuple(num_stages)(mlp_ratios)
         stages = []
diff --git a/timm/models/eva.py b/timm/models/eva.py
@@ -472,7 +472,7 @@ def __init__(
         else:
             self.rope = None
 
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth, device="cpu")]  # stochastic depth decay rule
         block_fn = EvaBlockPostNorm if use_post_norm else EvaBlock
         self.blocks = nn.ModuleList([
             block_fn(
diff --git a/timm/models/fastvit.py b/timm/models/fastvit.py
@@ -1142,7 +1142,7 @@ def __init__(
         # Build the main stages of the network architecture
         prev_dim = embed_dims[0]
         scale = 1
-        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(layers)).split(layers)]
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(layers), device="cpu").split(layers)]
         stages = []
         for i in range(len(layers)):
             downsample = downsamples[i] or prev_dim != embed_dims[i]
diff --git a/timm/models/focalnet.py b/timm/models/focalnet.py
@@ -378,7 +378,7 @@ def __init__(
         )
         in_dim = embed_dim[0]
 
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu")]  # stochastic depth decay rule
         layers = []
         for i_layer in range(self.num_layers):
             out_dim = embed_dim[i_layer]
diff --git a/timm/models/gcvit.py b/timm/models/gcvit.py
@@ -419,7 +419,7 @@ def __init__(
             norm_layer=norm_layer
         )
 
-        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
         stages = []
         for i in range(num_stages):
             last_stage = i == num_stages - 1
diff --git a/timm/models/hgnet.py b/timm/models/hgnet.py
@@ -447,7 +447,7 @@ def __init__(
         stages = []
         self.feature_info = []
         block_depths = [c[3] for c in stages_cfg]
-        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(block_depths)).split(block_depths)]
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(block_depths), device="cpu").split(block_depths)]
         for i, stage_config in enumerate(stages_cfg):
             in_chs, mid_chs, out_chs, block_num, downsample, light_block, kernel_size, layer_num = stage_config
             stages += [HighPerfGpuStage(
diff --git a/timm/models/hiera.py b/timm/models/hiera.py
@@ -516,7 +516,7 @@ def __init__(
         # Transformer blocks
         cur_stage = 0
         depth = sum(stages)
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth, device="cpu")]  # stochastic depth decay rule
         self.blocks = nn.ModuleList()
         self.feature_info = []
         for i in range(depth):
diff --git a/timm/models/hieradet_sam2.py b/timm/models/hieradet_sam2.py
@@ -325,7 +325,7 @@ def __init__(
         self.pos_embed = nn.Parameter(torch.zeros(1, embed_dim, *self.global_pos_size))
         self.pos_embed_window = nn.Parameter(torch.zeros(1, embed_dim, self.window_spec[0], self.window_spec[0]))
 
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth, device="cpu")]  # stochastic depth decay rule
         cur_stage = 0
         self.blocks = nn.Sequential()
         self.feature_info = []
diff --git a/timm/models/inception_next.py b/timm/models/inception_next.py
@@ -282,7 +282,7 @@ def __init__(
             norm_layer(dims[0])
         )
 
-        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
         prev_chs = dims[0]
         curr_stride = 4
         dilation = 1
diff --git a/timm/models/mambaout.py b/timm/models/mambaout.py
@@ -337,7 +337,7 @@ def __init__(
             norm_layer=norm_layer,
         )
         prev_dim = dims[0]
-        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
         cur = 0
         curr_stride = 4
         self.stages = nn.Sequential()
diff --git a/timm/models/metaformer.py b/timm/models/metaformer.py
@@ -523,7 +523,7 @@ def __init__(
 
         stages = []
         prev_dim = dims[0]
-        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
         for i in range(self.num_stages):
             stages += [MetaFormerStage(
                 prev_dim,
diff --git a/timm/models/nest.py b/timm/models/nest.py
@@ -344,7 +344,7 @@ def __init__(
 
         # Build up each hierarchical level
         levels = []
-        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
         prev_dim = None
         curr_stride = 4
         for i in range(len(self.num_blocks)):
diff --git a/timm/models/nextvit.py b/timm/models/nextvit.py
@@ -498,7 +498,7 @@ def __init__(
         in_chs = out_chs = stem_chs[-1]
         stages = []
         idx = 0
-        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
         for stage_idx in range(len(depths)):
             stage = NextStage(
                 in_chs=in_chs,
diff --git a/timm/models/pvt_v2.py b/timm/models/pvt_v2.py
@@ -312,7 +312,7 @@ def __init__(
             embed_dim=embed_dims[0],
         )
 
-        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
         cur = 0
         prev_dim = embed_dims[0]
         stages = []
diff --git a/timm/models/swin_transformer.py b/timm/models/swin_transformer.py
@@ -645,7 +645,7 @@ def __init__(
             window_size = (window_size,) * self.num_layers
         assert len(window_size) == self.num_layers
         mlp_ratio = to_ntuple(self.num_layers)(mlp_ratio)
-        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
         layers = []
         in_dim = embed_dim[0]
         scale = 1
diff --git a/timm/models/swin_transformer_v2.py b/timm/models/swin_transformer_v2.py
@@ -656,7 +656,7 @@ def __init__(
         )
         grid_size = self.patch_embed.grid_size
 
-        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
         layers = []
         in_dim = embed_dim[0]
         scale = 1
diff --git a/timm/models/swin_transformer_v2_cr.py b/timm/models/swin_transformer_v2_cr.py
@@ -686,7 +686,7 @@ def __init__(
         else:
             self.window_size = to_2tuple(window_size)
 
-        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu").split(depths)]
         stages = []
         in_dim = embed_dim
         in_scale = 1
diff --git a/timm/models/tiny_vit.py b/timm/models/tiny_vit.py
@@ -448,7 +448,7 @@ def __init__(
         )
 
         # stochastic depth rate rule
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu")]
 
         # build stages
         self.stages = nn.Sequential()
diff --git a/timm/models/tnt.py b/timm/models/tnt.py
@@ -240,7 +240,7 @@ def __init__(
         self.pixel_pos = nn.Parameter(torch.zeros(1, inner_dim, new_patch_size[0], new_patch_size[1]))
         self.pos_drop = nn.Dropout(p=pos_drop_rate)
 
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth, device="cpu")]  # stochastic depth decay rule
         blocks = []
         for i in range(depth):
             blocks.append(Block(
diff --git a/timm/models/twins.py b/timm/models/twins.py
@@ -326,7 +326,7 @@ def __init__(
 
         self.blocks = nn.ModuleList()
         self.feature_info = []
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths), device="cpu")]  # stochastic depth decay rule
         cur = 0
         for k in range(len(depths)):
             _block = nn.ModuleList([block_cls(
diff --git a/timm/models/visformer.py b/timm/models/visformer.py
@@ -202,7 +202,7 @@ def __init__(
         self.use_pos_embed = use_pos_embed
         self.grad_checkpointing = False
 
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth, device="cpu")]
         # stage 1
         if self.vit_stem:
             self.stem = None
diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py
@@ -539,8 +539,7 @@ def __init__(
             self.patch_drop = nn.Identity()
         self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity()
 
-        with torch.device("cpu"):
-            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth, device="cpu")]  # stochastic depth decay rule
         self.blocks = nn.Sequential(*[
             block_fn(
                 dim=embed_dim,
diff --git a/timm/models/vision_transformer_relpos.py b/timm/models/vision_transformer_relpos.py
@@ -317,7 +317,7 @@ def __init__(
 
         self.cls_token = nn.Parameter(torch.zeros(1, self.num_prefix_tokens, embed_dim)) if class_token else None
 
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth, device="cpu")]  # stochastic depth decay rule
         self.blocks = nn.ModuleList([
             block_fn(
                 dim=embed_dim,
diff --git a/timm/models/vision_transformer_sam.py b/timm/models/vision_transformer_sam.py
@@ -450,7 +450,7 @@ def __init__(
             self.rope_window = None
 
         # stochastic depth decay rule
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth, device="cpu")]
         self.blocks = nn.Sequential(*[
             block_fn(
                 dim=embed_dim,